1#if defined(USE_CUDA)
2#include <gmock/gmock-matchers.h>
3#include <gtest/gtest.h>
4
5#include <arith.h>
6#include <codegen.h>
7#include <disjoint_set.h>
8#include <executor.h>
9#include <executor_launch_params.h>
10#include <expr_evaluator.h>
11#include <fusion.h>
12#include <fusion_segmenter.h>
13#include <grouped_reduction.h>
14#include <inlining.h>
15#include <ir_all_nodes.h>
16#include <ir_builder.h>
17#include <ir_graphviz.h>
18#include <ir_iostream.h>
19#include <ir_utils.h>
20#include <iter_visitor.h>
21#include <kernel_cache.h>
22#include <kernel_expr_evaluator.h>
23#include <kernel_ir.h>
24#include <kernel_ir_dispatch.h>
25#include <lower2device.h>
26#include <lower_magic_zero.h>
27#include <mutator.h>
28#include <ops/all_ops.h>
29#include <register_interface.h>
30#include <root_domain_map.h>
31#include <scheduler/all_schedulers.h>
32#include <scheduler/reduction_utils.h>
33#include <scheduler/utils.h>
34#include <test/test_gpu_validator.h>
35#include <test/test_utils.h>
36#include <transform_replay.h>
37#include <transform_rfactor.h>
38
39#include <test/cpp/jit/test_utils.h>
40#include <torch/csrc/jit/api/function_impl.h>
41#include <parser.h>
42#include <torch/csrc/jit/ir/irparser.h>
43#include <torch/torch.h>
44
45#include <ATen/cuda/CUDAContext.h>
46#include <ATen/cuda/Exceptions.h>
47#include <c10/cuda/CUDAStream.h>
48
49#include <algorithm>
50#include <iostream>
51#include <sstream>
52#include <thread>
53
54// Tests go in torch::jit
55namespace torch {
56namespace jit {
57
58using namespace torch::jit::fuser::cuda;
59using namespace at::indexing;
60
61TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) {
62 Fusion fusion;
63 FusionGuard fg(&fusion);
64
65 // Set up your input tensor views
66 TensorView* tv0 = makeSymbolicTensor(2);
67 TensorView* tv1 =
68 reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
69 fusion.addInput(tv0);
70 fusion.addOutput(tv1);
71 // tv1[I0, R1] = tv0[I0, I1]
72
73 // Interface should just be a direct split with a Parallel type. We can
74 // include the parallelize call if we do this.
75 tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
76 // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
77
78 TensorView* tv2 = tv1->rFactor({2});
79 tv2->setMemoryType(MemoryType::Global);
80 // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
81 // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
82
83 tv0->computeAt(tv1, 1);
84
85 tv2->axis(-1)->parallelize(ParallelType::TIDx);
86 tv1->axis(0)->parallelize(ParallelType::BIDx);
87
88 constexpr int numel_x = 65000, numel_y = 1024;
89
90 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
91 at::Tensor input = at::randn({numel_x, numel_y}, options);
92
93 // How many threads to use for the block reduction
94 constexpr int runtime_threadIdx_dim = 128;
95
96 auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
97
98 FusionExecutor fe;
99 fe.compileFusion(&fusion, {input}, lparams);
100 auto cg_outputs = fe.runFusion({input}, lparams);
101
102 auto aten_output = input.to(at::kDouble).sum({1});
103 testValidate(
104 &fusion,
105 cg_outputs,
106 {input},
107 {aten_output},
108 __LINE__,
109 __FILE__,
110 "",
111 lparams);
112}
113
114TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
115 Fusion fusion;
116 FusionGuard fg(&fusion);
117
118 TensorView* tv0 = makeSymbolicTensor(2);
119 TensorView* tv1 = makeSymbolicTensor(2);
120 TensorView* tv2 = makeSymbolicTensor(2);
121 TensorView* tv3 = makeSymbolicTensor(2);
122 TensorView* tv4 = sub(tv2, tv3);
123 TensorView* tv5 = add(tv1, tv4);
124 TensorView* tv6 = sub(tv5, tv0);
125 fusion.addInput(tv0);
126 fusion.addInput(tv1);
127 fusion.addInput(tv2);
128 fusion.addInput(tv3);
129 fusion.addOutput(tv6);
130 // t6 = ((t1 + (t2 - t3)) - t0)
131
132 tv4->setMemoryType(MemoryType::Global);
133 tv5->setMemoryType(MemoryType::Global);
134 tv6->setMemoryType(MemoryType::Global);
135
136 constexpr int M = 32, N = 810;
137 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
138 at::Tensor t0 = at::randn({M, N}, options);
139 at::Tensor t1 = at::randn({M, N}, options);
140 at::Tensor t2 = at::randn({M, N}, options);
141 at::Tensor t3 = at::randn({M, N}, options);
142
143 at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
144
145 std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
146
147 FusionExecutor fe;
148 fe.compileFusion(&fusion, {t0, t1, t2, t3});
149 auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
150
151 testValidate(
152 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
153}
154
155TEST_F(NVFuserTest, FusionConstCheck_CUDA) {
156 Fusion fusion;
157 FusionGuard fg(&fusion);
158
159 auto one = IrBuilder::create<Int>(1);
160 TORCH_CHECK(one->isConstScalar());
161
162 auto one_x2 = mul(one, one);
163 TORCH_CHECK(one_x2->isConstScalar());
164
165 auto one_x3 = mul(one_x2, one);
166 TORCH_CHECK(one_x3->isConstScalar());
167
168 auto one_x4 = mul(one_x3, one);
169 TORCH_CHECK(one_x4->isConstScalar());
170}
171
172TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
173 const std::vector<int64_t> tensor_dims_in = {128, 128};
174 Fusion fusion;
175 FusionGuard fg(&fusion);
176
177 // Set up your input tensor views
178 TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
179 fusion.addInput(tv0);
180
181 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(0));
182 TensorView* tv2 =
183 reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
184 fusion.addOutput(tv2);
185
186 const auto options =
187 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
188 at::Tensor input = at::randn(tensor_dims_in, options);
189 at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options);
190
191 // Schedule
192 tv2->split(1, 32);
193 tv2->split(1, 4); // unroll
194
195 auto tv2_rf = tv2->rFactor({-3, -2});
196
197 tv2->axis(0)->parallelize(ParallelType::BIDx);
198 tv2->axis(-1)->parallelize(ParallelType::TIDx);
199
200 tv2_rf->axis(0)->parallelize(ParallelType::BIDx);
201 tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
202 tv2_rf->axis(-2)->parallelize(ParallelType::Unroll);
203
204 tv1->computeAt(tv2_rf, -1);
205
206 FusionExecutor fe;
207 fe.compileFusion(&fusion, {input});
208 auto cg_outputs = fe.runFusion({input});
209
210 auto aten_output = (input + 0).to(at::kDouble).sum(1);
211
212 testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
213}
214
215// Test isZeroInt
216TEST_F(NVFuserTest, FusionIsZeroInt_CUDA) {
217 Fusion fusion;
218 FusionGuard fg(&fusion);
219
220 Int* x = IrBuilder::create<Int>(0);
221 Int* y = IrBuilder::create<Int>(1);
222 Val* z = mul(x, y);
223 TORCH_CHECK(x->isZeroInt());
224 TORCH_CHECK(!y->isZeroInt());
225 TORCH_CHECK(!z->isZeroInt());
226}
227
228// Test isOneInt
229TEST_F(NVFuserTest, FusionIsOneInt_CUDA) {
230 Fusion fusion;
231 FusionGuard fg(&fusion);
232
233 Int* x = IrBuilder::create<Int>(1);
234 Int* y = IrBuilder::create<Int>(1);
235 Val* z = mul(x, y);
236 TORCH_CHECK(x->isOneInt());
237 TORCH_CHECK(y->isOneInt());
238 TORCH_CHECK(!z->isOneInt());
239}
240
241// This is to verify no cycle of computeAt is created. A more complex
242// variation of this pattern appears in one of the Python tests
243// (test_random_topo).
244TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
245 Fusion fusion;
246 FusionGuard fg(&fusion);
247
248 TensorView* tv0 = makeSymbolicTensor(1);
249 fusion.addInput(tv0);
250
251 // Common intermediate tensor
252 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
253 // tv1 -> tv2
254 auto tv2 = add(tv1, IrBuilder::create<Double>(2));
255 // tv1 -> tv3 -> tv4
256 auto tv3 = add(tv1, IrBuilder::create<Double>(3));
257 auto tv4 = add(tv3, IrBuilder::create<Double>(4));
258
259 // NOTE: This should no longer occur as of PR #201.
260 // The order of adding outputs matters. If tv3 is added before tv4,
261 // it should be fine. However, if tv4 is added before tv3, there
262 // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created
263 // first, and then tv4->tv3 is created at the final phase of
264 // computeAt (ComputeAt::setupOutputs).
265 fusion.addOutput(tv2);
266 fusion.addOutput(tv4);
267 fusion.addOutput(tv3);
268
269 tv0->computeAt(tv2, -1);
270
271 TORCH_CHECK(tv3->hasComputeAt());
272 TORCH_CHECK(!tv4->hasComputeAt());
273
274 const auto options =
275 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
276 at::Tensor aten_input = at::randn(100, options);
277
278 auto t1 = aten_input + 1;
279 auto t2 = t1 + 2;
280 auto t3 = t1 + 3;
281 auto t4 = t3 + 4;
282
283 FusionExecutor fe;
284 fe.compileFusion(&fusion, {aten_input});
285 auto cg_outputs = fe.runFusion({aten_input});
286
287 std::vector<at::Tensor> aten_outputs = {t2, t4, t3};
288 testValidate(
289 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
290}
291
292TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) {
293 Fusion fusion;
294 FusionGuard fg(&fusion);
295
296 // Set up your input tensor views
297 TensorView* tv0 = makeSymbolicTensor(2);
298 fusion.addInput(tv0);
299
300 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
301 TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
302 TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
303 TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4));
304
305 fusion.addOutput(tv2);
306 fusion.addOutput(tv3);
307 fusion.addOutput(tv4);
308
309 tv1->computeAt(tv3, -1);
310
311 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
312 at::Tensor aten_input = at::randn({10, 10}, options);
313
314 auto t1 = aten_input + 1;
315 auto t2 = aten_input + 2;
316 auto t3 = t1 + 3;
317 auto t4 = t1 + 4;
318
319 std::vector<at::Tensor> aten_outputs = {t2, t3, t4};
320
321 std::vector<at::Tensor> cg_outputs = {
322 at::empty_like(aten_input, options),
323 at::empty_like(aten_input, options),
324 at::empty_like(aten_input, options)};
325
326 FusionExecutor fe;
327 fe.compileFusion(&fusion, {aten_input});
328 fe.runFusion({aten_input}, cg_outputs);
329 testValidate(
330 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
331}
332
333TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) {
334 Fusion fusion;
335 FusionGuard fg(&fusion);
336
337 // Set up your input tensor views
338 TensorView* tv0 = makeSymbolicTensor(2);
339 fusion.addInput(tv0);
340
341 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
342 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
343
344 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
345 TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
346
347 TensorView* tv5 = add(tv1, tv3);
348
349 fusion.addOutput(tv2);
350 fusion.addOutput(tv4);
351 fusion.addOutput(tv5);
352
353 tv1->computeAt(tv5, -1);
354 tv3->computeAt(tv5, -1);
355
356 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
357 at::Tensor aten_input = at::randn({10, 10}, options);
358
359 auto t1 = aten_input + 1;
360 auto t2 = t1 + 2;
361 auto t3 = aten_input + 3;
362 auto t4 = t3 + 4;
363 auto t5 = t1 + t3;
364
365 std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
366
367 std::vector<at::Tensor> cg_outputs = {
368 at::empty_like(aten_input, options),
369 at::empty_like(aten_input, options),
370 at::empty_like(aten_input, options)};
371
372 FusionExecutor fe;
373 fe.compileFusion(&fusion, {aten_input});
374 fe.runFusion({aten_input}, cg_outputs);
375
376 testValidate(
377 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
378}
379
380TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) {
381 for (const auto i : c10::irange(2)) {
382 Fusion fusion;
383 FusionGuard fg(&fusion);
384
385 TensorView* tv0 = makeSymbolicTensor(1);
386 fusion.addInput(tv0);
387
388 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
389 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
390
391 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
392 TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
393
394 TensorView* tv5 = add(tv1, tv3);
395
396 fusion.addOutput(tv2);
397 fusion.addOutput(tv4);
398 fusion.addOutput(tv5);
399
400 const int tile = 32;
401
402 tv1->split(-1, tile);
403 tv2->split(-1, tile);
404 tv3->split(-1, tile);
405 tv4->split(-1, tile);
406 tv5->split(-1, tile);
407
408 auto compute_at_outer = tv1;
409 auto compute_at_inner = tv3;
410 if (i == 1) {
411 std::swap(compute_at_inner, compute_at_outer);
412 }
413
414 compute_at_outer->computeAt(tv5, -2);
415 compute_at_inner->computeAt(tv5, -1);
416
417 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
418 at::Tensor aten_input = at::randn({100}, options);
419 auto t1 = aten_input + 1;
420 auto t2 = t1 + 2;
421 auto t3 = aten_input + 3;
422 auto t4 = t3 + 4;
423 auto t5 = t1 + t3;
424
425 std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
426
427 std::vector<at::Tensor> cg_outputs = {
428 at::empty_like(aten_input, options),
429 at::empty_like(aten_input, options),
430 at::empty_like(aten_input, options)};
431
432 FusionExecutor fe;
433 fe.compileFusion(&fusion, {aten_input});
434 fe.runFusion({aten_input}, cg_outputs);
435
436 testValidate(
437 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
438 }
439}
440
441TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) {
442 Fusion fusion;
443 FusionGuard fg(&fusion);
444
445 // First tree
446 TensorView* tv0 = makeSymbolicTensor(1);
447 fusion.addInput(tv0);
448 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
449 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
450 TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
451 fusion.addOutput(tv2);
452 fusion.addOutput(tv3);
453
454 // Second tree
455 TensorView* tv4 = makeSymbolicTensor(1);
456 fusion.addInput(tv4);
457 TensorView* tv5 = add(tv4, IrBuilder::create<Double>(5));
458 TensorView* tv6 = add(tv5, IrBuilder::create<Double>(6));
459 TensorView* tv7 = add(tv5, IrBuilder::create<Double>(7));
460 fusion.addOutput(tv6);
461 fusion.addOutput(tv7);
462
463 tv1->computeAt(tv2, -1);
464 tv5->computeAt(tv6, -1);
465
466 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
467 at::Tensor t0 = at::randn({100}, options);
468 at::Tensor t4 = at::rand_like(t0, options);
469
470 auto t1 = t0 + 1;
471 auto t2 = t1 + 2;
472 auto t3 = t1 + 3;
473 auto t5 = t4 + 5;
474 auto t6 = t5 + 6;
475 auto t7 = t5 + 7;
476
477 std::vector<at::Tensor> aten_outputs = {t2, t3, t6, t7};
478 std::vector<IValue> aten_inputs = {t0, t4};
479 std::vector<at::Tensor> cg_outputs = {
480 at::empty_like(t0, options),
481 at::empty_like(t0, options),
482 at::empty_like(t0, options),
483 at::empty_like(t0, options)};
484
485 FusionExecutor fe;
486 fe.compileFusion(&fusion, aten_inputs);
487 fe.runFusion(aten_inputs, cg_outputs);
488
489 testValidate(
490 &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
491}
492
493TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) {
494 Fusion fusion;
495 FusionGuard fg(&fusion);
496
497 TensorView* tv0 = makeSymbolicTensor(1);
498 fusion.addInput(tv0);
499 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
500 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
501 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
502 TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
503 TensorView* tv5 = add(tv2, tv4);
504
505 fusion.addOutput(tv1);
506 fusion.addOutput(tv3);
507 fusion.addOutput(tv5);
508
509 tv2->computeAt(tv5, -1);
510 tv4->computeAt(tv5, -1);
511
512 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
513 at::Tensor aten_input = at::randn({100}, options);
514 std::vector<at::Tensor> cg_outputs = {
515 at::empty_like(aten_input, options),
516 at::empty_like(aten_input, options),
517 at::empty_like(aten_input, options)};
518
519 FusionExecutor fe;
520 fe.compileFusion(&fusion, {aten_input});
521 fe.runFusion({aten_input}, cg_outputs);
522
523 auto t1 = aten_input + 1;
524 auto t2 = t1 + 2;
525 auto t3 = aten_input + 3;
526 auto t4 = t3 + 4;
527 auto t5 = t2 + t4;
528
529 std::vector<at::Tensor> aten_outputs = {t1, t3, t5};
530
531 testValidate(
532 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
533}
534
535TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) {
536 Fusion fusion;
537 FusionGuard fg(&fusion);
538
539 TensorView* tv0 = makeSymbolicTensor(1);
540 fusion.addInput(tv0);
541 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
542 TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
543 TensorView* tv3 = add(tv1, tv2);
544 TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
545
546 fusion.addOutput(tv4);
547
548 tv1->split(0, 32);
549 tv2->split(0, 32);
550 tv3->split(0, 32);
551 tv4->split(0, 32);
552
553 tv3->computeAt(tv4, -2);
554 tv1->computeAt(tv3, -1);
555 tv2->computeAt(tv3, -2);
556
557 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
558 at::Tensor aten_input = at::randn({100}, options);
559
560 auto t1 = aten_input + 1;
561 auto t2 = aten_input + 2;
562 auto t3 = t1 + t2;
563 auto aten_output = t3 + 4;
564
565 at::Tensor cg_output = at::empty_like(aten_input, options);
566
567 FusionExecutor fe;
568 fe.compileFusion(&fusion, {aten_input});
569 fe.runFusion({aten_input}, {cg_output});
570
571 testValidate(
572 &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
573}
574
575TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) {
576 Fusion fusion;
577 FusionGuard fg(&fusion);
578
579 TensorView* tv0 = makeSymbolicTensor(1);
580 fusion.addInput(tv0);
581 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
582 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
583 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
584 TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
585 TensorView* tv5 = add(tv2, tv4);
586
587 fusion.addOutput(tv5);
588
589 TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5};
590 for (auto tv : tvs) {
591 tv->split(0, 2);
592 tv->split(0, 4);
593 tv->split(0, 8);
594 }
595
596 // computeAt into inner loop nests
597 tv1->computeAt(tv2, -1);
598 tv3->computeAt(tv4, -2);
599
600 tv2->computeAt(tv5, -4);
601 tv4->computeAt(tv5, -3);
602
603 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
604 at::Tensor aten_input = at::randn({100}, options);
605
606 auto t1 = aten_input + 1;
607 auto t2 = t1 + 2;
608 auto t3 = aten_input + 3;
609 auto t4 = t3 + 4;
610 auto aten_output = t2 + t4;
611
612 at::Tensor cg_output = at::empty_like(aten_input, options);
613
614 FusionExecutor fe;
615 fe.compileFusion(&fusion, {aten_input});
616 fe.runFusion({aten_input}, {cg_output});
617
618 testValidate(
619 &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
620}
621
622// Test predication of grid reduction
623TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) {
624 const int gdimx = 4;
625 const int bdimx = 128;
626
627 Fusion fusion;
628 FusionGuard fg(&fusion);
629
630 TensorView* tv0 = makeSymbolicTensor(2);
631 fusion.addInput(tv0);
632
633 TensorView* tv1 =
634 reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
635 TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1);
636 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(2));
637
638 fusion.addOutput(tv3);
639 fusion.addOutput(tv2);
640
641 tv1->split(1, bdimx);
642 tv1->split(1, gdimx);
643 tv3->split(1, bdimx);
644 tv3->split(1, gdimx);
645
646 TensorView* tv1_rf = tv1->rFactor({1});
647
648 tv1->computeAt(tv2, -1);
649
650 tv1->axis(0)->parallelize(ParallelType::BIDy);
651 tv1_rf->axis(0)->parallelize(ParallelType::BIDy);
652 tv2->axis(0)->parallelize(ParallelType::BIDy);
653 tv1->axis(-2)->parallelize(ParallelType::BIDx);
654 tv1_rf->axis(-2)->parallelize(ParallelType::BIDx);
655 tv1->axis(-1)->parallelize(ParallelType::TIDx);
656 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
657
658 tv3->axis(3)->parallelize(ParallelType::TIDx);
659 tv3->axis(2)->parallelize(ParallelType::BIDx);
660 tv3->axis(0)->parallelize(ParallelType::BIDy);
661
662 int numel_x = 100;
663 int numel_y = 1000;
664
665 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
666 at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
667
668 auto t2 = -aten_input.to(at::kDouble).sum({1});
669 auto t3 = aten_input + 2.0;
670
671 std::vector<at::Tensor> aten_outputs = {t3, t2};
672
673 std::vector<at::Tensor> cg_outputs = {
674 at::empty_like(aten_input, options), at::empty({numel_x}, options)};
675
676 FusionExecutor fe;
677 fe.compileFusion(&fusion, {aten_input});
678 fe.runFusion({aten_input}, cg_outputs);
679
680 testValidate(
681 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
682}
683
684TEST_F(NVFuserTest, FusionLSTMCell_CUDA) {
685 const int hidden_features = 512;
686 const int batch_size = 64;
687
688 Fusion fusion;
689 FusionGuard fg(&fusion);
690
691 TensorView* tvs[16];
692 for (const auto i : c10::irange(16)) {
693 tvs[i] = makeSymbolicTensor(2);
694 fusion.addInput(tvs[i]);
695 }
696
697 auto ingate = unaryOp(
698 UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
699
700 auto forgetgate = unaryOp(
701 UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
702
703 auto cellgate = unaryOp(
704 UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
705
706 auto outgate = unaryOp(
707 UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
708
709 auto cx = makeContigTensor(2);
710 fusion.addInput(cx);
711
712 auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
713
714 auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
715
716 fusion.addOutput(cy);
717 fusion.addOutput(hy);
718
719 std::vector<c10::IValue> aten_inputs;
720 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
721 at::Tensor large_tensor0 =
722 at::randn({batch_size, hidden_features * 4}, options);
723 at::Tensor large_tensor1 =
724 at::randn({batch_size, hidden_features * 4}, options);
725 at::Tensor large_tensor2 =
726 at::randn({batch_size, hidden_features * 4}, options);
727 at::Tensor large_tensor3 =
728 at::randn({batch_size, hidden_features * 4}, options);
729
730 auto chunked0 = large_tensor0.chunk(4, 1);
731 auto chunked1 = large_tensor1.chunk(4, 1);
732 auto chunked2 = large_tensor2.chunk(4, 1);
733 auto chunked3 = large_tensor3.chunk(4, 1);
734
735 aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end());
736 aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end());
737 aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end());
738 aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end());
739
740 auto at_ingate =
741 chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
742 auto at_forgetgate =
743 chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
744 auto at_cellgate =
745 chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
746 auto at_outgate =
747 chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
748
749 auto at_cx = at::randn({batch_size, hidden_features}, options);
750 aten_inputs.push_back(at_cx);
751 auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
752 auto at_hy = at_outgate.mul(at_cy.tanh());
753
754 auto lparams = schedulePointwise(&fusion, aten_inputs);
755
756 FusionExecutor fe;
757 fe.compileFusion(&fusion, aten_inputs, lparams);
758 auto cg_outputs = fe.runFusion(aten_inputs, lparams);
759
760 testValidate(
761 &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__);
762}
763
764TEST_F(NVFuserTest, FusionReductionHalf_CUDA) {
765 Fusion fusion;
766 FusionGuard fg(&fusion);
767
768 // Set up your input tensor views
769 TensorView* tv0 = makeSymbolicTensor(3, DataType::Half);
770 fusion.addInput(tv0);
771
772 auto tv1 = castOp(DataType::Float, tv0);
773 auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
774 auto tv3 = sum(tv2, {2});
775 auto tv4 = castOp(DataType::Half, tv3);
776
777 fusion.addOutput(tv4);
778
779 const auto options =
780 at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
781 at::Tensor aten_input = at::randn({8, 8, 16}, options);
782
783 auto reduction_tv = tv3;
784
785 auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
786 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
787 scheduleReduction(&fusion, *reduction_params);
788
789 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
790
791 auto lparams = reduction_params->lparams;
792
793 FusionExecutor fe;
794 fe.compileFusion(&fusion, {aten_input}, lparams);
795 // no broadcasting needed, omitting the last optional argument;
796 auto cg_outputs = fe.runFusion({aten_input}, lparams);
797
798 auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2});
799
800 testValidate(
801 &fusion,
802 cg_outputs,
803 {aten_input},
804 {aten_output},
805 __LINE__,
806 __FILE__,
807 "",
808 lparams);
809}
810
811TEST_F(NVFuserTest, FusionReduceSingle_CUDA) {
812 Fusion fusion;
813 FusionGuard fg(&fusion);
814
815 // Set up your input tensor views
816 TensorView* tv0 = makeConcreteTensor({100, 1});
817 fusion.addInput(tv0);
818 auto tv1 = sum(tv0, {1});
819 fusion.addOutput(tv1);
820
821 const auto options =
822 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
823 at::Tensor aten_input = at::randn({100, 1}, options);
824
825 // Grab only tensor views, though there shouldn't be any other type
826 FusionExecutor fe;
827 fe.compileFusion(&fusion, {aten_input});
828 // no broadcasting needed, omitting the last optional argument;
829 auto cg_outputs = fe.runFusion({aten_input});
830
831 auto aten_output = aten_input.to(at::kDouble).sum({1});
832 testValidate(
833 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
834}
835
836TEST_F(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) {
837 constexpr int bid_x = 80;
838 constexpr int tid_x = 4096;
839 constexpr int red_dim = 1;
840
841 Fusion fusion;
842 FusionGuard fg(&fusion);
843
844 // Set up your input tensor views
845 TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
846 fusion.addInput(tv0);
847
848 TensorView* tv1 = reductionOp(
849 BinaryOpType::Add, {red_dim, 2}, IrBuilder::create<Double>(0), tv0);
850 fusion.addOutput(tv1);
851
852 const auto options =
853 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
854 at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
855
856 // Apply reduction heuristic
857 auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
858 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
859 scheduleReduction(&fusion, *reduction_params);
860 auto lparams = reduction_params->lparams;
861
862 FusionExecutor fe;
863 fe.compileFusion(&fusion, {aten_input}, lparams);
864 // no broadcasting needed, omitting the last optional argument;
865 auto cg_outputs = fe.runFusion({aten_input}, lparams);
866 auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2});
867
868 testValidate(
869 &fusion,
870 cg_outputs,
871 {aten_input},
872 {aten_output},
873 __LINE__,
874 __FILE__,
875 "",
876 lparams);
877}
878
879TEST_F(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) {
880 constexpr int bid_x = 80;
881 constexpr int tid_x = 4096;
882 constexpr int red_dim = 1;
883
884 Fusion fusion;
885 FusionGuard fg(&fusion);
886
887 // Set up your input tensor views
888 TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
889 fusion.addInput(tv0);
890
891 TensorView* tv1 =
892 reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
893
894 TensorView* tv2 = reductionOp(
895 BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv1);
896 fusion.addOutput(tv2);
897
898 const auto options =
899 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
900 at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
901
902 // Apply reduction heuristic
903 auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
904 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
905
906 scheduleReduction(&fusion, *reduction_params);
907 auto lparams = reduction_params->lparams;
908
909 FusionExecutor fe;
910 fe.compileFusion(&fusion, {aten_input}, lparams);
911 // no broadcasting needed, omitting the last optional argument;
912 auto cg_outputs = fe.runFusion({aten_input}, lparams);
913 auto aten_output = aten_input.to(at::kDouble).sum({1, 2});
914
915 testValidate(
916 &fusion,
917 cg_outputs,
918 {aten_input},
919 {aten_output},
920 __LINE__,
921 __FILE__,
922 "",
923 lparams);
924}
925
926TEST_F(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) {
927 constexpr int bid_x = 80;
928 constexpr int tid_x = 4096;
929 constexpr int red_dim = 1;
930
931 Fusion fusion;
932 FusionGuard fg(&fusion);
933
934 // Set up your input tensor views
935 TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
936 fusion.addInput(tv0);
937
938 TensorView* tv1 = reductionOp(
939 BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
940
941 TensorView* tv2 =
942 reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
943 fusion.addOutput(tv2);
944
945 const auto options =
946 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
947 at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
948
949 // Apply reduction heuristic
950 auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
951 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
952 scheduleReduction(&fusion, *reduction_params);
953 auto lparams = reduction_params->lparams;
954
955 FusionExecutor fe;
956 fe.compileFusion(&fusion, {aten_input}, lparams);
957 // no broadcasting needed, omitting the last optional argument;
958 auto cg_outputs = fe.runFusion({aten_input}, lparams);
959 auto aten_output = aten_input.to(at::kDouble).sum({2, 1});
960
961 testValidate(
962 &fusion,
963 cg_outputs,
964 {aten_input},
965 {aten_output},
966 __LINE__,
967 __FILE__,
968 "",
969 lparams);
970}
971
972TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) {
973 Fusion fusion;
974 FusionGuard fg(&fusion);
975
976 // Set up your input tensor views
977 TensorView* tv0 = makeConcreteTensor({10, 20, 1});
978 fusion.addInput(tv0);
979 TensorView* tv1 =
980 reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
981 fusion.addOutput(tv1);
982
983 TORCH_CHECK(
984 ir_utils::getReductionOps(&fusion, true /* ignore_trivial */).empty(),
985 "Trivial reduction picked up by fusion");
986
987 const auto options =
988 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
989 at::Tensor aten_input = at::randn({10, 20, 1}, options);
990
991 FusionExecutor fe;
992 fe.compileFusion(&fusion, {aten_input});
993 auto cg_outputs = fe.runFusion({aten_input});
994 auto aten_output = aten_input.to(at::kDouble).sum({2});
995
996 testValidate(
997 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
998}
999
1000TEST_F(NVFuserTest, FusionTrivialReduction2_CUDA) {
1001 Fusion fusion;
1002 FusionGuard fg(&fusion);
1003
1004 int w = 1, x = 1, y = 7, z = 8;
1005
1006 auto tv0 = makeSymbolicTensor(2);
1007 auto tv1 = makeConcreteTensor({w, x, y, z});
1008 fusion.addInput(tv0);
1009 fusion.addInput(tv1);
1010
1011 auto tv2 = sum(tv1, {0});
1012 auto tv3 = sum(tv2, {0});
1013 auto tv4 = add(tv3, tv0);
1014
1015 fusion.addOutput(tv4);
1016
1017 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1018 at::Tensor t0 = at::randn({y, z}, options);
1019 at::Tensor t1 = at::randn({w, x, y, z}, options);
1020 auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0);
1021
1022 std::vector<IValue> aten_inputs = {t0, t1};
1023
1024 auto lparams = schedulePointwise(&fusion, aten_inputs);
1025
1026 FusionExecutor fe;
1027 fe.compileFusion(&fusion, aten_inputs, lparams);
1028 auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1029
1030 testValidate(
1031 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1032}
1033
1034TEST_F(NVFuserTest, FusionTrivialReduction3_CUDA) {
1035 Fusion fusion;
1036 FusionGuard fg(&fusion);
1037
1038 int v = 1, w = 1, x = 1, y = 7, z = 8;
1039
1040 auto tv0 = makeSymbolicTensor(2);
1041 auto tv1 = makeConcreteTensor({v, w, x, y, z});
1042 fusion.addInput(tv0);
1043 fusion.addInput(tv1);
1044
1045 auto tv2 = sum(tv1, {0, 1, 2});
1046 auto tv3 = add(tv2, tv0);
1047
1048 fusion.addOutput(tv3);
1049
1050 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1051 at::Tensor t0 = at::randn({y, z}, options);
1052 at::Tensor t1 = at::randn({v, w, x, y, z}, options);
1053 auto aten_output = t1.sum({0, 1, 2}).add(t0);
1054
1055 std::vector<IValue> aten_inputs = {t0, t1};
1056
1057 auto lparams = schedulePointwise(&fusion, aten_inputs);
1058
1059 FusionExecutor fe;
1060 fe.compileFusion(&fusion, aten_inputs, lparams);
1061 auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1062
1063 testValidate(
1064 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1065}
1066
1067// Make sure trivial reductions are correctly detected even with
1068// scheduling applied.
1069TEST_F(NVFuserTest, FusionDetectTrivialReduction1_CUDA) {
1070 Fusion fusion;
1071 FusionGuard fg(&fusion);
1072
1073 auto tv0 = makeSymbolicTensor(1);
1074 fusion.addInput(tv0);
1075
1076 auto tv1 = broadcast(tv0, {false, true});
1077 auto tv2 = sum(tv1, {1});
1078 fusion.addOutput(tv2);
1079
1080 tv2->split(1, 4);
1081 tv2->split(1, 8);
1082 auto tv3 = tv2->rFactor({-1});
1083 auto tv4 = tv2->rFactor({-1});
1084
1085 auto tv5 = broadcast(tv0, {true, false});
1086 auto tv6 = add(tv5, IrBuilder::create<Double>(1));
1087 auto tv7 = sub(tv6, IrBuilder::create<Double>(1));
1088 auto tv8 = sum(tv7, {0});
1089 fusion.addOutput(tv8);
1090
1091 auto tv9 = broadcast(tv0, {false, true, true});
1092 auto tv10 = sum(tv9, {1});
1093 auto tv11 = sum(tv10, {1});
1094 fusion.addOutput(tv11);
1095
1096 tv8->split(0, 3);
1097 tv10->split(1, 4);
1098 tv11->split(1, 5);
1099
1100 tv0->computeAt(tv2, -1);
1101 tv0->computeAt(tv8, -1);
1102 tv0->computeAt(tv11, 1);
1103
1104 // Test indexing to gmem-backed tensors
1105 tv3->setMemoryType(MemoryType::Global);
1106 tv8->setMemoryType(MemoryType::Global);
1107
1108 GpuLower gpulw(&fusion);
1109
1110 // No ReductionOp should be generated as all the reduction
1111 // exprs should be replaced with a unary set op.
1112 for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
1113 TORCH_CHECK(!expr->isA<ReductionOp>());
1114 }
1115
1116 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1117 at::Tensor t0 = at::randn({100}, options);
1118 std::vector<IValue> aten_inputs = {t0};
1119
1120 FusionExecutor fe;
1121 fe.compileFusion(&fusion, aten_inputs);
1122 auto cg_outputs = fe.runFusion(aten_inputs);
1123
1124 testValidate(
1125 &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__);
1126}
1127
1128// Test detection of partially trivial reduction
1129TEST_F(NVFuserTest, FusionDetectTrivialReduction2_CUDA) {
1130 Fusion fusion;
1131 FusionGuard fg(&fusion);
1132
1133 auto tv0 = makeSymbolicTensor(2);
1134 fusion.addInput(tv0);
1135 auto tv1 = sum(tv0, {1});
1136 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
1137 fusion.addOutput(tv2);
1138
1139 tv1->split(1, 1);
1140 // tv1->axis(1): non-trivial
1141 // tv1->axis(2): trivial
1142
1143 auto tv3 = tv1->rFactor({-1});
1144
1145 // Just to suppress register-allocation warning
1146 tv0->computeAt(tv2, 1);
1147 tv3->computeAt(tv1, -1);
1148
1149 GpuLower gpulw(&fusion);
1150
1151 // tv3's reduction axis is a trivial reduction. The only
1152 // ReductionOp should be for tv1.
1153 for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
1154 if (expr->isA<ReductionOp>()) {
1155 auto reduction_out =
1156 expr->as<ReductionOp>()->outputs()[0]->as<TensorView>();
1157 TORCH_CHECK(reduction_out->name() == 1);
1158 }
1159 }
1160}
1161
1162TEST_F(NVFuserTest, FusionInputsIdLookup_CUDA) {
1163 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1164 at::Tensor t0 = at::randn({16, 8, 8}, options);
1165 at::Tensor t1 = at::randn({8, 8}, options);
1166 at::Tensor t2 = at::randn({6, 4}, options);
1167
1168 // create a cache with max size 2;
1169 torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2);
1170
1171 // testing basic function, same encoding for identical inputs
1172 auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
1173 auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
1174 TORCH_CHECK(id_0.id == id_0_lookup.id);
1175 TORCH_CHECK(inputs_id_lookup.size() == 1);
1176 TORCH_CHECK(id_0.eviction == false);
1177
1178 // new input (even tho same shape, but we have different signature because of
1179 // missing scalar input
1180 auto id_1 = inputs_id_lookup.lookupId({t0, t1});
1181 auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
1182 TORCH_CHECK(id_1.id == id_1_lookup.id);
1183 TORCH_CHECK(inputs_id_lookup.size() == 2);
1184 TORCH_CHECK(id_1.eviction == false);
1185
1186 // eviction should happen at this point
1187 auto id_2 = inputs_id_lookup.lookupId({t2, t1});
1188 TORCH_CHECK(id_2.id != id_0.id);
1189 TORCH_CHECK(id_2.id != id_1.id);
1190 TORCH_CHECK(inputs_id_lookup.size() == 2);
1191 TORCH_CHECK(id_2.eviction == true);
1192 TORCH_CHECK(id_2.evict_id == id_0.id);
1193
1194 // look at input 1 again
1195 auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
1196 TORCH_CHECK(id_1_relook.id == id_1.id);
1197 TORCH_CHECK(id_1_relook.eviction == false);
1198}
1199
1200TEST_F(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) {
1201 std::vector<int64_t> sizes_vec({16, 8, 8});
1202 std::vector<int64_t> strides_vec({64, 8, 1});
1203 auto tensor_type = TensorType::create(
1204 at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1205 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1206
1207 // pass with identical shape
1208 auto t0 = at::randn({16, 8, 8}, options);
1209 TORCH_CHECK(complyWith(t0, tensor_type));
1210
1211 // pass with dynamic shape
1212 auto t1 = at::randn({16, 16, 8}, options);
1213 TORCH_CHECK(complyWith(t1, tensor_type));
1214
1215 // broadcasting semantic change failure
1216 auto t2 = at::randn({16, 1, 8}, options);
1217 TORCH_CHECK(!complyWith(t2, tensor_type));
1218
1219 // contiguity failure via slicing
1220 auto t3 = t0.slice(1, 0, 8, 2);
1221 TORCH_CHECK(!complyWith(t3, tensor_type));
1222
1223 // contiguity failure via slicing
1224 auto t4 = t0.slice(2, 0, 8, 2);
1225 TORCH_CHECK(!complyWith(t4, tensor_type));
1226
1227 // rank failure
1228 auto t5 = at::randn({16, 8, 8, 8}, options);
1229 TORCH_CHECK(!complyWith(t5, tensor_type));
1230
1231 // contiguity on stride 1 dimension with implicit broadcasting
1232 auto t = at::randn({4}, options);
1233 auto t6 = t.unsqueeze(1).expand({4, 8});
1234 TORCH_CHECK(complyWith(t6, TensorType::create(t6)));
1235}
1236
1237TEST_F(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) {
1238 std::vector<int64_t> sizes_vec({16, 1, 8});
1239 std::vector<int64_t> strides_vec({8, 8, 1});
1240 auto tensor_type = TensorType::create(
1241 at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1242 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1243
1244 // broadcasting semantic change
1245 auto t0 = at::randn({16, 8, 8}, options);
1246 TORCH_CHECK(!complyWith(t0, tensor_type));
1247
1248 // dtype failure
1249 auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf));
1250 TORCH_CHECK(!complyWith(t1, tensor_type));
1251
1252 // dtype failure
1253 auto t2 = at::randn({16, 1, 8}, options);
1254 TORCH_CHECK(complyWith(t2, tensor_type));
1255
1256 // device inconsistency shouldn't fail
1257 auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0));
1258 TORCH_CHECK(complyWith(t3, tensor_type));
1259}
1260
1261TEST_F(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) {
1262 std::vector<int64_t> sizes_vec({16, 8, 8});
1263 std::vector<int64_t> strides_vec({64, 1, 8});
1264 auto tensor_type = TensorType::create(
1265 at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1266 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1267
1268 // failing permutation
1269 auto t0 = at::randn({16, 8, 8}, options);
1270 TORCH_CHECK(!complyWith(t0, tensor_type));
1271
1272 // passing with dynamic shape
1273 auto t1 = t0.permute({0, 2, 1});
1274 TORCH_CHECK(complyWith(t1, tensor_type));
1275}
1276
1277TEST_F(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) {
1278 std::vector<int64_t> sizes_vec({16, 8, 8});
1279 std::vector<int64_t> strides_vec({128, 16, 1});
1280 auto tensor_type = TensorType::create(
1281 at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1282 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1283
1284 // contiguity check passes although it differs
1285 auto t0 = at::randn({16, 16, 8}, options);
1286 TORCH_CHECK(complyWith(t0, tensor_type));
1287
1288 // passing with dynamic shape
1289 auto t1 = t0.slice(1, 0, 16, 2);
1290 TORCH_CHECK(complyWith(t1, tensor_type));
1291}
1292
1293TEST_F(NVFuserTest, FusionDisjointSet_CUDA) {
1294 DisjointSets<int> set;
1295
1296 const std::set<int> group_x({0, 1, 2});
1297 const std::set<int> group_y({3, 4, 5});
1298 const std::set<int> group_z({6, 7, 8});
1299 const std::vector<std::set<int>> groups({group_x, group_y, group_z});
1300 std::set<int> group_all;
1301 std::for_each(groups.begin(), groups.end(), [&](const auto& g) {
1302 group_all.insert(g.begin(), g.end());
1303 });
1304
1305 // Initially, nothing should be considered equivalent
1306 for (auto i : group_all) {
1307 for (auto j : group_all) {
1308 TORCH_CHECK(!set.permissiveAreMapped(i, j));
1309 }
1310 }
1311
1312 // Sets values in group_x are equivalent
1313 for (auto i : group_x) {
1314 for (auto j : group_x) {
1315 set.mapEntries(i, j);
1316 TORCH_CHECK(set.mappingExists(i));
1317 TORCH_CHECK(set.mappingExists(j));
1318 }
1319 }
1320
1321 // All values in group_x shoudl be equivalent with each other
1322 for (auto i : group_x) {
1323 for (auto j : group_x) {
1324 TORCH_CHECK(set.permissiveAreMapped(i, j));
1325 }
1326 }
1327 // But nothing else should be equivalent
1328 for (auto i : group_all) {
1329 for (auto j : group_y) {
1330 TORCH_CHECK(!set.permissiveAreMapped(i, j));
1331 }
1332 for (auto j : group_z) {
1333 TORCH_CHECK(!set.permissiveAreMapped(i, j));
1334 }
1335 }
1336
1337 // Sets values in group_y are equivalent
1338 for (auto i : group_y) {
1339 for (auto j : group_y) {
1340 set.mapEntries(i, j);
1341 TORCH_CHECK(set.mappingExists(i));
1342 TORCH_CHECK(set.mappingExists(j));
1343 }
1344 }
1345
1346 // group_x should be still equivalent
1347 for (auto i : group_x) {
1348 for (auto j : group_x) {
1349 TORCH_CHECK(set.permissiveAreMapped(i, j));
1350 }
1351 }
1352 // group_y should be now equivalent
1353 for (auto i : group_y) {
1354 for (auto j : group_y) {
1355 TORCH_CHECK(set.permissiveAreMapped(i, j));
1356 }
1357 }
1358 // But group_z should not be equivalent with anything yet
1359 for (auto i : group_all) {
1360 for (auto j : group_z) {
1361 TORCH_CHECK(!set.permissiveAreMapped(i, j));
1362 }
1363 }
1364
1365 // Sets values in group_z are equivalent
1366 for (auto i : group_z) {
1367 for (auto j : group_z) {
1368 set.mapEntries(i, j);
1369 TORCH_CHECK(set.mappingExists(i));
1370 TORCH_CHECK(set.mappingExists(j));
1371 }
1372 }
1373
1374 // Now each of the three groups should be equivalent within each
1375 // group
1376 for (const auto gi : c10::irange(groups.size())) {
1377 for (const auto gj : c10::irange(groups.size())) {
1378 for (auto i : groups[gi]) {
1379 for (auto j : groups[gj]) {
1380 TORCH_CHECK(
1381 (gi == gj && set.permissiveAreMapped(i, j)) ||
1382 (gi != gj && !set.permissiveAreMapped(i, j)));
1383 }
1384 }
1385 }
1386 }
1387
1388 std::vector<int> all_elements = set.getAllElements().vector();
1389 std::sort(all_elements.begin(), all_elements.end());
1390 std::vector<int> group_all_vec(group_all.begin(), group_all.end());
1391 std::sort(group_all_vec.begin(), group_all_vec.end());
1392 TORCH_CHECK(all_elements == group_all_vec);
1393
1394 set.clear();
1395 TORCH_CHECK(set.getAllElements().vector().size() == 0);
1396
1397 // All cleared. Nothing should be considered equivalent.
1398 for (auto i : group_all) {
1399 for (auto j : group_all) {
1400 TORCH_CHECK(!set.permissiveAreMapped(i, j));
1401 }
1402 }
1403}
1404
1405TEST_F(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) {
1406 Fusion fusion;
1407 FusionGuard fg(&fusion);
1408
1409 auto tv0 = makeSymbolicTensor(1);
1410 auto tv1 = makeSymbolicTensor(2);
1411 auto tv2 = makeSymbolicTensor(2);
1412 fusion.addInput(tv0);
1413 fusion.addInput(tv1);
1414 fusion.addInput(tv2);
1415
1416 auto tv3 = broadcast(tv0, {true, false});
1417 auto tv4 = add(tv3, tv1);
1418 auto tv5 = add(tv3, tv2);
1419
1420 fusion.addOutput(tv4);
1421 fusion.addOutput(tv5);
1422
1423 // In order to do this, tv1->axis(1) and tv2->axis(1) must have the
1424 // same size, but we can't prove it, so this should throw an error.
1425 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1426 ASSERT_ANY_THROW(tv3->computeAt(tv4, -1));
1427}
1428
1429TEST_F(NVFuserTest, FusionBiasGeluFwd_CUDA) {
1430 Fusion fusion;
1431 FusionGuard fg(&fusion);
1432
1433 const float k_079 = 0.79788456;
1434 const float k_004 = 0.044715;
1435
1436 // bias vector
1437 auto t0 = makeSymbolicTensor(1, DataType::Half);
1438 fusion.addInput(t0);
1439 auto t1 = castOp(DataType::Float, t0);
1440 // input tensor
1441 auto t2 = makeSymbolicTensor(3, DataType::Half);
1442 fusion.addInput(t2);
1443 auto t3 = castOp(DataType::Float, t2);
1444 auto t4 = broadcast(t1, {true, true, false});
1445 auto t5 = add(t4, t3);
1446 auto t6 = mul(t5, IrBuilder::create<Double>(0.5));
1447 auto t7 = mul(t5, IrBuilder::create<Double>(k_079));
1448 auto t8 = mul(t5, IrBuilder::create<Double>(k_004));
1449 auto t9 = mul(t8, t5);
1450 auto t10 = add(t9, IrBuilder::create<Int>(1));
1451 auto t11 = mul(t7, t10);
1452 auto t12 = unaryOp(UnaryOpType::Tanh, t11);
1453 auto t13 = add(t12, IrBuilder::create<Double>(1));
1454 auto t14 = mul(t6, t13);
1455 auto t15 = castOp(DataType::Half, t14);
1456 fusion.addOutput(t15);
1457
1458 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
1459 at::manual_seed(0);
1460 std::vector<int64_t> input_shape{6, 512, 4096};
1461 std::vector<int64_t> bias_shape{4096};
1462
1463 auto at_input = at::randn(input_shape, options);
1464 auto at_bias = at::randn(bias_shape, options);
1465
1466 auto at_x =
1467 at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
1468 auto aten_output_float =
1469 at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh());
1470 auto aten_output = aten_output_float.to(c10::ScalarType::Half);
1471
1472 std::vector<IValue> aten_inputs = {at_bias, at_input};
1473 auto lparams = schedulePointwise(&fusion, aten_inputs);
1474
1475 FusionExecutor fe;
1476 fe.compileFusion(&fusion, aten_inputs, lparams);
1477 auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1478
1479 testValidate(
1480 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1481}
1482
1483TEST_F(NVFuserTest, FusionBiasGeluBwd_CUDA) {
1484 if (at::cuda::getDeviceProperties(0)->major < 6) {
1485 return;
1486 }
1487 Fusion fusion;
1488 FusionGuard fg(&fusion);
1489
1490 const float k_079 = 0.79788456;
1491 const float k_004 = 0.044715;
1492 const float k_010 = 0.1070322243;
1493
1494 // gradient tensor
1495 auto t0 = makeSymbolicTensor(3, DataType::Half);
1496 fusion.addInput(t0);
1497 auto t1 = castOp(DataType::Float, t0);
1498 // bias tensor
1499 auto t2 = makeSymbolicTensor(1, DataType::Half);
1500 fusion.addInput(t2);
1501 auto t3 = castOp(DataType::Float, t2);
1502 // input tensor
1503 auto t4 = makeSymbolicTensor(3, DataType::Half);
1504 fusion.addInput(t4);
1505 auto t5 = castOp(DataType::Float, t4);
1506 auto t6 = broadcast(t3, {true, true, false});
1507 auto t7 = add(t6, t5);
1508 auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
1509 auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
1510 auto t10 = mul(t9, t7);
1511 auto t11 = add(t10, IrBuilder::create<Int>(1));
1512 auto t12 = mul(t8, t11);
1513 auto t13 = unaryOp(UnaryOpType::Tanh, t12);
1514 auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
1515 auto t15 = mul(t13, t13);
1516 auto t16 = unaryOp(UnaryOpType::Neg, t15);
1517 auto t17 = add(t16, IrBuilder::create<Int>(1));
1518 auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
1519 auto t19 = mul(t18, t7);
1520 auto t20 = add(t19, IrBuilder::create<Double>(k_079));
1521 auto t21 = mul(t17, t20);
1522 auto t22 = mul(t14, t21);
1523 auto t23 = add(t13, IrBuilder::create<Int>(1));
1524 auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
1525 auto t25 = add(t22, t24);
1526 auto t26 = mul(t25, t1);
1527 // Save float output for validation
1528 fusion.addOutput(t26);
1529 auto t27 = castOp(DataType::Half, t26);
1530 fusion.addOutput(t27);
1531
1532 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
1533 at::manual_seed(1);
1534 std::vector<int64_t> input_shape{6, 512, 4096};
1535 std::vector<int64_t> bias_shape{4096};
1536 auto at_input = at::randn(input_shape, options);
1537 auto at_bias = at::randn(bias_shape, options);
1538 auto at_grad = at::randn(input_shape, options);
1539
1540 auto at_x =
1541 at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
1542 auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh();
1543 auto at_ff = 0.5 * at_x *
1544 ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) +
1545 0.5 * (1 + at_tanh_out);
1546 auto at_out = at_ff * at_grad;
1547 auto at_out_half = at_out.to(c10::ScalarType::Half);
1548
1549 std::vector<IValue> aten_inputs = {at_grad, at_bias, at_input};
1550 std::vector<at::Tensor> aten_outputs = {at_out, at_out_half};
1551
1552 auto lparams = schedulePointwise(&fusion, aten_inputs);
1553
1554 FusionExecutor fe;
1555 fe.compileFusion(&fusion, aten_inputs, lparams);
1556 auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1557
1558 testValidate(
1559 &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
1560}
1561
1562// Reproducer of issue #459
1563TEST_F(NVFuserTest, FusionIssue459_CUDA) {
1564 Fusion fusion;
1565 FusionGuard fg(&fusion);
1566
1567 auto tv0 = makeSymbolicTensor(1);
1568 fusion.addInput(tv0);
1569 auto tv1 = makeSymbolicTensor(2);
1570 fusion.addInput(tv1);
1571
1572 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
1573 auto tv3 = broadcast(tv2, {true, false});
1574 auto tv4 = add(tv1, tv3);
1575
1576 // Create two outputs from the final arithmetic result
1577 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
1578 fusion.addOutput(tv5);
1579 auto tv6 = add(tv4, IrBuilder::create<Double>(1));
1580 fusion.addOutput(tv6);
1581
1582 // Scheduling
1583 for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
1584 output->merge(-2, -1);
1585 }
1586 for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
1587 output->split(0, 128);
1588 }
1589
1590 tv0->computeAt(tv5, -1);
1591
1592 tv6->axis(0)->parallelize(ParallelType::BIDx);
1593 tv6->axis(1)->parallelize(ParallelType::TIDx);
1594
1595 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1596 at::manual_seed(0);
1597 const int numel_x = 10;
1598 const int numel_y = 20;
1599 auto t0 = at::randn({numel_x}, options);
1600 auto t1 = at::randn({numel_y, numel_x}, options);
1601 auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1;
1602
1603 std::vector<IValue> aten_inputs = {t0, t1};
1604
1605 torch::jit::fuser::cuda::FusionExecutor fe;
1606 fe.compileFusion(&fusion, aten_inputs);
1607 auto cg_outputs = fe.runFusion(aten_inputs);
1608
1609 testValidate(
1610 &fusion,
1611 cg_outputs,
1612 aten_inputs,
1613 {aten_output, aten_output},
1614 __LINE__,
1615 __FILE__);
1616}
1617
1618TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
1619 Fusion fusion;
1620 FusionGuard fg(&fusion);
1621
1622 auto tv0 = makeSymbolicTensor(2);
1623 fusion.addInput(tv0);
1624 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1625 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
1626 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
1627 fusion.addOutput(tv3);
1628
1629 tv3->axis(0)->parallelize(ParallelType::BIDx);
1630 tv3->axis(1)->parallelize(ParallelType::TIDx);
1631
1632 tv0->computeAt(tv3, -1);
1633
1634 tv1->setMemoryType(MemoryType::Shared);
1635 tv2->setMemoryType(MemoryType::Global);
1636
1637 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1638
1639 auto aten_input = at::randn({12, 34}, options);
1640 at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0;
1641
1642 FusionExecutor fe;
1643 fe.compileFusion(&fusion, {aten_input});
1644 auto cg_outputs = fe.runFusion({aten_input});
1645
1646 testValidate(
1647 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
1648}
1649
1650TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) {
1651 Fusion fusion;
1652 FusionGuard fg(&fusion);
1653
1654 // Symbolic integers we will use for runtime tiling
1655 Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
1656 Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
1657 Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
1658 // Compile-time integer for tiling
1659 int n_smem_tile = 32;
1660
1661 // Symbolic 2D tensors TV0[M, K], TV1[K, N]
1662 TensorView* tv0 = makeSymbolicTensor(2);
1663 TensorView* tv1 = makeSymbolicTensor(2);
1664
1665 // Broadcast tv0 to [M, K, *]
1666 TensorView* tv2 = broadcast(tv0, {false, false, true});
1667 // Broadcast tv1 to [*, K, N]
1668 TensorView* tv3 = broadcast(tv1, {true, false, false});
1669
1670 // Pointwise multiplication resulting in tv3[M, K, N]
1671 TensorView* tv4 = mul(tv2, tv3);
1672
1673 // Sum the K-dim
1674 TensorView* tv5 = sum(tv4, {1});
1675
1676 // Register inputs and outputs
1677 fusion.addInput(tv0);
1678 fusion.addInput(tv1);
1679 fusion.addOutput(tv5);
1680
1681 // Register runtime tile dims as inputs
1682 fusion.addInput(symbolic_m_tile_dim);
1683 fusion.addInput(symbolic_split_k_tile_dim);
1684 fusion.addInput(symbolic_block_k_tile_dim);
1685
1686 // Make a 3D tile, mix of symbolic and constant, do in reverse order because
1687 // dims are inserted
1688 // [M, rK, N]
1689 tv5->split(2, n_smem_tile);
1690 // [M, rK, No, Ni{32}]
1691 tv5->split(1, symbolic_block_k_tile_dim);
1692 // [M, rKo, rKi{i2}, No, Ni{32}]
1693 tv5->split(1, symbolic_split_k_tile_dim);
1694 // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
1695 tv5->split(0, symbolic_m_tile_dim);
1696 // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
1697
1698 // Reorder so all outer tiles are in the leftmost 3 positions
1699 // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
1700 // [Mo, No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
1701 tv5->reorder({{1, 5}, {5, 1}});
1702
1703 // Factor out the outer reduction IterDomain, then run the inter-cta
1704 // reduction, and intra-cta reduction
1705 // [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}]
1706 // [Mo, No, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
1707 auto tv6 = tv5->rFactor({2});
1708
1709 // Scope computations
1710 tv6->computeAt(tv5, 2);
1711
1712 // [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}]
1713 // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}]
1714 tv6->reorder({
1715 {5, -2},
1716 {6, -1},
1717 {2, 2},
1718 {3, 3},
1719 {4, 4},
1720 });
1721
1722 // Setup compute at schedule
1723 tv0->computeAt(tv6, 3);
1724 tv1->computeAt(tv6, 3);
1725 tv4->computeAt(tv6, -1);
1726
1727 // Cache smem tiles
1728 tv2->setMemoryType(MemoryType::Shared);
1729 tv3->setMemoryType(MemoryType::Shared);
1730 tv4->setMemoryType(MemoryType::Shared);
1731 tv6->setMemoryType(MemoryType::Shared);
1732
1733 tv5->axis(0)->parallelize(ParallelType::BIDz);
1734 tv5->axis(1)->parallelize(ParallelType::BIDy);
1735
1736 std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
1737 for (auto tv : tv_list) {
1738 tv->axis(-2)->parallelize(ParallelType::TIDz);
1739 tv->axis(-1)->parallelize(ParallelType::TIDy);
1740 }
1741
1742 constexpr int M = 31, K = 65, N = 32;
1743
1744 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1745 at::Tensor t0 = at::randn({M, K}, options);
1746 at::Tensor t1 = at::randn({K, N}, options);
1747
1748 at::Tensor aten_output =
1749 mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
1750
1751 // A, B, m_tile_dim, split_k, intra_cta_tile
1752 std::vector<IValue> aten_inputs = {t0, t1, 3, 4, 5};
1753
1754 FusionExecutor fe;
1755 fe.compileFusion(&fusion, aten_inputs);
1756 auto cg_outputs = fe.runFusion(aten_inputs);
1757
1758 testValidate(
1759 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1760}
1761
1762// Reproducer of issue 408
1763TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
1764 Fusion fusion;
1765 FusionGuard fg(&fusion);
1766
1767 auto tv0 = makeSymbolicTensor(2);
1768 fusion.addInput(tv0);
1769 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1770 auto tv2 = sum(tv1, {1});
1771 fusion.addOutput(tv2);
1772
1773 tv2->split(0, 4);
1774
1775 auto tv3 = tv2->cacheBefore();
1776
1777 tv0->computeAt(tv3, -1);
1778 tv3->computeAt(tv2, -1);
1779
1780 tv3->axis(-1)->parallelize(ParallelType::TIDx);
1781
1782 const int numel_x = 100;
1783 const int numel_y = 200;
1784 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1785
1786 at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
1787 at::Tensor cg_output = at::empty({numel_x}, options);
1788
1789 auto aten_output = (aten_input + 1).to(at::kDouble).sum({1});
1790
1791 FusionExecutor fe;
1792 fe.compileFusion(&fusion, {aten_input});
1793 fe.runFusion({aten_input}, {cg_output});
1794
1795 testValidate(
1796 &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
1797}
1798
1799TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
1800 Fusion fusion;
1801 FusionGuard fg(&fusion);
1802
1803 auto tv0 = makeSymbolicTensor(3);
1804 fusion.addInput(tv0);
1805 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1806 auto tv2 = sum(tv1, {1});
1807 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
1808 fusion.addOutput(tv2);
1809 fusion.addOutput(tv3);
1810
1811 auto tv4 = tv2->cacheBefore();
1812
1813 tv4->computeAt(tv3, 1);
1814 tv0->computeAt(tv4, -1);
1815
1816 tv3->axis(0)->parallelize(ParallelType::BIDx);
1817 tv1->axis(-1)->parallelize(ParallelType::TIDx);
1818 tv2->axis(-1)->parallelize(ParallelType::TIDx);
1819 tv3->axis(-1)->parallelize(ParallelType::TIDx);
1820 tv4->axis(-1)->parallelize(ParallelType::TIDx);
1821
1822 const int numel_x = 10;
1823 const int numel_y = 20;
1824 const int numel_z = 30;
1825 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1826
1827 at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
1828 auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
1829 auto t3 = t2 + 1;
1830 std::vector<at::Tensor> aten_outputs = {t2, t3};
1831
1832 FusionExecutor fe;
1833 fe.compileFusion(&fusion, {aten_input});
1834 auto cg_outputs = fe.runFusion({aten_input});
1835
1836 testValidate(
1837 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
1838}
1839
1840TEST_F(NVFuserTest, FusionIssue367_CUDA) {
1841 Fusion fusion;
1842 FusionGuard fg(&fusion);
1843
1844 // Symbolic integers we will use for runtime tiling
1845 Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
1846 Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
1847 Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
1848 // Compile-time integer for tiling
1849 int n_smem_tile = 32;
1850
1851 // Symbolic 2D tensors TV0[M, K], TV1[K, N]
1852 TensorView* tv0 = makeSymbolicTensor(2);
1853 TensorView* tv1 = makeSymbolicTensor(2);
1854
1855 // Broadcast tv0 to [M, K, *]
1856 TensorView* tv2 = broadcast(tv0, {false, false, true});
1857 // Broadcast tv1 to [*, K, N]
1858 TensorView* tv3 = broadcast(tv1, {true, false, false});
1859
1860 // Pointwise multiplication resulting in tv3[M, K, N]
1861 TensorView* tv4 = mul(tv2, tv3);
1862
1863 // Sum the K-dim
1864 TensorView* tv5 = sum(tv4, {1});
1865
1866 // Register inputs and outputs
1867 fusion.addInput(tv0);
1868 fusion.addInput(tv1);
1869 fusion.addOutput(tv5);
1870
1871 // Register runtime tile dims as inputs
1872 fusion.addInput(symbolic_m_tile_dim);
1873 fusion.addInput(symbolic_split_k_tile_dim);
1874 fusion.addInput(symbolic_block_k_tile_dim);
1875
1876 // Make a 3D tile, mix of symbolic and constant, do in reverse order because
1877 // dims are inserted
1878 // [M, K, N]
1879 tv5->split(2, n_smem_tile);
1880 tv5->split(1, symbolic_block_k_tile_dim);
1881 tv5->split(1, symbolic_split_k_tile_dim);
1882 tv5->split(0, symbolic_m_tile_dim);
1883 // [Mo, Mi, Koo, Koi, Ki, No, Ni]
1884 tv5->reorder({{1, 5}, {5, 1}});
1885 // [Mo, No, Koo, Koi, Ki, Mi, Ni]
1886
1887 auto tv6 = tv5->rFactor({2});
1888 auto tv7 = tv5->rFactor({2});
1889 // [Mo, No, rKoo, Koi, Ki, Mi, Ni]
1890 // [Mo, No, rKoi, rKi, Mi, Ni]
1891
1892 // Scope computations
1893 tv6->computeAt(tv5, 2);
1894
1895 tv0->computeAt(tv6, 3);
1896 tv1->computeAt(tv6, 3);
1897 tv4->computeAt(tv6, -1);
1898
1899 // Cache smem tiles
1900 tv2->setMemoryType(MemoryType::Shared);
1901 tv3->setMemoryType(MemoryType::Shared);
1902 tv4->setMemoryType(MemoryType::Local);
1903 tv6->setMemoryType(MemoryType::Local);
1904 tv7->setMemoryType(MemoryType::Local);
1905
1906 tv5->axis(0)->parallelize(ParallelType::BIDz);
1907 tv5->axis(1)->parallelize(ParallelType::BIDy);
1908
1909 std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6, tv7};
1910 for (auto tv : tv_list) {
1911 tv->axis(-2)->parallelize(ParallelType::TIDz);
1912 tv->axis(-1)->parallelize(ParallelType::TIDy);
1913 }
1914 tv2->axis(3)->parallelize(ParallelType::TIDx);
1915 tv3->axis(3)->parallelize(ParallelType::TIDx);
1916 tv4->axis(3)->parallelize(ParallelType::TIDx);
1917 tv6->axis(3)->parallelize(ParallelType::TIDx);
1918 tv7->axis(2)->parallelize(ParallelType::TIDx);
1919
1920 tv2->axis(4)->parallelize(ParallelType::BIDx);
1921 tv3->axis(4)->parallelize(ParallelType::BIDx);
1922 tv4->axis(4)->parallelize(ParallelType::BIDx);
1923 tv6->axis(4)->parallelize(ParallelType::BIDx);
1924 tv7->axis(3)->parallelize(ParallelType::BIDx);
1925 tv5->axis(2)->parallelize(ParallelType::BIDx);
1926
1927 constexpr int M = 3, K = 6, N = 16;
1928
1929 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1930
1931 at::Tensor t0 = at::randn({M, K}, options);
1932 at::Tensor t1 = at::randn({K, N}, options);
1933
1934 // A, B, m, split_k, block_k
1935 std::vector<IValue> aten_inputs = {t0, t1, 2, 2, 3};
1936 at::Tensor aten_output =
1937 mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
1938
1939 torch::jit::fuser::cuda::FusionExecutor fe;
1940 fe.compileFusion(&fusion, aten_inputs);
1941 auto cg_outputs = fe.runFusion(aten_inputs);
1942
1943 testValidate(
1944 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1945}
1946
1947TEST_F(NVFuserTest, FusionIssue468_CUDA) {
1948 Fusion fusion;
1949 FusionGuard fg(&fusion);
1950
1951 auto tv0 = makeSymbolicTensor(2);
1952 fusion.addInput(tv0);
1953 auto tv1 = sum(tv0, {1});
1954 auto tv2 = sum(tv1, {0});
1955 fusion.addOutput(tv2);
1956
1957 tv1->axis(0)->parallelize(ParallelType::TIDy);
1958 tv1->axis(1)->parallelize(ParallelType::TIDx);
1959
1960 tv2->axis(0)->parallelize(ParallelType::TIDy);
1961
1962 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1963 at::Tensor aten_input = at::randn({10, 100}, options);
1964 at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0});
1965
1966 FusionExecutor fe;
1967 fe.compileFusion(&fusion, {aten_input});
1968 auto cg_outputs = fe.runFusion({aten_input});
1969
1970 testValidate(
1971 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
1972}
1973
1974TEST_F(NVFuserTest, FusionIssue363_CUDA) {
1975 Fusion fusion;
1976 FusionGuard fg(&fusion);
1977
1978 // Symbolic 2D tensors TV0[M, K], TV1[K, N]
1979 TensorView* tv0 = makeSymbolicTensor(2);
1980 TensorView* tv1 = makeSymbolicTensor(2);
1981
1982 // Broadcast tv0 to [M, K, *]
1983 TensorView* tv2 = broadcast(tv0, {false, false, true});
1984 // Broadcast tv1 to [*, K, N]
1985 TensorView* tv3 = broadcast(tv1, {true, false, false});
1986
1987 // Pointwise multiplication resulting in tv3[M, K, N]
1988 TensorView* tv4 = mul(tv2, tv3);
1989
1990 // Sum the K-dim
1991 TensorView* tv5 = sum(tv4, {1});
1992
1993 // Register inputs and outputs
1994 fusion.addInput(tv0);
1995 fusion.addInput(tv1);
1996 fusion.addOutput(tv5);
1997
1998 tv2->setMemoryType(MemoryType::Global);
1999 tv3->setMemoryType(MemoryType::Global);
2000 tv4->setMemoryType(MemoryType::Global);
2001
2002 tv0->computeAt(tv5, -1);
2003 tv1->computeAt(tv5, -1);
2004
2005 tv5->axis(0)->parallelize(ParallelType::BIDz);
2006 tv5->axis(1)->parallelize(ParallelType::BIDy);
2007
2008 tv5->axis(2)->parallelize(ParallelType::BIDx);
2009
2010 constexpr int M = 3, K = 6, N = 16;
2011
2012 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2013
2014 at::Tensor t0 = at::randn({M, K}, options);
2015 at::Tensor t1 = at::randn({K, N}, options);
2016 at::Tensor aten_output =
2017 mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
2018
2019 std::vector<IValue> aten_inputs = {t0, t1};
2020
2021 torch::jit::fuser::cuda::FusionExecutor fe;
2022 fe.compileFusion(&fusion, aten_inputs);
2023 auto cg_outputs = fe.runFusion(aten_inputs);
2024
2025 testValidate(
2026 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2027}
2028
2029TEST_F(NVFuserTest, FusionIssue484_CUDA) {
2030 Fusion fusion;
2031 FusionGuard fg(&fusion);
2032
2033 auto tv0 = makeSymbolicTensor(2);
2034 fusion.addInput(tv0);
2035 auto tv1 = sum(tv0, {1});
2036 auto tv2 = add(tv1, IrBuilder::create<Double>(0));
2037 fusion.addOutput(tv2);
2038
2039 tv1->setMemoryType(MemoryType::Global);
2040 tv1->axis(1)->parallelize(ParallelType::TIDx);
2041
2042 constexpr int M = 100;
2043
2044 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2045
2046 at::Tensor aten_input = at::randn({M, M}, options);
2047 at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
2048
2049 torch::jit::fuser::cuda::FusionExecutor fe;
2050 fe.compileFusion(&fusion, {aten_input});
2051 auto cg_outputs = fe.runFusion({aten_input});
2052
2053 testValidate(
2054 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
2055}
2056
2057TEST_F(NVFuserTest, FusionIssue329_CUDA) {
2058 Fusion fusion;
2059 FusionGuard fg(&fusion);
2060
2061 auto tv0 = makeSymbolicTensor(2);
2062 fusion.addInput(tv0);
2063 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
2064 auto tv2 = sum(tv1, {1});
2065 fusion.addOutput(tv2);
2066 auto tv3 = sum(tv1, {1});
2067 fusion.addOutput(tv3);
2068
2069 tv1->computeAt(tv2, -1);
2070
2071 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2072
2073 std::vector<int64_t> t0_shape{17, 19};
2074 auto aten_input = at::randn(t0_shape, options);
2075 auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
2076 auto t3 = (aten_input + 1).to(at::kDouble).sum({1});
2077 std::vector<at::Tensor> aten_outputs = {t2, t3};
2078
2079 FusionExecutor fe;
2080 fe.compileFusion(&fusion, {aten_input});
2081 auto cg_outputs = fe.runFusion({aten_input});
2082
2083 testValidate(
2084 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
2085}
2086
2087TEST_F(NVFuserTest, FusionIssue382_CUDA) {
2088 Fusion fusion;
2089 FusionGuard fg(&fusion);
2090
2091 auto tv0 = makeSymbolicTensor(2);
2092 fusion.addInput(tv0);
2093
2094 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
2095 auto tv2 = broadcast(tv1, {false, false, true});
2096 auto tv3 = makeSymbolicTensor(3);
2097 fusion.addInput(tv3);
2098 auto tv4 = add(tv2, tv3);
2099 fusion.addOutput(tv4);
2100
2101 tv2->merge(1);
2102 tv4->merge(1);
2103
2104 tv1->computeAt(tv4, 1);
2105
2106 tv4->axis(0)->parallelize(ParallelType::BIDx);
2107
2108 tv1->setMemoryType(MemoryType::Global);
2109 tv2->setMemoryType(MemoryType::Global);
2110
2111 const int numel_x = 12;
2112 const int numel_y = 34;
2113 const int numel_z = 56;
2114
2115 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2116 at::manual_seed(0);
2117 auto t0 = at::randn({numel_x, numel_y}, options);
2118 auto t3 = at::randn({numel_x, numel_y, numel_z}, options);
2119
2120 std::vector<IValue> aten_inputs = {t0, t3};
2121 auto aten_output = (t0 + 1).unsqueeze(-1) + t3;
2122
2123 FusionExecutor fe;
2124 fe.compileFusion(&fusion, aten_inputs);
2125 auto cg_outputs = fe.runFusion(aten_inputs);
2126
2127 testValidate(
2128 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2129}
2130
2131TEST_F(NVFuserTest, FusionIssue507_CUDA) {
2132 Fusion fusion;
2133 FusionGuard fg(&fusion);
2134
2135 auto tv0 = makeSymbolicTensor(2);
2136 fusion.addInput(tv0);
2137 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
2138 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
2139 fusion.addOutput(tv2);
2140
2141 tv1->setMemoryType(MemoryType::Shared);
2142
2143 tv1->axis(1)->parallelize(ParallelType::TIDx);
2144 tv2->axis(1)->parallelize(ParallelType::TIDx);
2145 tv1->axis(0)->parallelize(ParallelType::BIDx);
2146 tv2->axis(0)->parallelize(ParallelType::BIDx);
2147
2148 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2149
2150 std::vector<int64_t> t0_shape{17, 19};
2151 auto aten_input = at::randn(t0_shape, options);
2152 auto t1 = (aten_input + 1);
2153 auto aten_output = (t1 + 1);
2154
2155 FusionExecutor fe;
2156 fe.compileFusion(&fusion, {aten_input});
2157 auto cg_outputs = fe.runFusion({aten_input});
2158
2159 testValidate(
2160 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
2161}
2162
2163TEST_F(NVFuserTest, FusionIssue532_CUDA) {
2164 Fusion fusion;
2165 FusionGuard fg(&fusion);
2166
2167 // Algorithm
2168 TensorView* tv0 = makeSymbolicTensor(1);
2169 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
2170 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
2171 fusion.addInput(tv0);
2172 fusion.addOutput(tv2);
2173
2174 const int M_BLOCK = 64;
2175 const int M_THREAD = 4;
2176
2177 tv2->split(0, M_BLOCK);
2178 // tv2: [M/M_BLOCK, M_BLOCK]
2179 tv1->computeAt(tv2, 1);
2180 // tv1: [M/M_BLOCK, M_BLOCK]
2181
2182 tv1->split(-1, M_BLOCK / M_THREAD);
2183 // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD]
2184
2185 tv2->split(-1, M_THREAD);
2186 // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD]
2187
2188 constexpr int M = 1000;
2189
2190 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2191 at::manual_seed(0);
2192 at::Tensor t0 = at::randn({M}, options);
2193 std::vector<IValue> aten_inputs = {t0};
2194
2195 FusionExecutor fe;
2196 fe.compileFusion(&fusion, aten_inputs);
2197 auto outputs = fe.runFusion(aten_inputs);
2198
2199 at::Tensor aten_output = t0 + 1 + 1;
2200
2201 testValidate(
2202 &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2203}
2204
2205TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) {
2206 Fusion fusion;
2207 FusionGuard fg(&fusion);
2208
2209 // Algorithm
2210 TensorView* tv0 = makeSymbolicTensor(1);
2211 TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
2212 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
2213 fusion.addInput(tv0);
2214 fusion.addOutput(tv2);
2215
2216 tv2->split(0, 32);
2217 tv1->computeAt(tv2, -1);
2218
2219 tv2->axis(1)->parallelize(ParallelType::Unswitch);
2220
2221 constexpr int M = 1000;
2222
2223 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2224 at::manual_seed(0);
2225 at::Tensor t0 = at::randn({M}, options);
2226 std::vector<IValue> aten_inputs = {t0};
2227
2228 FusionExecutor fe;
2229 fe.compileFusion(&fusion, aten_inputs);
2230 auto outputs = fe.runFusion(aten_inputs);
2231
2232 at::Tensor aten_output = t0 + 1 + 1;
2233
2234 testValidate(
2235 &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2236}
2237
2238TEST_F(NVFuserTest, FusionIssue549_CUDA) {
2239 Fusion fusion;
2240 FusionGuard fg(&fusion);
2241
2242 // Set up your input tensor views
2243 TensorView* tv0 = makeSymbolicTensor(2); // M, K
2244 TensorView* tv1 = makeSymbolicTensor(2); // K, N
2245 fusion.addInput(tv0);
2246 fusion.addInput(tv1);
2247
2248 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
2249
2250 TensorView* tv3 = broadcast(tv2, {false, false, true});
2251 // tv3[I0, I1, B] = tv0[I0, I1]
2252
2253 TensorView* tv4 = broadcast(tv1, {true, false, false});
2254 // tv4[B, I1, I2] = tv1[I1, I2]
2255
2256 // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2]
2257 TensorView* tv5 = mul(tv3, tv4);
2258 // tv6[I0, R1, I2] = tv5[I0, I1, I2]
2259 TensorView* tv6 = sum(tv5, {1});
2260 fusion.addOutput(tv6);
2261
2262 tv6->split(1, 32);
2263 // tv6[I0, R1o, R1i{32}, I2]
2264
2265 auto tv7 = tv6->rFactor({1});
2266 // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2]
2267 // tv6[I0, , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2]
2268
2269 tv6->split(0, 4);
2270 tv6->split(-1, 4);
2271 // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
2272 // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
2273
2274 tv0->computeAt(tv6, -1);
2275 tv1->computeAt(tv6, -1);
2276
2277 // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
2278 // tv6[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}]
2279 //--> (line symbolizes compute at location)
2280 // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
2281 // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
2282 // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
2283
2284 tv0->computeAt(tv7, -1);
2285 tv1->computeAt(tv7, -1);
2286 // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
2287 // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
2288 // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
2289
2290 tv6->axis(0)->parallelize(ParallelType::BIDz);
2291 tv6->axis(1)->parallelize(ParallelType::TIDz);
2292
2293 tv6->axis(-2)->parallelize(ParallelType::BIDy);
2294 tv6->axis(-1)->parallelize(ParallelType::TIDy);
2295
2296 tv6->axis(2)->parallelize(ParallelType::TIDx);
2297 tv7->axis(2)->parallelize(ParallelType::TIDx);
2298
2299 constexpr int M = 65, K = 33, N = 17;
2300
2301 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2302
2303 at::Tensor t0 = at::randn({M, K}, options);
2304 at::Tensor t1 = at::randn({K, N}, options);
2305
2306 // Lets specify a few bounds in launch params to make sure it works
2307 LaunchParams lparams(1, -1, -1, 32, 4, 4);
2308
2309 FusionExecutor fe;
2310 fe.compileFusion(&fusion, {t0, t1}, lparams);
2311 fe.runFusion({t0, t1}, lparams);
2312
2313 // Make sure bad launch params throws
2314 // TODO: Re-enable once we have parallelization validation in.
2315 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2316 // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
2317
2318 // Don't specify any launch params
2319 auto cg_outputs = fe.runFusion({t0, t1});
2320
2321 auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble));
2322
2323 testValidate(
2324 &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
2325}
2326
2327TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) {
2328 FusionExecutor fe;
2329 std::string kernel = R"(
2330__global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
2331 if(threadIdx.x==0){
2332 for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) {
2333 T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2;
2334 }
2335 }
2336}
2337 )";
2338 fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2339 LaunchParams lp(
2340 256, // gdimx
2341 1, // gdimy
2342 1, // gdimz
2343 1, // bdimx
2344 1, // bdimy
2345 1 // bdimz
2346 );
2347 lp.setSmem(0);
2348 const auto options =
2349 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2350 const std::vector<int64_t> tensor_dims = {8};
2351 auto in0 = at::randn(tensor_dims, options);
2352 auto out0 = at::empty_like(in0);
2353 fe.runRtc(lp, {in0, out0});
2354
2355 auto out_ref = in0 * 2;
2356 TORCH_CHECK(out_ref.allclose(out0));
2357}
2358
2359TEST_F(NVFuserTest, FusionSerialWelford_CUDA) {
2360 FusionExecutor fe;
2361 int x = 128, y = 64, z = 64;
2362
2363 std::string kernel = R"(
2364__global__ void kernel1(
2365 Tensor<float,3> inp,
2366 Tensor<float,1> out_var,
2367 Tensor<float,1> out_avg
2368){
2369 for(int i0=0;i0<inp.size[0];i0++){
2370 float tmp_M2=0;
2371 float tmp_avg=0;
2372 long tmp_N=0;
2373 for(int i1=0;i1<inp.size[1];i1++){
2374 for(int i2=0;i2<inp.size[2];i2++){
2375 welfordCombine(
2376 tmp_avg,
2377 tmp_M2,
2378 tmp_N,
2379 inp[i0*inp.stride[0]+
2380 i1*inp.stride[1]+
2381 i2*inp.stride[2]],
2382 0.f,
2383 (long)1
2384 );
2385 }
2386 }
2387 out_var[i0*out_var.stride[0]]=
2388 tmp_M2/(tmp_N);
2389 out_avg[i0*out_avg.stride[0]]=
2390 tmp_avg;
2391 }
2392}
2393 )";
2394 fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2395 LaunchParams lp(
2396 1, // gdimx
2397 1, // gdimy
2398 1, // gdimz
2399 1, // bdimx
2400 1, // bdimy
2401 1 // bdimz
2402 );
2403 lp.setSmem(0);
2404 const auto options =
2405 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2406 const std::vector<int64_t> tensor_dims = {x, y, z};
2407 auto in0 = at::randn(tensor_dims, options);
2408 auto out_var = at::empty({x}, options);
2409 auto out_avg = at::empty({x}, options);
2410 fe.runRtc(lp, {in0, out_var, out_avg});
2411
2412 TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
2413 TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
2414}
2415
2416TEST_F(NVFuserTest, FusionBlockWelford_CUDA) {
2417 FusionExecutor fe;
2418 int x = 7, y = 8, z = 9;
2419
2420 std::string kernel = R"(
2421__global__ void kernel1(
2422 Tensor<float,2> inp,
2423 Tensor<float,1> out_avg,
2424 Tensor<float,1> out_var,
2425 Tensor<float,1> init_avg,
2426 Tensor<float,1> init_var,
2427 Tensor<long,0> init_N
2428){
2429 //actual generated kernel will use dynamic shared mem,
2430 // here is just for prototype
2431 __shared__ float mem_avg[512];
2432 __shared__ float mem_M2[512];
2433 __shared__ long mem_N[512];
2434 float in=inp[threadIdx.x*inp.stride[0]+
2435 threadIdx.y*inp.stride[1]];
2436 float tmp_avg=0;
2437 float tmp_M2=0;
2438 long tmp_N=0;
2439 blockWelford<false,true,false>(
2440 tmp_avg,
2441 tmp_M2,
2442 tmp_N,
2443 in,
2444 0.f,
2445 (long)1,
2446 threadIdx,
2447 blockDim,
2448 (float*)mem_avg,
2449 (float*)mem_M2,
2450 (long*)mem_N,
2451 (bool)(threadIdx.x<inp.size[0]),
2452 0.f);
2453 __syncthreads();
2454 if(threadIdx.x<out_var.size[0] && threadIdx.y==0){
2455 welfordCombine(
2456 tmp_avg,
2457 tmp_M2,
2458 tmp_N,
2459 init_avg[threadIdx.x*init_avg.stride[0]],
2460 init_var[threadIdx.x*init_var.stride[0]]*init_N[0],
2461 init_N[0]
2462 );
2463 out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
2464 out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
2465 }
2466}
2467 )";
2468 fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2469 LaunchParams lp(
2470 1, // gdimx
2471 1, // gdimy
2472 1, // gdimz
2473 x, // bdimx
2474 y, // bdimy
2475 1 // bdimz
2476 );
2477 lp.setSmem(0);
2478 const auto options =
2479 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2480 const std::vector<int64_t> tensor_dims = {x, y};
2481 const std::vector<int64_t> init_dims = {x, z};
2482
2483 // generate initial values
2484 auto init_in = at::randn(init_dims, options);
2485 auto init_var = init_in.var({1}, false);
2486 auto init_avg = init_in.mean({1});
2487 auto init_N =
2488 at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0));
2489
2490 auto in0 = at::randn(tensor_dims, options);
2491
2492 // run kernel
2493 auto out_var = at::zeros({x}, options);
2494 auto out_avg = at::zeros({x}, options);
2495 fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N});
2496
2497 // compare with reference output
2498 auto cat_tensor = at::cat({init_in, in0}, 1);
2499 TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var));
2500 TORCH_CHECK(
2501 cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
2502}
2503
2504TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
2505 FusionExecutor fe;
2506 int x = 7, y = 8, z = 9;
2507
2508 // need support IValue for integer input as initial count
2509 std::string kernel = R"(
2510__global__ void kernel1(
2511 Tensor<float,3> inp,
2512 Tensor<float,1> out_avg,
2513 Tensor<float,1> out_var
2514){
2515 //actual generated kernel will use dynamic shared mem,
2516 // here is just for prototype
2517 __shared__ float mem_avg[512];
2518 __shared__ float mem_M2[512];
2519 __shared__ long mem_N[512];
2520 float in=inp[threadIdx.x*inp.stride[0]+
2521 threadIdx.y*inp.stride[1]+
2522 threadIdx.z*inp.stride[2]];
2523 float tmp_avg=0;
2524 float tmp_M2=0;
2525 long tmp_N=0;
2526 block_sync::init();
2527 blockWelford<false,true,true>(
2528 tmp_avg,
2529 tmp_M2,
2530 tmp_N,
2531 in,
2532 0.f,
2533 (long) 1,
2534 threadIdx,
2535 blockDim,
2536 (float*)mem_avg,
2537 (float*)mem_M2,
2538 (long*)mem_N,
2539 (bool)(threadIdx.x<inp.size[0]),
2540 0.f);
2541 __syncthreads();
2542 if(threadIdx.x<out_var.size[0] && threadIdx.y==0 && threadIdx.z==0){
2543 out_avg[threadIdx.x*out_var.stride[0]]=tmp_avg;
2544 out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
2545 }
2546}
2547 )";
2548 fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2549 LaunchParams lp(
2550 1, // gdimx
2551 1, // gdimy
2552 1, // gdimz
2553 x, // bdimx
2554 y, // bdimy
2555 z // bdimz
2556 );
2557 lp.setSmem(0);
2558 const auto options =
2559 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2560 const std::vector<int64_t> tensor_dims = {x, y, z};
2561 auto in0 = at::randn(tensor_dims, options);
2562 auto out_var = at::empty({x}, options);
2563 auto out_avg = at::empty({x}, options);
2564 fe.runRtc(lp, {in0, out_avg, out_var});
2565
2566 TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
2567 TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
2568}
2569
2570TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
2571 FusionExecutor fe;
2572 int x = 128, y = 64, z = 128;
2573
2574 std::string kernel = R"(
2575__global__ void kernel1(
2576 Tensor<float,3> inp,
2577 Tensor<float,1> out_avg,
2578 Tensor<float,1> out_var,
2579 Tensor<float,1> work_buf_avg,
2580 Tensor<float,1> work_buf_M2,
2581 Tensor<long,1> work_buf_N,
2582 Tensor<int64_t,1> sync_flag
2583){
2584 __shared__ float shared_buf_avg[512];
2585 __shared__ float shared_buf_M2[512];
2586 __shared__ long shared_buf_N[512];
2587 float tmp_avg=0;
2588 float tmp_M2=0;
2589 long tmp_N=0;
2590 float in = inp[ blockIdx.x * inp.stride[0]+
2591 blockIdx.y * inp.stride[1]+
2592 threadIdx.x * inp.stride[2]];
2593 block_sync::init();
2594 welford::gridWelford<
2595 true,true,false,
2596 true,false,false,
2597 false
2598 >(
2599 tmp_avg,
2600 tmp_M2,
2601 tmp_N,
2602 in,
2603 0.f,
2604 (long) 1,
2605 &work_buf_avg[0],
2606 &work_buf_M2[0],
2607 &work_buf_N[0],
2608 sync_flag,
2609 (float*)shared_buf_avg,
2610 (float*)shared_buf_M2,
2611 (long*)shared_buf_N,
2612 threadIdx.x<out_var.size[0],
2613 threadIdx.x<out_var.size[0],
2614 0.f,
2615 0,
2616 1);
2617 if(blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1){
2618 out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
2619 out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/tmp_N;
2620 }
2621}
2622 )";
2623 fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2624 LaunchParams lp(
2625 x, // gdimx
2626 y, // gdimy
2627 1, // gdimz
2628 z, // bdimx
2629 1, // bdimy
2630 1 // bdimz
2631 );
2632 lp.setSmem(0);
2633 const auto options =
2634 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2635 const auto options_int =
2636 at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2637
2638 const std::vector<int64_t> tensor_dims = {x, y, z};
2639 auto in0 = at::randn(tensor_dims, options);
2640
2641 auto out_avg = at::empty({z}, options);
2642 auto out_var = at::empty({z}, options);
2643 auto work_buf_avg = at::empty({x * y * z}, options);
2644 auto work_buf_var = at::empty({x * y * z}, options);
2645 auto work_buf_N = at::empty({x * y * z}, options_int);
2646 auto sync_flag = at::zeros({1}, options_int);
2647 fe.runRtc(
2648 lp,
2649 {in0,
2650 out_avg,
2651 out_var,
2652 work_buf_avg,
2653 work_buf_var,
2654 work_buf_N,
2655 sync_flag});
2656 std::vector<int64_t> dims{0, 1};
2657
2658 TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
2659 TORCH_CHECK(in0.var(dims, false).allclose(out_var));
2660}
2661
2662TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
2663 Fusion fusion;
2664 FusionGuard fg(&fusion);
2665
2666 int M = 64, N = 128;
2667
2668 auto tv0 = makeSymbolicTensor(2);
2669 fusion.addInput(tv0);
2670 auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
2671 auto tvs = Welford(tv1, {1});
2672 auto tv_avg = tvs.avg;
2673 auto tv_M2 = tvs.var_sum;
2674 auto tv_N = tvs.n;
2675 fusion.addOutput(tv_avg);
2676 fusion.addOutput(tv_M2);
2677 fusion.addOutput(tv_N);
2678
2679 tv_avg->split(1, 32);
2680 tv_avg->split(0, 32);
2681 tv_avg->split(0, 4);
2682 tv_avg->reorder({{-1, -3}, {-3, -1}});
2683 tv1->computeAt(tv_avg, -1);
2684
2685 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2686 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2687 at::manual_seed(0);
2688 at::Tensor t0 = at::randn({M, N}, options);
2689
2690 FusionExecutor fe;
2691 fe.compileFusion(&fusion, {t0});
2692 auto outputs = fe.runFusion({t0});
2693
2694 // by default Welford outputs sum of square diff so need to divide to get var
2695 outputs[1] /= N;
2696
2697 testValidate(
2698 fe.kernel(),
2699 outputs,
2700 {t0},
2701 {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
2702 __LINE__,
2703 __FILE__);
2704}
2705
2706TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
2707 Fusion fusion;
2708 FusionGuard fg(&fusion);
2709
2710 int M = 64, N = 128;
2711
2712 auto tv0 = makeSymbolicTensor(2);
2713 fusion.addInput(tv0);
2714 auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
2715 auto tvs = Welford(tv1, {1});
2716 auto tv_avg = tvs.avg;
2717 auto tv_M2 = tvs.var_sum;
2718 auto tv_N = tvs.n;
2719 fusion.addOutput(tv_avg);
2720 fusion.addOutput(tv_M2);
2721 fusion.addOutput(tv_N);
2722
2723 tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
2724
2725 tv1->computeAt(tv_avg, -1);
2726
2727 //
2728 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2729 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2730 at::manual_seed(0);
2731 at::Tensor t0 = at::randn({M, N}, options);
2732 at::Tensor t_var = at::empty({M}, options);
2733 at::Tensor t_avg = at::empty({M}, options);
2734 at::Tensor t_N = at::empty({M}, options_int);
2735
2736 FusionExecutor fe;
2737 fe.compileFusion(&fusion, {t0});
2738 auto outputs = fe.runFusion({t0});
2739
2740 // by default Welford outputs sum of square diff so need to divide to get var
2741 outputs[1] /= N;
2742
2743 testValidate(
2744 fe.kernel(),
2745 outputs,
2746 {t0},
2747 {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
2748 __LINE__,
2749 __FILE__);
2750}
2751
2752TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
2753 Fusion fusion;
2754 FusionGuard fg(&fusion);
2755
2756 int M = 64, N = 128;
2757
2758 auto tv0 = makeSymbolicTensor(2);
2759 fusion.addInput(tv0);
2760 auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
2761 auto tvs = Welford(tv1, {1});
2762 auto tv_avg = tvs.avg;
2763 auto tv_M2 = tvs.var_sum;
2764 auto tv_N = tvs.n;
2765 fusion.addOutput(tv_avg);
2766 fusion.addOutput(tv_M2);
2767 fusion.addOutput(tv_N);
2768
2769 tv_avg->axis(0)->parallelize(ParallelType::TIDx);
2770 tv_avg->axis(-1)->parallelize(ParallelType::BIDx);
2771
2772 tv1->computeAt(tv_avg, -1);
2773
2774 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2775 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2776 at::manual_seed(0);
2777 at::Tensor t0 = at::randn({M, N}, options);
2778 at::Tensor t_avg = at::empty({M}, options);
2779 at::Tensor t_var = at::empty({M}, options);
2780 at::Tensor t_N = at::empty({M}, options_int);
2781
2782 FusionExecutor fe;
2783 fe.compileFusion(&fusion, {t0});
2784 auto outputs = fe.runFusion({t0});
2785
2786 // by default Welford outputs sum of square diff so need to divide to get var
2787 outputs[1] /= N;
2788
2789 testValidate(
2790 fe.kernel(),
2791 outputs,
2792 {t0},
2793 {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
2794 __LINE__,
2795 __FILE__);
2796}
2797
2798TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
2799 Fusion fusion;
2800 FusionGuard fg(&fusion);
2801
2802 int M = 64, N = 128;
2803
2804 auto tv0 = makeSymbolicTensor(2);
2805 fusion.addInput(tv0);
2806 auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
2807 auto tvs = Welford(tv1, {1});
2808 auto tv_avg = tvs.avg;
2809 auto tv_M2 = tvs.var_sum;
2810 auto tv_N = tvs.n;
2811 fusion.addOutput(tv_avg);
2812 fusion.addOutput(tv_M2);
2813 fusion.addOutput(tv_N);
2814
2815 tv_avg->split(1, 4);
2816 ir_utils::rfactorHelper(tvs.avg, {2});
2817 tv1->computeAt(tv_avg, -1);
2818
2819 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2820 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2821 at::manual_seed(0);
2822 at::Tensor t0 = at::randn({M, N}, options);
2823 at::Tensor t_avg = at::empty({M}, options);
2824 at::Tensor t_var = at::empty({M}, options);
2825 at::Tensor t_N = at::empty({M}, options_int);
2826
2827 FusionExecutor fe;
2828 fe.compileFusion(&fusion, {t0});
2829 auto outputs = fe.runFusion({t0});
2830
2831 // by default Welford outputs sum of square diff so need to divide to get var
2832 outputs[1] /= N;
2833
2834 testValidate(
2835 fe.kernel(),
2836 outputs,
2837 {t0},
2838 {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
2839 __LINE__,
2840 __FILE__);
2841}
2842
2843TEST_F(NVFuserTest, FusionWelfordSchedule_CUDA) {
2844 Fusion fusion;
2845 FusionGuard fg(&fusion);
2846
2847 int M = 64, N = 128;
2848
2849 auto tv0 = makeSymbolicTensor(2);
2850 fusion.addInput(tv0);
2851 auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
2852 auto tvs = Welford(tv1, {1});
2853 auto tv_avg = tvs.avg;
2854 auto tv_M2 = tvs.var_sum;
2855 auto tv_N = tvs.n;
2856 fusion.addOutput(tv_avg);
2857 fusion.addOutput(tv_M2);
2858 fusion.addOutput(tv_N);
2859
2860 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2861 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2862 at::manual_seed(0);
2863 at::Tensor t0 = at::randn({M, N}, options);
2864 // TODO: Why do we use launch params from here, but not scheduling???
2865 auto reduction_params = getReductionHeuristics(&fusion, {t0});
2866 scheduleReduction(&fusion, *reduction_params);
2867
2868 auto lparams = reduction_params->lparams;
2869 FusionExecutor fe;
2870 fe.compileFusion(&fusion, {t0}, lparams);
2871 auto outputs = fe.runFusion({t0}, lparams);
2872
2873 // by default Welford outputs sum of square diff so need to divide to get var
2874 outputs[1] /= N;
2875
2876 auto at_avg = t0.mean({1});
2877 auto at_var = t0.var({1}, false);
2878 auto at_n = at::ones({M}, options_int) * N;
2879
2880 testValidate(
2881 fe.kernel(),
2882 outputs,
2883 {t0},
2884 {at_avg, at_var, at_n},
2885 __LINE__,
2886 __FILE__,
2887 "validate welford",
2888 reduction_params->lparams);
2889}
2890
2891namespace {
2892void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
2893 const int axis = red_axis;
2894 at::ScalarType aten_dtype = data_type_to_aten(dtype);
2895
2896 Fusion fusion;
2897 FusionGuard fg(&fusion);
2898 TensorView* tv0 = makeSymbolicTensor(2, dtype);
2899 bool is_fp16 = dtype == DataType::Half;
2900 bool is_bf16 = dtype == DataType::BFloat16;
2901 TensorView* tv0_cast = tv0;
2902 if (is_fp16 || is_bf16) {
2903 tv0_cast = castOp(DataType::Float, tv0);
2904 }
2905 fusion.addInput(tv0);
2906 auto tv1 = mul(tv0_cast, IrBuilder::create<Double>(1));
2907 auto tvs = Welford(tv1, {axis});
2908 auto tv_avg = tvs.avg;
2909 auto tv_M2 = tvs.var_sum;
2910 auto tv_N = tvs.n;
2911
2912 TensorView* avg_cast = tv_avg;
2913 TensorView* M2_cast = tv_M2;
2914
2915 if (is_fp16) {
2916 avg_cast = castOp(DataType::Half, tv_avg);
2917 M2_cast = castOp(DataType::Half, tv_M2);
2918 }
2919 if (is_bf16) {
2920 avg_cast = castOp(DataType::BFloat16, tv_avg);
2921 M2_cast = castOp(DataType::BFloat16, tv_M2);
2922 }
2923
2924 fusion.addOutput(avg_cast);
2925 fusion.addOutput(M2_cast);
2926 fusion.addOutput(tv_N);
2927
2928 auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
2929 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
2930 at::manual_seed(0);
2931 std::vector<TensorView*> outputs_of_red;
2932 at::Tensor aten_input =
2933 (axis ? at::randn({odim, rdim}, options)
2934 : at::randn({rdim, odim}, options));
2935
2936 if (is_fp16 || is_bf16) {
2937 outputs_of_red.push_back(avg_cast);
2938 outputs_of_red.push_back(M2_cast);
2939 }
2940
2941 auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
2942 scheduleReduction(&fusion, *reduction_params);
2943
2944 auto lparams = reduction_params->lparams;
2945
2946 FusionExecutor fe;
2947 fe.compileFusion(&fusion, {aten_input}, lparams);
2948 auto outputs = fe.runFusion({aten_input}, lparams);
2949
2950 // by default Welford outputs sum of square diff so need to divide to
2951 // get var
2952
2953 outputs[1] /= rdim;
2954
2955 auto at_avg = aten_input.mean({axis});
2956 auto at_var = aten_input.var({axis}, false);
2957 auto at_n =
2958 (axis ? at::ones({odim, rdim}, options)
2959 : at::ones({rdim, odim}, options));
2960 at_n = at_n.sum({axis});
2961
2962 testValidate(
2963 fe.kernel(),
2964 outputs,
2965 {aten_input},
2966 {at_avg, at_var, at_n},
2967 __LINE__,
2968 __FILE__,
2969 "validate welford",
2970 reduction_params->lparams);
2971}
2972} // namespace
2973
2974TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) {
2975 std::vector<DataType> dtypes = {
2976 DataType::Double, DataType::Float, DataType::Half};
2977 // TODO: enable this for complex. Currently, complex yields
2978 // silent wrong results:
2979 // Detected abs error of: 3.8062
2980 // absolute tolerance was set to 2.23704e-06
2981 // and relative tolerance set to 2.23704e-08
2982#if !defined(USE_ROCM)
2983 if (at::cuda::getDeviceProperties(0)->major >= 8) {
2984 dtypes.insert(dtypes.end(), DataType::BFloat16);
2985 }
2986#endif
2987
2988 std::vector<int> red_axis = {1, 0};
2989 std::vector<int> output_dims = {160, 320};
2990 std::vector<int> red_dims;
2991
2992 // Tried to cut down the number iterations with just
2993 // doing every other power of 2.
2994 for (int i = 1; i <= 1024 * 1024; i <<= 2) {
2995 red_dims.push_back(i);
2996 }
2997
2998 for (auto dtype : dtypes) {
2999 for (auto& axis : red_axis) {
3000 for (auto& odim : output_dims) {
3001 for (auto& rdim : red_dims) {
3002 // TODO: original welford algorithm actually keeps a running sum of
3003 // squares, i.e. M_{2n} in the
3004 // cf:
3005 // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
3006 // algorithm notation, and it can reach inf for large numbers
3007 // with half precision. skipping too large volumes for half for
3008 // nwo might need further numerical experiments to re-design
3009 // this.
3010 if (rdim > 32768 &&
3011 (dtype == DataType::Half || dtype == DataType::BFloat16)) {
3012 continue;
3013 }
3014 testWelford(dtype, axis, odim, rdim);
3015 }
3016 }
3017 }
3018 }
3019}
3020
3021namespace {
3022void testVarMean(at::ScalarType dtype, int correction, bool keepdim) {
3023 auto fusion = std::make_unique<Fusion>();
3024 FusionGuard fg(fusion.get());
3025
3026 int M = 64, N = 128;
3027
3028 auto tv0 = makeSymbolicTensor(2, aten_to_data_type(dtype));
3029 fusion->addInput(tv0);
3030 auto tvs = variance_mean(tv0, {1}, correction, keepdim);
3031 auto tv_mean = tvs.mean;
3032 auto tv_var = tvs.var;
3033 fusion->addOutput(tv_var);
3034 fusion->addOutput(tv_mean);
3035
3036 auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
3037 at::manual_seed(0);
3038 at::Tensor t0 = at::randn({M, N}, options);
3039
3040 FusionExecutorCache executor_cache(std::move(fusion));
3041 auto outputs = executor_cache.runFusionWithInputs({t0});
3042
3043 auto at_var_mean = at::var_mean(t0, {1}, correction, keepdim);
3044 std::vector<at::Tensor> aten_outputs = {
3045 std::get<0>(at_var_mean), std::get<1>(at_var_mean)};
3046
3047 testValidate(
3048 executor_cache.fusion(), outputs, {t0}, aten_outputs, __LINE__, __FILE__);
3049}
3050} // namespace
3051
3052TEST_F(NVFuserTest, FusionVarMean_CUDA) {
3053 std::vector<at::ScalarType> dtypes = {at::kFloat, at::kDouble};
3054 std::vector<int> corrections = {0, 1};
3055 std::vector<bool> keepdims = {false, true};
3056 for (auto correction : corrections) {
3057 for (auto keepdim : keepdims) {
3058 for (auto dtype : dtypes) {
3059 testVarMean(dtype, correction, keepdim);
3060 }
3061 }
3062 }
3063}
3064
3065TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
3066 Fusion fusion;
3067 FusionGuard fg(&fusion);
3068
3069 // Set up your input tensor views
3070
3071 TensorView* tv0 = makeSymbolicTensor(2); // K, M
3072 TensorView* tv1 = makeSymbolicTensor(2); // N, K
3073 fusion.addInput(tv0);
3074 fusion.addInput(tv1);
3075
3076 TensorView* tv0_t = transpose(tv0);
3077 TensorView* tv1_t = transpose(tv1);
3078
3079 TensorView* tv2 = broadcast(tv0_t, {false, false, true});
3080 // tv2[I0, I1, B] = tv0[I0, I1]
3081
3082 TensorView* tv3 = broadcast(tv1_t, {true, false, false});
3083 // tv3[B, I1, I2] = tv1[I1, I2]
3084
3085 // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
3086 TensorView* tv4 = mul(tv2, tv3);
3087 // tv5[I0, R1, I2] = tv4[I0, I1, I2]
3088 TensorView* tv5 = sum(tv4, {1});
3089 fusion.addOutput(tv5);
3090
3091 tv5->split(1, 32);
3092 // tv5[I0, R1o, R1i{32}, I2]
3093
3094 auto tv6 = tv5->rFactor({1});
3095 // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
3096 // tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
3097
3098 tv5->split(0, 4);
3099 tv5->split(-1, 4);
3100 // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
3101 // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
3102
3103 tv0_t->computeAt(tv5, -1);
3104 tv1_t->computeAt(tv5, -1);
3105
3106 // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
3107 // tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}]
3108 //--> (line symbolizes compute at location)
3109 // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
3110 // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
3111 // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
3112
3113 tv0_t->computeAt(tv6, -1);
3114 tv1_t->computeAt(tv6, -1);
3115 // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
3116 // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
3117 // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
3118
3119 tv5->axis(0)->parallelize(ParallelType::BIDz);
3120 tv5->axis(1)->parallelize(ParallelType::TIDz);
3121
3122 tv5->axis(-2)->parallelize(ParallelType::BIDy);
3123 tv5->axis(-1)->parallelize(ParallelType::TIDy);
3124
3125 tv5->axis(2)->parallelize(ParallelType::TIDx);
3126 tv6->axis(2)->parallelize(ParallelType::TIDx);
3127
3128 constexpr int M = 65, K = 33, N = 17;
3129
3130 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3131
3132 at::Tensor t0 = at::randn({K, M}, options);
3133 at::Tensor t1 = at::randn({N, K}, options);
3134
3135 // Lets specify a few bounds in launch params to make sure it works
3136 LaunchParams lparams(1, -1, -1, 32, 4, 4);
3137 FusionExecutor fe;
3138 fe.compileFusion(&fusion, {t0, t1}, lparams);
3139 fe.runFusion({t0, t1}, lparams);
3140
3141 // Don't specify any launch params
3142 auto cg_outputs = fe.runFusion({t0, t1});
3143
3144 auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
3145
3146 testValidate(
3147 &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
3148}
3149
3150TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
3151 Fusion fusion;
3152 FusionGuard fg(&fusion);
3153
3154 const int tidx = 32;
3155 const int dimx = 32;
3156 const int dimy = 16;
3157 const int dimz = 130;
3158
3159 // Set up your input tensor views
3160 TensorView* input_tv0 = makeSymbolicTensor(3);
3161 fusion.addInput(input_tv0);
3162
3163 TensorView* input_t = transpose(input_tv0, 1, 2);
3164
3165 TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t);
3166 TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
3167 TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
3168
3169 // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
3170 // computed at sum_exp_rf_tv8.
3171 TensorView* input_t_copy = transpose(input_tv0, 1, 2);
3172 TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy);
3173
3174 TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
3175
3176 fusion.addOutput(output_tv4);
3177
3178 bcast_sum_tv3->split(-1, tidx);
3179
3180 sum_exp_tv2->split(-1, tidx);
3181 TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
3182
3183 output_tv4->split(-1, tidx);
3184
3185 input_t->computeAt(sum_exp_rf_tv5, -1);
3186 input_t_copy->computeAt(output_tv4, -1);
3187
3188 TensorView* tensors_to_parallelize[] = {
3189 sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
3190
3191 for (auto tv : tensors_to_parallelize) {
3192 tv->axis(0)->parallelize(ParallelType::BIDx);
3193 tv->axis(1)->parallelize(ParallelType::BIDy);
3194 tv->axis(-1)->parallelize(ParallelType::TIDx);
3195 }
3196
3197 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3198 at::Tensor input = at::randn({dimx, dimz, dimy}, options);
3199
3200 at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
3201
3202 FusionExecutor fe;
3203 fe.compileFusion(&fusion, {input});
3204 fe.runFusion({input}, {cg_output});
3205
3206 auto aten_input_t = at::transpose(input, 1, 2);
3207 auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false);
3208
3209 testValidate(
3210 &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
3211}
3212
3213TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
3214 // Case 1
3215 // tv1 = tv0 * 0.5
3216 // tv2 = tv1 * -1
3217 // tv3 = tv1 + 3
3218 // tv4 = tv1 * 2
3219 // tv5 = tv3 + tv2
3220 // tv6 = tv5 + tv4
3221 // tv7 = tv1 + tv4
3222 Fusion fusion;
3223 FusionGuard fg(&fusion);
3224
3225 TensorView* tv0 = makeSymbolicTensor(2);
3226 fusion.addInput(tv0);
3227
3228 tv0 = transpose(tv0);
3229
3230 TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
3231 TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
3232 TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
3233 TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
3234 TensorView* tv5 = add(tv3, tv2);
3235
3236 TensorView* tv6 = add(tv5, tv4);
3237 TensorView* tv7 = add(tv1, tv4);
3238
3239 fusion.addOutput(tv6);
3240 fusion.addOutput(tv7);
3241
3242 // Lets setup to actually run
3243 tv7->merge(0);
3244 tv7->split(0, 128);
3245 tv7->split(0, 4);
3246
3247 tv7->axis(0)->parallelize(ParallelType::BIDx);
3248
3249 tv0->computeAt(tv7, 1);
3250
3251 // The this-position of the last tensor should be zero.
3252 TORCH_CHECK(
3253 tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
3254 tv7->getMaxProducerPosition() == 1);
3255 TORCH_CHECK(
3256 tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
3257 tv6->getMaxProducerPosition() == 1);
3258 // The position of every other tensor should be 1.
3259 for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
3260 TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
3261 }
3262
3263 for (Val* val : fusion.vals()) {
3264 if (!val->isFusionInput() &&
3265 val->getValType().value() == ValType::TensorView) {
3266 TensorView* tv = static_cast<TensorView*>(val);
3267 tv->axis(1)->parallelize(ParallelType::Unroll);
3268 tv->axis(-1)->parallelize(ParallelType::TIDx);
3269 }
3270 }
3271
3272 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3273
3274 at::Tensor aten_input = at::randn({129, 127}, options);
3275
3276 FusionExecutor fe;
3277 fe.compileFusion(&fusion, {aten_input});
3278 auto cg_outputs = fe.runFusion({aten_input});
3279
3280 at::Tensor aten_input_t = aten_input.t();
3281
3282 auto t1 = aten_input_t.mul({0.5});
3283 auto t2 = t1.mul({-1.0});
3284 auto t3 = t1.add({3.0});
3285 auto t4 = t1.mul({2.0});
3286 auto t5 = t3.add(t2);
3287 auto t6 = t5.add(t4);
3288 auto t7 = t1.add(t4);
3289
3290 std::vector<at::Tensor> aten_outputs = {t6, t7};
3291
3292 testValidate(
3293 &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
3294}
3295
3296TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
3297 // Case 2
3298 // tv1 = tv0 * -1
3299 // tv2 = tv0 + 3
3300 // tv3 = tv0 * 2
3301 // tv4 = tv2 + tv1
3302 // tv5 = tv4 + tv3
3303 // tv6 = tv5 + tv3
3304 Fusion fusion;
3305 FusionGuard fg(&fusion);
3306
3307 TensorView* tv0 = makeSymbolicTensor(2);
3308 fusion.addInput(tv0);
3309
3310 tv0 = transpose(tv0);
3311
3312 TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
3313 TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
3314 TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
3315 TensorView* tv4 = add(tv2, tv1);
3316
3317 TensorView* tv5 = add(tv4, tv3);
3318 TensorView* tv6 = add(tv5, tv3);
3319
3320 fusion.addOutput(tv5);
3321 fusion.addOutput(tv6);
3322
3323 // Lets setup to actually run
3324 tv6->merge(0);
3325 tv6->split(0, 128);
3326 tv6->split(0, 4);
3327
3328 tv6->axis(0)->parallelize(ParallelType::BIDx);
3329
3330 tv0->computeAt(tv6, 1);
3331
3332 for (Val* val : fusion.vals()) {
3333 if (!val->isFusionInput() &&
3334 val->getValType().value() == ValType::TensorView) {
3335 TensorView* tv = static_cast<TensorView*>(val);
3336
3337 tv->axis(1)->parallelize(ParallelType::Unroll);
3338 tv->axis(-1)->parallelize(ParallelType::TIDx);
3339 }
3340 }
3341
3342 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3343 at::Tensor input = at::randn({129, 127}, options);
3344
3345 FusionExecutor fe;
3346 fe.compileFusion(&fusion, {input});
3347 auto cg_outputs = fe.runFusion({input});
3348
3349 auto input_t = input.t();
3350 auto t1 = input_t.mul({-1.0});
3351 auto t2 = input_t.add({3.0});
3352 auto t3 = input_t.mul({2.0});
3353 auto t4 = t2.add(t1);
3354 auto t5 = t4.add(t3);
3355 auto t6 = t5.add(t3);
3356
3357 std::vector<at::Tensor> aten_outputs = {t5, t6};
3358
3359 testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
3360}
3361
3362TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
3363 // Case 3
3364 // T2 = T1 * 0.979361
3365 // T3 = T2 * T0
3366 Fusion fusion;
3367 FusionGuard fg(&fusion);
3368
3369 TensorView* tv0 = makeSymbolicTensor(4);
3370 fusion.addInput(tv0);
3371
3372 tv0 = permute(tv0, {3, 0, 1, 2});
3373
3374 TensorView* tv1 = makeSymbolicTensor(4);
3375 fusion.addInput(tv1);
3376
3377 tv1 = permute(tv1, {3, 0, 1, 2});
3378
3379 TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
3380 TensorView* tv3 = mul(tv2, tv0);
3381
3382 fusion.addOutput(tv3);
3383
3384 // Lets setup to actually run
3385 while (tv3->nDims() > 1)
3386 tv3->merge(0);
3387 tv3->split(0, 128);
3388 tv3->split(0, 4);
3389
3390 tv0->computeAt(tv3, 1);
3391 tv1->computeAt(tv3, 1);
3392
3393 tv3->axis(0)->parallelize(ParallelType::BIDx);
3394
3395 for (Val* val : fusion.vals()) {
3396 if (!val->isFusionInput() &&
3397 val->getValType().value() == ValType::TensorView) {
3398 TensorView* tv = static_cast<TensorView*>(val);
3399
3400 tv->axis(1)->parallelize(ParallelType::Unroll);
3401 tv->axis(-1)->parallelize(ParallelType::TIDx);
3402 }
3403 }
3404
3405 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3406 at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
3407 at::Tensor t1 = at::rand_like(t0, options);
3408
3409 std::vector<IValue> aten_inputs = {t0, t1};
3410
3411 FusionExecutor fe;
3412 fe.compileFusion(&fusion, aten_inputs);
3413 auto cg_outputs = fe.runFusion(aten_inputs);
3414
3415 auto t0_t = t0.permute({3, 0, 1, 2});
3416 auto t1_t = t1.permute({3, 0, 1, 2});
3417 auto t2 = t1_t.mul({0.979361});
3418 auto aten_output = t2.mul(t0_t);
3419
3420 testValidate(
3421 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3422}
3423
3424TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
3425 // Case 4
3426 // T4 = T2 - T3
3427 // T5 = T1 + T4
3428 // T6 = T5 - T0
3429 Fusion fusion;
3430 FusionGuard fg(&fusion);
3431
3432 TensorView* tv0 = makeSymbolicTensor(4);
3433 fusion.addInput(tv0);
3434
3435 tv0 = permute(tv0, {3, 0, 1, 2});
3436
3437 TensorView* tv1 = makeSymbolicTensor(4);
3438 fusion.addInput(tv1);
3439
3440 tv1 = permute(tv1, {3, 0, 1, 2});
3441
3442 TensorView* tv2 = makeSymbolicTensor(4);
3443 fusion.addInput(tv2);
3444
3445 tv2 = permute(tv2, {3, 0, 1, 2});
3446
3447 TensorView* tv3 = makeSymbolicTensor(4);
3448 fusion.addInput(tv3);
3449
3450 tv3 = permute(tv3, {3, 0, 1, 2});
3451
3452 TensorView* tv4 = sub(tv2, tv3);
3453 TensorView* tv5 = add(tv1, tv4);
3454 TensorView* tv6 = sub(tv5, tv0);
3455
3456 fusion.addOutput(tv6);
3457
3458 // Lets setup to actually run
3459 while (tv6->nDims() > 1)
3460 tv6->merge(0);
3461 tv6->split(0, 128);
3462 tv6->split(0, 4);
3463
3464 tv0->computeAt(tv6, 1);
3465 tv1->computeAt(tv6, 1);
3466 tv2->computeAt(tv6, 1);
3467 tv3->computeAt(tv6, 1);
3468
3469 tv6->axis(0)->parallelize(ParallelType::BIDx);
3470
3471 for (Val* val : fusion.vals()) {
3472 if (!val->isFusionInput() &&
3473 val->getValType().value() == ValType::TensorView) {
3474 TensorView* tv = static_cast<TensorView*>(val);
3475
3476 tv->axis(1)->parallelize(ParallelType::Unroll);
3477 tv->axis(-1)->parallelize(ParallelType::TIDx);
3478 }
3479 }
3480
3481 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3482 at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
3483 at::Tensor t1 = at::rand_like(t0, options);
3484 at::Tensor t2 = at::rand_like(t0, options);
3485 at::Tensor t3 = at::rand_like(t0, options);
3486
3487 std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
3488
3489 FusionExecutor fe;
3490 fe.compileFusion(&fusion, aten_inputs);
3491 auto cg_outputs = fe.runFusion(aten_inputs);
3492
3493 auto t0_t = t0.permute({3, 0, 1, 2});
3494 auto t1_t = t1.permute({3, 0, 1, 2});
3495 auto t2_t = t2.permute({3, 0, 1, 2});
3496 auto t3_t = t3.permute({3, 0, 1, 2});
3497 auto t4 = t2_t.sub(t3_t);
3498 auto t5 = t1_t.add(t4);
3499 auto aten_output = t5.sub(t0_t);
3500
3501 testValidate(
3502 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3503}
3504
3505TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
3506 // Case 5
3507 // tv2 = tv0 + 2.0
3508 // tv3 = tv1 * tv2
3509 Fusion fusion;
3510 FusionGuard fg(&fusion);
3511
3512 // Set up your input tensor views
3513 TensorView* tv0 = makeSymbolicTensor(2);
3514 fusion.addInput(tv0);
3515 tv0 = transpose(tv0);
3516 TensorView* tv1 = makeSymbolicTensor(2);
3517 fusion.addInput(tv1);
3518 tv1 = transpose(tv1);
3519 TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
3520 TensorView* tv3 = mul(tv1, tv2);
3521 fusion.addOutput(tv3);
3522
3523 tv3->merge(0);
3524 tv3->split(-1, 8);
3525 tv3->split(-1, 4);
3526
3527 tv0->computeAt(tv3, 1);
3528 tv1->computeAt(tv3, 1);
3529 tv3->axis(0)->parallelize(ParallelType::BIDx);
3530
3531 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3532 at::Tensor t0 = at::randn({63, 65}, options);
3533 at::Tensor t1 = at::rand_like(t0, options);
3534
3535 std::vector<IValue> aten_inputs = {t0, t1};
3536
3537 FusionExecutor fe;
3538 fe.compileFusion(&fusion, aten_inputs);
3539 auto cg_outputs = fe.runFusion(aten_inputs);
3540
3541 auto t2 = t0.t().add(2.0);
3542 auto aten_output = t1.t().mul(t2);
3543
3544 testValidate(
3545 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3546}
3547
3548TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
3549 Fusion fusion;
3550 FusionGuard fg(&fusion);
3551
3552 TensorView* tv0 = makeSymbolicTensor(2);
3553 fusion.addInput(tv0);
3554 tv0 = transpose(tv0);
3555 TensorView* tv1 = makeSymbolicTensor(2);
3556 fusion.addInput(tv1);
3557 tv1 = transpose(tv1);
3558 TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
3559 TensorView* tv3 = mul(tv1, tv2);
3560 fusion.addOutput(tv3);
3561
3562 tv2->merge(0);
3563 tv2->split(-1, 8);
3564 tv2->split(-1, 4);
3565 tv3->merge(0);
3566 tv3->split(-1, 8);
3567
3568 tv0->computeAt(tv3, 1);
3569 tv1->computeAt(tv3, 1);
3570
3571 tv3->axis(0)->parallelize(ParallelType::BIDx);
3572
3573 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3574 at::Tensor t0 = at::randn({63, 65}, options);
3575 at::Tensor t1 = at::rand_like(t0, options);
3576
3577 std::vector<IValue> aten_inputs = {t0, t1};
3578
3579 FusionExecutor fe;
3580 fe.compileFusion(&fusion, aten_inputs);
3581 auto cg_outputs = fe.runFusion(aten_inputs);
3582
3583 auto t2 = t0.t().add(2.0);
3584 auto aten_output = t1.t().mul(t2);
3585
3586 testValidate(
3587 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3588}
3589
3590TEST_F(NVFuserTest, FusionSegmentReducePointwise_CUDA) {
3591 auto fusion = std::make_unique<Fusion>();
3592 FusionGuard fg(fusion.get());
3593
3594 TensorView* tv0 = makeSymbolicTensor(2);
3595 TensorView* tv1 = makeSymbolicTensor(1);
3596 TensorView* tv2 = makeSymbolicTensor(2);
3597
3598 fusion->addInput(tv0);
3599 fusion->addInput(tv1);
3600 fusion->addInput(tv2);
3601
3602 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
3603 TensorView* tv4 =
3604 max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
3605 TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce,
3606 // keeps normalization scheduler away)
3607 TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce)
3608
3609 fusion->addOutput(tv6);
3610
3611 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3612 at::Tensor t0 = at::randn({128, 65}, options);
3613 at::Tensor t1 = at::randn({65}, options);
3614 at::Tensor t2 = at::randn({128, 65}, options);
3615
3616 auto t3 = t0.add(1.0);
3617 auto t4 = std::get<0>(at::max(t3, 0));
3618 auto t5 = t4.add(t1);
3619 auto t6 = t5.add(t2);
3620
3621 FusionExecutorCache executor_cache(std::move(fusion));
3622
3623 auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
3624
3625 TORCH_CHECK(
3626 executor_cache.getMostRecentKernelRuntime()->isSegmented(),
3627 "segmentation didn't happen");
3628 TORCH_CHECK(
3629 executor_cache.getMostRecentKernelRuntime()
3630 ->fusionSegments()
3631 ->groups()
3632 .size() == 2,
3633 "segmentation didn't happen as expected");
3634
3635 testValidate(
3636 executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
3637}
3638
3639TEST_F(NVFuserTest, FusionMultipleVectorize_CUDA) {
3640 auto fusion = std::make_unique<Fusion>();
3641 FusionGuard fg(fusion.get());
3642
3643 TensorView* tv0 = makeContigTensor(1);
3644 TensorView* tv1 = makeContigTensor(1);
3645
3646 fusion->addInput(tv0);
3647 fusion->addInput(tv1);
3648
3649 TensorView* tv3 = add(tv0, tv1);
3650 fusion->addOutput(tv3);
3651
3652 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3653 at::Tensor t0 = at::randn({40960}, options);
3654 at::Tensor t1 = at::randn({40960}, options);
3655 auto t2 = t0 + t1;
3656
3657 FusionExecutorCache executor_cache(std::move(fusion));
3658 executor_cache.profile(true);
3659
3660 auto outputs = executor_cache.runFusionWithInputs({t0, t1});
3661 auto runtime1 = executor_cache.getMostRecentKernelRuntime();
3662 auto log1 =
3663 executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
3664 TORCH_CHECK(log1 != nullptr);
3665 TORCH_CHECK(log1->vectorize);
3666
3667 testValidate(
3668 executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
3669
3670 t0 = at::randn({40964}, options);
3671 t1 = at::randn({40964}, options);
3672 t2 = t0 + t1;
3673
3674 outputs = executor_cache.runFusionWithInputs({t0, t1});
3675 auto runtime2 = executor_cache.getMostRecentKernelRuntime();
3676 auto log2 =
3677 executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
3678 TORCH_CHECK(log2 != nullptr);
3679 TORCH_CHECK(log2->vectorize);
3680
3681 testValidate(
3682 executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
3683
3684 t0 = at::randn({40962}, options);
3685 t1 = at::randn({40962}, options);
3686 t2 = t0 + t1;
3687
3688 outputs = executor_cache.runFusionWithInputs({t0, t1});
3689 auto runtime3 = executor_cache.getMostRecentKernelRuntime();
3690 auto log3 =
3691 executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
3692 TORCH_CHECK(log3 != nullptr);
3693 TORCH_CHECK(log3->vectorize);
3694
3695 testValidate(
3696 executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
3697
3698 TORCH_CHECK(runtime1 == runtime2);
3699 TORCH_CHECK(runtime1 != runtime3);
3700}
3701
3702TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) {
3703 Fusion fusion;
3704 FusionGuard fg(&fusion);
3705
3706 TensorView* tv0 = makeContigTensor(3);
3707
3708 fusion.addInput(tv0);
3709
3710 auto tv1 = unaryOp(UnaryOpType::Sin, tv0);
3711
3712 fusion.addOutput(tv1);
3713
3714 auto tv0_cache = tv0->cacheAfter();
3715
3716 auto tv1_cache = tv1->cacheBefore();
3717
3718 tv1->merge(0);
3719 tv1->merge(0);
3720 tv1->split(0, 4);
3721 tv1->split(0, 128);
3722
3723 tv1->axis(0)->parallelize(ParallelType::BIDx);
3724 tv1->axis(1)->parallelize(ParallelType::TIDx);
3725
3726 tv0->computeAt(tv1, 2);
3727
3728 tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
3729 tv1->axis(2)->parallelize(ParallelType::Vectorize);
3730
3731 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3732
3733 at::Tensor aten_input = at::empty({2, 6, 32}, options);
3734
3735 FusionExecutor fe;
3736 fe.compileFusion(&fusion, {aten_input});
3737 auto cg_outputs = fe.runFusion({aten_input});
3738
3739 at::Tensor aten_output = aten_input.sin();
3740
3741 testValidate(
3742 &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
3743}
3744
3745TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
3746 Fusion fusion;
3747 FusionGuard fg(&fusion);
3748 // dimensionality of the problem
3749 int nDims = 3;
3750
3751 // Set up your input tensor views
3752 TensorView* tv0 = makeContigTensor(nDims);
3753 TensorView* tv1 = makeContigTensor(nDims);
3754
3755 // Register your inputs
3756 fusion.addInput(tv0);
3757 fusion.addInput(tv1);
3758
3759 // Do math with it, it returns a `Val*` but can be static_casted back to
3760 // TensorView
3761 TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
3762 TensorView* tv3 = add(tv0, tv2);
3763
3764 // Register your outputs
3765 fusion.addOutput(tv3);
3766
3767 auto tv0_cache = tv0->cacheAfter();
3768 auto tv1_cache = tv1->cacheAfter();
3769 auto tv3_cache = tv3->cacheBefore();
3770
3771 // Do transformations, remember, transformations are outputs to inputs
3772 // This doesn't have to be in this order
3773 tv3->merge(1);
3774
3775 // Split by n_threads
3776 tv3->split(1, 2);
3777 tv3->split(0, 3);
3778 tv3->split(0, 1);
3779
3780 // [bidx, unswitch, unroll{2}, tidx, vectorize{2}]
3781
3782 // Parallelize TV3
3783 tv3->axis(0)->parallelize(ParallelType::BIDx);
3784 tv3->axis(1)->parallelize(ParallelType::Unswitch);
3785 tv3->axis(2)->parallelize(ParallelType::Unroll);
3786 tv3->axis(3)->parallelize(ParallelType::TIDx);
3787
3788 tv3->reorder({{4, 2}});
3789 // [bidx, unswitch, vectorize{2}, unroll{2}, tidx]
3790
3791 TransformPropagatorWithCheck propagator(tv3);
3792 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
3793 scheduler_utils::parallelizeAllLike(tv3);
3794
3795 tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
3796 tv1_cache->axis(2)->parallelize(ParallelType::Vectorize);
3797 tv3->axis(2)->parallelize(ParallelType::Vectorize);
3798
3799 // For all inputs, computeAt the output inline, temporaries should be squeezed
3800 // between them
3801 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
3802 tv1->computeAt(tv3, -1, ComputeAtMode::MostInlined);
3803
3804 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3805
3806 at::Tensor input1 = at::randn({64, 2, 128}, options);
3807 at::Tensor input2 = at::rand_like(input1);
3808 at::Tensor output = at::empty_like(input1);
3809
3810 FusionExecutor fe;
3811 fe.compileFusion(&fusion, {input1, input2});
3812 fe.runFusion({input1, input2}, {output});
3813
3814 at::Tensor tv2_ref = input2 + 2.0;
3815 at::Tensor output_ref = input1 + tv2_ref;
3816
3817 TORCH_CHECK(output_ref.equal(output));
3818}
3819
3820TEST_F(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) {
3821 auto fusion = std::make_unique<Fusion>();
3822 FusionGuard fg(fusion.get());
3823
3824 std::vector<int64_t> input_shape{32, 64, 8};
3825 const int kReductionAxis = 1;
3826
3827 auto tv0 = TensorViewBuilder()
3828 .ndims(input_shape.size())
3829 .dtype(DataType::Double)
3830 .build();
3831
3832 fusion->addInput(tv0);
3833
3834 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
3835 auto tv2 = sum(tv1, {2}); // Group 0
3836
3837 auto output = softmax(tv2, kReductionAxis); // Group 1
3838 fusion->addOutput(output);
3839
3840 auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
3841 at::Tensor at_x = at::randn(input_shape, options);
3842
3843 FusionExecutorCache executor_cache(std::move(fusion));
3844
3845 auto outputs = executor_cache.runFusionWithInputs({at_x});
3846
3847 auto t1 = at_x.add(1.0);
3848 auto t2 = t1.sum({2});
3849 auto t3 = at::_softmax(t2.to(at::kDouble), -1, false);
3850
3851 auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
3852 TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen");
3853 TORCH_CHECK(
3854 optimized_fusion->fusionSegments()->groups().size() == 2,
3855 "segmentation didn't happen as expected");
3856
3857 testValidate(
3858 executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__);
3859}
3860
3861TEST_F(NVFuserTest, FusionSwizzle1_CUDA) {
3862 Fusion fusion;
3863 FusionGuard fg(&fusion);
3864
3865 auto tv0 = makeSymbolicTensor(1);
3866 fusion.addInput(tv0);
3867 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3868 auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
3869 fusion.addOutput(tv2);
3870
3871 tv2->split(0, 7);
3872 tv2->split(0, 9);
3873
3874 tv0->computeAt(tv2, 1);
3875
3876 tv2->axis(0)->parallelize(ParallelType::BIDx);
3877
3878 tv1->setMemoryType(MemoryType::Shared);
3879 tv1->swizzle(SwizzleType::Transpose, {1, 2});
3880
3881 tv1->axis(1)->parallelize(ParallelType::TIDx);
3882 tv1->axis(2)->parallelize(ParallelType::TIDy);
3883
3884 tv2->axis(1)->parallelize(ParallelType::TIDx);
3885 tv2->axis(2)->parallelize(ParallelType::TIDy);
3886
3887 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3888 at::Tensor t0 = at::randn({100}, options);
3889
3890 std::vector<IValue> aten_inputs = {t0};
3891
3892 FusionExecutor fe;
3893 fe.compileFusion(&fusion, aten_inputs);
3894 auto cg_outputs = fe.runFusion(aten_inputs);
3895
3896 auto aten_output = (t0 + 1) * 2;
3897
3898 testValidate(
3899 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3900}
3901
3902TEST_F(NVFuserTest, FusionSwizzle2_CUDA) {
3903 Fusion fusion;
3904 FusionGuard fg(&fusion);
3905
3906 auto tv0 = makeSymbolicTensor(1);
3907 fusion.addInput(tv0);
3908 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3909 auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
3910 fusion.addOutput(tv2);
3911
3912 tv1->split(-1, 4);
3913 tv1->split(-2, 4);
3914
3915 tv2->split(-1, 4);
3916 tv2->split(-2, 4);
3917
3918 tv0->computeAt(tv2, 1);
3919
3920 tv2->reorder({{-1, -2}});
3921
3922 tv1->setMemoryType(MemoryType::Shared);
3923 tv1->swizzle(SwizzleType::Transpose, {-2, -1});
3924
3925 tv2->axis(0)->parallelize(ParallelType::BIDx);
3926 tv2->axis(-1)->parallelize(ParallelType::TIDx);
3927 tv2->axis(-2)->parallelize(ParallelType::TIDy);
3928 tv1->axis(-1)->parallelize(ParallelType::TIDx);
3929 tv1->axis(-2)->parallelize(ParallelType::TIDy);
3930
3931 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3932 at::Tensor t0 = at::randn({123}, options);
3933
3934 std::vector<IValue> aten_inputs = {t0};
3935
3936 FusionExecutor fe;
3937 fe.compileFusion(&fusion, aten_inputs);
3938 auto cg_outputs = fe.runFusion(aten_inputs);
3939
3940 auto aten_output = (t0 + 1) * 2;
3941
3942 testValidate(
3943 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3944}
3945
3946TEST_F(NVFuserTest, FusionGridPersistence_CUDA) {
3947 Fusion fusion;
3948 FusionGuard fg(&fusion);
3949
3950 auto tv0 = makeSymbolicTensor(1);
3951 fusion.addInput(tv0);
3952
3953 auto tv1 = sum(tv0, {0});
3954 auto tv2 = broadcast(tv1, {true});
3955 auto tv3 = add(tv0, tv2);
3956 fusion.addOutput(tv3);
3957
3958 std::vector<TensorView*> tvs = {tv1, tv2, tv3};
3959 for (auto tv : tvs) {
3960 tv->split(0, 2);
3961 tv->axis(0)->parallelize(ParallelType::BIDx);
3962 tv->axis(1)->parallelize(ParallelType::BIDy);
3963 }
3964
3965 const int numel_x = 10;
3966
3967 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3968 at::Tensor input = at::randn({numel_x}, options);
3969
3970 FusionExecutor fe;
3971 fe.compileFusion(&fusion, {input});
3972 auto out = fe.runFusion({input});
3973
3974 auto aten_output = input.sum({0}).unsqueeze(-1).add(input);
3975
3976 testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
3977}
3978
3979TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) {
3980 Fusion fusion;
3981 FusionGuard fg(&fusion);
3982
3983 auto tv0 = makeSymbolicTensor(2);
3984 fusion.addInput(tv0);
3985
3986 auto tv1 = sum(tv0, {0});
3987 auto tv2 = broadcast(tv1, {true, false});
3988 auto tv3 = add(tv0, tv2);
3989 fusion.addOutput(tv3);
3990
3991 std::vector<TensorView*> tvs = {tv1, tv2, tv3};
3992 for (auto tv : tvs) {
3993 tv->split(0, 2);
3994 tv->axis(0)->parallelize(ParallelType::BIDx);
3995 tv->axis(1)->parallelize(ParallelType::TIDy);
3996 tv->axis(2)->parallelize(ParallelType::TIDx);
3997 }
3998
3999 const int numel_x = 10;
4000 const int numel_y = 3;
4001
4002 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4003 at::Tensor input = at::randn({numel_x, numel_y}, options);
4004
4005 FusionExecutor fe;
4006 fe.compileFusion(&fusion, {input});
4007 auto out = fe.runFusion({input});
4008
4009 auto aten_output = input.sum({0}).unsqueeze(0).add(input);
4010
4011 testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
4012}
4013
4014TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) {
4015 Fusion fusion;
4016 FusionGuard fg(&fusion);
4017
4018 auto tv0 = makeSymbolicTensor(1);
4019 fusion.addInput(tv0);
4020
4021 auto tvs = Welford(tv0, {0});
4022 auto tv4 = add(tvs.avg, tvs.var_sum);
4023 auto tv5 = broadcast(tv4, {true});
4024 auto tv6 = add(tv0, tv5);
4025 fusion.addOutput(tv6);
4026
4027 std::vector<TensorView*> schedule_tvs = {
4028 tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
4029
4030 for (auto tv : schedule_tvs) {
4031 tv->split(0, 2);
4032 tv->axis(0)->parallelize(ParallelType::BIDx);
4033 tv->axis(1)->parallelize(ParallelType::BIDy);
4034 }
4035
4036 const int numel_x = 10;
4037
4038 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4039 at::Tensor input = at::randn({numel_x}, options);
4040
4041 FusionExecutor fe;
4042 fe.compileFusion(&fusion, {input});
4043 auto out = fe.runFusion({input});
4044
4045 auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
4046 .unsqueeze(-1)
4047 .add(input);
4048
4049 testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
4050}
4051
4052TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) {
4053 Fusion fusion;
4054 FusionGuard fg(&fusion);
4055
4056 auto tv0 = makeSymbolicTensor(2);
4057 fusion.addInput(tv0);
4058
4059 auto tvs = Welford(tv0, {0});
4060 auto tv4 = add(tvs.avg, tvs.var_sum);
4061 auto tv5 = broadcast(tv4, {true, false});
4062 auto tv6 = add(tv0, tv5);
4063 fusion.addOutput(tv6);
4064
4065 std::vector<TensorView*> schedule_tvs = {
4066 tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
4067 for (auto tv : schedule_tvs) {
4068 tv->split(0, 2);
4069 tv->axis(0)->parallelize(ParallelType::BIDx);
4070 tv->axis(1)->parallelize(ParallelType::TIDy);
4071 tv->axis(2)->parallelize(ParallelType::TIDx);
4072 }
4073 tv4->axis(0)->parallelize(ParallelType::TIDx);
4074
4075 const int numel_x = 10;
4076 const int numel_y = 3;
4077
4078 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4079 at::Tensor input = at::randn({numel_x, numel_y}, options);
4080
4081 FusionExecutor fe;
4082 fe.compileFusion(&fusion, {input});
4083 auto out = fe.runFusion({input});
4084
4085 auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
4086 .unsqueeze(0)
4087 .add(input);
4088
4089 testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
4090}
4091
4092TEST_F(NVFuserTest, FusionIssue633_CUDA) {
4093 Fusion fusion;
4094 FusionGuard fg(&fusion);
4095
4096 const int dx = 10;
4097 const int dy = 11;
4098 const int dz = 12;
4099
4100 auto tv0 = makeConcreteTensor({dx, dy, dz});
4101 fusion.addInput(tv0);
4102 auto tv1 = makeConcreteTensor({dx, dy, 1});
4103 fusion.addInput(tv1);
4104 auto tv2 = add(tv0, tv1);
4105 fusion.addOutput(tv2);
4106
4107 tv2->merge(1);
4108 tv2->merge(0);
4109 tv2->split(-1, 128);
4110
4111 tv2->axis(0)->parallelize(ParallelType::BIDx);
4112 tv2->axis(1)->parallelize(ParallelType::TIDx);
4113
4114 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4115 at::Tensor t0 = at::randn({dx, dy, dz}, options);
4116 at::Tensor t1 = at::randn({dx, dy, 1}, options);
4117 std::vector<IValue> aten_inputs = {t0, t1};
4118
4119 FusionExecutor fe;
4120 fe.compileFusion(&fusion, aten_inputs);
4121 auto cg_outputs = fe.runFusion(aten_inputs);
4122
4123 auto aten_output = t0 + t1;
4124
4125 testValidate(
4126 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4127}
4128
4129TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
4130 Fusion fusion;
4131 FusionGuard fg(&fusion);
4132
4133 std::vector<int64_t> shape{17, 19};
4134
4135 auto tv0 = makeSymbolicTensor(1);
4136 fusion.addInput(tv0);
4137 auto tv1 = makeSymbolicTensor(2);
4138 fusion.addInput(tv1);
4139 auto tv2 = broadcast(tv0, {false, true});
4140 auto tv3 = add(tv1, tv2);
4141 fusion.addOutput(tv3);
4142
4143 tv3->split(1, 128);
4144 tv0->computeAt(tv3, 2);
4145
4146 for (auto tv : {tv2, tv3}) {
4147 tv->axis(-1)->parallelize(ParallelType::TIDx);
4148 }
4149
4150 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4151 at::Tensor t0 = at::randn({shape[0]}, options);
4152 at::Tensor t1 = at::randn(shape, options);
4153 std::vector<IValue> aten_inputs = {t0, t1};
4154
4155 FusionExecutor fe;
4156 fe.compileFusion(&fusion, aten_inputs);
4157 auto cg_outputs = fe.runFusion(aten_inputs);
4158
4159 auto t3 = t0.unsqueeze(-1).expand(shape) + t1;
4160
4161 testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
4162}
4163
4164TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
4165 Fusion fusion;
4166 FusionGuard fg(&fusion);
4167
4168 auto tv0 = makeContigTensor(2);
4169 auto tv1 = makeContigTensor(2);
4170 fusion.addInput(tv0);
4171 fusion.addInput(tv1);
4172
4173 auto tv2 = add(tv0, tv1);
4174 fusion.addOutput(tv2);
4175
4176 const int kTDX = 64;
4177 const int kVecSize = 4;
4178 const int kNumElems = kTDX * kVecSize;
4179
4180 tv2->split(1, kNumElems);
4181
4182 auto c0 = tv0->cacheAfter();
4183 auto c1 = tv1->cacheAfter();
4184 auto c2 = tv2->cacheBefore();
4185
4186 tv2->split(-1, kVecSize);
4187
4188 c0->computeAt(tv2, -2);
4189 c1->computeAt(tv2, -2);
4190
4191 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4192 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4193
4194 tv2->axis(0)->parallelize(ParallelType::BIDx);
4195 tv2->axis(-2)->parallelize(ParallelType::TIDx);
4196 tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4197
4198 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4199 const int bx = 128;
4200 const int by = 457;
4201 at::Tensor t0 = at::randn({bx, by}, options);
4202 at::Tensor t1 = at::randn({bx, by}, options);
4203
4204 std::vector<IValue> aten_inputs = {t0, t1};
4205
4206 FusionExecutor fe;
4207 fe.compileFusion(&fusion, aten_inputs);
4208 auto cg_outputs = fe.runFusion(aten_inputs);
4209
4210 auto aten_output = t0 + t1;
4211 testValidate(
4212 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4213}
4214
4215TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
4216 Fusion fusion;
4217 FusionGuard fg(&fusion);
4218
4219 auto tv0 = makeContigTensor(4);
4220 auto tv1 = makeContigTensor(4);
4221 fusion.addInput(tv0);
4222 fusion.addInput(tv1);
4223
4224 auto tv2 = add(tv0, tv1);
4225 fusion.addOutput(tv2);
4226
4227 tv2->reorder({{0, 1}, {1, 0}});
4228 tv2->merge(-2);
4229
4230 const int kTDX = 64;
4231 const int kVecSize = 2;
4232 const int kNumElems = kTDX * kVecSize;
4233
4234 tv2->split(-1, kNumElems);
4235
4236 auto c0 = tv0->cacheAfter();
4237 auto c1 = tv1->cacheAfter();
4238 auto c2 = tv2->cacheBefore();
4239
4240 tv2->split(0, 128);
4241 tv2->split(-1, kVecSize);
4242
4243 c0->computeAt(tv2, -2);
4244 c1->computeAt(tv2, -2);
4245
4246 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4247 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4248
4249 tv2->axis(0)->parallelize(ParallelType::BIDx);
4250 tv2->axis(1)->parallelize(ParallelType::BIDy);
4251 tv2->axis(-2)->parallelize(ParallelType::TIDx);
4252 tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4253
4254 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4255 const int n = 32;
4256 const int c = 127;
4257 const int h = 51;
4258 const int w = 23;
4259 at::Tensor t0 = at::randn({n, c, h, w}, options);
4260 at::Tensor t1 = at::randn({n, c, h, w}, options);
4261
4262 std::vector<IValue> aten_inputs = {t0, t1};
4263
4264 FusionExecutor fe;
4265 fe.compileFusion(&fusion, aten_inputs);
4266 auto cg_outputs = fe.runFusion(aten_inputs);
4267
4268 auto aten_output = t0 + t1;
4269 testValidate(
4270 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4271}
4272
4273TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
4274 Fusion fusion;
4275 FusionGuard fg(&fusion);
4276
4277 constexpr int kNumDims = 4;
4278 constexpr int kTDX = 64;
4279 constexpr int kVecSize = 2;
4280 constexpr int kNumElems = kTDX * kVecSize;
4281
4282 auto tv0 = makeSymbolicTensor(kNumDims);
4283 auto tv1 = makeSymbolicTensor(kNumDims);
4284 fusion.addInput(tv0);
4285 fusion.addInput(tv1);
4286
4287 auto tv2 = add(tv0, tv1);
4288 fusion.addOutput(tv2);
4289
4290 // Create caches for vectorization
4291 auto c0 = tv0->cacheAfter();
4292 auto c1 = tv1->cacheAfter();
4293 auto c2 = tv2->cacheBefore();
4294
4295 // Merge all dimensions together except inner-most dim
4296 for (const auto idx : c10::irange(kNumDims - 2)) {
4297 tv2->merge(0);
4298 }
4299 // Split inner-most dim
4300 tv2->split(-1, kNumElems);
4301 tv2->split(-1, kVecSize);
4302 TransformPropagatorWithCheck propagator(tv2);
4303 MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
4304
4305 c0->computeAt(tv2, -2);
4306 c1->computeAt(tv2, -2);
4307
4308 // Parallelization Strategy
4309 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4310 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4311
4312 tv2->axis(0)->parallelize(ParallelType::BIDx);
4313 tv2->axis(2)->parallelize(ParallelType::TIDx);
4314 tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4315
4316 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4317 const int n = 5;
4318 const int c = 3;
4319 const int h = 51;
4320 const int w = 257;
4321 at::Tensor t0 = at::randn({n, c, h, w}, options);
4322 at::Tensor t1 = at::randn({n, c, h, w}, options);
4323
4324 std::vector<IValue> aten_inputs = {t0, t1};
4325
4326 FusionExecutor fe;
4327 fe.compileFusion(&fusion, aten_inputs);
4328 auto cg_outputs = fe.runFusion(aten_inputs);
4329
4330 auto aten_output = t0 + t1;
4331 testValidate(
4332 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4333}
4334
4335TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
4336 Fusion fusion;
4337 FusionGuard fg(&fusion);
4338
4339 constexpr int kNumDims = 4;
4340 constexpr int kTDX = 64;
4341 constexpr int kVecSize = 2;
4342 constexpr int kNumElems = kTDX * kVecSize;
4343 std::vector<int64_t> bcast_shape{1, 1, 1, -1};
4344
4345 auto tv0 = makeContigTensor(kNumDims);
4346 auto tv1 = TensorViewBuilder().shape(bcast_shape).build();
4347 fusion.addInput(tv0);
4348 fusion.addInput(tv1);
4349
4350 auto tv2 = add(tv0, tv1);
4351 fusion.addOutput(tv2);
4352
4353 // Create caches for vectorization
4354 auto c0 = tv0->cacheAfter();
4355 auto c1 = tv1->cacheAfter();
4356 auto c2 = tv2->cacheBefore();
4357
4358 // Merge all dimensions together
4359 // Backward merge order is necessary for vectorize validation
4360 for (int idx = kNumDims - 1; idx > 0; --idx) {
4361 tv2->merge(idx - 1);
4362 }
4363 tv2->split(-1, kNumElems);
4364 tv2->split(-1, kVecSize);
4365 TransformPropagatorWithCheck propagator(tv2);
4366 MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
4367
4368 c0->computeAt(tv2, -2);
4369 c1->computeAt(tv2, -2);
4370
4371 // Parallelization Strategy
4372 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4373 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4374
4375 tv2->axis(0)->parallelize(ParallelType::BIDx);
4376 tv2->axis(1)->parallelize(ParallelType::TIDx);
4377 tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4378
4379 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4380 const int n = 32;
4381 const int c = 128;
4382 const int h = 51;
4383 const int w = 23;
4384 at::Tensor t0 = at::randn({n, c, h, w}, options);
4385 at::Tensor t1 = at::randn({1, 1, 1, w}, options);
4386
4387 std::vector<IValue> aten_inputs = {t0, t1};
4388
4389 FusionExecutor fe;
4390 // TODO: throw assertion - cannot merge non-contiguous vectorization axes
4391 // Make sure compilation fails
4392 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4393 ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4394}
4395
4396TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
4397 Fusion fusion;
4398 FusionGuard fg(&fusion);
4399
4400 auto tv0 = makeContigTensor(2);
4401 auto tv1 = makeContigTensor(2);
4402
4403 fusion.addInput(tv0);
4404 fusion.addInput(tv1);
4405
4406 auto tv2 = add(tv0, tv1);
4407
4408 auto tv3 = sum(tv2, {-1});
4409
4410 fusion.addOutput(tv3);
4411
4412 auto c0 = tv0->cacheAfter();
4413 auto c1 = tv1->cacheAfter();
4414
4415 tv3->split(-1, 128 * 4);
4416 tv3->split(-1, 4);
4417 // Reduce outer dim first
4418 auto tv4 = tv3->rFactor({-3, -1});
4419 // Tv3 will reduce threads
4420
4421 tv0->computeAt(tv3, 1);
4422 tv1->computeAt(tv3, 1);
4423
4424 tv3->axis(0)->parallelize(ParallelType::BIDx);
4425
4426 tv0->computeAt(tv4, -2);
4427 tv1->computeAt(tv4, -2);
4428
4429 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4430 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4431
4432 tv4->axis(-2)->parallelize(ParallelType::TIDx);
4433 tv3->axis(1)->parallelize(ParallelType::TIDx);
4434
4435 tv2->computeAt(tv4, -1);
4436
4437 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4438 const int bx = 128;
4439 const int by = 2050;
4440 at::Tensor t0 = at::randn({bx, by}, options);
4441 at::Tensor t1 = at::randn({bx, by}, options);
4442
4443 std::vector<IValue> aten_inputs = {t0, t1};
4444
4445 FusionExecutor fe;
4446 fe.compileFusion(&fusion, aten_inputs);
4447 auto cg_outputs = fe.runFusion(aten_inputs);
4448
4449 auto aten_output = t0.add(t1).sum(1);
4450 testValidate(
4451 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4452}
4453
4454TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
4455 Fusion fusion;
4456 FusionGuard fg(&fusion);
4457
4458 auto tv0 = makeContigTensor(2);
4459 auto tv1 = makeContigTensor(2);
4460
4461 fusion.addInput(tv0);
4462 fusion.addInput(tv1);
4463
4464 auto tv2 = add(tv0, tv1);
4465 fusion.addOutput(tv2);
4466
4467 tv2->split(1, 16);
4468 tv2->split(1, 64);
4469
4470 tv2->axis(0)->parallelize(ParallelType::BIDx);
4471 tv2->axis(2)->parallelize(ParallelType::TIDx);
4472
4473 auto c0 = tv0->cacheAfter();
4474 auto c1 = tv1->cacheAfter();
4475 auto c2 = tv2->cacheBefore();
4476
4477 c0->computeAt(tv2, -2);
4478 c1->computeAt(tv2, -2);
4479
4480 std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4481 for (auto tv : vectorized_tvs) {
4482 tv->split(-1, 4);
4483 // Vectorize the wrong dimension
4484 tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize);
4485 }
4486
4487 FusionExecutor fe;
4488 // Make sure compilation fails
4489 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4490 ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4491}
4492
4493TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
4494 Fusion fusion;
4495 FusionGuard fg(&fusion);
4496
4497 auto tv0 = makeSymbolicTensor(2);
4498 auto tv1 = makeSymbolicTensor(2);
4499
4500 fusion.addInput(tv0);
4501 fusion.addInput(tv1);
4502
4503 auto tv2 = add(tv0, tv1);
4504 fusion.addOutput(tv2);
4505
4506 const int kTDX = 64;
4507 const int kVecSize = 4;
4508 const int kNumElems = kTDX * kVecSize;
4509
4510 tv2->split(1, kNumElems);
4511
4512 auto c0 = tv0->cacheAfter();
4513 auto c1 = tv1->cacheAfter();
4514
4515 tv2->split(-1, kVecSize);
4516
4517 c0->computeAt(tv2, -2);
4518 c1->computeAt(tv2, -2);
4519
4520 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4521 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4522
4523 tv2->axis(0)->parallelize(ParallelType::BIDx);
4524 tv2->axis(-2)->parallelize(ParallelType::TIDx);
4525
4526 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4527 const int bx = 128;
4528 const int by = 2049;
4529 at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
4530 at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
4531 std::vector<IValue> aten_inputs = {t0, t1};
4532
4533 FusionExecutor fe;
4534 fe.compileFusion(&fusion, aten_inputs);
4535 auto cg_outputs = fe.runFusion(aten_inputs);
4536
4537 auto aten_output = t0 + t1;
4538 testValidate(
4539 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4540}
4541
4542TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
4543 Fusion fusion;
4544 FusionGuard fg(&fusion);
4545
4546 auto tv0 = makeSymbolicTensor(2);
4547 auto tv1 = makeSymbolicTensor(2);
4548
4549 fusion.addInput(tv0);
4550 fusion.addInput(tv1);
4551
4552 auto tv2 = add(tv0, tv1);
4553 fusion.addOutput(tv2);
4554
4555 const int kTDX = 64;
4556 const int kVecSize = 4;
4557 const int kNumElems = kTDX * kVecSize;
4558
4559 tv2->split(1, kNumElems);
4560
4561 auto c0 = tv0->cacheAfter();
4562 auto c1 = tv1->cacheAfter();
4563 auto c2 = tv2->cacheBefore();
4564
4565 tv2->split(-1, kVecSize);
4566
4567 c0->computeAt(tv2, -2);
4568 c1->computeAt(tv2, -2);
4569
4570 c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4571 c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4572
4573 tv2->axis(0)->parallelize(ParallelType::BIDx);
4574 tv2->axis(-2)->parallelize(ParallelType::TIDx);
4575 tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
4576
4577 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4578 const int bx = 128;
4579 const int by = 2049;
4580 at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
4581 at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
4582 std::vector<IValue> aten_inputs = {t0, t1};
4583
4584 FusionExecutor fe;
4585 fe.compileFusion(&fusion, aten_inputs);
4586
4587 // Failure because the input + output tensors do not have the same stride
4588 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4589 ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
4590}
4591
4592TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
4593 Fusion fusion;
4594 FusionGuard fg(&fusion);
4595
4596 auto tv0 = makeSymbolicTensor(2);
4597
4598 auto tv1 = makeSymbolicTensor(2);
4599 fusion.addInput(tv0);
4600 fusion.addInput(tv1);
4601
4602 auto tv2 = add(tv0, tv1);
4603 fusion.addOutput(tv2);
4604
4605 tv2->split(1, 16);
4606 tv2->split(1, 64);
4607
4608 tv2->axis(0)->parallelize(ParallelType::BIDx);
4609 tv2->axis(2)->parallelize(ParallelType::TIDx);
4610
4611 auto c0 = tv0->cacheAfter();
4612 auto c1 = tv1->cacheAfter();
4613 auto c2 = tv2->cacheBefore();
4614
4615 c0->computeAt(tv2, -2);
4616 c1->computeAt(tv2, -2);
4617
4618 std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4619 for (auto tv : vectorized_tvs) {
4620 tv->split(-1, 4);
4621 tv->axis(-1)->parallelize(ParallelType::Vectorize);
4622 }
4623
4624 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4625 const int bx = 128;
4626 const int by = 2048;
4627 at::Tensor t0 = at::randn({bx, by}, options);
4628 at::Tensor t1 = at::randn({bx, by}, options);
4629
4630 std::vector<IValue> aten_inputs = {t0, t1};
4631
4632 FusionExecutor fe;
4633 fe.compileFusion(&fusion, aten_inputs);
4634 auto cg_outputs = fe.runFusion(aten_inputs);
4635
4636 auto aten_output = t0 + t1;
4637 testValidate(
4638 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4639}
4640
4641TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
4642 Fusion fusion;
4643 FusionGuard fg(&fusion);
4644
4645 auto tv0 = makeSymbolicTensor(2);
4646
4647 auto tv1 = makeSymbolicTensor(2);
4648 fusion.addInput(tv0);
4649 fusion.addInput(tv1);
4650
4651 auto tv2 = add(tv0, tv1);
4652 fusion.addOutput(tv2);
4653
4654 tv2->split(1, 16);
4655 tv2->split(1, 64);
4656
4657 tv2->axis(0)->parallelize(ParallelType::BIDx);
4658 tv2->axis(2)->parallelize(ParallelType::TIDx);
4659
4660 auto c0 = tv0->cacheAfter();
4661 auto c1 = tv1->cacheAfter();
4662 auto c2 = tv2->cacheBefore();
4663
4664 c0->computeAt(tv2, -2);
4665 c1->computeAt(tv2, -2);
4666
4667 std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4668 for (auto tv : vectorized_tvs) {
4669 tv->split(-1, 4);
4670 // Vectorize the wrong dimension
4671 tv->axis(-2)->parallelize(ParallelType::Vectorize);
4672 }
4673
4674 FusionExecutor fe;
4675 // Make sure compilation fails
4676 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4677 ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4678}
4679
4680TEST_F(NVFuserTest, FusionVectorization3_CUDA) {
4681 Fusion fusion;
4682 FusionGuard fg(&fusion);
4683
4684 auto tv0 = makeSymbolicTensor(2);
4685
4686 auto tv1 = makeSymbolicTensor(2);
4687 fusion.addInput(tv0);
4688 fusion.addInput(tv1);
4689
4690 auto tv2 = add(tv0, tv1);
4691 fusion.addOutput(tv2);
4692
4693 tv2->split(1, 16);
4694 tv2->split(1, 64);
4695
4696 tv2->axis(0)->parallelize(ParallelType::BIDx);
4697 tv2->axis(2)->parallelize(ParallelType::TIDx);
4698
4699 auto c0 = tv0->cacheAfter();
4700 auto c1 = tv1->cacheAfter();
4701 auto c2 = tv2->cacheBefore();
4702
4703 c0->computeAt(tv2, -2);
4704 c1->computeAt(tv2, -2);
4705
4706 std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4707 for (auto tv : vectorized_tvs) {
4708 tv->split(-1, 4);
4709 tv->axis(-1)->parallelize(ParallelType::Vectorize);
4710 }
4711
4712 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4713 const int bx = 128;
4714 const int by = 2049;
4715 at::Tensor t0 = at::randn({bx, by}, options);
4716 at::Tensor t1 = at::randn({bx, by}, options);
4717 std::vector<IValue> aten_inputs = {t0, t1};
4718
4719 FusionExecutor fe;
4720 fe.compileFusion(&fusion, aten_inputs);
4721 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4722 ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
4723
4724 aten_inputs[0] = t0.index({"...", Slice(1)});
4725 aten_inputs[1] = t1.index({"...", Slice(1)});
4726 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4727 ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
4728
4729 t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
4730 t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
4731 aten_inputs = {t0, t1};
4732 auto cg_outputs = fe.runFusion(aten_inputs);
4733
4734 auto aten_output = t0 + t1;
4735 testValidate(
4736 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4737}
4738
4739TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
4740 Fusion fusion;
4741 FusionGuard fg(&fusion);
4742
4743 auto tv0 = makeSymbolicTensor(2);
4744
4745 auto tv1 = makeSymbolicTensor(2);
4746 fusion.addInput(tv0);
4747 fusion.addInput(tv1);
4748
4749 auto tv2 = add(tv0, tv1);
4750
4751 auto tv3 = sum(tv2, {-1});
4752
4753 fusion.addOutput(tv3);
4754
4755 tv3->split(-1, 128 * 4);
4756 tv3->split(-1, 4);
4757 // Reduce outer dim first
4758 auto tv4 = tv3->rFactor({-3, -1});
4759 // Tv3 will reduce threads
4760
4761 auto tv6 = tv0->cacheAfter();
4762 auto tv7 = tv1->cacheAfter();
4763
4764 tv0->computeAt(tv3, 1);
4765 tv1->computeAt(tv3, 1);
4766
4767 tv3->axis(0)->parallelize(ParallelType::BIDx);
4768
4769 tv0->computeAt(tv4, -2);
4770 tv1->computeAt(tv4, -2);
4771
4772 tv6->axis(-1)->parallelize(ParallelType::Vectorize);
4773 tv7->axis(-1)->parallelize(ParallelType::Vectorize);
4774
4775 tv4->axis(-2)->parallelize(ParallelType::TIDx);
4776 tv3->axis(1)->parallelize(ParallelType::TIDx);
4777
4778 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4779 const int bx = 128;
4780 const int by = 2048;
4781 at::Tensor t0 = at::randn({bx, by}, options);
4782 at::Tensor t1 = at::randn({bx, by}, options);
4783
4784 std::vector<IValue> aten_inputs = {t0, t1};
4785
4786 FusionExecutor fe;
4787 fe.compileFusion(&fusion, aten_inputs);
4788 auto cg_outputs = fe.runFusion(aten_inputs);
4789
4790 auto aten_output = t0.add(t1).sum(1);
4791 testValidate(
4792 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4793
4794 auto t3 = t0.add(t1).sum(1);
4795
4796 testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
4797}
4798
4799// Unswitched loops with extent one may omit else clause.
4800TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
4801 Fusion fusion;
4802 FusionGuard fg(&fusion);
4803
4804 // Progressively broadcast tensors
4805 TensorView* tv0 = makeSymbolicTensor(1);
4806 fusion.addInput(tv0);
4807 TensorView* tv1 = makeSymbolicTensor(2);
4808 fusion.addInput(tv1);
4809 TensorView* tv2 = makeSymbolicTensor(3);
4810 fusion.addInput(tv2);
4811
4812 TensorView* tv3 = broadcast(tv0, {false, true});
4813 TensorView* tv4 = add(tv3, tv1);
4814 TensorView* tv5 = add(tv4, tv2);
4815
4816 fusion.addOutput(tv5);
4817
4818 // Split inner dimension
4819 tv5->split(1, 8);
4820 // Merge middle dims with outer dimensions
4821 tv5->merge(2);
4822 tv5->merge(0);
4823
4824 // tv5[I0*I1o, I1i*I2]
4825 // Get a dim of size 1 to unswitch
4826 tv5->split(0, 1, false);
4827
4828 // Compute everything inline
4829 tv0->computeAt(tv5, -1);
4830
4831 tv5->axis(0)->parallelize(ParallelType::Unswitch);
4832 tv5->axis(1)->parallelize(ParallelType::BIDx);
4833 tv5->axis(2)->parallelize(ParallelType::TIDx);
4834
4835 // Make sure the unswitched loop does not have an else clause.
4836 GpuLower gpulw(&fusion);
4837 TORCH_CHECK(!UnswitchInElseChecker::check(gpulw));
4838
4839 const int x = 11;
4840 const int y = 12;
4841 const int z = 13;
4842 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4843 at::Tensor t0 = at::randn({x}, options);
4844 at::Tensor t1 = at::randn({x, y}, options);
4845 at::Tensor t2 = at::randn({z, x, y}, options);
4846 std::vector<IValue> aten_inputs = {t0, t1, t2};
4847
4848 FusionExecutor fe;
4849 fe.compileFusion(&fusion, aten_inputs);
4850 auto cg_outputs = fe.runFusion(aten_inputs);
4851 auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2;
4852
4853 testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__);
4854}
4855
4856// The unswitched loop has extent one but inner loops don't. The else
4857// part should not be omitted.
4858TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
4859 Fusion fusion;
4860 FusionGuard fg(&fusion);
4861
4862 const int x = 15;
4863 auto tv0 = makeConcreteTensor({x});
4864 fusion.addInput(tv0);
4865
4866 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4867 fusion.addOutput(tv1);
4868
4869 tv1->split(-1, 4);
4870 tv1->split(-2, 1);
4871
4872 tv1->axis(-2)->parallelize(ParallelType::Unswitch);
4873
4874 // Make sure the size-one unswitched loop does not omit the else clause.
4875 GpuLower gpulw(&fusion);
4876 TORCH_CHECK(UnswitchInElseChecker::check(gpulw));
4877
4878 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4879 at::Tensor t0 = at::randn({x}, options);
4880 std::vector<IValue> aten_inputs = {t0};
4881
4882 FusionExecutor fe;
4883 fe.compileFusion(&fusion, aten_inputs);
4884 auto cg_outputs = fe.runFusion(aten_inputs);
4885 auto t1 = t0 + 1;
4886
4887 testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
4888}
4889
4890TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
4891 Fusion fusion;
4892 FusionGuard fg(&fusion);
4893
4894 auto tv0 = makeSymbolicTensor(1);
4895 fusion.addInput(tv0);
4896
4897 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4898 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4899 fusion.addOutput(tv2);
4900
4901 tv1->axis(-1)->parallelize(ParallelType::TIDx);
4902 tv2->axis(-1)->parallelize(ParallelType::TIDy);
4903
4904 // Invalid as tv1 and tv2 do have the same ParallelType
4905 FusionExecutor fe;
4906 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4907 ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4908}
4909
4910TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
4911 Fusion fusion;
4912 FusionGuard fg(&fusion);
4913
4914 auto tv0 = makeSymbolicTensor(1);
4915 fusion.addInput(tv0);
4916
4917 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4918 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4919 fusion.addOutput(tv2);
4920
4921 tv1->axis(-1)->parallelize(ParallelType::TIDx);
4922 tv2->axis(-1)->parallelize(ParallelType::TIDy);
4923 tv1->setMemoryType(MemoryType::Shared);
4924
4925 // tv1 and tv2 do have the same ParallelType, but tv1 is on shared
4926 // memory, so it is valid
4927 FusionExecutor fe;
4928 fe.compileFusion(&fusion);
4929}
4930
4931TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
4932 Fusion fusion;
4933 FusionGuard fg(&fusion);
4934
4935 auto tv0 = makeSymbolicTensor(1);
4936 fusion.addInput(tv0);
4937
4938 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4939 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4940 fusion.addOutput(tv2);
4941
4942 tv1->split(-1, 4);
4943 tv1->axis(-1)->parallelize(ParallelType::TIDx);
4944 tv2->split(-1, 4);
4945 tv2->axis(-1)->parallelize(ParallelType::TIDx);
4946
4947 tv1->setMemoryType(MemoryType::Global);
4948
4949 // tv1 and tv2 have the same shape and ParallelType
4950 FusionExecutor fe;
4951 fe.compileFusion(&fusion);
4952}
4953
4954TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
4955 Fusion fusion;
4956 FusionGuard fg(&fusion);
4957
4958 auto tv0 = makeSymbolicTensor(1);
4959 fusion.addInput(tv0);
4960
4961 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4962 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4963 fusion.addOutput(tv2);
4964
4965 tv1->split(-1, 4);
4966 tv1->axis(-1)->parallelize(ParallelType::TIDx);
4967 tv2->split(-1, 8);
4968 tv2->axis(-1)->parallelize(ParallelType::TIDx);
4969
4970 tv1->setMemoryType(MemoryType::Global);
4971
4972 // tv1 and tv2 do not have the same shape but global memory comm is supported.
4973 FusionExecutor fe;
4974 fe.compileFusion(&fusion);
4975}
4976
4977TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
4978 Fusion fusion;
4979 FusionGuard fg(&fusion);
4980
4981 auto tv0 = makeSymbolicTensor(1);
4982 fusion.addInput(tv0);
4983
4984 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4985 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4986 fusion.addOutput(tv2);
4987
4988 tv1->split(-1, 4);
4989 tv1->axis(-1)->parallelize(ParallelType::TIDx);
4990 tv1->setMemoryType(MemoryType::Shared);
4991
4992 tv2->split(-1, 8);
4993 tv2->axis(-1)->parallelize(ParallelType::TIDx);
4994
4995 // tv1 and tv2 do not have the same shape, but tv1 is on shared
4996 // memory, so it is valid
4997 FusionExecutor fe;
4998 fe.compileFusion(&fusion);
4999}
5000
5001// See issue #995
5002TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) {
5003 Fusion fusion;
5004 FusionGuard fg(&fusion);
5005
5006 int64_t W = 5, X = 6, Y = 7, Z = 8;
5007
5008 auto tv0 = makeConcreteTensor({X, Y, Z});
5009 auto tv1 = makeConcreteTensor({W, X, Y, Z});
5010 fusion.addInput(tv0);
5011 fusion.addInput(tv1);
5012
5013 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
5014 auto tv3 = broadcast(tv2, {true, false, false, false});
5015 auto tv4 = add(tv3, tv1);
5016 fusion.addOutput(tv4);
5017
5018 tv4->merge(0);
5019 tv4->merge(0);
5020 tv4->merge(0);
5021 tv4->split(0, 4);
5022 tv4->split(0, 3);
5023 tv4->split(0, 2);
5024
5025 TransformPropagatorWithCheck propagator(tv4);
5026 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
5027
5028 tv0->computeAt(tv2, 2);
5029 tv3->computeAt(tv4, 2);
5030
5031 tv4->axis(0)->parallelize(ParallelType::BIDx);
5032 tv4->axis(-1)->parallelize(ParallelType::TIDx);
5033 tv2->axis(0)->parallelize(ParallelType::BIDx);
5034 tv2->axis(-1)->parallelize(ParallelType::TIDx);
5035 tv3->axis(-1)->parallelize(ParallelType::TIDx);
5036
5037 // Validation should throw an exception saying the first axes of tv2
5038 // and tv3 have incompatible parallelization. See also issue #995.
5039 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
5040 ASSERT_ANY_THROW(fusion.printKernel());
5041}
5042
5043// Repro of #2046
5044TEST_F(NVFuserTest, FusionValidateParallelize7_CUDA) {
5045 Fusion fusion;
5046 FusionGuard fg(&fusion);
5047
5048 auto tv0 = makeSymbolicTensor(2);
5049 fusion.addInput(tv0);
5050
5051 auto tv1 = set(tv0);
5052 auto tv2 = set(tv1);
5053 auto tv3 = set(tv1);
5054 fusion.addOutput(tv2);
5055 fusion.addOutput(tv3);
5056
5057 tv1->setMemoryType(MemoryType::Global);
5058
5059 tv1->axis(0)->parallelize(ParallelType::BIDx);
5060 tv1->axis(1)->parallelize(ParallelType::TIDx);
5061
5062 tv2->axis(1)->parallelize(ParallelType::TIDy);
5063 tv3->axis(0)->parallelize(ParallelType::BIDx);
5064
5065 // tv2 uses tv1 but is not parallelized with BIDx, so a grid sync is
5066 // required. It should be placed as a top-level expression.
5067
5068 GpuLower gpulw(&fusion);
5069 TORCH_CHECK(
5070 std::any_of(
5071 gpulw.kernel()->topLevelExprs().begin(),
5072 gpulw.kernel()->topLevelExprs().end(),
5073 [](Expr* expr) { return expr->isA<kir::GridSync>(); }),
5074 "Grid sync not found");
5075}
5076
5077TEST_F(NVFuserTest, FusionDAGMerging_CUDA) {
5078 Fusion fusion;
5079 FusionGuard fg(&fusion);
5080
5081 auto tv0 = makeSymbolicTensor(5);
5082 auto tv1 = makeSymbolicTensor(1);
5083 fusion.addInput(tv0);
5084 fusion.addInput(tv1);
5085
5086 // Branch 0
5087 auto tv2 = sum(tv0, {0}); // 0
5088 auto tv3 = sum(tv2, {0}); // 1
5089 auto tv4 = sum(tv3, {0}); // 2
5090 auto tv5 = sum(tv4, {0}); // 3
5091
5092 // Branch 1
5093 auto tv6 = add(tv1, IrBuilder::create<Double>(1)); // 4
5094
5095 // Merge
5096 auto tv7 = add(tv6, tv5); // 5
5097
5098 // Maximum expected output groups (can improve overtime):
5099 // {0}, {1}, {2}, {3,4,5}
5100 // without final merge would have been {0}, {1}, {2}, {3,4}, {5}
5101
5102 fusion.addOutput(tv7);
5103
5104 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5105 at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options);
5106 at::Tensor t1 = at::randn({2}, options);
5107
5108 std::vector<at::Tensor> aten_inputs = {t0, t1};
5109
5110 KernelArgumentHolder args(KernelIndexMode::INT32);
5111 args.setDeviceIndex(0);
5112 args.push(aten_inputs);
5113
5114 auto fusion_segments = fusion.segment(args);
5115 TORCH_CHECK(fusion_segments->groups().size() <= 4);
5116}
5117
5118TEST_F(NVFuserTest, FusionDAGScalarMerging_CUDA) {
5119 auto fusion = std::make_unique<Fusion>();
5120 FusionGuard fg(fusion.get());
5121
5122 auto tv0 = makeSymbolicTensor(3);
5123 auto i0 = IrBuilder::create<Double>();
5124
5125 fusion->addInput(tv0);
5126 fusion->addInput(i0);
5127
5128 auto i1 = add(i0, IrBuilder::create<Double>(1.0));
5129 auto i2 = mul(i1, i1);
5130 auto i3 = add(i2, i1);
5131
5132 // Branch 0
5133 auto tv1 = sum(tv0, {0}); // 0
5134 auto tv2 = add(tv1, i2);
5135 // Branch 1
5136 auto tv3 = sum(tv2, {0}); // 1
5137 auto tv4 = add(tv3, i3);
5138
5139 auto tv5 = add(tv4, i0);
5140
5141 fusion->addOutput(tv5);
5142
5143 FusionExecutorCache executor_cache(std::move(fusion));
5144
5145 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5146 at::Tensor t0 = at::randn({16, 16, 16}, options);
5147 double s0 = 0.5;
5148
5149 auto s1 = s0 + 1.0;
5150 auto s2 = s1 * s1;
5151 auto s3 = s2 + s1;
5152 auto t1 = t0.sum({0});
5153 auto t2 = t1 + s2;
5154 auto t3 = sum(t2, {0});
5155 auto t4 = t3 + s3;
5156 auto t5 = t4 + s0;
5157
5158 auto outputs = executor_cache.runFusionWithInputs({t0, s0});
5159
5160 TORCH_CHECK(
5161 executor_cache.getMostRecentKernelRuntime()->isSegmented(),
5162 "segmentation didn't happen");
5163 TORCH_CHECK(
5164 executor_cache.getMostRecentKernelRuntime()
5165 ->fusionSegments()
5166 ->groups()
5167 .size() == 2,
5168 "segmentation didn't happen as expected");
5169
5170 testValidate(
5171 executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__);
5172}
5173
5174TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
5175 Fusion fusion;
5176 FusionGuard fg(&fusion);
5177
5178 constexpr int M = 10;
5179 constexpr int N = 20;
5180 constexpr int K = 20;
5181
5182 auto tv0 = makeSymbolicTensor(3);
5183 auto tv1 = sum(tv0, {{1, 2}});
5184 fusion.addInput(tv0);
5185 fusion.addOutput(tv1);
5186
5187 tv1->axis(-1)->parallelize(ParallelType::TIDx);
5188 tv1->axis(0)->parallelize(ParallelType::BIDx);
5189
5190 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5191 at::manual_seed(0);
5192 at::Tensor t0 = at::randn({M, N, K}, options);
5193 std::vector<IValue> aten_inputs = {t0};
5194
5195 FusionExecutor fe;
5196 fe.compileFusion(&fusion, aten_inputs);
5197 auto outputs = fe.runFusion(aten_inputs);
5198 at::Tensor aten_output = t0.sum({1, 2});
5199 testValidate(
5200 &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
5201}
5202
5203TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
5204 Fusion fusion;
5205 FusionGuard fg(&fusion);
5206
5207 constexpr int M = 10;
5208 constexpr int N = 20;
5209 constexpr int K = 20;
5210
5211 auto tv0 = makeSymbolicTensor(3);
5212 auto tvs = Welford(tv0, {{1, 2}});
5213 fusion.addInput(tv0);
5214 auto tv_avg = tvs.avg;
5215 auto tv_M2 = tvs.var_sum;
5216 auto tv_N = tvs.n;
5217 fusion.addOutput(tv_avg);
5218 fusion.addOutput(tv_M2);
5219
5220 tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
5221 tv_avg->axis(0)->parallelize(ParallelType::BIDx);
5222
5223 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5224 at::manual_seed(0);
5225 at::Tensor t0 = at::randn({M, N, K}, options);
5226 std::vector<IValue> aten_inputs = {t0};
5227
5228 FusionExecutor fe;
5229 fe.compileFusion(&fusion, aten_inputs);
5230 auto outputs = fe.runFusion(aten_inputs);
5231 at::Tensor aten_avg = t0.mean({1, 2});
5232 at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K;
5233 testValidate(
5234 &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__);
5235}
5236
5237// See Issue #716
5238TEST_F(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) {
5239 Fusion fusion;
5240 FusionGuard fg(&fusion);
5241
5242 constexpr int M = 10;
5243 constexpr int N = 11;
5244
5245 auto tv0 = makeSymbolicTensor(1);
5246 fusion.addInput(tv0);
5247
5248 std::vector<int> reduction_axes = {1};
5249 std::vector<bool> broadcast_mask = {false, true};
5250
5251 auto tv0_bcast = broadcast(tv0, broadcast_mask);
5252 auto path1_bcast = add(tv0_bcast, IrBuilder::create<Double>(1.0));
5253 auto path1 = sum(path1_bcast, reduction_axes);
5254 fusion.addOutput(path1);
5255
5256 auto p = path1->split(1, 1);
5257 path1->rFactor({1});
5258 path1->axis(0)->parallelize(ParallelType::BIDx);
5259 tv0->computeAt(path1, 1);
5260
5261 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5262 at::manual_seed(0);
5263 at::Tensor t0 = at::randn({M}, options);
5264 at::Tensor t0_ref = t0.clone();
5265 std::vector<IValue> aten_inputs = {t0};
5266
5267 FusionExecutor fe;
5268 fe.compileFusion(&fusion, aten_inputs);
5269
5270 // inplace op, we are adding t0 to itself
5271 auto outputs = fe.runFusion(aten_inputs, {t0});
5272
5273 TORCH_CHECK(outputs[0].allclose(t0_ref.add(1)));
5274}
5275
5276TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) {
5277 Fusion fusion;
5278 FusionGuard fg(&fusion);
5279
5280 auto tv0 = makeSymbolicTensor(2);
5281 fusion.addInput(tv0);
5282 auto tv1 = sum(tv0, {0});
5283 fusion.addOutput(tv1);
5284
5285 auto tv2 = tv0->cacheAfter();
5286
5287 const int bdimx = 128;
5288 tv1->split(1, bdimx);
5289 tv1->split(1, 4);
5290 tv1->split(1, 1);
5291
5292 tv1->axis(-1)->parallelize(ParallelType::TIDx);
5293 tv1->axis(2)->parallelize(ParallelType::Unroll);
5294 tv1->split(0, 10);
5295 tv0->computeAt(tv1, 4);
5296
5297 tv2->axis(-1)->parallelize(ParallelType::TIDx);
5298
5299 int numel_x = 650;
5300 int numel_y = 102;
5301
5302 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5303 at::Tensor input = at::randn({numel_x, numel_y}, options);
5304 at::Tensor cg_output = at::empty({numel_y}, options);
5305
5306 FusionExecutor fe;
5307 fe.compileFusion(&fusion, {input});
5308 fe.runFusion({input}, {cg_output});
5309
5310 auto aten_output = input.to(at::kDouble).sum({0});
5311
5312 testValidate(
5313 &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
5314}
5315
5316TEST_F(NVFuserTest, FusionIssue728_CUDA) {
5317 Fusion fusion;
5318 FusionGuard fg(&fusion);
5319
5320 auto tv0 = makeSymbolicTensor(1);
5321 fusion.addOutput(tv0);
5322 auto tv1 = makeSymbolicTensor(1);
5323 fusion.addOutput(tv1);
5324 auto tv2 = makeSymbolicTensor(1);
5325 fusion.addOutput(tv2);
5326
5327 auto tv3 = add(tv0, IrBuilder::create<Double>(1));
5328 auto tv4 = add(tv3, tv1);
5329 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
5330 auto tv6 = add(tv2, IrBuilder::create<Double>(1));
5331 fusion.addOutput(tv5);
5332 fusion.addOutput(tv6);
5333
5334 // tv0 -> tv3 -+
5335 // tv1 --------+-> tv4 -> tv5
5336 //
5337 // tv2 -> tv6
5338
5339 auto all_vals_under_tv3 =
5340 DependencyCheck::getAllValsBetween({tv3}, fusion.outputs());
5341 std::unordered_set<Val*> included_tensors({tv3, tv4, tv5});
5342 for (auto tv : included_tensors) {
5343 TORCH_CHECK(
5344 std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) !=
5345 all_vals_under_tv3.end(),
5346 "TV",
5347 tv->name(),
5348 " not found");
5349 }
5350 for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
5351 if (included_tensors.find(tv) == included_tensors.end()) {
5352 TORCH_CHECK(
5353 std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) ==
5354 all_vals_under_tv3.end(),
5355 "TV",
5356 tv->name(),
5357 " should not be found");
5358 }
5359 }
5360
5361 auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs());
5362 TORCH_CHECK(no_dependency.empty(), "No val should be returned");
5363
5364 auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6});
5365 TORCH_CHECK(no_dep_path.empty(), "No val should be returned");
5366
5367 auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5});
5368 TORCH_CHECK(no_dep_path2.empty(), "No val should be returned");
5369
5370 auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3});
5371 TORCH_CHECK(
5372 just_tv3.size() == 1 && *(just_tv3.begin()) == tv3,
5373 "Only tv3 should be included");
5374}
5375
5376TEST_F(NVFuserTest, FusionIssue757_CUDA) {
5377 Fusion fusion;
5378 FusionGuard fg(&fusion);
5379
5380 auto tv0 = makeSymbolicTensor(2);
5381 fusion.addInput(tv0);
5382 auto tv1 = sum(tv0, {1});
5383 auto tv2 = broadcast(tv1, {false, true});
5384 auto tv3 = makeSymbolicTensor(2);
5385 fusion.addInput(tv3);
5386 auto tv4 = add(tv2, tv3);
5387 fusion.addOutput(tv4);
5388
5389 tv1->computeAt(tv4, -1);
5390
5391 tv2->axis(-1)->parallelize(ParallelType::TIDx);
5392 tv4->axis(-1)->parallelize(ParallelType::TIDx);
5393 tv1->axis(-1)->parallelize(ParallelType::TIDx);
5394
5395 int numel_x = 650;
5396 int numel_y = 102;
5397
5398 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5399 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
5400 at::Tensor t3 = at::randn({numel_x, numel_y}, options);
5401 std::vector<IValue> inputs = {t0, t3};
5402
5403 FusionExecutor fe;
5404 fe.compileFusion(&fusion, inputs);
5405 auto outputs = fe.runFusion(inputs);
5406
5407 auto t1 = t0.sum({1});
5408 auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
5409 auto t4 = t2 + t3;
5410
5411 testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
5412}
5413
5414// See issue #759
5415TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
5416 Fusion fusion;
5417 FusionGuard fg(&fusion);
5418
5419 auto tv0 = makeSymbolicTensor(2);
5420 fusion.addInput(tv0);
5421 auto tv1 = sum(tv0, {1});
5422 auto tv2 = broadcast(tv1, {false, true});
5423 auto tv3 = makeSymbolicTensor(2);
5424 fusion.addInput(tv3);
5425 auto tv4 = add(tv2, tv3);
5426 fusion.addOutput(tv4);
5427
5428 tv4->split(0, 4);
5429 tv1->computeAt(tv4, -1);
5430
5431 tv2->axis(-1)->parallelize(ParallelType::TIDx);
5432 tv2->axis(1)->parallelize(ParallelType::TIDy);
5433 tv4->axis(-1)->parallelize(ParallelType::TIDx);
5434 tv4->axis(1)->parallelize(ParallelType::TIDy);
5435 tv1->axis(-1)->parallelize(ParallelType::TIDx);
5436
5437 int numel_x = 100;
5438 int numel_y = 101;
5439
5440 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5441 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
5442 at::Tensor t3 = at::randn({numel_x, numel_y}, options);
5443 std::vector<IValue> inputs = {t0, t3};
5444
5445 FusionExecutor fe;
5446 fe.compileFusion(&fusion, inputs);
5447 auto outputs = fe.runFusion(inputs);
5448
5449 auto t1 = t0.sum({1});
5450 auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
5451 auto t4 = t2 + t3;
5452
5453 testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
5454}
5455
5456TEST_F(NVFuserTest, FusionSegmentVerticalMerge_CUDA) {
5457 auto fusion = std::make_unique<Fusion>();
5458 FusionGuard fg(fusion.get());
5459
5460 auto tv0 = makeSymbolicTensor(3);
5461
5462 fusion->addInput(tv0);
5463 // {first kernel}
5464 auto tv1 = sum(tv0, {0});
5465 auto tv2 = add(tv1, tv0);
5466 auto tv3 = sum(tv2, {0});
5467 auto tv4 = add(tv3, tv0);
5468 auto tv5 = sum(tv4, {0});
5469 auto tv6 = sum(tv5, {0});
5470 // {second kernel}
5471 auto tv7 = add(tv6, tv5);
5472 auto tv8 = add(tv7, tv5);
5473 auto tv9 = sum(tv8, {0});
5474
5475 fusion->addOutput(tv9);
5476
5477 SegmentCandidateFinderOptions segment_options;
5478 segment_options.run_herrmann_merge = false;
5479 segment_options.run_final_merge = false;
5480
5481 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5482 at::Tensor t0 = at::randn({2, 2, 2}, options);
5483
5484 KernelArgumentHolder args(KernelIndexMode::INT32);
5485 args.setDeviceIndex(0);
5486 args.push(t0);
5487
5488 auto segmented_fusion =
5489 SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
5490
5491 TORCH_CHECK(segmented_fusion->groups().size() == 2);
5492}
5493
5494TEST_F(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) {
5495 auto fusion = std::make_unique<Fusion>();
5496 FusionGuard fg(fusion.get());
5497
5498 auto tv0 = makeSymbolicTensor(3);
5499 auto i0 = IrBuilder::create<Double>();
5500
5501 fusion->addInput(tv0);
5502 fusion->addInput(i0);
5503
5504 // Branch 0 {first kernel}
5505 auto tv1 = sum(tv0, {0});
5506 auto tv2 = add(tv0, i0);
5507 auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2);
5508 auto tv4 = sum(tv3, {0});
5509
5510 // Branch 1 {first kernel}
5511 auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3);
5512 auto tv6 = sum(tv5, {0});
5513
5514 // Incompatible {second kernel}
5515 auto tv7 = sum(tv6, {0});
5516
5517 fusion->addOutput(tv1);
5518 fusion->addOutput(tv4);
5519 fusion->addOutput(tv7);
5520
5521 SegmentCandidateFinderOptions segment_options;
5522 segment_options.run_herrmann_merge = false;
5523 segment_options.run_final_merge = false;
5524
5525 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5526 at::Tensor t0 = at::randn({2, 2, 2}, options);
5527
5528 KernelArgumentHolder args(KernelIndexMode::INT32);
5529 args.setDeviceIndex(0);
5530 args.push(t0);
5531 c10::IValue scalar = 1.0;
5532 args.push(scalar);
5533
5534 auto segmented_fusion =
5535 SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
5536
5537 TORCH_CHECK(segmented_fusion->groups().size() == 2);
5538}
5539
5540TEST_F(NVFuserTest, FusionSegmentMixReduction_CUDA) {
5541 auto fusion = std::make_unique<Fusion>();
5542 FusionGuard fg(fusion.get());
5543
5544 auto tv0 = makeSymbolicTensor(3);
5545
5546 fusion->addInput(tv0);
5547
5548 // def of tv1 in kernel 1 through horizontal
5549 auto tv1 = sum(tv0, {0, 1});
5550 // kernel 2
5551 auto tv2 = sum(tv0, {2});
5552 auto tv3 = broadcast(tv2, {false, false, true});
5553 auto tv4 = add(tv0, tv3);
5554 auto tv5 = sum(tv4, {2});
5555 // end of kernel 2
5556 // kernel 1
5557 auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0);
5558 auto tv7 = sum(tv6, {0, 1});
5559 auto tv8 = sum(tv6, {0, 1});
5560
5561 fusion->addOutput(tv1);
5562 fusion->addOutput(tv5);
5563 fusion->addOutput(tv7);
5564 fusion->addOutput(tv8);
5565
5566 SegmentCandidateFinderOptions segment_options;
5567 segment_options.run_herrmann_merge = false;
5568 segment_options.run_final_merge = false;
5569
5570 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5571 at::Tensor t0 = at::randn({2, 2, 2}, options);
5572
5573 KernelArgumentHolder args(KernelIndexMode::INT32);
5574 args.setDeviceIndex(0);
5575 args.push(t0);
5576
5577 auto segmented_fusion =
5578 SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
5579
5580 TORCH_CHECK(segmented_fusion->groups().size() <= 2);
5581}
5582
5583TEST_F(NVFuserTest, FusionSBAR_CUDA) {
5584 Fusion fusion;
5585 FusionGuard fg(&fusion);
5586
5587 // N, H, W, C format
5588 std::vector<int64_t> input_shape{656, 7, 7, 64};
5589
5590 auto x = makeContigTensor(4);
5591 auto y = makeContigTensor(4);
5592 auto weight = makeContigTensor(1);
5593 auto bias = makeContigTensor(1);
5594
5595 fusion.addInput(x);
5596 fusion.addInput(y);
5597 fusion.addInput(weight);
5598 fusion.addInput(bias);
5599
5600 const size_t kNumberOfDims = x->nDims();
5601 std::vector<bool> broadcast_mask(kNumberOfDims, false);
5602 for (const auto axis : c10::irange(kNumberOfDims - 1)) {
5603 broadcast_mask[axis] = true;
5604 }
5605
5606 auto weight_bcast = broadcast(weight, broadcast_mask);
5607 auto scale = mul(x, weight_bcast);
5608 auto bias_bcast = broadcast(bias, broadcast_mask);
5609 auto scale_bias = add(scale, bias_bcast);
5610 auto scale_bias_add = add(scale_bias, y);
5611 auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add);
5612
5613 fusion.addOutput(scale_bias_add_relu);
5614
5615 // inputs
5616 at::manual_seed(0);
5617 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5618 at::Tensor at_x = at::randn(input_shape, options);
5619 at::Tensor at_y = at::randn(input_shape, options);
5620 at::Tensor at_weight = at::ones({input_shape[3]}, options);
5621 at::Tensor at_bias = at::zeros({input_shape[3]}, options);
5622
5623 // inputs
5624 std::vector<c10::IValue> inputs = {at_x, at_y, at_weight, at_bias};
5625
5626 // outputs
5627 std::vector<at::Tensor> outputs;
5628
5629 auto lparams = schedulePointwise(&fusion, inputs);
5630
5631 FusionExecutor executor;
5632 executor.compileFusion(&fusion, inputs, lparams);
5633 outputs = executor.runFusion(inputs, lparams);
5634
5635 auto at_scale = at::mul(at_x, at_weight);
5636 auto at_scale_bias = at::add(at_scale, at_bias);
5637 auto pwise_add = at::add(at_scale_bias, at_y);
5638 auto output = at::relu(pwise_add);
5639
5640 testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__);
5641}
5642
5643TEST_F(NVFuserTest, FusionSingleElement_CUDA) {
5644 Fusion fusion;
5645 FusionGuard fg(&fusion);
5646
5647 auto tv0 = makeSymbolicTensor(0);
5648 fusion.addInput(tv0);
5649
5650 auto tv1 = add(tv0, IrBuilder::create<Double>(2.5));
5651
5652 auto tv2 = add(tv1, IrBuilder::create<Double>(3.5));
5653 fusion.addOutput(tv2);
5654
5655 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5656 at::Tensor input = at::randn({}, options);
5657
5658 at::Tensor cg_output = at::empty({}, options);
5659
5660 auto lparams = schedulePointwise(&fusion, {input});
5661
5662 FusionExecutor fe;
5663 fe.compileFusion(&fusion, {input}, lparams);
5664 fe.runFusion({input}, {cg_output}, lparams);
5665
5666 auto aten_output = input.add(2.5).add(3.5);
5667
5668 testValidate(
5669 &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
5670}
5671
5672TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) {
5673 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5674 Fusion& fusion = *fusion_ptr.get();
5675 FusionGuard fg(&fusion);
5676
5677 int batch = 4;
5678 int c = 4;
5679 int h = 4;
5680 int w = 4;
5681 int numDims = 4;
5682
5683 auto input = makeSymbolicTensor(numDims);
5684 fusion.addInput(input);
5685 auto weight = makeSymbolicTensor(1);
5686 fusion.addInput(weight);
5687 auto running_mean = makeSymbolicTensor(1);
5688 fusion.addInput(running_mean);
5689 auto running_var = makeSymbolicTensor(1);
5690 fusion.addInput(running_var);
5691 auto save_mean = makeSymbolicTensor(1);
5692 fusion.addInput(save_mean);
5693 auto save_invstd = makeSymbolicTensor(1);
5694 fusion.addInput(save_invstd);
5695
5696 auto grad_out_prev = makeSymbolicTensor(numDims);
5697 fusion.addInput(grad_out_prev);
5698 auto gt_0 =
5699 makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
5700 fusion.addInput(gt_0);
5701
5702 auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
5703 auto gt_float = castOp(DataType::Float, gt_bool);
5704
5705 auto grad_out = mul(grad_out_prev, gt_float);
5706
5707 Val* eps_ptr = IrBuilder::create<Double>(1e-5);
5708
5709 auto grads = batch_norm_backward(
5710 input,
5711 grad_out,
5712 weight,
5713 running_mean,
5714 running_var,
5715 save_mean,
5716 save_invstd,
5717 true,
5718 eps_ptr,
5719 {true, true, true});
5720
5721 fusion.addOutput(grads.grad_input);
5722 fusion.addOutput(grads.grad_weight);
5723 fusion.addOutput(grads.grad_bias);
5724
5725 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5726 at::Tensor input0 = at::randn({batch, c, h, w}, options);
5727 at::Tensor input1 = at::randn({c}, options);
5728 at::Tensor input2 = at::randn_like(input1);
5729 at::Tensor input3 = at::randn_like(input1);
5730 at::Tensor input4 = at::randn_like(input1);
5731 at::Tensor input5 = at::randn_like(input1);
5732 at::Tensor input6 = at::randn_like(input0);
5733 at::Tensor input7 = at::randn_like(input0);
5734
5735 FusionExecutorCache fec(std::move(fusion_ptr));
5736 std::vector<IValue> inputs = {
5737 input0, input1, input2, input3, input4, input5, input6, input7};
5738 auto outputs = fec.runFusionWithInputs(inputs);
5739}
5740
5741// TODO: We only changed inputs, merge this with the test above.
5742TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
5743 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5744 Fusion& fusion = *fusion_ptr.get();
5745 FusionGuard fg(&fusion);
5746
5747 int batch = 2;
5748 int c = 81;
5749 int h = 1;
5750 int w = 1;
5751 int numDims = 4;
5752
5753 // auto input = makeSymbolicTensor(numDims);
5754 auto input = makeConcreteTensor({-1, -1, 1, 1});
5755 fusion.addInput(input);
5756 auto weight = makeSymbolicTensor(1);
5757 fusion.addInput(weight);
5758 auto running_mean = makeSymbolicTensor(1);
5759 fusion.addInput(running_mean);
5760 auto running_var = makeSymbolicTensor(1);
5761 fusion.addInput(running_var);
5762 auto save_mean = makeSymbolicTensor(1);
5763 fusion.addInput(save_mean);
5764 auto save_invstd = makeSymbolicTensor(1);
5765 fusion.addInput(save_invstd);
5766
5767 // auto grad_out_prev = makeSymbolicTensor(numDims);
5768 auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1});
5769 fusion.addInput(grad_out_prev);
5770 // auto gt_0 =
5771 // makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
5772 auto gt_0 = makeConcreteTensor({-1, -1, 1, 1});
5773 fusion.addInput(gt_0);
5774
5775 auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
5776 auto gt_float = castOp(DataType::Float, gt_bool);
5777
5778 auto grad_out = mul(grad_out_prev, gt_float);
5779
5780 Val* eps_ptr = IrBuilder::create<Double>(1e-5);
5781
5782 auto grads = batch_norm_backward(
5783 input,
5784 grad_out,
5785 weight,
5786 running_mean,
5787 running_var,
5788 save_mean,
5789 save_invstd,
5790 true,
5791 eps_ptr,
5792 {true, true, true});
5793
5794 fusion.addOutput(grads.grad_input);
5795 fusion.addOutput(grads.grad_weight);
5796 fusion.addOutput(grads.grad_bias);
5797
5798 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5799 at::Tensor input0 = at::randn({batch, c, h, w}, options);
5800 at::Tensor input1 = at::randn({c}, options);
5801 at::Tensor input2 = at::randn_like(input1);
5802 at::Tensor input3 = at::randn_like(input1);
5803 at::Tensor input4 = at::randn_like(input1);
5804 at::Tensor input5 = at::randn_like(input1);
5805 at::Tensor input6 = at::randn_like(input0);
5806 at::Tensor input7 = at::randn_like(input0);
5807
5808 FusionExecutorCache fec(std::move(fusion_ptr));
5809 std::vector<IValue> inputs = {
5810 input0, input1, input2, input3, input4, input5, input6, input7};
5811 auto outputs = fec.runFusionWithInputs(inputs);
5812}
5813
5814TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
5815 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5816 Fusion& fusion = *fusion_ptr.get();
5817 FusionGuard fg(&fusion);
5818
5819 const bool kTraining = true;
5820 const float kMomentum = 0.1;
5821 const float kEps = 1e-5;
5822
5823 int batch = 14;
5824 int c = 65;
5825 int h = 7;
5826 int w = 7;
5827 int numDims = 4;
5828
5829 auto input = makeSymbolicTensor(numDims);
5830 fusion.addInput(input);
5831 auto weight = makeSymbolicTensor(1);
5832 fusion.addInput(weight);
5833 auto bias = makeSymbolicTensor(1);
5834 fusion.addInput(bias);
5835 auto running_mean = makeSymbolicTensor(1);
5836 fusion.addInput(running_mean);
5837 auto running_var = makeSymbolicTensor(1);
5838 fusion.addInput(running_var);
5839
5840 auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
5841 auto eps_ptr = IrBuilder::create<Double>(kEps);
5842
5843 auto result = batch_norm(
5844 input,
5845 weight,
5846 bias,
5847 running_mean,
5848 running_var,
5849 kTraining,
5850 momentum_ptr,
5851 eps_ptr);
5852
5853 fusion.addOutput(result.output);
5854 fusion.addOutput(result.mean);
5855 fusion.addOutput(result.invstd);
5856
5857 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5858 at::Tensor input1 = at::randn({batch, c, h, w}, options);
5859 at::Tensor input2 = at::randn({c}, options);
5860 at::Tensor input3 = at::randn_like(input2);
5861 at::Tensor input4 = at::randn_like(input2);
5862 at::Tensor input5 = at::randn_like(input2);
5863
5864 auto input1_ref = input1.clone();
5865 auto input2_ref = input2.clone();
5866 auto input3_ref = input3.clone();
5867 auto input4_ref = input4.clone();
5868 auto input5_ref = input5.clone();
5869
5870 FusionExecutorCache fec(std::move(fusion_ptr));
5871 std::vector<IValue> aten_inputs = {input1, input2, input3, input4, input5};
5872 auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
5873
5874 auto at_results = at::native_batch_norm(
5875 input1_ref,
5876 input2_ref,
5877 input3_ref,
5878 input4_ref,
5879 input5_ref,
5880 kTraining,
5881 kMomentum,
5882 kEps);
5883
5884 auto at_output = std::get<0>(at_results);
5885 auto at_mean = std::get<1>(at_results);
5886 auto at_invstd = std::get<2>(at_results);
5887
5888 std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
5889
5890 testValidate(
5891 &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
5892}
5893
5894TEST_F(NVFuserTest, FusionBNRepro2_CUDA) {
5895 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5896 Fusion& fusion = *fusion_ptr.get();
5897 FusionGuard fg(&fusion);
5898
5899 const bool kTraining = true;
5900 const float kMomentum = 0.1;
5901 const float kEps = 1e-5;
5902
5903 int batch = 2;
5904 int c = 4;
5905 int h = 17;
5906 int w = 17;
5907 int numDims = 4;
5908
5909 auto input = makeSymbolicTensor(numDims);
5910 fusion.addInput(input);
5911
5912 Val* momentum_ptr = IrBuilder::create<Double>(kMomentum);
5913 Val* eps_ptr = IrBuilder::create<Double>(kEps);
5914
5915 auto result = batch_norm(
5916 input,
5917 nullptr,
5918 nullptr,
5919 nullptr,
5920 nullptr,
5921 kTraining,
5922 momentum_ptr,
5923 eps_ptr);
5924
5925 fusion.addOutput(result.output);
5926 fusion.addOutput(result.mean);
5927 fusion.addOutput(result.invstd);
5928
5929 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5930 at::Tensor input1 = at::randn({batch, c, h, w}, options);
5931
5932 auto input1_ref = input1.clone();
5933 at::Tensor r_m;
5934 at::Tensor r_v;
5935 at::Tensor weight;
5936 at::Tensor bias;
5937
5938 FusionExecutorCache fec(std::move(fusion_ptr));
5939 std::vector<IValue> aten_inputs = {input1};
5940 auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
5941
5942 auto at_results = at::native_batch_norm(
5943 input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps);
5944
5945 auto at_output = std::get<0>(at_results);
5946 auto at_mean = std::get<1>(at_results);
5947 auto at_invstd = std::get<2>(at_results);
5948
5949 std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
5950
5951 testValidate(
5952 &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
5953}
5954
5955TEST_F(NVFuserTest, FusionZeroSizeTensorPW_CUDA) {
5956 Fusion fusion;
5957 FusionGuard fg(&fusion);
5958
5959 auto tv0 = makeSymbolicTensor(1);
5960 fusion.addInput(tv0);
5961
5962 auto tv1 = makeConcreteTensor({0});
5963 fusion.addInput(tv1);
5964
5965 auto tv2 = add(tv0, IrBuilder::create<Double>(2.5));
5966 fusion.addOutput(tv2);
5967
5968 // This test used to just have:
5969 // auto tv3 = makeConcreteTensor({0});
5970 // and somehow that was running through our system fine, but size-0 tensors
5971 // are not supported, so making sure this fails.
5972 auto tv3 = set(tv1);
5973 fusion.addOutput(tv3);
5974
5975 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5976
5977 at::Tensor input0 = at::randn({2}, options);
5978 at::Tensor input1 = at::randn({0}, options);
5979 at::Tensor cg_output2 = at::empty({2}, options);
5980 at::Tensor cg_output3 = at::empty({0}, options);
5981
5982 // Fails at schedule pointwise because our (maybe only) size-0 check is in
5983 // binding input sizes which the scheduler ends up calling.
5984 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
5985 ASSERT_ANY_THROW(schedulePointwise(&fusion, {input0, input1}));
5986}
5987
5988TEST_F(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) {
5989 Fusion fusion;
5990 FusionGuard fg(&fusion);
5991
5992 auto tv0 = makeSymbolicTensor(2);
5993 fusion.addInput(tv0);
5994
5995 auto tv1 = makeConcreteTensor({0});
5996 fusion.addInput(tv1);
5997
5998 auto tv2 = sum(tv0, {1});
5999 fusion.addOutput(tv2);
6000
6001 auto tv3 = makeConcreteTensor({0});
6002 fusion.addOutput(tv3);
6003
6004 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6005
6006 at::Tensor input0 = at::randn({2, 4}, options);
6007 at::Tensor input1 = at::randn({0}, options);
6008 at::Tensor cg_output2 = at::empty({2}, options);
6009 at::Tensor cg_output3 = at::empty({0}, options);
6010
6011 auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
6012 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
6013 scheduleReduction(&fusion, *reduction_params);
6014 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
6015
6016 auto lparams = reduction_params->lparams;
6017 FusionExecutor fe;
6018 fe.compileFusion(&fusion, {input0, input1}, lparams);
6019 auto cg_outputs = fe.runFusion({input0, input1}, lparams);
6020 auto aten_output2 = input0.sum({1});
6021 at::Tensor aten_output3 = at::empty({0}, options);
6022
6023 testValidate(
6024 &fusion,
6025 cg_outputs,
6026 {input0, input1},
6027 {aten_output2, aten_output3},
6028 __LINE__,
6029 __FILE__,
6030 "",
6031 lparams);
6032}
6033
6034TEST_F(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) {
6035 Fusion fusion;
6036 FusionGuard fg(&fusion);
6037
6038 auto tv0 = makeSymbolicTensor(2);
6039 fusion.addInput(tv0);
6040
6041 auto tv1 = makeConcreteTensor({0});
6042 fusion.addInput(tv1);
6043
6044 auto tv2 = sum(tv0, {0});
6045 auto tv3 = broadcast(tv2, {true, false});
6046 auto tv4 = add(tv0, tv3);
6047 fusion.addOutput(tv4);
6048
6049 auto tv5 = makeConcreteTensor({0});
6050 fusion.addOutput(tv5);
6051
6052 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6053
6054 at::Tensor input0 = at::randn({2, 4}, options);
6055 at::Tensor input1 = at::randn({0}, options);
6056 at::Tensor cg_output2 = at::empty({2, 4}, options);
6057 at::Tensor cg_output3 = at::empty({0}, options);
6058
6059 auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1});
6060 TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
6061 schedulePersistentKernel(&fusion, *reduction_params);
6062
6063 auto lparams = reduction_params->lparams;
6064 FusionExecutor fe;
6065 fe.compileFusion(&fusion, {input0, input1}, lparams);
6066 auto cg_outputs = fe.runFusion({input0, input1}, lparams);
6067 auto aten_output2 = input0.sum({0}).add(input0);
6068 at::Tensor aten_output3 = at::empty({0}, options);
6069
6070 testValidate(
6071 &fusion,
6072 cg_outputs,
6073 {input0, input1},
6074 {aten_output2, aten_output3},
6075 __LINE__,
6076 __FILE__,
6077 "",
6078 lparams);
6079}
6080
6081TEST_F(NVFuserTest, FusionSegmentIoAlias_CUDA) {
6082 auto fusion = std::make_unique<Fusion>();
6083 FusionGuard fg(fusion.get());
6084
6085 TensorView* tv0 = makeSymbolicTensor(2);
6086 TensorView* tv1 = makeSymbolicTensor(1);
6087 TensorView* tv2 = makeSymbolicTensor(2);
6088
6089 fusion->addInput(tv0);
6090 fusion->addInput(tv1);
6091 fusion->addInput(tv2);
6092
6093 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
6094 TensorView* tv4 =
6095 max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
6096 TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce,
6097 // keeps normalization scheduler away)
6098 TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce)
6099
6100 // Note: test alias;
6101 fusion->aliasOutputToInput(tv6, tv0);
6102 // TODO: support output on aliased fusion #1488
6103 // remove tv7 after #1488
6104 // fusion->addOutput(tv6);
6105 TensorView* tv7 = add(tv6, IrBuilder::create<Double>(1)); // Group 0
6106 fusion->addOutput(tv7);
6107
6108 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6109 at::Tensor t0 = at::randn({128, 65}, options);
6110 at::Tensor t1 = at::randn({65}, options);
6111 at::Tensor t2 = at::randn({128, 65}, options);
6112
6113 auto t3 = t0.add(1.0);
6114 auto t4 = std::get<0>(at::max(t3, 0));
6115 auto t5 = t4.add(t1);
6116 auto t6 = t5.add(t2);
6117 auto t7 = t6.add(1.0);
6118
6119 FusionExecutorCache executor_cache(std::move(fusion));
6120
6121 auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
6122
6123 // TODO: support output on aliased fusion #1488
6124 // validating aliasing
6125 // TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr());
6126
6127 TORCH_CHECK(
6128 executor_cache.getMostRecentKernelRuntime()->isSegmented(),
6129 "segmentation didn't happen");
6130 TORCH_CHECK(
6131 executor_cache.getMostRecentKernelRuntime()
6132 ->fusionSegments()
6133 ->groups()
6134 .size() == 2,
6135 "segmentation didn't happen as expected");
6136
6137 testValidate(
6138 executor_cache.fusion(), outputs, {t0, t1, t2}, {t7}, __LINE__, __FILE__);
6139}
6140
6141TEST_F(NVFuserTest, FusionWelford1Output_CUDA) {
6142 auto fusion_ptr = std::make_unique<Fusion>();
6143 auto fusion = fusion_ptr.get();
6144 FusionGuard fg(fusion);
6145
6146 auto tv0 = makeSymbolicTensor(2);
6147 fusion->addInput(tv0);
6148
6149 auto tvs = Welford(tv0, {1});
6150 fusion->addOutput(tvs.var_sum);
6151 FusionExecutorCache executor_cache(std::move(fusion_ptr));
6152
6153 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6154 at::Tensor t0 = at::randn({128, 65}, options);
6155 auto outputs = executor_cache.runFusionWithInputs({t0});
6156
6157 auto t1 = t0.var({1}, false) * 65;
6158 testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
6159}
6160
6161TEST_F(NVFuserTest, FusionTranslate1Welford_CUDA) {
6162 auto fusion_ptr = std::make_unique<Fusion>();
6163 auto fusion = fusion_ptr.get();
6164 FusionGuard fg(fusion);
6165
6166 auto tv0 = makeSymbolicTensor(2);
6167 fusion->addInput(tv0);
6168
6169 auto tvs = Welford(tv0, {1});
6170 auto tv_out = add(tv0, broadcast(tvs.avg, {false, true}));
6171 fusion->addOutput(tv_out);
6172 FusionExecutorCache executor_cache(std::move(fusion_ptr));
6173
6174 auto run_test = [&executor_cache,
6175 fusion](auto inner_size) -> FusionKernelRuntime* {
6176 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6177 at::Tensor t0 = at::randn({128, inner_size}, options);
6178 auto outputs = executor_cache.runFusionWithInputs({t0});
6179 // Square sums does not fit well in the testValidate assumptions,
6180 // so we just compare the divided output here.
6181 testValidate(
6182 fusion,
6183 outputs,
6184 {t0},
6185 {t0.add(t0.mean({1}).unsqueeze(1))},
6186 __LINE__,
6187 __FILE__);
6188
6189 return executor_cache.getMostRecentKernelRuntime();
6190 };
6191
6192 // Run a translated welford
6193 auto runtime1 = run_test(64);
6194 // Check it was translated
6195 TORCH_CHECK(
6196 runtime1->fusionSegments()->groups().size() == 1 &&
6197 runtime1->fusionSegments()->groups()[0]->exprs().size() > 2);
6198
6199 // Run an un-translated welford
6200 auto runtime2 = run_test(65536);
6201
6202 bool found_welford = false;
6203 for (auto group : runtime2->fusionSegments()->groups()) {
6204 for (auto expr : group->exprs()) {
6205 if (expr->isA<WelfordOp>()) {
6206 found_welford = true;
6207 }
6208 }
6209 }
6210 TORCH_CHECK(found_welford);
6211}
6212
6213TEST_F(NVFuserTest, FusionTranslate2Welford_CUDA) {
6214 auto fusion_ptr = std::make_unique<Fusion>();
6215 auto fusion = fusion_ptr.get();
6216 FusionGuard fg(fusion);
6217
6218 auto tv0 = makeSymbolicTensor(2);
6219 fusion->addInput(tv0);
6220
6221 auto tvs1 = Welford(tv0, {1});
6222 auto tv_out1 = add(tv0, broadcast(tvs1.avg, {false, true}));
6223 fusion->addOutput(tv_out1);
6224
6225 auto tvs2 = Welford(tv0, {1});
6226 auto tv_out2 = add(tv0, broadcast(tvs2.avg, {false, true}));
6227 fusion->addOutput(tv_out2);
6228
6229 FusionExecutorCache executor_cache(std::move(fusion_ptr));
6230
6231 auto run_test = [&executor_cache,
6232 fusion](auto inner_size) -> FusionKernelRuntime* {
6233 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6234 at::Tensor t0 = at::randn({128, inner_size}, options);
6235 auto outputs = executor_cache.runFusionWithInputs({t0});
6236
6237 // Square sums does not fit well in the testValidate assumptions,
6238 // so we just compare the divided output here.
6239 auto out = t0.add(t0.mean({1}).unsqueeze(1));
6240 testValidate(fusion, outputs, {t0}, {out, out}, __LINE__, __FILE__);
6241
6242 return executor_cache.getMostRecentKernelRuntime();
6243 };
6244
6245 // Run a translated welford
6246 auto runtime1 = run_test(64);
6247 // Check it was translated
6248 TORCH_CHECK(
6249 runtime1->fusionSegments()->groups().size() == 1 &&
6250 runtime1->fusionSegments()->groups()[0]->exprs().size() > 4);
6251
6252 // Run an un-translated welford
6253 auto runtime2 = run_test(65536);
6254 // // Check it was not translated
6255 bool found_welford = false;
6256 for (auto group : runtime2->fusionSegments()->groups()) {
6257 for (auto expr : group->exprs()) {
6258 if (expr->isA<WelfordOp>()) {
6259 found_welford = true;
6260 }
6261 }
6262 }
6263 TORCH_CHECK(found_welford);
6264}
6265
6266TEST_F(NVFuserTest, FusionLargeWelfordNormalization_CUDA) {
6267 auto fusion_ptr = std::make_unique<Fusion>();
6268 auto fusion = fusion_ptr.get();
6269 FusionGuard fg(fusion);
6270
6271 auto tv0 = makeSymbolicTensor(2);
6272 fusion->addInput(tv0);
6273
6274 auto tvs1 = Welford(tv0, {1});
6275 auto sum_of_tv0 = sum(tv0, {1});
6276
6277 fusion->addOutput(tvs1.var_sum);
6278 fusion->addOutput(sum_of_tv0);
6279
6280 FusionExecutorCache executor_cache(std::move(fusion_ptr));
6281
6282 auto run_test = [&executor_cache,
6283 fusion](auto inner_size) -> FusionKernelRuntime* {
6284 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6285 at::Tensor t0 = at::randn({128, inner_size}, options);
6286 auto outputs = executor_cache.runFusionWithInputs({t0});
6287
6288 auto t1 = t0.var({1}, false) * inner_size;
6289 auto t2 = t0.sum({1});
6290 testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__);
6291
6292 return executor_cache.getMostRecentKernelRuntime();
6293 };
6294
6295 auto runtime = run_test(65536);
6296 TORCH_CHECK(!runtime->isSegmented());
6297}
6298
6299TEST_F(NVFuserTest, FusionWelfordOuterPersistence_CUDA) {
6300 auto fusion_ptr = std::make_unique<Fusion>();
6301 auto fusion = fusion_ptr.get();
6302 FusionGuard fg(fusion);
6303
6304 auto tv0 = makeSymbolicTensor(2);
6305 fusion->addInput(tv0);
6306
6307 auto tvs1 = Welford(tv0, {1});
6308 auto sum_of_tv0 = sum(tv0, {1});
6309 auto sum_bcasted = broadcast(sum_of_tv0, {false, true});
6310 auto avg_bcasted = broadcast(tvs1.avg, {false, true});
6311 auto tv0_plus_sum = add(tv0, sum_bcasted);
6312 auto tv0_plus_avg = add(tv0, avg_bcasted);
6313
6314 fusion->addOutput(tv0_plus_sum);
6315 fusion->addOutput(tv0_plus_avg);
6316
6317 FusionExecutorCache executor_cache(std::move(fusion_ptr));
6318
6319 auto run_test = [&executor_cache,
6320 fusion](auto inner_size) -> FusionKernelRuntime* {
6321 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6322 at::Tensor t0 = at::randn({128, inner_size}, options);
6323 auto outputs = executor_cache.runFusionWithInputs({t0});
6324
6325 auto t1 = t0.to(c10::kDouble).mean({1}).unsqueeze(1) + t0;
6326 auto t2 = t0.to(c10::kDouble).sum({1}).unsqueeze(1) + t0;
6327 testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__);
6328
6329 return executor_cache.getMostRecentKernelRuntime();
6330 };
6331
6332 for (auto inner_size : {4096, 8192, 32768}) {
6333 auto runtime = run_test(inner_size);
6334 TORCH_CHECK(!runtime->isSegmented());
6335 }
6336}
6337
6338TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
6339 auto fusion = std::make_unique<Fusion>();
6340 FusionGuard fg(fusion.get());
6341
6342 auto tv0 = makeSymbolicTensor(2);
6343 auto tv1 = makeSymbolicTensor(2);
6344 fusion->addInput(tv0);
6345 fusion->addInput(tv1);
6346
6347 auto tv2 = sum(tv0, {0});
6348 auto tv3 = sum(tv1, {1});
6349 fusion->addOutput(tv2);
6350 fusion->addOutput(tv3);
6351
6352 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6353 at::Tensor t0 = at::randn({16, 16}, options);
6354 at::Tensor t1 = at::randn({16, 16}, options);
6355
6356 FusionExecutorCache fusion_executor_cache(std::move(fusion));
6357 fusion_executor_cache.runFusionWithInputs({t0, t1});
6358}
6359
6360TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
6361 auto fusion = std::make_unique<Fusion>();
6362 FusionGuard fg(fusion.get());
6363
6364 auto tv0 = makeSymbolicTensor(1);
6365 auto tv1 = makeSymbolicTensor(2);
6366 auto tv2 = makeSymbolicTensor(4);
6367 fusion->addInput(tv0);
6368 fusion->addInput(tv1);
6369
6370 auto tv3 = broadcast(tv0, {false, true, true, true});
6371 auto tv4 = broadcast(tv1, {false, false, true, true});
6372 auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2);
6373
6374 auto tv6 = add(tv3, tv5);
6375 auto tv7 = add(tv4, tv5);
6376 auto tv8 = add(tv3, tv4);
6377
6378 auto tv9 = add(tv6, tv7);
6379 auto tv10 = add(tv9, tv8);
6380
6381 fusion->addOutput(tv10);
6382
6383 tv0->computeAt(tv10, -2);
6384 tv1->computeAt(tv10, -2);
6385 tv2->computeAt(tv10, -2);
6386
6387 TORCH_CHECK(tv3->getComputeAtPosition() == 1);
6388 TORCH_CHECK(tv4->getComputeAtPosition() == 2);
6389 TORCH_CHECK(tv5->getComputeAtPosition() == 3);
6390
6391 TORCH_CHECK(tv6->getMaxProducerPosition() == 3);
6392 TORCH_CHECK(tv7->getMaxProducerPosition() == 3);
6393 TORCH_CHECK(tv8->getMaxProducerPosition() == 2);
6394}
6395
6396TEST_F(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) {
6397 auto fusion = std::make_unique<Fusion>();
6398 FusionGuard fg(fusion.get());
6399
6400 auto tv0 = makeSymbolicTensor(2);
6401 auto tv1 = makeSymbolicTensor(3);
6402 fusion->addInput(tv0);
6403 fusion->addInput(tv1);
6404 auto tv2 = broadcast(tv0, {false, false, true});
6405 auto tv3 = add(tv2, tv1);
6406
6407 fusion->addOutput(tv3);
6408 tv3->split(-2, 4);
6409 tv3->reorder({{-1, -2}});
6410 tv0->computeAt(tv3, -2);
6411 tv1->computeAt(tv3, -2);
6412 TORCH_CHECK(tv2->getComputeAtPosition() == 2);
6413 TORCH_CHECK(tv3->getMaxProducerPosition() == 2);
6414}
6415
6416TEST_F(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) {
6417 auto fusion = std::make_unique<Fusion>();
6418 FusionGuard fg(fusion.get());
6419
6420 auto tv0 = makeSymbolicTensor(2);
6421 auto tv1 = makeSymbolicTensor(4);
6422 fusion->addInput(tv0);
6423 fusion->addInput(tv1);
6424 auto tv2 = broadcast(tv0, {false, false, true});
6425 auto tv3 = broadcast(tv2, {false, true, false, false});
6426 auto tv4 = add(tv3, tv1);
6427
6428 fusion->addOutput(tv4);
6429 tv0->computeAt(tv4, -1);
6430 tv1->computeAt(tv4, -1);
6431 TORCH_CHECK(tv2->getComputeAtPosition() == 2);
6432 TORCH_CHECK(tv3->getMaxProducerPosition() == 3);
6433}
6434
6435TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) {
6436 auto fusion = std::make_unique<Fusion>();
6437 FusionGuard fg(fusion.get());
6438
6439 auto tv0 = makeSymbolicTensor(2);
6440 fusion->addInput(tv0);
6441
6442 auto tv1 = sum(tv0, {1});
6443 auto tv2 = broadcast(tv1, {false, true});
6444 auto tv3 = add(tv2, tv0);
6445
6446 fusion->addOutput(tv3);
6447
6448 tv1->split(1, 32);
6449 auto tv1_rf = tv1->rFactor({1});
6450 TransformPropagatorWithCheck propagator(tv1_rf);
6451 MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
6452 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
6453 tv1->axis(0)->parallelize(ParallelType::BIDx);
6454 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6455 tv2->axis(0)->parallelize(ParallelType::BIDx);
6456 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6457 tv3->axis(0)->parallelize(ParallelType::BIDx);
6458 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6459 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6460
6461 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6462 at::Tensor input1 = at::randn({16, 128}, options);
6463
6464 auto at_output = input1.sum({1}, true).add(input1);
6465
6466 FusionExecutor fe;
6467 fe.compileFusion(fusion.get(), {input1});
6468 auto outputs = fe.runFusion({input1});
6469
6470 testValidate(
6471 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6472}
6473
6474TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) {
6475 auto fusion = std::make_unique<Fusion>();
6476 FusionGuard fg(fusion.get());
6477
6478 auto tv0 = makeSymbolicTensor(2);
6479
6480 fusion->addInput(tv0);
6481
6482 auto tv1 = sum(tv0, {1});
6483 auto tv2 = broadcast(tv1, {false, true});
6484 auto tv3 = add(tv2, tv0);
6485
6486 fusion->addOutput(tv3);
6487
6488 // Schedule a persistent kernel
6489 auto tv0_cache = tv0->cacheAfter();
6490 tv1->split(1, 8, false);
6491 auto tv1_rf = tv1->rFactor({1});
6492 tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
6493 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
6494 tv1_rf->axis(-1)->padToMultipleOfWarp(32);
6495 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6496 tv1->axis(-1)->padToMultipleOfWarp(32);
6497 TransformPropagatorWithCheck propagator(tv1_rf);
6498 MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
6499 tv0->axis(-1)->parallelize(ParallelType::TIDx);
6500 tv0->axis(-1)->padToMultipleOfWarp(32);
6501 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
6502 tv0_cache->axis(-1)->padToMultipleOfWarp(32);
6503 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6504 tv2->axis(-1)->padToMultipleOfWarp(32);
6505 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6506 tv3->axis(-1)->padToMultipleOfWarp(32);
6507
6508 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6509
6510 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6511 at::Tensor input1 = at::randn({16, 127}, options);
6512
6513 auto at_output = input1.sum({1}, true).add(input1);
6514
6515 FusionExecutor fe;
6516 fe.compileFusion(fusion.get(), {input1});
6517 auto outputs = fe.runFusion({input1});
6518 testValidate(
6519 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6520}
6521
6522TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
6523 auto fusion = std::make_unique<Fusion>();
6524 FusionGuard fg(fusion.get());
6525
6526 auto tv0 = makeSymbolicTensor(3);
6527
6528 fusion->addInput(tv0);
6529
6530 auto tv1 = sum(tv0, {1, 2});
6531 auto tv2 = broadcast(tv1, {false, true, true});
6532 auto tv3 = add(tv2, tv0);
6533
6534 fusion->addOutput(tv3);
6535
6536 // Schedule a persistent kernel
6537 auto tv0_cache = tv0->cacheAfter();
6538 tv1->merge(1);
6539 tv1->split(1, 8, false);
6540
6541 auto tv1_rf = tv1->rFactor({1});
6542 tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
6543 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
6544 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6545 tv1->axis(-1)->padToMultipleOfWarp();
6546 TransformPropagatorWithCheck propagator(tv1_rf);
6547 MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
6548 tv0->axis(-1)->parallelize(ParallelType::TIDx);
6549 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
6550 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6551 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6552
6553 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6554
6555 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6556 at::Tensor input1 = at::randn({16, 17, 128}, options);
6557
6558 auto at_output = input1.sum({1, 2}, true).add(input1);
6559
6560 FusionExecutor fe;
6561 fe.compileFusion(fusion.get(), {input1});
6562 auto outputs = fe.runFusion({input1});
6563 testValidate(
6564 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6565}
6566
6567TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) {
6568 auto fusion = std::make_unique<Fusion>();
6569 FusionGuard fg(fusion.get());
6570
6571 auto tv0 = makeSymbolicTensor(3);
6572
6573 fusion->addInput(tv0);
6574
6575 auto tv1 = sum(tv0, {1, 2});
6576 auto tv2 = broadcast(tv1, {false, true, true});
6577 auto tv3 = add(tv2, tv0);
6578
6579 fusion->addOutput(tv3);
6580
6581 // Schedule a persistent kernel
6582 auto tv0_cache = tv0->cacheAfter();
6583 tv1->merge(1);
6584 tv1->split(1, 8, false);
6585
6586 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6587 tv1->axis(-1)->padToMultipleOfWarp();
6588 TransformPropagatorWithCheck propagator(tv1);
6589 MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
6590 tv0->axis(-1)->parallelize(ParallelType::TIDx);
6591 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
6592 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6593 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6594
6595 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6596
6597 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6598 at::Tensor input1 = at::randn({16, 17, 128}, options);
6599
6600 auto at_output = input1.sum({1, 2}, true).add(input1);
6601
6602 FusionExecutor fe;
6603 fe.compileFusion(fusion.get(), {input1});
6604 auto outputs = fe.runFusion({input1});
6605 testValidate(
6606 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6607}
6608
6609TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
6610 auto fusion = std::make_unique<Fusion>();
6611 FusionGuard fg(fusion.get());
6612
6613 auto tv0 = makeConcreteTensor({17, 18, 128, 1});
6614
6615 fusion->addInput(tv0);
6616
6617 auto tv1 = sum(tv0, {1, 2, 3});
6618 auto tv2 = broadcast(tv1, {false, true, true, true});
6619 auto tv3 = add(tv2, tv0);
6620
6621 fusion->addOutput(tv3);
6622
6623 // Schedule a persistent kernel
6624 auto tv0_cache = tv0->cacheAfter();
6625 tv1->merge(1);
6626 tv1->split(1, 8, false);
6627
6628 auto tv1_rf = tv1->rFactor({1});
6629 tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
6630 tv1_rf->axis(-2)->parallelize(ParallelType::TIDx);
6631 tv1->axis(-2)->parallelize(ParallelType::TIDx);
6632 tv1->axis(-2)->padToMultipleOfWarp();
6633 TransformPropagatorWithCheck propagator(tv1_rf);
6634 MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
6635 tv0->axis(-2)->parallelize(ParallelType::TIDx);
6636 tv0_cache->axis(-2)->parallelize(ParallelType::TIDx);
6637 tv2->axis(-2)->parallelize(ParallelType::TIDx);
6638 tv3->axis(-2)->parallelize(ParallelType::TIDx);
6639
6640 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6641
6642 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6643 at::Tensor input1 = at::randn({17, 18, 128, 1}, options);
6644
6645 auto at_output = input1.sum({1, 2, 3}, true).add(input1);
6646
6647 FusionExecutor fe;
6648 fe.compileFusion(fusion.get(), {input1});
6649 auto outputs = fe.runFusion({input1});
6650 testValidate(
6651 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6652}
6653
6654TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) {
6655 auto fusion = std::make_unique<Fusion>();
6656 FusionGuard fg(fusion.get());
6657
6658 auto tv0 = makeSymbolicTensor(2);
6659 auto tv_add = makeSymbolicTensor(2);
6660
6661 fusion->addInput(tv0);
6662 fusion->addInput(tv_add);
6663
6664 auto tv1 = sum(tv0, {1});
6665 auto tv2 = broadcast(tv1, {false, true});
6666 auto tv3 = add(tv2, tv0);
6667 auto tv4 = add(tv0, tv_add);
6668
6669 fusion->addOutput(tv3);
6670 fusion->addOutput(tv4);
6671
6672 // Schedule a persistent kernel
6673 auto tv0_cache = tv0->cacheAfter();
6674 tv1->split(1, 8, false);
6675 auto tv1_rf = tv1->rFactor({1});
6676 tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
6677 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
6678 tv1_rf->axis(-1)->padToMultipleOfWarp(32);
6679 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6680 tv1->axis(-1)->padToMultipleOfWarp(32);
6681 TransformPropagatorWithCheck propagator(tv1_rf);
6682 MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
6683 tv0->axis(-1)->parallelize(ParallelType::TIDx);
6684 tv0->axis(-1)->padToMultipleOfWarp(32);
6685 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
6686 tv0_cache->axis(-1)->padToMultipleOfWarp(32);
6687 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6688 tv2->axis(-1)->padToMultipleOfWarp(32);
6689 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6690 tv3->axis(-1)->padToMultipleOfWarp(32);
6691 tv4->axis(-1)->parallelize(ParallelType::TIDx);
6692 tv4->axis(-1)->padToMultipleOfWarp(64);
6693
6694 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6695
6696 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6697 at::Tensor input1 = at::randn({16, 128}, options);
6698 at::Tensor input2 = at::randn({16, 128}, options);
6699
6700 auto at_output = input1.sum({1}, true).add(input1);
6701
6702 FusionExecutor fe;
6703 fe.compileFusion(fusion.get(), {input1, input2});
6704 auto outputs = fe.runFusion({input1, input2});
6705 testValidate(
6706 fusion.get(),
6707 outputs,
6708 {input1, input2},
6709 {at_output, input1 + input2},
6710 __LINE__,
6711 __FILE__);
6712}
6713
6714TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
6715 auto fusion = std::make_unique<Fusion>();
6716 FusionGuard fg(fusion.get());
6717
6718 auto tv0 = makeSymbolicTensor(2);
6719
6720 fusion->addInput(tv0);
6721
6722 auto tv1 = sum(tv0, {1});
6723 auto tv2 = broadcast(tv1, {false, true});
6724 auto tv3 = add(tv2, tv0);
6725
6726 fusion->addOutput(tv3);
6727
6728 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6729 tv1->axis(-1)->padToMultipleOfWarp();
6730 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6731 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6732
6733 tv1->axis(0)->parallelize(ParallelType::TIDy);
6734 tv2->axis(0)->parallelize(ParallelType::TIDy);
6735 tv3->axis(0)->parallelize(ParallelType::TIDy);
6736
6737 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6738 at::Tensor input1 = at::randn({16, 31}, options);
6739
6740 auto at_output = input1.sum({1}, true).add(input1);
6741
6742 FusionExecutor fe;
6743 fe.compileFusion(fusion.get(), {input1});
6744 auto outputs = fe.runFusion({input1});
6745 testValidate(
6746 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6747}
6748
6749TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
6750 auto fusion = std::make_unique<Fusion>();
6751 FusionGuard fg(fusion.get());
6752
6753 auto tv0 = makeSymbolicTensor(2);
6754 fusion->addInput(tv0);
6755 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
6756 auto tv2 = sum(tv1, {1});
6757 fusion->addOutput(tv2);
6758
6759 tv2->split(1, 8);
6760 auto tv2_rf = tv2->rFactor({-1});
6761 tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
6762 tv2_rf->axis(-1)->padToMultipleOfWarp();
6763
6764 TransformPropagatorWithCheck propagator(tv2_rf);
6765 MaxRootDomainInfoSpanningTree(tv2_rf).traverse(&propagator);
6766
6767 tv0->axis(-1)->parallelize(ParallelType::TIDx);
6768 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6769 tv2->axis(0)->parallelize(ParallelType::BIDx);
6770 tv2->axis(1)->parallelize(ParallelType::TIDy);
6771 tv0->computeAt(tv2, 2);
6772
6773 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6774 at::Tensor input1 = at::randn({16, 31}, options);
6775
6776 auto at_output = (input1 + 1).sum({1});
6777
6778 FusionExecutor fe;
6779 fe.compileFusion(fusion.get(), {input1});
6780 auto outputs = fe.runFusion({input1});
6781 testValidate(
6782 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6783}
6784
6785TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
6786 auto fusion = std::make_unique<Fusion>();
6787 FusionGuard fg(fusion.get());
6788
6789 auto tv0 = makeSymbolicTensor(2);
6790
6791 fusion->addInput(tv0);
6792
6793 auto tv1 = sum(tv0, {1});
6794 auto tv2 = broadcast(tv1, {false, true});
6795 auto tv3 = add(tv2, tv0);
6796
6797 fusion->addOutput(tv3);
6798
6799 // Schedule a persistent kernel
6800 auto tv0_cache = tv0->cacheAfter();
6801 tv1->split(1, 8, false);
6802 tv1->split(0, 4);
6803 auto tv1_rf = tv1->rFactor({2});
6804
6805 tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
6806 tv1_rf->axis(1)->parallelize(ParallelType::Unroll);
6807 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
6808 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6809 tv1->axis(-1)->padToMultipleOfWarp();
6810 tv1->axis(1)->parallelize(ParallelType::Unroll);
6811 TransformPropagatorWithCheck propagator(tv1_rf);
6812 MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
6813 tv0->axis(-1)->parallelize(ParallelType::TIDx);
6814 tv0->axis(1)->parallelize(ParallelType::Unroll);
6815 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
6816 tv0_cache->axis(1)->parallelize(ParallelType::Unroll);
6817 tv2->axis(-1)->parallelize(ParallelType::TIDx);
6818 tv2->axis(1)->parallelize(ParallelType::Unroll);
6819 tv3->axis(-1)->parallelize(ParallelType::TIDx);
6820 tv3->axis(1)->parallelize(ParallelType::Unroll);
6821
6822 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
6823
6824 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6825 at::Tensor input1 = at::randn({16, 128}, options);
6826
6827 auto at_output = input1.sum({1}, true).add(input1);
6828
6829 FusionExecutor fe;
6830 fe.compileFusion(fusion.get(), {input1});
6831 auto outputs = fe.runFusion({input1});
6832 testValidate(
6833 fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6834}
6835
6836// Repro of issue #1579
6837TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) {
6838 Fusion fusion;
6839 FusionGuard fg(&fusion);
6840
6841 std::vector<int64_t> shape1 = {1024};
6842 std::vector<int64_t> shape2 = {50};
6843
6844 auto tv0 = makeConcreteTensor(shape1);
6845 fusion.addInput(tv0);
6846 auto tv1 = sum(tv0, {0});
6847 fusion.addOutput(tv1);
6848
6849 auto tv2 = makeConcreteTensor(shape2);
6850 fusion.addInput(tv2);
6851 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
6852 auto tv4 = sum(tv3, {0});
6853 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
6854 fusion.addOutput(tv5);
6855
6856 // Just to fill the smem buffer by a thread block of 1024 threads
6857 // with some values
6858 tv1->axis(-1)->parallelize(ParallelType::TIDx);
6859
6860 // Make the tv4_rf reduction a warp reduction to trigger the
6861 // bug. Since the smem buffer is filled with some values due to the
6862 // reduction of tv1, those values would be used by predicated-out
6863 // threads.
6864 tv4->split(-1, 10);
6865 auto tv4_rf = tv4->rFactor({-1});
6866 tv4_rf->axis(-1)->parallelize(ParallelType::TIDx);
6867 tv4_rf->axis(-1)->padToMultipleOfWarp();
6868
6869 tv4_rf->computeAt(tv4, 1);
6870
6871 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6872 auto t0 = at::randn(shape1, options);
6873 auto t2 = at::randn(shape2, options);
6874
6875 FusionExecutor fe;
6876 fe.compileFusion(&fusion, {t0, t2});
6877 auto cg_outputs = fe.runFusion({t0, t2});
6878
6879 auto t1 = t0.sum({0});
6880 auto t4 = (t2 + 1).sum({0}) + 1;
6881
6882 testValidate(&fusion, cg_outputs, {t0, t2}, {t1, t4}, __LINE__, __FILE__);
6883}
6884
6885TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) {
6886 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6887 Fusion& fusion = *fusion_ptr.get();
6888 FusionGuard fg(&fusion);
6889
6890 int batch = 2;
6891 int c = 1;
6892 int h = 1;
6893 int w = 1;
6894 int numDims = 4;
6895
6896 auto input = makeConcreteTensor({-1, 1, 1, 1});
6897 fusion.addInput(input);
6898 auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1});
6899 fusion.addInput(bcast_bias);
6900
6901 std::vector<int64_t> at_sum_axes;
6902 std::vector<int> outer_reduction_axes;
6903 std::vector<bool> outer_broadcast_mask(numDims, false);
6904 Val* N = IrBuilder::create<Double>(1);
6905 for (const auto axis : c10::irange(numDims)) {
6906 if (axis != 1) {
6907 outer_reduction_axes.push_back(axis);
6908 at_sum_axes.push_back(axis);
6909 outer_broadcast_mask[axis] = true;
6910 N = mul(N, input->domain()->domain()[axis]->extent());
6911 }
6912 }
6913
6914 auto output0 = mul(input, bcast_bias);
6915 fusion.addOutput(output0);
6916 auto output1 = sum(output0, outer_reduction_axes);
6917 fusion.addOutput(output1);
6918
6919 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6920 at::Tensor input0 = at::randn({batch, c, h, w}, options);
6921 at::Tensor input1 = at::randn({batch, c, h, w}, options);
6922
6923 auto at_output0 = input0.mul(input1);
6924 auto at_output1 = at_output0.sum(at_sum_axes);
6925
6926 FusionExecutorCache fec(std::move(fusion_ptr));
6927 std::vector<IValue> inputs = {input0, input1};
6928 auto outputs = fec.runFusionWithInputs(inputs);
6929
6930 testValidate(
6931 &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__);
6932}
6933
6934TEST_F(NVFuserTest, FusionPredicateElimination1_CUDA) {
6935 Fusion fusion;
6936 FusionGuard fg(&fusion);
6937
6938 auto tv0 = makeSymbolicTensor(1);
6939 fusion.addInput(tv0);
6940
6941 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
6942 auto tv2 = add(tv1, IrBuilder::create<Double>(2));
6943 auto tv3 = add(tv2, IrBuilder::create<Double>(3));
6944
6945 fusion.addOutput(tv3);
6946
6947 tv3->split(0, 32);
6948 tv0->computeAt(tv3, 1);
6949
6950 tv2->axis(1)->parallelize(ParallelType::Unswitch);
6951
6952 {
6953 GpuLower gpulw(&fusion);
6954 TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
6955 }
6956
6957 tv2->axis(1)->parallelize(ParallelType::Serial);
6958 tv2->split(1, 5);
6959
6960 {
6961 GpuLower gpulw(&fusion);
6962 TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
6963 }
6964}
6965
6966// Repro of issue #1571
6967TEST_F(NVFuserTest, FusionPredicateElimination2_CUDA) {
6968 Fusion fusion;
6969 FusionGuard fg(&fusion);
6970
6971 std::vector<int64_t> shape({10, 11});
6972
6973 auto tv0 = makeConcreteTensor(shape);
6974 fusion.addInput(tv0);
6975
6976 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
6977 auto tv2 = sum(tv1, {1});
6978 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
6979
6980 fusion.addOutput(tv3);
6981
6982 tv1->split(1, 4);
6983 tv1->split(0, 4);
6984 tv2->split(1, 4);
6985 tv2->split(0, 4);
6986
6987 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6988 auto t0 = at::randn(shape, options);
6989
6990 FusionExecutor fe;
6991 fe.compileFusion(&fusion, {t0});
6992 auto cg_outputs = fe.runFusion({t0});
6993
6994 auto ref = (t0 + 1).sum({1}) + 1;
6995
6996 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
6997}
6998
6999TEST_F(NVFuserTest, FusionPredicateElimination3_CUDA) {
7000 Fusion fusion;
7001 FusionGuard fg(&fusion);
7002
7003 auto tv0 = makeSymbolicTensor(1);
7004 fusion.addInput(tv0);
7005
7006 auto tv1 = sum(tv0, {0});
7007 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
7008 fusion.addOutput(tv2);
7009
7010 auto tv3 = tv0->cacheAfter();
7011
7012 tv1->split(0, 10);
7013 tv1->split(0, 33);
7014 TransformPropagatorWithCheck propagator(tv1);
7015 MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
7016
7017 auto tv4 = tv1->rFactor({-1});
7018 auto tv5 = tv1->rFactor({-1});
7019
7020 tv4->axis(0)->parallelize(ParallelType::BIDx);
7021 tv4->axis(1)->parallelize(ParallelType::TIDx);
7022 scheduler_utils::parallelizeAllLike(tv4);
7023
7024 GpuLower gpulw(&fusion);
7025
7026 // The fusion has three reductions: one within each thread, one
7027 // within each block, and another with the whole grid. All of them
7028 // should not need to be predicated as they use the same init value
7029 // and same reduction op.
7030 TORCH_CHECK(!PredicatedChecker::isPredicated(tv4, gpulw));
7031 TORCH_CHECK(!PredicatedChecker::isPredicated(tv5, gpulw));
7032 TORCH_CHECK(!PredicatedChecker::isPredicated(tv1, gpulw));
7033
7034 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7035
7036 for (auto size : {1, 2, 999, 1001, 1234, 10000}) {
7037 auto t0 = at::randn({size}, options);
7038
7039 FusionExecutor fe;
7040 fe.compileFusion(&fusion, {t0});
7041 auto cg_outputs = fe.runFusion({t0});
7042
7043 auto ref = sum(t0) + 1;
7044 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7045 }
7046}
7047
7048TEST_F(NVFuserTest, FusionPredicateElimination4_CUDA) {
7049 Fusion fusion;
7050 FusionGuard fg(&fusion);
7051
7052 auto tv0 = makeSymbolicTensor(2);
7053 fusion.addInput(tv0);
7054
7055 auto tv1 = sum(tv0, {1});
7056
7057 auto tv2 = sum(tv1, {0});
7058 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
7059 fusion.addOutput(tv3);
7060
7061 auto tv4 = max(tv1, {0});
7062 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
7063 fusion.addOutput(tv5);
7064
7065 tv1->split(1, 7);
7066 tv1->split(0, 11);
7067 tv1->reorder({{1, 2}, {2, 1}});
7068 TransformPropagatorWithCheck propagator(tv1);
7069 MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
7070
7071 tv1->axis(0)->parallelize(ParallelType::TIDy);
7072 tv1->axis(1)->parallelize(ParallelType::TIDx);
7073 scheduler_utils::parallelizeAllLike(tv1);
7074
7075 GpuLower gpulw(&fusion);
7076
7077 // tv2 uses the same op and init with tv1, so tv2 should be fine
7078 // without a predicate. However, tv4, while it uses the tv1 as its
7079 // input, the reduction op and init value is different from those of
7080 // tv1, so tv4 needs to be predicated.
7081 TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
7082 TORCH_CHECK(PredicatedChecker::isPredicated(tv4, gpulw));
7083
7084 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7085
7086 std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
7087 for (auto s0 : sizes) {
7088 for (auto s1 : sizes) {
7089 auto t0 = at::randn({s0, s1}, options);
7090
7091 FusionExecutor fe;
7092 fe.compileFusion(&fusion, {t0});
7093 auto cg_outputs = fe.runFusion({t0});
7094
7095 auto t1 = t0.sum({1});
7096 auto t3 = t1.sum({0}) + 1;
7097 auto t5 = std::get<0>(t1.max(0)) + 1;
7098
7099 testValidate(&fusion, cg_outputs, {t0}, {t3, t5}, __LINE__, __FILE__);
7100 }
7101 }
7102}
7103
7104TEST_F(NVFuserTest, FusionPredicateElimination5_CUDA) {
7105 Fusion fusion;
7106 FusionGuard fg(&fusion);
7107
7108 auto tv0 = makeSymbolicTensor(1);
7109 fusion.addInput(tv0);
7110
7111 auto tv1 = set(tv0);
7112 auto tvs2 = Welford(tv1, {0});
7113 auto tv3 = set(tvs2.avg);
7114 fusion.addOutput(tv3);
7115
7116 tvs2.avg->split(0, 4);
7117 TransformPropagatorWithCheck propagator(tvs2.avg);
7118 MaxRootDomainInfoSpanningTree(tvs2.avg).traverse(&propagator);
7119 auto avg_rf = ir_utils::rfactorHelper(tvs2.avg, {1});
7120
7121 avg_rf->axis(0)->parallelize(ParallelType::TIDx);
7122 scheduler_utils::parallelizeAllLike(avg_rf);
7123
7124 GpuLower gpulw(&fusion);
7125
7126 // The first per-thread welford needs to be predicated as the N
7127 // input is different from its init value. The second welford op
7128 // does not need a predicate.
7129 TORCH_CHECK(PredicatedChecker::isPredicated(avg_rf, gpulw));
7130 TORCH_CHECK(!PredicatedChecker::isPredicated(tvs2.avg, gpulw));
7131
7132 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7133
7134 std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
7135 for (auto s0 : sizes) {
7136 auto t0 = at::randn({s0}, options);
7137
7138 FusionExecutor fe;
7139 fe.compileFusion(&fusion, {t0});
7140 auto cg_outputs = fe.runFusion({t0});
7141
7142 auto ref = t0.mean({0});
7143
7144 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7145 }
7146}
7147
7148TEST_F(NVFuserTest, FusionPredicateElimination6_CUDA) {
7149 Fusion fusion;
7150 FusionGuard fg(&fusion);
7151
7152 auto tv0 = makeConcreteTensor({2, 3});
7153 fusion.addInput(tv0);
7154
7155 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
7156 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
7157 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
7158 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
7159 fusion.addOutput(tv4);
7160
7161 tv4->split(1, 5);
7162 TransformPropagatorWithCheck propagator(tv4);
7163 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
7164
7165 tv4->reorder({{0, 1}, {1, 0}});
7166 tv3->computeAt(tv4, 1);
7167
7168 GpuLower gpulw(&fusion);
7169
7170 // The expression for tv2 is a local-to-local expression. It
7171 // satisfies all the requirements of predicate elimination, except
7172 // for the on on split root domains. As the second root axis of tv2
7173 // is split, its index exceeds its extent (i.e., 3 in this case)
7174 // without its predicate.
7175 TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
7176
7177 // Unlike tv2, tv3 is computed at tv4, so the second root axis does
7178 // have a zero domain. Its index should look like "i * 5 + j", where
7179 // i comes from the first root domain and j comes from the split
7180 // inner domain.
7181 TORCH_CHECK(!PredicatedChecker::isPredicated(tv3, gpulw));
7182
7183 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7184 auto t0 = at::randn({2, 3}, options);
7185
7186 FusionExecutor fe;
7187 fe.compileFusion(&fusion, {t0});
7188 auto cg_outputs = fe.runFusion({t0});
7189
7190 auto ref = t0 + 4;
7191
7192 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7193}
7194
7195TEST_F(NVFuserTest, FusionPredicateElimination7_CUDA) {
7196 Fusion fusion;
7197 FusionGuard fg(&fusion);
7198
7199 auto tv0 = makeSymbolicTensor(1);
7200 fusion.addInput(tv0);
7201
7202 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
7203 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
7204 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
7205 fusion.addOutput(tv3);
7206
7207 tv3->split(-1, 5);
7208 tv3->split(-1, 4);
7209 tv3->split(-1, 3);
7210 TransformPropagatorWithCheck propagator(tv3);
7211 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
7212
7213 tv0->computeAt(tv3, 1);
7214
7215 // The last split of tv2 is a non-divisible split, and omitting it
7216 // is invalid.
7217 GpuLower gpulw(&fusion);
7218 TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
7219
7220 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7221 auto t0 = at::randn({123}, options);
7222
7223 FusionExecutor fe;
7224 fe.compileFusion(&fusion, {t0});
7225 auto cg_outputs = fe.runFusion({t0});
7226
7227 auto ref = t0 + 3;
7228
7229 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7230}
7231
7232TEST_F(NVFuserTest, FusionForceFp16Simple_CUDA) {
7233 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7234 auto fusion = fusion_ptr.get();
7235 FusionGuard fg(fusion);
7236
7237 auto tv0 = makeSymbolicTensor(2);
7238 auto tv1 = makeSymbolicTensor(2);
7239
7240 fusion->addInput(tv0);
7241 fusion->addInput(tv1);
7242
7243 // Group 1
7244 auto tv2 = sum(tv0, {1});
7245 auto tv3 = broadcast(tv2, {false, true});
7246
7247 // Group 2
7248 auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
7249 auto tv5 = castOp(DataType::Half, tv4);
7250
7251 fusion->addOutput(tv5);
7252
7253 FusionExecutorCache fec(std::move(fusion_ptr));
7254
7255 std::vector<int64_t> shape{15, 16};
7256
7257 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7258 auto in0 = at::randn(shape, options);
7259 auto in1 = at::randn(shape, options);
7260 fec.runFusionWithInputs({in0, in1});
7261
7262 // Check the segmented edge is fp16
7263 auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7264 for (auto edge : segmented_fusion->edges()) {
7265 auto edge_tv = edge->val->as<TensorView>();
7266 TORCH_CHECK(edge_tv->getDataType() == DataType::Half);
7267 }
7268}
7269
7270TEST_F(NVFuserTest, FusionForceBf16Simple_CUDA) {
7271#if !defined(USE_ROCM)
7272 // requires ampere+ GPU
7273 if (!deviceMajorMinorCheck(8)) {
7274 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
7275 return;
7276 }
7277
7278 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7279 auto fusion = fusion_ptr.get();
7280 FusionGuard fg(fusion);
7281
7282 auto tv0 = makeSymbolicTensor(2);
7283 auto tv1 = makeSymbolicTensor(2);
7284
7285 fusion->addInput(tv0);
7286 fusion->addInput(tv1);
7287
7288 // Group 1
7289 auto tv2 = sum(tv0, {1});
7290 auto tv3 = broadcast(tv2, {false, true});
7291
7292 // Group 2
7293 auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
7294 auto tv5 = castOp(DataType::BFloat16, tv4);
7295
7296 fusion->addOutput(tv5);
7297
7298 FusionExecutorCache fec(std::move(fusion_ptr));
7299
7300 std::vector<int64_t> shape{15, 16};
7301
7302 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7303 auto in0 = at::randn(shape, options);
7304 auto in1 = at::randn(shape, options);
7305 fec.runFusionWithInputs({in0, in1});
7306
7307 // Check the segmented edge is bf16
7308 auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7309 for (auto edge : segmented_fusion->edges()) {
7310 auto edge_tv = edge->val->as<TensorView>();
7311 TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16);
7312 }
7313#else
7314 GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
7315#endif
7316}
7317
7318TEST_F(NVFuserTest, FusionForceFp16NotAllCast_CUDA) {
7319 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7320 auto fusion = fusion_ptr.get();
7321 FusionGuard fg(fusion);
7322
7323 auto tv0 = makeSymbolicTensor(3);
7324 auto tv1 = makeSymbolicTensor(3);
7325
7326 fusion->addInput(tv0);
7327 fusion->addInput(tv1);
7328
7329 // Group 1
7330 auto tv3 = sum(tv0, {1});
7331 auto tv4 = broadcast(tv3, {false, true, false});
7332 auto tv5 = sum(tv0, {1});
7333
7334 // Group 2
7335 auto tv6 = add(tv4, tv1); // edge tv4, expect cast
7336 auto tv7 = castOp(DataType::Half, tv6);
7337
7338 // Group 3
7339 auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
7340
7341 fusion->addOutput(tv7);
7342 fusion->addOutput(tv8);
7343
7344 FusionExecutorCache fec(std::move(fusion_ptr));
7345
7346 std::vector<int64_t> shape{16, 16, 16};
7347
7348 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7349 auto in0 = at::randn(shape, options);
7350 auto in1 = at::randn(shape, options);
7351 fec.runFusionWithInputs({in0, in1});
7352
7353 auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7354 auto complete_fusion = segmented_fusion->completeFusion();
7355
7356 // Check that the edge that wasn't fp16 is the producer of the
7357 // reduction op, i.e. tv8 = sum(tv5,{1});.
7358 for (auto edge : segmented_fusion->edges()) {
7359 auto edge_tv = edge->val->as<TensorView>();
7360 if (edge_tv->getDataType() == DataType::Float) {
7361 auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
7362 TORCH_CHECK(consumer->isA<ReductionOp>());
7363 }
7364 }
7365}
7366
7367TEST_F(NVFuserTest, FusionForceBf16NotAllCast_CUDA) {
7368#if !defined(USE_ROCM)
7369 // requires ampere+ GPU
7370 if (!deviceMajorMinorCheck(8)) {
7371 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
7372 return;
7373 }
7374
7375 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7376 auto fusion = fusion_ptr.get();
7377 FusionGuard fg(fusion);
7378
7379 auto tv0 = makeSymbolicTensor(3);
7380 auto tv1 = makeSymbolicTensor(3);
7381
7382 fusion->addInput(tv0);
7383 fusion->addInput(tv1);
7384
7385 // Group 1
7386 auto tv3 = sum(tv0, {1});
7387 auto tv4 = broadcast(tv3, {false, true, false});
7388 auto tv5 = sum(tv0, {1});
7389
7390 // Group 2
7391 auto tv6 = add(tv4, tv1); // edge tv4, expect cast
7392 auto tv7 = castOp(DataType::BFloat16, tv6);
7393
7394 // Group 3
7395 auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
7396
7397 fusion->addOutput(tv7);
7398 fusion->addOutput(tv8);
7399
7400 FusionExecutorCache fec(std::move(fusion_ptr));
7401
7402 std::vector<int64_t> shape{16, 16, 16};
7403
7404 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7405 auto in0 = at::randn(shape, options);
7406 auto in1 = at::randn(shape, options);
7407 fec.runFusionWithInputs({in0, in1});
7408
7409 auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7410 auto complete_fusion = segmented_fusion->completeFusion();
7411
7412 // Check that the edge that wasn't fp16 is the producer of the
7413 // reduction op, i.e. tv8 = sum(tv5,{1});.
7414 for (auto edge : segmented_fusion->edges()) {
7415 auto edge_tv = edge->val->as<TensorView>();
7416 if (edge_tv->getDataType() == DataType::Float) {
7417 auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
7418 TORCH_CHECK(consumer->isA<ReductionOp>());
7419 }
7420 }
7421#else
7422 GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
7423#endif
7424}
7425
7426TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
7427 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7428 auto fusion = fusion_ptr.get();
7429 FusionGuard fg(fusion);
7430
7431 auto tv0 = makeConcreteTensor({2, 2});
7432 auto tv1 = makeConcreteTensor({2, 2, 2});
7433
7434 fusion->addInput(tv0);
7435 fusion->addInput(tv1);
7436
7437 auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
7438 auto tv3 = broadcast(tv2, {false, false, true});
7439 auto tv4 = add(tv3, tv1);
7440 auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
7441 fusion->addOutput(tv5);
7442
7443 // t4 cannot inner re-use t2, because there's a broadcast
7444 // between them.
7445 tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
7446 tv3->computeAt(tv5, 2, ComputeAtMode::BestEffort);
7447
7448 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7449 auto in0 = at::randn({2, 2}, options);
7450 auto in1 = at::randn({2, 2, 2}, options);
7451
7452 auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
7453 FusionExecutor fe;
7454 fe.compileFusion(fusion, {in0, in1});
7455 auto outputs = fe.runFusion({in0, in1});
7456
7457 testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__);
7458}
7459
7460TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
7461 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7462 auto fusion = fusion_ptr.get();
7463 FusionGuard fg(fusion);
7464
7465 auto tv0 = makeConcreteTensor({2, 2});
7466 auto tv1 = makeConcreteTensor({2, 2, 2});
7467
7468 fusion->addInput(tv0);
7469 fusion->addInput(tv1);
7470
7471 auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
7472 auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
7473 auto tv4 = mul(tv2, tv3);
7474 // Broadcast buffer can be reused through outer sharing
7475 auto tv5 = broadcast(tv4, {true, false, false});
7476 auto tv6 = mul(tv5, IrBuilder::create<Double>(5));
7477 auto tv7 = mul(tv6, tv1);
7478 auto tv8 = mul(tv7, IrBuilder::create<Double>(7));
7479 // tv9 shouldn't alias to avoid buffer over-subscription
7480 auto tv9 = broadcast(tv4, {true, false, false});
7481 auto tv10 = mul(tv9, IrBuilder::create<Double>(9));
7482 auto tv11 = add(tv5, tv9);
7483 fusion->addOutput(tv7);
7484 fusion->addOutput(tv11);
7485
7486 tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
7487 tv0->computeAt(tv9, 1, ComputeAtMode::BestEffort);
7488
7489 tv5->computeAt(tv7, 1, ComputeAtMode::BestEffort);
7490 tv5->computeAt(tv11, 1, ComputeAtMode::BestEffort);
7491 tv9->computeAt(tv11, 1, ComputeAtMode::BestEffort);
7492
7493 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7494 auto in0 = at::randn({2, 2}, options);
7495 auto in1 = at::randn({2, 2, 2}, options);
7496 auto t2 = in0 * 2;
7497 auto t3 = in0 * 3;
7498 auto t4 = t2 * t3;
7499 auto t5 = t4.unsqueeze(0);
7500 auto t6 = t5 * 5;
7501 auto t7 = t6 * in1;
7502 auto t8 = t7 * 7;
7503 auto t9 = t4.unsqueeze(0);
7504 auto t10 = t9 * 9;
7505 auto t11 = t5 + t9;
7506 FusionExecutor fe;
7507 fe.compileFusion(fusion, {in0, in1});
7508
7509 auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
7510 auto outputs = fe.runFusion({in0, in1});
7511
7512 testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__);
7513}
7514
7515TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
7516 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7517 auto fusion = fusion_ptr.get();
7518 FusionGuard fg(fusion);
7519
7520 auto tv0 = makeConcreteTensor({256, 512});
7521
7522 fusion->addInput(tv0);
7523
7524 auto tv1 = mul(tv0, IrBuilder::create<Double>(2));
7525 auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
7526 auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
7527 auto tv4 = mul(tv3, IrBuilder::create<Double>(2));
7528 auto tv5 = mul(tv4, IrBuilder::create<Double>(2));
7529 auto tv6 = mul(tv5, IrBuilder::create<Double>(2));
7530
7531 fusion->addOutput(tv6);
7532
7533 tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
7534 tv6->axis(0)->parallelize(ParallelType::TIDx);
7535
7536 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7537 auto in0 = at::randn({256, 512}, options);
7538
7539 FusionExecutor fe;
7540 fe.compileFusion(fusion, {in0});
7541 auto outputs = fe.runFusion({in0});
7542
7543 auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2);
7544
7545 testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
7546}
7547
7548TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
7549 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7550 auto fusion = fusion_ptr.get();
7551 FusionGuard fg(fusion);
7552
7553 auto tv0 = makeConcreteTensor({2, 2});
7554 auto tv1 = makeConcreteTensor({2, 2, 2});
7555
7556 fusion->addInput(tv0);
7557 fusion->addInput(tv1);
7558
7559 auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
7560 auto tv3 = broadcast(tv2, {false, false, true});
7561 auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and
7562 // shouldn't outer alias on top
7563 auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
7564 auto tv6 = mul(tv5, IrBuilder::create<Double>(3));
7565 fusion->addOutput(tv6);
7566
7567 tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
7568 tv4->computeAt(tv6, 2, ComputeAtMode::BestEffort);
7569
7570 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7571 auto in0 = at::randn({2, 2}, options);
7572 auto in1 = at::randn({2, 2, 2}, options);
7573 FusionExecutor fe;
7574 fe.compileFusion(fusion, {in0, in1});
7575 auto outputs = fe.runFusion({in0, in1});
7576
7577 auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0);
7578
7579 testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__);
7580}
7581
7582TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
7583 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7584 auto fusion = fusion_ptr.get();
7585 FusionGuard fg(fusion);
7586
7587 auto tv0 = makeConcreteTensor({3, 3, 3});
7588
7589 fusion->addInput(tv0);
7590
7591 auto tv1 = sum(tv0, {1});
7592 auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
7593 auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
7594
7595 fusion->addOutput(tv3);
7596
7597 // In this case tv1 "reuses" allocation of tv2
7598 // due to the switched allocation order
7599 tv1->computeAt(tv2, 1, ComputeAtMode::BestEffort);
7600
7601 tv0->axis(0)->parallelize(ParallelType::TIDx);
7602 tv1->axis(0)->parallelize(ParallelType::TIDx);
7603 tv2->axis(0)->parallelize(ParallelType::TIDx);
7604 tv3->axis(0)->parallelize(ParallelType::TIDx);
7605
7606 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7607 auto in0 = at::randn({3, 3, 3}, options);
7608
7609 FusionExecutor fe;
7610 fe.compileFusion(fusion, {in0});
7611 auto outputs = fe.runFusion({in0});
7612
7613 auto at_out = in0.sum(1).mul(2).mul(2);
7614
7615 testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
7616}
7617
7618TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
7619 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7620 auto fusion = fusion_ptr.get();
7621 FusionGuard fg(fusion);
7622
7623 auto tv0 = makeConcreteTensor({16, 16});
7624
7625 fusion->addInput(tv0);
7626
7627 auto tv1 = mul(tv0, IrBuilder::create<Double>(3));
7628 auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
7629 auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
7630 // tv1 used till here, cannot be reused by tv2 or tv3
7631 auto tv4 = mul(tv3, tv1);
7632
7633 fusion->addOutput(tv4);
7634
7635 tv0->computeAt(tv4, 1);
7636
7637 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7638 auto in0 = at::randn({16, 16}, options);
7639
7640 FusionExecutor fe;
7641 fe.compileFusion(fusion, {in0});
7642 auto cg_outputs = fe.runFusion({in0});
7643
7644 auto at_t0 = in0 * 3.0;
7645 auto at_out = at_t0 * 2.0 * 2.0 * at_t0;
7646
7647 testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__);
7648}
7649
7650TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
7651 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7652 auto fusion = fusion_ptr.get();
7653 FusionGuard fg(fusion);
7654
7655 auto tv0 = makeConcreteTensor({2, 2});
7656 auto tv1 = makeConcreteTensor({2, 2, 2});
7657
7658 fusion->addInput(tv0);
7659 fusion->addInput(tv1);
7660
7661 auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
7662 auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
7663 auto tv4 = mul(tv2, tv3);
7664 auto tv5 = broadcast(tv4, {false, false, true});
7665 auto tv6 = mul(tv5, tv1);
7666 auto tv7 = mul(tv6, IrBuilder::create<Double>(7));
7667 fusion->addOutput(tv7);
7668
7669 // tv6 shouldn't re-use t2 or t3 because of
7670 // the broadcast in between
7671 tv0->computeAt(tv4, 1, ComputeAtMode::BestEffort);
7672 tv4->computeAt(tv7, 2, ComputeAtMode::BestEffort);
7673
7674 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7675 auto in0 = at::randn({2, 2}, options);
7676 auto in1 = at::randn({2, 2, 2}, options);
7677 FusionExecutor fe;
7678 fe.compileFusion(fusion, {in0, in1});
7679 auto outputs = fe.runFusion({in0, in1});
7680
7681 auto t2 = in0 * 2;
7682 auto t3 = in0 * 3;
7683 auto t4 = t2 * t3;
7684 auto t5 = t4.unsqueeze(2);
7685 auto t6 = t5 * in1;
7686 auto t7 = t6 * 7;
7687 testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__);
7688}
7689
7690TEST_F(NVFuserTest, FusionIssue970_CUDA) {
7691 Fusion fusion;
7692 FusionGuard fg(&fusion);
7693
7694 const int nelm = 10;
7695
7696 // tv3 = tv0 + sum(tv0)
7697 auto tv0 = makeConcreteTensor({nelm, nelm});
7698 fusion.addInput(tv0);
7699 auto tv1 = sum(tv0, {1});
7700 auto tv2 = broadcast(tv1, {false, true});
7701 auto tv3 = add(tv2, tv0);
7702 fusion.addOutput(tv3);
7703
7704 tv1->split(1, 4);
7705
7706 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7707 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
7708 at::manual_seed(0);
7709 at::Tensor t0 = at::randn({nelm, nelm}, options);
7710
7711 FusionExecutor fe;
7712 fe.compileFusion(&fusion, {t0});
7713 auto outputs = fe.runFusion({t0});
7714
7715 auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0;
7716
7717 testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
7718}
7719
7720// Reproducer of #1016
7721TEST_F(NVFuserTest, FusionIssue1016_CUDA) {
7722 Fusion fusion;
7723 FusionGuard fg(&fusion);
7724
7725 auto tv0 = makeSymbolicTensor(2);
7726 fusion.addInput(tv0);
7727
7728 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
7729 auto tv2 = add(tv1, IrBuilder::create<Double>(2));
7730
7731 fusion.addOutput(tv2);
7732
7733 tv1->setMemoryType(MemoryType::Shared);
7734
7735 tv2->split(-1, 8);
7736
7737 int numel_x = 10;
7738 int numel_y = 11;
7739
7740 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7741 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
7742 std::vector<IValue> inputs = {t0};
7743
7744 FusionExecutor fe;
7745 fe.compileFusion(&fusion, inputs);
7746 auto outputs = fe.runFusion(inputs);
7747
7748 auto ref = t0 + 1 + 2;
7749
7750 testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
7751}
7752
7753// Reproducer of #1021
7754TEST_F(NVFuserTest, FusionIssue1021_CUDA) {
7755 Fusion fusion;
7756 FusionGuard fg(&fusion);
7757
7758 auto tv0 = makeSymbolicTensor(1);
7759 fusion.addInput(tv0);
7760 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
7761 auto tv2 = broadcast(tv1, {false, true});
7762 fusion.addOutput(tv2);
7763
7764 auto tv3 = tv2->cacheBefore();
7765
7766 tv2->split(0, 2);
7767
7768 tv1->computeAt(tv2, 1);
7769
7770 tv2->axis(0)->parallelize(ParallelType::TIDx);
7771 tv2->axis(1)->parallelize(ParallelType::Vectorize);
7772
7773 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7774 at::Tensor t0 = at::randn({10}, options);
7775 std::vector<IValue> inputs = {t0};
7776
7777 FusionExecutor fe;
7778 fe.compileFusion(&fusion, inputs);
7779 auto outputs = fe.runFusion(inputs);
7780
7781 auto ref = (t0 + 1).unsqueeze(-1);
7782
7783 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
7784}
7785
7786// Reproducer of issue #1053
7787TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
7788 auto fusion = std::make_unique<Fusion>();
7789 FusionGuard fg(fusion.get());
7790
7791 auto tv0 = makeSymbolicTensor(1);
7792 fusion->addInput(tv0);
7793 auto tv1 = sum(tv0, {0});
7794 fusion->addOutput(tv1);
7795
7796 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
7797 fusion->addOutput(tv2);
7798
7799 tv1->split(0, 8);
7800 auto tv1_rf = tv1->rFactor({-1});
7801
7802 tv1_rf->computeAt(tv1, 1);
7803
7804 tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
7805
7806 tv2->axis(0)->parallelize(ParallelType::TIDx);
7807
7808 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7809 at::Tensor input1 = at::randn({32}, options);
7810
7811 auto at_tv1 = (input1).sum({0});
7812 auto at_tv2 = input1 + 1;
7813
7814 FusionExecutor fe;
7815 fe.compileFusion(fusion.get(), {input1});
7816 auto outputs = fe.runFusion({input1});
7817 testValidate(
7818 fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
7819}
7820
7821TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
7822 auto fusion = std::make_unique<Fusion>();
7823 FusionGuard fg(fusion.get());
7824
7825 auto tv0 = makeSymbolicTensor(1);
7826 fusion->addInput(tv0);
7827 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
7828 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
7829 fusion->addOutput(tv1);
7830 fusion->addOutput(tv2);
7831
7832 tv1->split(0, 8, false);
7833 tv1->axis(1)->parallelize(ParallelType::TIDx);
7834 tv2->split(0, 8, false);
7835 tv2->axis(1)->parallelize(ParallelType::TIDx);
7836
7837 // The extents of tv1 and tv2 axes are equal even though their
7838 // actual values are not statically known
7839 GpuLower gpulw(fusion.get());
7840 const auto& pdmap = gpulw.parallelDimensionMap();
7841 for (const auto i : c10::irange(tv1->domain()->domain().size())) {
7842 auto dom1 = tv1->domain()->domain()[i];
7843 auto dom2 = tv2->domain()->domain()[i];
7844 TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent()));
7845 }
7846
7847 TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
7848 TORCH_CHECK(
7849 pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7850 pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7851
7852 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7853 at::Tensor input1 = at::randn({32}, options);
7854
7855 FusionExecutor fe;
7856 fe.compileFusion(fusion.get(), {input1});
7857 auto outputs = fe.runFusion({input1});
7858
7859 testValidate(
7860 fusion.get(),
7861 outputs,
7862 {input1},
7863 {input1 + 1, input1 + 1},
7864 __LINE__,
7865 __FILE__);
7866}
7867
7868TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
7869 auto fusion = std::make_unique<Fusion>();
7870 FusionGuard fg(fusion.get());
7871
7872 auto tv0 = makeSymbolicTensor(1);
7873 fusion->addInput(tv0);
7874 auto tv1 = makeSymbolicTensor(2);
7875 fusion->addInput(tv1);
7876 auto tv2 = broadcast(tv0, {false, true});
7877 auto tv3 = add(tv1, tv2);
7878 fusion->addOutput(tv3);
7879
7880 tv3->split(-1, 8, false);
7881 tv2->computeAt(tv3, -1);
7882
7883 tv3->axis(-1)->parallelize(ParallelType::TIDx);
7884 tv2->axis(-1)->parallelize(ParallelType::TIDx);
7885
7886 GpuLower gpulw(fusion.get());
7887 const auto& pdmap = gpulw.parallelDimensionMap();
7888 TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
7889 TORCH_CHECK(
7890 pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7891 pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7892
7893 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7894 at::Tensor input1 = at::randn({11}, options);
7895 at::Tensor input2 = at::randn({11, 13}, options);
7896
7897 FusionExecutor fe;
7898 fe.compileFusion(fusion.get(), {input1, input2});
7899 auto outputs = fe.runFusion({input1, input2});
7900
7901 auto ref = input1.unsqueeze(-1) + input2;
7902
7903 testValidate(
7904 fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
7905}
7906
7907// Mix symbolic and concrete tensors
7908TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
7909 auto fusion = std::make_unique<Fusion>();
7910 FusionGuard fg(fusion.get());
7911
7912 auto tv0 = makeSymbolicTensor(1);
7913 fusion->addInput(tv0);
7914
7915 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
7916 fusion->addOutput(tv2);
7917 auto tv3 = add(tv0, IrBuilder::create<Double>(1));
7918 fusion->addOutput(tv3);
7919
7920 tv2->split(0, 10);
7921 tv3->split(0, 20);
7922
7923 auto tv4 = add(tv0, IrBuilder::create<Double>(1));
7924 fusion->addOutput(tv4);
7925 auto tv5 = add(tv0, IrBuilder::create<Double>(1));
7926 fusion->addOutput(tv5);
7927
7928 // Not mapped but equal extent
7929 tv4->split(0, 10);
7930 tv5->split(0, 10);
7931
7932 tv2->axis(-1)->parallelize(ParallelType::TIDx);
7933 tv3->axis(-1)->parallelize(ParallelType::TIDx);
7934
7935 tv4->axis(-1)->parallelize(ParallelType::TIDy);
7936 tv5->axis(-1)->parallelize(ParallelType::TIDy);
7937
7938 GpuLower gpulw(fusion.get());
7939 const auto& pdmap = gpulw.parallelDimensionMap();
7940 TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
7941 TORCH_CHECK(
7942 pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7943 pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7944 TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
7945 TORCH_CHECK(
7946 pdmap.get(ParallelType::TIDy)->isConst() &&
7947 pdmap.get(ParallelType::TIDy)->as<Int>()->value().value() == 10);
7948
7949 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
7950 at::Tensor input1 = at::randn({13}, options);
7951
7952 FusionExecutor fe;
7953 fe.compileFusion(fusion.get(), {input1});
7954 auto outputs = fe.runFusion({input1});
7955
7956 testValidate(
7957 fusion.get(),
7958 outputs,
7959 {input1},
7960 {input1 + 1, input1 + 1, input1 + 1, input1 + 1},
7961 __LINE__,
7962 __FILE__);
7963}
7964
7965// Parallelizing merged broadcast domains
7966TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
7967 Fusion fusion;
7968 FusionGuard fg(&fusion);
7969
7970 auto tv0 = makeSymbolicTensor(1);
7971 fusion.addInput(tv0);
7972 auto tv1 = makeSymbolicTensor(2);
7973 fusion.addInput(tv1);
7974 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
7975 auto tv3 = broadcast(tv2, {true, false});
7976 auto tv4 = add(tv3, tv1);
7977 fusion.addOutput(tv4);
7978
7979 tv4->split(1, 4);
7980 tv4->reorder({{1, 2}, {2, 1}});
7981 tv4->merge(0);
7982 tv0->computeAt(tv4, 1);
7983 tv1->computeAt(tv4, 1);
7984
7985 // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not
7986 // exact.
7987 tv4->axis(0)->parallelize(ParallelType::TIDx);
7988
7989 tv2->setMemoryType(MemoryType::Shared);
7990 tv3->setMemoryType(MemoryType::Shared);
7991
7992 GpuLower gpulw(&fusion);
7993 const auto& pdmap = gpulw.parallelDimensionMap();
7994 TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
7995 TORCH_CHECK(
7996 pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7997 pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7998
7999 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8000 at::Tensor input1 = at::randn({13}, options);
8001 at::Tensor input2 = at::randn({15, 13}, options);
8002
8003 FusionExecutor fe;
8004 fe.compileFusion(&fusion, {input1, input2});
8005 auto outputs = fe.runFusion({input1, input2});
8006
8007 auto ref = (input1 + 1).unsqueeze(0) + input2;
8008
8009 testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
8010}
8011
8012TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
8013 Fusion fusion;
8014 FusionGuard fg(&fusion);
8015
8016 auto tv0 = makeSymbolicTensor(1);
8017 fusion.addInput(tv0);
8018 auto tv1 = makeSymbolicTensor(2);
8019 fusion.addInput(tv1);
8020 auto tv3 = broadcast(tv0, {false, true});
8021 auto tv4 = add(tv3, tv1);
8022 fusion.addOutput(tv4);
8023
8024 tv4->split(1, 4);
8025 tv0->computeAt(tv4, -1);
8026 tv1->computeAt(tv4, -1);
8027
8028 tv4->axis(-1)->parallelize(ParallelType::TIDx);
8029 tv3->axis(-1)->parallelize(ParallelType::TIDx);
8030 tv4->axis(-2)->parallelize(ParallelType::TIDy);
8031 tv3->axis(-2)->parallelize(ParallelType::TIDy);
8032
8033 GpuLower gpulw(&fusion);
8034 const auto& pdmap = gpulw.parallelDimensionMap();
8035 TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
8036 TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
8037 TORCH_CHECK(
8038 pdmap.get(ParallelType::TIDx)->isConst() &&
8039 pdmap.get(ParallelType::TIDx)->as<Int>()->value().value() == 4);
8040 TORCH_CHECK(
8041 pdmap.get(ParallelType::TIDy)->isA<NamedScalar>() &&
8042 pdmap.get(ParallelType::TIDy)->as<NamedScalar>()->name() == "blockDim.y");
8043
8044 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8045 at::Tensor input1 = at::randn({13}, options);
8046 at::Tensor input2 = at::randn({13, 15}, options);
8047
8048 FusionExecutor fe;
8049 fe.compileFusion(&fusion, {input1, input2});
8050 auto outputs = fe.runFusion({input1, input2});
8051
8052 auto ref = (input1).unsqueeze(-1) + input2;
8053
8054 testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
8055}
8056
8057TEST_F(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) {
8058 auto fusion_ptr = std::make_unique<Fusion>();
8059 auto& fusion = *fusion_ptr.get();
8060 FusionGuard fg(&fusion);
8061
8062 auto t0 = makeSymbolicTensor(3, DataType::Float);
8063 auto t1 = makeSymbolicTensor(3, DataType::Half);
8064 auto t3 = makeSymbolicTensor(3, DataType::Half);
8065 auto t5 = makeSymbolicTensor(3, DataType::Half);
8066 auto t7 = makeSymbolicTensor(1, DataType::Half);
8067 auto t11 = makeSymbolicTensor(3, DataType::Half);
8068 auto t13 = makeSymbolicTensor(3, DataType::Half);
8069 auto t15 = makeSymbolicTensor(3, DataType::Half);
8070 auto t17 = makeSymbolicTensor(3, DataType::Half);
8071 auto d56 = IrBuilder::create<Double>();
8072
8073 fusion.addInput(t0);
8074 fusion.addInput(t1);
8075 fusion.addInput(t3);
8076 fusion.addInput(t5);
8077 fusion.addInput(t7);
8078 fusion.addInput(t11);
8079 fusion.addInput(t13);
8080 fusion.addInput(t15);
8081 fusion.addInput(t17);
8082 fusion.addInput(d56);
8083
8084 auto t2 = castOp(DataType::Float, t1);
8085 auto t4 = castOp(DataType::Float, t3);
8086 auto t22 = sub(t2, t4);
8087 auto t6 = castOp(DataType::Float, t5);
8088 auto t23 = mul(t22, t6);
8089 auto t16 = castOp(DataType::Float, t15);
8090 auto t18 = castOp(DataType::Float, t17);
8091 auto t19 = add(t16, t18);
8092 auto t14 = castOp(DataType::Float, t13);
8093 auto t20 = add(t19, t14);
8094 auto t12 = castOp(DataType::Float, t11);
8095 auto t21 = add(t20, t12);
8096 auto t8 = castOp(DataType::Float, t7);
8097 auto t24 = broadcast(t8, {true, true, false});
8098 auto t25 = mul(t21, t24);
8099 auto t27 = sum(t25, {2});
8100 auto t28 = broadcast(t27, {false, false, true});
8101 auto t29 = mul(t25, t23);
8102 auto t30 = sum(t29, {2});
8103 auto t31 = broadcast(t30, {false, false, true});
8104 auto d59 =
8105 mul(t1->getRootDomain()[2]->extent(), IrBuilder::create<Double>(1));
8106 auto t26 = mul(d59, t25);
8107 auto txx = mul(t26, IrBuilder::create<Double>(1));
8108 auto t33 = sub(txx, t28);
8109 auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
8110 auto t35 = mul(d70, t6);
8111 auto t39 = sum(t21, {0, 1});
8112 auto t47 = castOp(DataType::Half, t39);
8113 auto t37 = mul(t21, t23);
8114 auto t38 = sum(t37, {0, 1});
8115 auto t46 = castOp(DataType::Half, t38);
8116 auto t32 = mul(t23, t31);
8117 auto t34 = sub(t33, t32);
8118 auto t36 = mul(t35, t34);
8119 auto t45 = castOp(DataType::Half, t36);
8120 auto t40 = mul(t36, t0);
8121 auto t41 = mul(t40, d56);
8122 auto t44 = castOp(DataType::Half, t41);
8123 auto t42 = sum(t41, {0, 1});
8124 auto t43 = castOp(DataType::Half, t42);
8125
8126 fusion.addOutput(t43);
8127 fusion.addOutput(t44);
8128 fusion.addOutput(t45);
8129 fusion.addOutput(t46);
8130 fusion.addOutput(t47);
8131
8132 auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
8133 auto options_float =
8134 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8135 at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float);
8136 at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half);
8137 at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half);
8138 at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half);
8139 at::Tensor at_t7 = at::randn({1024}, options_half);
8140 at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half);
8141 at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half);
8142 at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half);
8143 at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half);
8144 double at_d56 = 1.1111;
8145
8146 std::vector<at::Tensor> aten_inputs = {
8147 at_t0, at_t1, at_t3, at_t5, at_t7, at_t11, at_t13, at_t15, at_t17};
8148
8149 c10::IValue val = at_d56;
8150
8151 KernelArgumentHolder args(KernelIndexMode::INT32);
8152 args.setDeviceIndex(0);
8153 args.push(aten_inputs);
8154 args.push(val);
8155
8156 for (auto _ : c10::irange(5)) {
8157 auto segmented_fusion =
8158 SegmentCandidateFinder::segment(fusion_ptr.get(), args);
8159 }
8160}
8161
8162TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
8163 Fusion fusion;
8164 FusionGuard fg(&fusion);
8165
8166 auto tv0 = makeSymbolicTensor(1);
8167 fusion.addInput(tv0);
8168
8169 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8170 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
8171 fusion.addOutput(tv2);
8172
8173 auto tv3 = add(tv0, IrBuilder::create<Double>(1));
8174 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
8175 fusion.addOutput(tv4);
8176
8177 auto tv5 = add(tv0, IrBuilder::create<Double>(1));
8178 auto tv6 = add(tv5, IrBuilder::create<Double>(1));
8179 fusion.addOutput(tv6);
8180
8181 // Case 1: local memory tensor computed serially and used by
8182 // parallel threads
8183 tv2->split(-1, 4);
8184 tv1->computeAt(tv2, -2);
8185 tv2->axis(-1)->parallelize(ParallelType::TIDx);
8186
8187 // Case 2: shared memory tensor computed serially and used by BID
8188 tv4->split(-1, 4);
8189 tv3->computeAt(tv4, -2);
8190 tv4->axis(-1)->parallelize(ParallelType::BIDx);
8191 tv3->setMemoryType(MemoryType::Shared);
8192
8193 // Case 3: shared memory tensor computed by TID and used by BID
8194 tv6->split(-1, 4);
8195 tv5->computeAt(tv6, -2);
8196 tv6->axis(-1)->parallelize(ParallelType::BIDx);
8197 tv5->axis(-1)->parallelize(ParallelType::TIDx);
8198 tv5->setMemoryType(MemoryType::Shared);
8199
8200 const int nx = 11;
8201 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8202 at::Tensor t0 = at::randn({nx}, options);
8203 std::vector<IValue> aten_inputs = {t0};
8204
8205 FusionExecutor fe;
8206 fe.compileFusion(&fusion, aten_inputs);
8207 auto outputs = fe.runFusion(aten_inputs);
8208
8209 auto ref = t0 + 2;
8210
8211 testValidate(
8212 &fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__);
8213}
8214
8215// Repro of issue #1105
8216TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
8217 Fusion fusion;
8218 FusionGuard fg(&fusion);
8219
8220 auto tv0 = makeSymbolicTensor(1);
8221 fusion.addInput(tv0);
8222
8223 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8224 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
8225 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
8226
8227 fusion.addOutput(tv3);
8228
8229 tv1->setMemoryType(MemoryType::Shared);
8230 tv2->setMemoryType(MemoryType::Shared);
8231
8232 tv3->split(0, 4);
8233 tv0->computeAt(tv3, 1);
8234
8235 tv1->axis(-1)->parallelize(ParallelType::TIDx);
8236 tv2->axis(-1)->parallelize(ParallelType::TIDy);
8237 tv3->axis(-1)->parallelize(ParallelType::TIDz);
8238
8239 // Make sure a WAR sync is inserted at the end of the outer loop
8240 GpuLower gpulw(&fusion);
8241 for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
8242 if (auto loop = dynamic_cast<kir::ForLoop*>(kir_node)) {
8243 const auto& body = loop->body().exprs();
8244 TORCH_CHECK(!body.empty());
8245 auto last_expr = dynamic_cast<kir::BlockSync*>(body.back());
8246 TORCH_CHECK(last_expr != nullptr, "Invalid expr found");
8247 TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard");
8248 }
8249 }
8250
8251 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8252 at::Tensor t0 = at::randn({17}, options);
8253 std::vector<IValue> aten_inputs = {t0};
8254
8255 FusionExecutor fe;
8256 fe.compileFusion(&fusion, aten_inputs);
8257 auto outputs = fe.runFusion(aten_inputs);
8258
8259 auto ref1 = t0 + 3;
8260
8261 testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__);
8262}
8263
8264TEST_F(NVFuserTest, FusionIssue1099_CUDA) {
8265 Fusion fusion;
8266 FusionGuard fg(&fusion);
8267
8268 auto tv0 = makeSymbolicTensor(1);
8269 fusion.addInput(tv0);
8270
8271 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8272 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
8273 fusion.addOutput(tv2);
8274
8275 auto tv3 = makeSymbolicTensor(1);
8276 fusion.addInput(tv3);
8277
8278 // Just to make TIDx/y/z non-exact
8279 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
8280 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
8281 auto tv6 = add(tv5, IrBuilder::create<Double>(1));
8282 fusion.addOutput(tv6);
8283
8284 tv2->split(0, 4);
8285 tv0->computeAt(tv2, 1);
8286
8287 tv0->axis(-1)->parallelize(ParallelType::TIDx);
8288 tv1->axis(-1)->parallelize(ParallelType::TIDy);
8289 tv2->axis(-1)->parallelize(ParallelType::TIDz);
8290 tv2->axis(0)->parallelize(ParallelType::BIDx);
8291
8292 tv1->setMemoryType(MemoryType::Shared);
8293
8294 tv4->split(0, 5);
8295 tv4->axis(-1)->parallelize(ParallelType::TIDx);
8296 tv4->setMemoryType(MemoryType::Shared);
8297 tv5->split(0, 6);
8298 tv5->axis(-1)->parallelize(ParallelType::TIDy);
8299 tv5->setMemoryType(MemoryType::Shared);
8300 tv6->split(0, 7);
8301 tv6->axis(-1)->parallelize(ParallelType::TIDz);
8302
8303 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8304 at::Tensor t0 = at::randn({17}, options);
8305 at::Tensor t3 = at::randn({19}, options);
8306 std::vector<IValue> aten_inputs = {t0, t3};
8307
8308 FusionExecutor fe;
8309 fe.compileFusion(&fusion, aten_inputs);
8310 auto outputs = fe.runFusion(aten_inputs);
8311
8312 auto ref_t2 = t0 + 2;
8313 auto ref_t3 = t3 + 3;
8314
8315 testValidate(
8316 &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
8317}
8318
8319// Repro of issue #1080
8320TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) {
8321 Fusion fusion;
8322 FusionGuard fg(&fusion);
8323
8324 auto tv0 = makeSymbolicTensor(2);
8325 fusion.addInput(tv0);
8326
8327 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8328 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
8329 fusion.addOutput(tv2);
8330
8331 tv2->split(0, 4);
8332 tv0->computeAt(tv2, 2);
8333
8334 tv2->split(-1, 8);
8335 tv1->split(-1, 8);
8336
8337 tv2->axis(1)->parallelize(ParallelType::Unswitch);
8338
8339 tv2->axis(-1)->parallelize(ParallelType::TIDx);
8340 tv2->axis(-2)->parallelize(ParallelType::TIDy);
8341
8342 // swap TIDx and TIDy
8343 tv1->axis(-1)->parallelize(ParallelType::TIDy);
8344 tv1->axis(-2)->parallelize(ParallelType::TIDx);
8345
8346 tv1->setMemoryType(MemoryType::Shared);
8347
8348 const int nx = 4;
8349 const int ny = 10;
8350 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8351 at::Tensor t0 = at::randn({nx, ny}, options);
8352 std::vector<IValue> aten_inputs = {t0};
8353
8354 FusionExecutor fe;
8355 fe.compileFusion(&fusion, aten_inputs);
8356 auto outputs = fe.runFusion(aten_inputs);
8357
8358 auto ref = t0 + 2;
8359
8360 testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
8361}
8362
8363TEST_F(NVFuserTest, FusionIssue1189_CUDA) {
8364 Fusion fusion;
8365 FusionGuard fg(&fusion);
8366
8367 auto tv0 = makeConcreteTensor({16, 16});
8368 auto tv1 = makeConcreteTensor({16, 16});
8369
8370 auto tv0b = broadcast(tv0, {false, false, true});
8371 auto tv1b = broadcast(tv1, {false, false, true});
8372
8373 fusion.addInput(tv0b);
8374 fusion.addInput(tv1b);
8375
8376 auto tv2 = add(tv0b, tv1b);
8377 auto tv3 = sum(tv2, {1});
8378 fusion.addOutput(tv3);
8379
8380 auto parallelize = [](auto tv) {
8381 tv->axis(0)->parallelize(ParallelType::TIDx);
8382 tv->axis(1)->parallelize(ParallelType::BIDx);
8383 tv->axis(2)->parallelize(ParallelType::BIDy);
8384 };
8385
8386 parallelize(tv0b);
8387 parallelize(tv1b);
8388 parallelize(tv2);
8389 parallelize(tv3);
8390
8391 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8392 at::Tensor t0 = at::randn({16, 16, 1}, options);
8393 at::Tensor t1 = at::randn({16, 16, 1}, options);
8394
8395 FusionExecutor fe;
8396 fe.compileFusion(&fusion, {t0, t1});
8397 auto outputs = fe.runFusion({t0, t1});
8398
8399 auto ref = (t0 + t1).sum({1});
8400
8401 testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
8402}
8403
8404TEST_F(NVFuserTest, FusionIssue1052_CUDA) {
8405 Fusion fusion;
8406 FusionGuard fg(&fusion);
8407
8408 auto tv0 = makeSymbolicTensor(1);
8409 fusion.addInput(tv0);
8410 auto tv1 = makeSymbolicTensor(1);
8411 fusion.addInput(tv1);
8412
8413 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
8414 fusion.addOutput(tv2);
8415
8416 auto tv3 = add(tv1, IrBuilder::create<Double>(1));
8417 fusion.addOutput(tv3);
8418
8419 tv2->axis(-1)->parallelize(ParallelType::TIDx);
8420 tv3->axis(-1)->parallelize(ParallelType::TIDx);
8421
8422 scheduler_utils::parallelizeAllLike(tv2, {tv0});
8423 scheduler_utils::parallelizeAllLike(tv3, {tv1});
8424
8425 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8426 at::Tensor t0 = at::randn({10}, options);
8427 at::Tensor t1 = at::randn({100}, options);
8428 std::vector<IValue> aten_inputs = {t0, t1};
8429
8430 FusionExecutor fe;
8431 fe.compileFusion(&fusion, aten_inputs);
8432 auto outputs = fe.runFusion(aten_inputs);
8433
8434 auto ref_t2 = t0 + 1;
8435 auto ref_t3 = t1 + 1;
8436
8437 testValidate(
8438 &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
8439}
8440
8441// Repro of issue #1115
8442TEST_F(NVFuserTest, FusionPointwiseBroadcast_CUDA) {
8443 Fusion fusion;
8444 FusionGuard fg(&fusion);
8445
8446 std::vector<int64_t> input_shape{3, 17, 80};
8447 std::vector<int64_t> output_shape{3, 17, 1, 80};
8448
8449 TensorView* x = makeSymbolicTensor(input_shape.size());
8450 TensorView* bias = makeSymbolicTensor(input_shape.size());
8451 fusion.addInput(x);
8452 fusion.addInput(bias);
8453
8454 auto x_add_bias = add(x, bias);
8455 auto x_bcast = broadcast(x_add_bias, {false, false, true, false});
8456 auto y = gelu(x_bcast);
8457 fusion.addOutput(y);
8458
8459 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8460 at::Tensor at_x = at::randn(input_shape, options);
8461 at::Tensor at_bias = at::randn(input_shape, options);
8462 std::vector<IValue> aten_inputs = {at_x, at_bias};
8463
8464 schedulePointwise(&fusion, aten_inputs);
8465
8466 FusionExecutor fe;
8467 fe.compileFusion(&fusion, aten_inputs);
8468 auto outputs = fe.runFusion(aten_inputs);
8469
8470 auto at_x_add_bias = at_x + at_bias;
8471 auto at_x_view = at::native::view(at_x_add_bias, output_shape);
8472 auto aten_y = at::gelu(at_x_view);
8473
8474 testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__);
8475}
8476
8477TEST_F(NVFuserTest, FusionPointwiseVectorize_CUDA) {
8478 Fusion fusion;
8479 FusionGuard fg(&fusion);
8480
8481 const int size = 1024 * 64;
8482
8483 TensorView* x = makeContigTensor(1);
8484 fusion.addInput(x);
8485 auto y = sin(x);
8486 fusion.addOutput(y);
8487
8488 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8489
8490 // PyTorch's CUDA caching allocator should always return aligned pointer for
8491 // freshly allocated tensor
8492 at::Tensor at_x = at::randn({size}, options);
8493
8494 schedulePointwise(&fusion, {at_x});
8495
8496 for (auto x_consumer : ir_utils::consumerTvsOf(x)) {
8497 bool found_vec_in_input = false;
8498 for (auto id : x_consumer->domain()->domain()) {
8499 if (isParallelTypeVectorize(id->getParallelType())) {
8500 found_vec_in_input = true;
8501 break;
8502 }
8503 }
8504 TORCH_CHECK(found_vec_in_input, "Expect input to be vectorized");
8505 }
8506
8507 for (auto id : y->domain()->domain()) {
8508 if (isParallelTypeVectorize(id->getParallelType())) {
8509 return;
8510 }
8511 }
8512 TORCH_CHECK(false, "Expect output to be vectorized");
8513}
8514
8515TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) {
8516 Fusion fusion;
8517 FusionGuard fg(&fusion);
8518
8519 auto tv0 = makeSymbolicTensor(1);
8520 fusion.addInput(tv0);
8521
8522 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8523 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
8524 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
8525
8526 fusion.addOutput(tv3);
8527
8528 // Just set the dimension of TIDx
8529 auto tv4 = makeSymbolicTensor(1);
8530 fusion.addInput(tv4);
8531 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
8532 fusion.addOutput(tv5);
8533
8534 tv1->setMemoryType(MemoryType::Shared);
8535 tv2->setMemoryType(MemoryType::Shared);
8536
8537 tv5->axis(0)->parallelize(ParallelType::TIDx);
8538
8539 // tv1 and tv2 are on shared memory and are not parallelized with
8540 // TIDx. They should be predicated as they are redundant and can
8541 // interfere with smem aliasing (issue #1100).
8542
8543 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8544 at::Tensor t0 = at::randn({10}, options);
8545 at::Tensor t4 = at::randn({1024}, options);
8546 std::vector<IValue> aten_inputs = {t0, t4};
8547
8548 FusionExecutor fe;
8549 fe.compileFusion(&fusion, aten_inputs);
8550 auto outputs = fe.runFusion(aten_inputs);
8551
8552 auto ref1 = t0 + 3;
8553 auto ref2 = t4 + 1;
8554
8555 testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8556}
8557
8558TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
8559 Fusion fusion;
8560 FusionGuard fg(&fusion);
8561
8562 auto tv0 = makeSymbolicTensor(1);
8563 fusion.addInput(tv0);
8564
8565 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8566 fusion.addOutput(tv1);
8567
8568 auto tv2 = makeSymbolicTensor(1);
8569 fusion.addInput(tv2);
8570 auto tv3 = sum(tv2, {0});
8571 fusion.addOutput(tv3);
8572
8573 tv1->axis(0)->parallelize(ParallelType::TIDx);
8574 tv3->axis(0)->parallelize(ParallelType::BIDx);
8575
8576 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8577 at::Tensor t0 = at::randn({17}, options);
8578 at::Tensor t2 = at::randn({19}, options);
8579 std::vector<IValue> aten_inputs = {t0, t2};
8580
8581 FusionExecutor fe;
8582 fe.compileFusion(&fusion, aten_inputs);
8583 auto outputs = fe.runFusion(aten_inputs);
8584
8585 auto ref1 = t0 + 1;
8586 auto ref2 = sum(t2);
8587
8588 testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8589}
8590
8591TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
8592 Fusion fusion;
8593 FusionGuard fg(&fusion);
8594
8595 auto tv0 = makeSymbolicTensor(1);
8596 fusion.addInput(tv0);
8597
8598 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8599 fusion.addOutput(tv1);
8600
8601 auto tv2 = makeSymbolicTensor(1);
8602 fusion.addInput(tv2);
8603 auto tv3 = Welford(tv2, {0}).avg;
8604 fusion.addOutput(tv3);
8605
8606 tv1->axis(0)->parallelize(ParallelType::TIDx);
8607 tv3->axis(0)->parallelize(ParallelType::BIDx);
8608
8609 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8610 at::Tensor t0 = at::randn({17}, options);
8611 at::Tensor t2 = at::randn({19}, options);
8612 std::vector<IValue> aten_inputs = {t0, t2};
8613
8614 FusionExecutor fe;
8615 fe.compileFusion(&fusion, aten_inputs);
8616 auto outputs = fe.runFusion(aten_inputs);
8617
8618 auto ref1 = t0 + 1;
8619 auto ref2 = mean(t2, {0});
8620
8621 testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8622}
8623
8624TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
8625 Fusion fusion;
8626 FusionGuard fg(&fusion);
8627
8628 auto tv0 = makeSymbolicTensor(2);
8629 fusion.addInput(tv0);
8630
8631 auto tv1 = sum(tv0, {0, 1});
8632 fusion.addOutput(tv1);
8633
8634 auto tv2 = makeSymbolicTensor(3);
8635 fusion.addInput(tv2);
8636 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
8637 fusion.addOutput(tv3);
8638
8639 auto tv4 = makeSymbolicTensor(3);
8640 fusion.addInput(tv4);
8641 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
8642 fusion.addOutput(tv5);
8643
8644 tv1->axis(0)->parallelize(ParallelType::BIDx);
8645 tv1->axis(1)->parallelize(ParallelType::TIDx);
8646
8647 tv3->axis(0)->parallelize(ParallelType::TIDx);
8648 tv3->axis(1)->parallelize(ParallelType::TIDy);
8649 tv3->axis(2)->parallelize(ParallelType::TIDz);
8650
8651 tv5->axis(0)->parallelize(ParallelType::BIDx);
8652 tv5->axis(1)->parallelize(ParallelType::BIDy);
8653 tv5->axis(2)->parallelize(ParallelType::BIDz);
8654
8655 // TODO: This needs a fix for issue #1102.
8656 // Also, need to allow predicated grid reductions.
8657#if 0
8658 FusionExecutor fe;
8659 fe.compileFusion(&fusion);
8660
8661 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8662 at::Tensor t0 = at::randn({2, 3}, options);
8663 at::Tensor t2 = at::randn({5, 6, 7}, options);
8664 at::Tensor t4 = at::randn({8, 9, 10}, options);
8665 std::vector<IValue> aten_inputs = {t0, t2, t4};
8666 auto outputs = fe.runFusion(aten_inputs);
8667
8668 auto ref1 = t0.sum(at::IntArrayRef{0, 1});
8669 auto ref2 = t2 + 1;
8670 auto ref3 = t4 + 1;
8671
8672 testValidate(
8673 &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
8674#endif
8675}
8676
8677TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
8678 Fusion fusion;
8679 FusionGuard fg(&fusion);
8680
8681 auto tv0 = makeSymbolicTensor(2);
8682 fusion.addInput(tv0);
8683
8684 auto tvs = Welford(tv0, {0, 1});
8685 fusion.addOutput(tvs.avg);
8686
8687 auto tv2 = makeSymbolicTensor(3);
8688 fusion.addInput(tv2);
8689 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
8690 fusion.addOutput(tv3);
8691
8692 auto tv4 = makeSymbolicTensor(3);
8693 fusion.addInput(tv4);
8694 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
8695 fusion.addOutput(tv5);
8696
8697 tvs.avg->axis(0)->parallelize(ParallelType::BIDx);
8698 tvs.avg->axis(1)->parallelize(ParallelType::TIDx);
8699
8700 tv3->axis(0)->parallelize(ParallelType::TIDx);
8701 tv3->axis(1)->parallelize(ParallelType::TIDy);
8702 tv3->axis(2)->parallelize(ParallelType::TIDz);
8703
8704 tv5->axis(0)->parallelize(ParallelType::BIDx);
8705 tv5->axis(1)->parallelize(ParallelType::BIDy);
8706 tv5->axis(2)->parallelize(ParallelType::BIDz);
8707
8708 // TODO: needs a fix for issue #1102
8709 // Also, need to allow predicated grid reductions.
8710#if 0
8711 FusionExecutor fe;
8712 fe.compileFusion(&fusion);
8713
8714 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8715 at::Tensor t0 = at::randn({2, 3}, options);
8716 at::Tensor t2 = at::randn({5, 6, 7}, options);
8717 at::Tensor t4 = at::randn({8, 9, 10}, options);
8718 std::vector<IValue> aten_inputs = {t0, t2, t4};
8719 auto outputs = fe.runFusion(aten_inputs);
8720
8721 auto ref1 = t0.mean(at::IntArrayRef{0, 1});
8722 auto ref2 = t2 + 1;
8723 auto ref3 = t4 + 1;
8724
8725 testValidate(
8726 &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
8727#endif
8728}
8729
8730// Repro of issue #1102
8731TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
8732 Fusion fusion;
8733 FusionGuard fg(&fusion);
8734
8735 auto tv0 = makeSymbolicTensor(1);
8736 fusion.addInput(tv0);
8737
8738 // Just to make TIDx/y/z non-exact
8739 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
8740 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
8741 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
8742 fusion.addOutput(tv3);
8743
8744 auto tv4 = makeSymbolicTensor(1);
8745 fusion.addInput(tv4);
8746
8747 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
8748 auto tv6 = add(tv5, IrBuilder::create<Double>(1));
8749 auto tv7 = add(tv6, IrBuilder::create<Double>(1));
8750 auto tv8 = add(tv7, IrBuilder::create<Double>(1));
8751 auto tv9 = sum(tv8, {0});
8752 fusion.addOutput(tv9);
8753
8754 tv1->split(0, 5);
8755 tv1->axis(-1)->parallelize(ParallelType::TIDx);
8756 tv1->setMemoryType(MemoryType::Shared);
8757 tv2->split(0, 6);
8758 tv2->axis(-1)->parallelize(ParallelType::TIDy);
8759 tv2->setMemoryType(MemoryType::Shared);
8760 tv3->split(0, 7);
8761 tv3->axis(-1)->parallelize(ParallelType::TIDz);
8762
8763 tv9->split(0, 4);
8764 tv4->computeAt(tv9, 1);
8765
8766 tv4->axis(-1)->parallelize(ParallelType::TIDx);
8767 tv5->axis(-1)->parallelize(ParallelType::TIDy);
8768 tv6->axis(-1)->parallelize(ParallelType::TIDz);
8769 tv7->axis(-1)->parallelize(ParallelType::TIDz);
8770 tv8->axis(-1)->parallelize(ParallelType::TIDz);
8771 tv9->axis(-1)->parallelize(ParallelType::TIDz);
8772 tv9->axis(0)->parallelize(ParallelType::BIDx);
8773
8774 tv5->setMemoryType(MemoryType::Shared);
8775
8776 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8777 at::Tensor t0 = at::randn({17}, options);
8778 at::Tensor t4 = at::randn({19}, options);
8779 std::vector<IValue> aten_inputs = {t0, t4};
8780
8781 FusionExecutor fe;
8782 fe.compileFusion(&fusion, aten_inputs);
8783 auto outputs = fe.runFusion(aten_inputs);
8784
8785 auto ref1 = t0 + 3;
8786 auto ref2 = sum(t4 + 4);
8787
8788 testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8789}
8790
8791// Repro of #1102 and #1129
8792TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
8793 if (!deviceMajorMinorCheck(7)) {
8794 GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
8795 return;
8796 }
8797 Fusion fusion;
8798 FusionGuard fg(&fusion);
8799
8800 auto tv0 = makeSymbolicTensor(1);
8801 fusion.addInput(tv0);
8802 auto tv1 = makeSymbolicTensor(1);
8803 fusion.addInput(tv1);
8804
8805 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
8806 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
8807 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
8808 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
8809 fusion.addOutput(tv5);
8810
8811 // Just to make TIDx/y/z non-exact
8812 auto tvx = add(tv1, IrBuilder::create<Double>(1));
8813 auto tvy = add(tvx, IrBuilder::create<Double>(1));
8814 auto tvz = add(tvy, IrBuilder::create<Double>(1));
8815 fusion.addOutput(tvz);
8816
8817 tv5->split(0, 4);
8818 tv0->computeAt(tv5, 1);
8819
8820 tv0->axis(-1)->parallelize(ParallelType::TIDx);
8821 tv2->axis(-1)->parallelize(ParallelType::TIDy);
8822 tv3->axis(-1)->parallelize(ParallelType::TIDz);
8823 tv4->axis(-1)->parallelize(ParallelType::TIDx);
8824 tv5->axis(-1)->parallelize(ParallelType::TIDy);
8825 tv5->axis(0)->parallelize(ParallelType::Unswitch);
8826
8827 tvx->split(0, 5);
8828 tvx->axis(-1)->parallelize(ParallelType::TIDx);
8829 tvy->split(0, 6);
8830 tvy->axis(-1)->parallelize(ParallelType::TIDy);
8831 tvz->split(0, 7);
8832 tvz->axis(-1)->parallelize(ParallelType::TIDz);
8833
8834 for (auto tv : {tv2, tv3, tv4, tvx, tvy}) {
8835 tv->setMemoryType(MemoryType::Shared);
8836 }
8837
8838 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8839 at::Tensor t0 = at::randn({17}, options);
8840 at::Tensor t1 = at::randn({19}, options);
8841 std::vector<IValue> aten_inputs = {t0, t1};
8842
8843 FusionExecutor fe;
8844 fe.compileFusion(&fusion, aten_inputs);
8845 auto outputs = fe.runFusion(aten_inputs);
8846
8847 auto ref1 = t0 + 4;
8848 auto ref2 = t1 + 3;
8849
8850 testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8851}
8852
8853// Repro of issue #1136
8854TEST_F(NVFuserTest, FusionFloatPow_CUDA) {
8855 Fusion fusion;
8856 FusionGuard fg(&fusion);
8857
8858 auto tv0 = makeSymbolicTensor(1);
8859 fusion.addInput(tv0);
8860
8861 auto tv1 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(4));
8862 // To check if pow(tv0, 2) is replaced with tv0 * tv0
8863 auto tv2 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(2));
8864 // To check if pow(tv0, 2.0) is replaced with tv0 * tv0
8865 auto tv3 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(2));
8866 auto tv4 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(3));
8867 auto tv5 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(3));
8868 auto s = binaryOp(
8869 BinaryOpType::Pow,
8870 IrBuilder::create<Double>(3),
8871 IrBuilder::create<Double>(3));
8872 auto tv6 = add(tv0, s);
8873
8874 fusion.addOutput(tv1);
8875 fusion.addOutput(tv2);
8876 fusion.addOutput(tv3);
8877 fusion.addOutput(tv4);
8878 fusion.addOutput(tv5);
8879 fusion.addOutput(tv6);
8880
8881 tv1->split(0, 32);
8882 tv1->axis(0)->parallelize(ParallelType::BIDx);
8883 tv1->axis(1)->parallelize(ParallelType::TIDx);
8884
8885 TransformPropagatorWithCheck propagator(tv1);
8886 MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
8887 scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6});
8888
8889 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
8890 at::Tensor t0 = at::randn({1000}, options);
8891 // Negative inputs cause nan in Fuesr as use_fast_math is enabled
8892 t0 = abs(t0);
8893 std::vector<IValue> aten_inputs = {t0};
8894
8895 FusionExecutor fe;
8896 fe.compileFusion(&fusion, aten_inputs);
8897 auto outputs = fe.runFusion(aten_inputs);
8898
8899 auto p4 = at::pow(t0, 4);
8900 auto p2 = at::pow(t0, 2);
8901 auto p3 = at::pow(t0, 3);
8902 auto t6 = t0 + std::pow(3, 3);
8903
8904 testValidate(
8905 &fusion,
8906 outputs,
8907 aten_inputs,
8908 {p4, p2, p2, p3, p3, t6},
8909 __LINE__,
8910 __FILE__);
8911}
8912
8913TEST_F(NVFuserTest, FusionIssue1127_CUDA) {
8914 Fusion fusion;
8915 FusionGuard fg(&fusion);
8916
8917 const int numel = 4;
8918
8919 auto tv0 = makeConcreteTensor({numel});
8920 fusion.addInput(tv0);
8921
8922 auto tv1 = sum(tv0, {0});
8923 auto tv2 = broadcast(tv1, {true});
8924
8925 auto tv3 = makeConcreteTensor({numel, numel});
8926 fusion.addInput(tv3);
8927
8928 auto tv4 = sum(tv3, {1});
8929
8930 auto tv5 = add(tv2, tv4);
8931 fusion.addOutput(tv5);
8932
8933 tv1->axis(0)->parallelize(ParallelType::TIDx);
8934 tv2->axis(0)->parallelize(ParallelType::TIDx);
8935 tv4->axis(1)->parallelize(ParallelType::TIDx);
8936 tv5->axis(0)->parallelize(ParallelType::TIDx);
8937
8938 // Lowering should fail since tv5 is predicated and paralellized with TIDx.
8939 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
8940 ASSERT_ANY_THROW(fusion.printKernel());
8941}
8942
8943TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) {
8944 // This test may not pass if using a custom block sync as there may
8945 // be additional calls. Skip the test as it's not specifically
8946 // relevant with block synchronizatin.
8947 if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
8948 return;
8949 }
8950 auto g = std::make_shared<Graph>();
8951 const auto graph0_string = R"IR(
8952 graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]),
8953 %1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])):
8954 %o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6
8955 %3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9
8956 return (%3))IR";
8957 parseIR(graph0_string, g.get());
8958
8959 // strides are not yet supported in the irparser.
8960 {
8961 auto val = g->block()->inputs()[0];
8962 val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
8963 {8, 4, 10, 16}, {640, 1, 64, 4}));
8964 }
8965
8966 {
8967 auto val = g->block()->inputs()[1];
8968 val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
8969 {8, 4, 10, 16}, {640, 160, 16, 1}));
8970 }
8971
8972 for (auto node : g->block()->nodes()) {
8973 for (auto val : node->outputs()) {
8974 if (val->isCompleteTensor())
8975 val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
8976 {8, 4, 10, 16}, {640, 1, 64, 4}));
8977 }
8978 }
8979
8980 auto fusion = parseJitIR(g);
8981 FusionGuard fg(fusion.get());
8982 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
8983 at::Tensor input0 =
8984 at::randn({2, 2, 2, 16}, options).clone(c10::MemoryFormat::ChannelsLast);
8985 at::Tensor input1 = at::randn({2, 2, 2, 16}, options);
8986 auto lparams = schedulePointwise(fusion.get(), {input0, input1});
8987
8988 // CONSIDER:
8989 // 1. this can be moved to a dedicated "golden" file
8990 // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
8991 const std::string expected_kernel = R"(
8992__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) {
8993 int64_t i165;
8994 i165 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
8995 if ((i165 < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) {
8996 __half T9[1];
8997 T9[0] = 0;
8998 T9[0]
8999 = T2[((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * ((T0.size[2] * T0.size[1]) * T0.size[3])) + ((((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * (T0.size[2] * T0.size[1])) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * T0.size[2]) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3])];
9000 __half T8[1];
9001 T8[0] = 0;
9002 T8[0]
9003 = T0[i165];
9004 float T3[1];
9005 T3[0]
9006 = __half2float(T9[0]);
9007 float T4[1];
9008 T4[0]
9009 = T3[0];
9010 float T1[1];
9011 T1[0]
9012 = __half2float(T8[0]);
9013 float T5[1];
9014 T5[0]
9015 = T1[0]
9016 * T4[0];
9017 float T6[1];
9018 T6[0]
9019 = relu(T5[0]);
9020 __half T10[1];
9021 T10[0]
9022 = __float2half(T6[0]);
9023 T7[i165]
9024 = T10[0];
9025 }
9026}
9027)";
9028
9029 const std::string actual_kernel =
9030 "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
9031
9032 if (expected_kernel.size() != actual_kernel.size() ||
9033 expected_kernel.compare(actual_kernel) != 0) {
9034 std::cerr
9035 << " Codegen mismatch, codegen possibly changed, or is incorrect. "
9036 << " \n ========= EXPECTED ========= \n"
9037 << expected_kernel << "\n========= ACTUAL ========== \n"
9038 << actual_kernel << "\n=================" << std::endl;
9039 auto it = std::mismatch(
9040 expected_kernel.begin(),
9041 expected_kernel.end(),
9042 actual_kernel.begin(),
9043 actual_kernel.end());
9044 std::string actual_mismatched_snippet(it.second, actual_kernel.end());
9045 actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
9046 std::string expected_mismatched_snippet(it.first, expected_kernel.end());
9047 expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
9048 std::cerr << "First mismatch found at: " << actual_mismatched_snippet
9049 << ", expected: " << expected_mismatched_snippet << std::endl;
9050 TORCH_CHECK(false);
9051 }
9052
9053 // TODO: runFusion hits assertion. I'm probably doing something wrong here.
9054 // FusionExecutor fe;
9055 // fe.compileFusion(fusion.get());
9056 // auto outputs = fe.runFusion({input0, input1}, lparams);
9057 // at::Tensor output_ref = (input0 * input1).relu();
9058 // TORCH_CHECK(output_ref.equal(outputs[0]));
9059}
9060
9061TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
9062 Fusion fusion;
9063 FusionGuard fg(&fusion);
9064
9065 auto tv0 = makeConcreteTensor({10, 1024});
9066 fusion.addInput(tv0);
9067
9068 auto tv1 = sum(tv0, {1});
9069 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
9070 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
9071
9072 fusion.addOutput(tv3);
9073
9074 tv1->axis(-1)->parallelize(ParallelType::TIDx);
9075 tv2->computeAt(tv3, -1);
9076 tv3->axis(0)->parallelize(ParallelType::Unswitch);
9077
9078 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9079 at::Tensor t0 = at::randn({10, 1024}, options);
9080 std::vector<IValue> aten_inputs = {t0};
9081
9082 FusionExecutor fe;
9083 fe.compileFusion(&fusion, aten_inputs);
9084 auto outputs = fe.runFusion(aten_inputs);
9085
9086 auto ref = sum(t0, {1}) + 2;
9087
9088 testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
9089}
9090
9091TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) {
9092 Fusion fusion;
9093 FusionGuard fg(&fusion);
9094
9095 auto tv0 = makeSymbolicTensor(1);
9096 fusion.addInput(tv0);
9097
9098 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
9099 fusion.addOutput(tv1);
9100
9101 tv1->setContiguity(false);
9102
9103 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9104 at::Tensor at_input = at::randn({10}, options);
9105 at::Tensor at_output = at::empty_strided({10}, {2}, options);
9106
9107 FusionExecutor fe;
9108 fe.compileFusion(&fusion, {at_input});
9109 auto returned_outputs = fe.runFusion({at_input}, {at_output});
9110
9111 // Returned outputs should only contain one tensor that is the same
9112 // as the output tensor given to runFusion
9113 TORCH_CHECK(returned_outputs.size() == 1);
9114 TORCH_CHECK(returned_outputs[0].is_same(at_output));
9115 TORCH_CHECK(!returned_outputs[0].is_contiguous());
9116
9117 auto at_ref = at_input + 1;
9118
9119 testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__);
9120}
9121
9122TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
9123 Fusion fusion;
9124 FusionGuard fg(&fusion);
9125
9126 // Setup softmax fusion
9127 auto input = makeContigTensor(2);
9128 fusion.addInput(input);
9129 auto output = softmax(input, 1);
9130 fusion.addOutput(output);
9131
9132 // Setup runtime input
9133 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9134 at::Tensor aten_input = at::randn({8, 16 * 197}, options);
9135 std::vector<c10::IValue> aten_inputs({aten_input});
9136
9137 // Schedule through magic scheduler
9138 SchedulerRuntimeInfo runtime_info(&fusion, aten_inputs, true);
9139 TORCH_CHECK(SchedulerEntry::canSchedule(
9140 ScheduleHeuristic::Persistent, &fusion, runtime_info));
9141 auto scheduler = SchedulerEntry::makeEntry(
9142 ScheduleHeuristic::Persistent, &fusion, runtime_info);
9143 scheduler->schedule(&fusion);
9144
9145 // Modify the schedule to use warp reduction
9146 auto used_vals = fusion.usedMathVals();
9147 for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
9148 for (IterDomain* id : tv->domain()->domain()) {
9149 if (id->getParallelType() == ParallelType::TIDx) {
9150 id->padToMultipleOfWarp();
9151 }
9152 }
9153 }
9154
9155 // Test result
9156 FusionExecutor fe;
9157 fe.compileFusion(&fusion, aten_inputs);
9158 auto outputs = fe.runFusion(aten_inputs);
9159 auto ref_output = at::_softmax(aten_input, 1, false);
9160 testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
9161}
9162
9163TEST_F(NVFuserTest, FusionIssue1133_CUDA) {
9164 if (!deviceMajorMinorCheck(7)) {
9165 GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
9166 return;
9167 }
9168 Fusion fusion;
9169 FusionGuard fg(&fusion);
9170
9171 auto tv0 = makeSymbolicTensor(2);
9172 fusion.addInput(tv0);
9173
9174 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
9175 auto tv2 = sum(tv1, {1});
9176 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
9177
9178 fusion.addOutput(tv3);
9179
9180 tv0->computeAt(tv3, 1);
9181
9182 const int split_factor = 32;
9183
9184 tv2->split(-1, split_factor);
9185 tv1->computeAt(tv2, -2);
9186
9187 tv1->axis(-1)->parallelize(ParallelType::TIDx);
9188 tv2->axis(-1)->parallelize(ParallelType::TIDx);
9189
9190 tv3->axis(0)->parallelize(ParallelType::Unswitch);
9191
9192 tv1->setMemoryType(MemoryType::Shared);
9193 tv2->setMemoryType(MemoryType::Shared);
9194
9195 // Both tv1 and tv2 should be allocated at the top-level scope
9196 GpuLower gpulw(&fusion);
9197 bool tv1_validated = false;
9198 bool tv2_validated = false;
9199 for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
9200 if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node)) {
9201 auto size = alloc->size();
9202 if (!(alloc->buffer()->name() == 1 || alloc->buffer()->name() == 2)) {
9203 // There should be no allocation other than those for tv1 and tv2
9204 TORCH_CHECK(false, "Invalid allocation detected");
9205 }
9206 TORCH_CHECK(size->isA<Int>(), "Invalid allocation size");
9207 TORCH_CHECK(size->as<Int>()->isConst(), "Allocation not constant");
9208 auto size_int = size->as<Int>()->value().value();
9209 if (alloc->buffer()->name() == 1) {
9210 TORCH_CHECK(
9211 size_int == split_factor,
9212 "Invalid allocation size: ",
9213 size->as<Int>()->value().value());
9214 tv1_validated = true;
9215 } else {
9216 TORCH_CHECK(
9217 size_int == 1,
9218 "Invalid allocation size: ",
9219 size->as<Int>()->value().value());
9220 tv2_validated = true;
9221 }
9222 }
9223 }
9224
9225 TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation");
9226 TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation");
9227
9228 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9229 at::Tensor t0 = at::randn({99, 101}, options);
9230 std::vector<IValue> aten_inputs = {t0};
9231
9232 FusionExecutor fe;
9233 fe.compileFusion(&fusion, aten_inputs);
9234 auto outputs = fe.runFusion(aten_inputs);
9235
9236 auto ref = (t0 + 1).sum({1}) + 1;
9237
9238 testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
9239}
9240
9241TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) {
9242 Fusion fusion;
9243 FusionGuard fg(&fusion);
9244
9245 auto tv0 = makeSymbolicTensor(2);
9246 fusion.addInput(tv0);
9247
9248 auto tv1 = sum(tv0, {1});
9249 fusion.addOutput(tv1);
9250
9251 tv1->split(1, 32);
9252
9253 auto tv2 = tv1->rFactor({1});
9254
9255 // This merged domain is not contiguous.
9256 tv2->merge(0, 2);
9257
9258 tv2->setMemoryType(MemoryType::Shared);
9259
9260 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9261 at::Tensor t0 = at::randn({99, 101}, options);
9262 std::vector<IValue> aten_inputs = {t0};
9263
9264 FusionExecutor fe;
9265 fe.compileFusion(&fusion, aten_inputs);
9266 auto outputs = fe.runFusion(aten_inputs);
9267
9268 auto ref = t0.sum({1});
9269
9270 testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
9271}
9272
9273TEST_F(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) {
9274 Fusion fusion;
9275 FusionGuard fg(&fusion);
9276
9277 auto tv0 = makeSymbolicTensor(2);
9278 fusion.addInput(tv0);
9279
9280 auto tv1 = set(tv0);
9281 auto tv2 = sum(tv1, {1});
9282 auto tv3 = broadcast(tv2, {false, true});
9283 auto tv4 = set(tv1);
9284 auto tv5 = add(tv3, tv4);
9285 fusion.addOutput(tv5);
9286
9287 auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9288
9289 auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
9290 return std::find(vec.begin(), vec.end(), tv) != vec.end();
9291 };
9292
9293 auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9294 std::vector<TensorView*>& buffer_vec,
9295 TensorView* tv) {
9296 auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9297 return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9298 };
9299
9300 auto& buffers = persistent_buffer_info.persistent_buffers;
9301 auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9302 auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9303 auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9304
9305 TORCH_INTERNAL_ASSERT(buffers.size() == 1);
9306 TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
9307 TORCH_INTERNAL_ASSERT(projectable.size() == 1);
9308 TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
9309
9310 TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
9311 TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
9312 TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9313
9314 auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
9315 TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9316
9317 TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
9318
9319 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9320 at::Tensor aten_t0 = at::randn({99, 101}, options);
9321
9322 // Schedule through magic scheduler
9323 SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
9324 auto persistent_buffer_size =
9325 persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9326
9327 TORCH_INTERNAL_ASSERT(
9328 persistent_buffer_size.persistent_buffer_size ==
9329 static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
9330 TORCH_INTERNAL_ASSERT(
9331 persistent_buffer_size.projected_persistent_buffer_size ==
9332 static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
9333}
9334
9335TEST_F(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) {
9336 Fusion fusion;
9337 FusionGuard fg(&fusion);
9338
9339 auto tv0 = makeSymbolicTensor(2, DataType::Half);
9340 fusion.addInput(tv0);
9341
9342 auto tv1 = castOp(DataType::Float, tv0);
9343 auto tv2 = sum(tv1, {1});
9344 auto tv3 = broadcast(tv2, {false, true});
9345 auto tv4 = set(tv1);
9346 auto tv5 = add(tv3, tv4);
9347 auto tv6 = castOp(DataType::Half, tv5);
9348 fusion.addOutput(tv6);
9349
9350 auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9351
9352 auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
9353 return std::find(vec.begin(), vec.end(), tv) != vec.end();
9354 };
9355
9356 auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9357 std::vector<TensorView*>& buffer_vec,
9358 TensorView* tv) {
9359 auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9360 return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9361 };
9362
9363 auto& buffers = persistent_buffer_info.persistent_buffers;
9364 auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9365 auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9366 auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9367
9368 TORCH_INTERNAL_ASSERT(buffers.size() == 1);
9369 TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
9370 TORCH_INTERNAL_ASSERT(projectable.size() == 1);
9371 TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
9372
9373 TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
9374 TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
9375 TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9376
9377 auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
9378 TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9379
9380 TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
9381
9382 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
9383 at::Tensor aten_t0 = at::randn({99, 101}, options);
9384
9385 // Schedule through magic scheduler
9386 SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
9387 auto persistent_buffer_size =
9388 persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9389
9390 TORCH_INTERNAL_ASSERT(
9391 persistent_buffer_size.persistent_buffer_size ==
9392 static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
9393 TORCH_INTERNAL_ASSERT(
9394 persistent_buffer_size.projected_persistent_buffer_size ==
9395 static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
9396}
9397
9398TEST_F(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) {
9399 Fusion fusion;
9400 FusionGuard fg(&fusion);
9401
9402 auto tv0 = makeSymbolicTensor(2, DataType::Half);
9403 fusion.addInput(tv0);
9404
9405 auto tv1 = castOp(DataType::Float, tv0);
9406 auto tv2 = set(tv1);
9407 auto tv3 = sum(tv2, {1});
9408 auto tv4 = broadcast(tv3, {false, true});
9409
9410 auto tv5 = makeSymbolicTensor(2, DataType::Half);
9411 fusion.addInput(tv5);
9412
9413 auto tv6 = castOp(DataType::Float, tv5);
9414
9415 auto tv7 = add(tv6, tv4);
9416 auto tv8 = set(tv1);
9417 auto tv9 = add(tv7, tv8);
9418 auto tv10 = sum(tv9, {1});
9419 auto tv11 = broadcast(tv10, {false, true});
9420 auto tv12 = set(tv7);
9421 auto tv13 = add(tv12, tv11);
9422
9423 fusion.addOutput(tv13);
9424
9425 auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9426
9427 auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
9428 return std::find(vec.begin(), vec.end(), tv) != vec.end();
9429 };
9430
9431 auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9432 std::vector<TensorView*>& buffer_vec,
9433 TensorView* tv) {
9434 auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9435 return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9436 };
9437
9438 auto& buffers = persistent_buffer_info.persistent_buffers;
9439 auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9440 auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9441 auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9442
9443 TORCH_INTERNAL_ASSERT(buffers.size() == 2);
9444 TORCH_INTERNAL_ASSERT(
9445 resolution.size() == 2 && resolution[0].size() == 1 &&
9446 resolution[1].size() == 1);
9447 TORCH_INTERNAL_ASSERT(projectable.size() == 1);
9448 TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
9449
9450 TORCH_INTERNAL_ASSERT(
9451 isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7));
9452 TORCH_INTERNAL_ASSERT(
9453 isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7));
9454
9455 TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9456
9457 auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
9458 TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9459 TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9));
9460
9461 auto tv7_resolution_it = tvEntryInVecVec(resolution, buffers, tv7);
9462 TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end())
9463 TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13));
9464
9465 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
9466 at::Tensor aten_t0 = at::randn({99, 101}, options);
9467 at::Tensor aten_t5 = at::randn({99, 101}, options);
9468
9469 // Schedule through magic scheduler
9470 SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0, aten_t5}, true);
9471 auto persistent_buffer_size =
9472 persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9473
9474 TORCH_INTERNAL_ASSERT(
9475 persistent_buffer_size.persistent_buffer_size ==
9476 static_cast<int64_t>(
9477 aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
9478 TORCH_INTERNAL_ASSERT(
9479 persistent_buffer_size.projected_persistent_buffer_size ==
9480 static_cast<int64_t>(
9481 aten_t0.size(1) *
9482 (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))));
9483}
9484
9485TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) {
9486 Fusion fusion;
9487 FusionGuard fg(&fusion);
9488
9489 auto tv0 = makeSymbolicTensor(2, DataType::Half);
9490 fusion.addInput(tv0);
9491
9492 auto tv1 = castOp(DataType::Float, tv0);
9493 auto tv2 = set(tv1);
9494 auto tv3 = sum(tv2, {1});
9495 auto tv4 = broadcast(tv3, {false, true});
9496 auto tv5 = set(tv1);
9497 auto tv6 = add(tv4, tv5);
9498 auto tv7 = set(tv2);
9499 auto tv8 = add(tv7, tv6);
9500 auto tv9 = castOp(DataType::Half, tv8);
9501
9502 fusion.addOutput(tv9);
9503
9504 auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9505
9506 auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
9507 return std::find(vec.begin(), vec.end(), tv) != vec.end();
9508 };
9509
9510 auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9511 std::vector<TensorView*>& buffer_vec,
9512 TensorView* tv) {
9513 auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9514 return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9515 };
9516
9517 auto& buffers = persistent_buffer_info.persistent_buffers;
9518 auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9519 auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9520 auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9521
9522 TORCH_INTERNAL_ASSERT(buffers.size() == 2);
9523 TORCH_INTERNAL_ASSERT(
9524 resolution.size() == 2 && resolution[0].size() == 1 &&
9525 resolution[1].size() == 1);
9526
9527 TORCH_INTERNAL_ASSERT(projectable.size() == 2);
9528 TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
9529
9530 TORCH_INTERNAL_ASSERT(
9531 isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2));
9532 TORCH_INTERNAL_ASSERT(
9533 isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2));
9534
9535 TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9536
9537 auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
9538 TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9539 TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6));
9540
9541 auto tv2_resolution_it = tvEntryInVecVec(resolution, buffers, tv2);
9542 TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end())
9543 TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8));
9544
9545 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
9546 at::Tensor aten_t0 = at::randn({99, 101}, options);
9547
9548 // Schedule through magic scheduler
9549 SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
9550 auto persistent_buffer_size =
9551 persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9552
9553 TORCH_INTERNAL_ASSERT(
9554 persistent_buffer_size.persistent_buffer_size ==
9555 static_cast<int64_t>(
9556 aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
9557
9558 TORCH_INTERNAL_ASSERT(
9559 persistent_buffer_size.projected_persistent_buffer_size ==
9560 static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
9561}
9562
9563TEST_F(NVFuserTest, FusionPersistentBufferProjection_CUDA) {
9564 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
9565 Fusion& fusion = *fusion_ptr.get();
9566 FusionGuard fg(&fusion);
9567
9568 auto tv0 = makeSymbolicTensor(2, DataType::Half);
9569 fusion.addInput(tv0);
9570
9571 auto tv1 = castOp(DataType::Float, tv0);
9572 auto tv2 = set(tv1);
9573 auto tv3 = sum(tv2, {1});
9574 auto tv4 = broadcast(tv3, {false, true});
9575 auto tv5 = set(tv1);
9576 auto tv6 = add(tv4, tv5);
9577 auto tv7 = set(tv2);
9578 auto tv8 = add(tv7, tv6);
9579 auto tv9 = castOp(DataType::Half, tv8);
9580
9581 fusion.addOutput(tv9);
9582
9583 reduction_scheduler_utils::projectPersistentBuffers(&fusion);
9584
9585 auto tv5_producers = ir_utils::producerTvsOf(tv5);
9586 auto tv7_producers = ir_utils::producerTvsOf(tv7);
9587
9588 // Projection should have broken these dependencies
9589
9590 TORCH_INTERNAL_ASSERT(
9591 std::find(tv5_producers.begin(), tv5_producers.end(), tv1) ==
9592 tv5_producers.end());
9593 TORCH_INTERNAL_ASSERT(
9594 std::find(tv7_producers.begin(), tv7_producers.end(), tv2) ==
9595 tv7_producers.end());
9596
9597 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
9598 at::Tensor aten_t0 = at::randn({99, 101}, options);
9599
9600 FusionExecutorCache fec(std::move(fusion_ptr));
9601 auto cg_outputs = fec.runFusionWithInputs({aten_t0});
9602
9603 auto aten_t1 = aten_t0.to(c10::kDouble);
9604 auto aten_t3 = aten_t1.sum({1});
9605 auto aten_t4 = aten_t3.unsqueeze(1);
9606 auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1);
9607
9608 testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__);
9609}
9610
9611TEST_F(NVFuserTest, FusionIssue1223_CUDA) {
9612 if (!deviceMajorMinorCheck(7)) {
9613 GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
9614 return;
9615 }
9616 Fusion fusion;
9617 FusionGuard fg(&fusion);
9618
9619 auto tv0 = makeContigTensor(2);
9620 fusion.addInput(tv0);
9621
9622 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
9623 auto tv2 = sum(tv1, {0, 1});
9624 fusion.addOutput(tv2);
9625
9626 auto tv3 = add(tv0, IrBuilder::create<Double>(0));
9627 fusion.addOutput(tv3);
9628
9629 tv2->split(0, 4);
9630 tv2->split(1, 1, false);
9631 tv2->split(-1, 4);
9632
9633 tv2->axis(1)->parallelize(ParallelType::Unswitch);
9634 tv2->axis(-3)->parallelize(ParallelType::TIDx);
9635 tv2->axis(-1)->parallelize(ParallelType::TIDy);
9636
9637 tv1->computeAt(tv2, -1);
9638
9639 // Make TIDx and TIDy non-exact
9640 tv3->split(0, 32);
9641 tv3->split(-1, 32);
9642 tv3->axis(1)->parallelize(ParallelType::TIDx);
9643 tv3->axis(3)->parallelize(ParallelType::TIDy);
9644
9645 // The second axis of both tv1 and tv2 are fully unswitched, so they
9646 // don't need to predicate the parallel type usage of TIDy, whereas
9647 // the first axis is only partially unswitched, i.e., part of its
9648 // split output domains is outside the unswitched axis, so the first
9649 // axis, which uses TIDx, needs to predicate the parallel
9650 // dimension. Previously, as reported in issue #1223, unswitched
9651 // expressions didn't predicate parallel dimensions. It should be
9652 // fixed by PR #1222.
9653
9654 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9655 at::Tensor at_t0 = at::ones({11, 10}, options);
9656
9657 FusionExecutor fe;
9658 fe.compileFusion(&fusion, {at_t0});
9659 auto cg_outputs = fe.runFusion({at_t0});
9660
9661 auto at_t1 = (at_t0 + 1).sum();
9662
9663 testValidate(
9664 &fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__);
9665}
9666
9667// See #1247 and #1250
9668TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) {
9669 Fusion fusion;
9670 FusionGuard fg(&fusion);
9671
9672 auto tv0 = makeContigTensor(1);
9673 fusion.addInput(tv0);
9674
9675 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
9676 auto tv2 = min(tv1, {0});
9677
9678 fusion.addOutput(tv2);
9679
9680 // Make TIDx non-exact
9681 auto tv3 = makeContigTensor(1);
9682 fusion.addInput(tv3);
9683
9684 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
9685 fusion.addOutput(tv4);
9686
9687 tv2->split(0, 4);
9688 auto tv5 = tv2->rFactor({1});
9689
9690 tv0->computeAt(tv2, 1);
9691
9692 tv2->axis(0)->parallelize(ParallelType::TIDx);
9693
9694 tv4->axis(0)->parallelize(ParallelType::TIDx);
9695
9696 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9697 at::Tensor at_t0 = at::randn({9}, options);
9698 at_t0 = at::abs(at_t0);
9699 at::Tensor at_t3 = at::randn({128}, options);
9700
9701 FusionExecutor fe;
9702 fe.compileFusion(&fusion, {at_t0, at_t3});
9703 auto cg_outputs = fe.runFusion({at_t0, at_t3});
9704
9705 auto at_t2 = (at_t0 + 1).min();
9706 auto at_t4 = at_t3 + 1;
9707
9708 testValidate(
9709 &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
9710}
9711
9712TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) {
9713 Fusion fusion;
9714 FusionGuard fg(&fusion);
9715
9716 auto tv0 = makeContigTensor(1);
9717 fusion.addInput(tv0);
9718
9719 auto tv1 = min(tv0, {0});
9720 fusion.addOutput(tv1);
9721
9722 // Make TIDx non-exact
9723 auto tv2 = makeContigTensor(1);
9724 fusion.addInput(tv2);
9725
9726 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
9727 fusion.addOutput(tv3);
9728
9729 tv1->split(0, 4);
9730 auto tv4 = tv1->rFactor({0});
9731
9732 tv1->split(0, 3);
9733
9734 // tv0->computeAt(tv1, 3);
9735 tv4->reorder({{0, 1}});
9736 tv4->split(0, 3);
9737 tv4->setMemoryType(MemoryType::Shared);
9738
9739 // tv0: [I]
9740 // tv4: [4/3, 3, I/4]
9741 // tv1: [4/3, 3]
9742
9743 tv1->axis(0)->parallelize(ParallelType::TIDx);
9744 scheduler_utils::parallelizeAllLike(tv1, {tv4});
9745
9746 tv3->axis(0)->parallelize(ParallelType::TIDx);
9747
9748 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9749 at::manual_seed(0);
9750 at::Tensor at_t0 = at::randn({9}, options);
9751 at_t0 = at::abs(at_t0);
9752 at::Tensor at_t3 = at::randn({128}, options);
9753
9754 FusionExecutor fe;
9755 fe.compileFusion(&fusion, {at_t0, at_t3});
9756 auto cg_outputs = fe.runFusion({at_t0, at_t3});
9757
9758 auto at_t2 = std::get<0>(at_t0.min(0));
9759 auto at_t4 = at_t3 + 1;
9760
9761 testValidate(
9762 &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
9763}
9764
9765TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) {
9766 // https://github.com/csarofeen/pytorch/issues/1692
9767 Fusion fusion;
9768 FusionGuard fg(&fusion);
9769
9770 auto tv0 = makeSymbolicTensor(3);
9771 fusion.addInput(tv0);
9772
9773 auto tv1 = sum(tv0, {1, 2});
9774 fusion.addOutput(tv1);
9775
9776 tv1->split(2, 4);
9777 tv1->split(1, 3);
9778 tv1->merge(2, 3);
9779 auto rf = tv1->rFactor({-1});
9780
9781 tv1->split(0, 256);
9782 tv1->axis(0)->parallelize(ParallelType::BIDx);
9783 tv1->axis(1)->parallelize(ParallelType::TIDx);
9784 rf->computeAt(tv1, -1);
9785
9786 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9787 at::manual_seed(0);
9788
9789 auto at_in = at::randn({6, 6, 6}, options);
9790 auto at_out = at_in.sum({1, 2});
9791
9792 FusionExecutor fe;
9793 fe.compileFusion(&fusion, {at_in});
9794 auto cg_outputs = fe.runFusion({at_in});
9795
9796 testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__);
9797}
9798
9799} // namespace jit
9800} // namespace torch
9801#endif // #if defined(USE_CUDA)
9802