1#if defined(USE_CUDA)
2#include <gtest/gtest.h>
3
4#include <arith.h>
5#include <codegen.h>
6#include <disjoint_set.h>
7#include <executor.h>
8#include <executor_launch_params.h>
9#include <expr_evaluator.h>
10#include <fusion.h>
11#include <fusion_segmenter.h>
12#include <ir_all_nodes.h>
13#include <ir_builder.h>
14#include <ir_graphviz.h>
15#include <ir_iostream.h>
16#include <ir_utils.h>
17#include <iter_visitor.h>
18#include <kernel_cache.h>
19#include <kernel_expr_evaluator.h>
20#include <kernel_ir.h>
21#include <lower2device.h>
22#include <mutator.h>
23#include <ops/all_ops.h>
24#include <register_interface.h>
25#include <root_domain_map.h>
26#include <scheduler/all_schedulers.h>
27#include <scheduler/utils.h>
28#include <test/test_gpu_validator.h>
29#include <test/test_utils.h>
30#include <transform_replay.h>
31#include <transform_rfactor.h>
32
33// fuser and IR parser
34#include <ATen/cuda/CUDAContext.h>
35#include <ATen/cuda/Exceptions.h>
36#include <c10/cuda/CUDAStream.h>
37
38#include <algorithm>
39#include <iostream>
40
41// Tests go in torch::jit
42namespace torch {
43namespace jit {
44
45using namespace torch::jit::fuser::cuda;
46using namespace at::indexing;
47
48namespace {
49
50// Used to signify invalid ranges, i.e., values at offset 0 to
51// start_offset, and values at offset stop_offset to the end of the
52// domain.
53static constexpr int invalid_marker = 1;
54
55// ATen version of tensor shifting
56auto shift(
57 at::Tensor tensor,
58 const std::vector<int>& offsets,
59 std::vector<int> padding = {}) {
60 TORCH_INTERNAL_ASSERT(
61 tensor.ndimension() == static_cast<int64_t>(offsets.size()));
62 if (padding.empty()) {
63 padding = offsets;
64 for (auto& p : padding) {
65 p = std::abs(p);
66 }
67 }
68 at::Tensor t = tensor;
69 for (size_t i = 0; i < offsets.size(); ++i) {
70 auto offset = offsets[i];
71 t = t.roll(offsets[i], i);
72 if (offset == 0) {
73 continue;
74 }
75 // Zero padding
76 std::vector<at::indexing::TensorIndex> indices(
77 tensor.ndimension(), at::indexing::Slice(0, at::indexing::None));
78 if (offset > 0) {
79 indices[i] = at::indexing::Slice(0, offset);
80 } else {
81 indices[i] = at::indexing::Slice(offset, at::indexing::None);
82 }
83 t.index(indices) = 0;
84 // Fill the outside range by the special marker value.
85 const auto pad = padding[i];
86 if (offset > 0) {
87 indices[i] = at::indexing::Slice(0, offset - pad);
88 } else {
89 offset += pad;
90 TORCH_INTERNAL_ASSERT(offset <= 0);
91 if (offset == 0) {
92 continue;
93 }
94 indices[i] = at::indexing::Slice(offset, at::indexing::None);
95 }
96 t.index(indices) = invalid_marker;
97 }
98 return t;
99}
100
101// ATen version of tensor gather
102auto gather(
103 at::Tensor tensor,
104 const std::vector<int>& window_shape,
105 const std::vector<std::vector<int>>& pad_width,
106 std::vector<int> strides = {}) {
107 TORCH_CHECK(
108 tensor.ndimension() == static_cast<int64_t>(window_shape.size()),
109 "Invalid window shape: ",
110 window_shape,
111 ". Size of the window shape is different from the tensor dimension.");
112 TORCH_CHECK(
113 tensor.ndimension() == static_cast<int64_t>(pad_width.size()),
114 "Invalid pad width: ",
115 pad_width,
116 ". Size of the pad width is different from the tensor dimension.");
117 if (strides.empty()) {
118 strides = std::vector<int>(tensor.ndimension(), 1);
119 } else {
120 TORCH_CHECK(
121 tensor.ndimension() == static_cast<int64_t>(strides.size()),
122 "Invalid strides: ",
123 strides,
124 ". Size of strides is different from the tensor dimension.");
125 }
126 at::Tensor t = tensor;
127 for (size_t i = 0; i < window_shape.size(); ++i) {
128 const auto w_size = window_shape[i];
129 TORCH_CHECK(w_size != 0);
130 const auto& pad = pad_width[i];
131 TORCH_CHECK(pad.size() == 2);
132 const auto out_extent_adj = -w_size + 1 + pad[0] + pad[1];
133 TORCH_INTERNAL_ASSERT(out_extent_adj <= 0);
134 const auto stride = strides[i];
135 TORCH_CHECK(stride >= 1);
136
137 at::Tensor concat_tensor;
138
139 for (int w = 0; w < w_size; ++w) {
140 std::vector<int> shift_offsets(t.ndimension(), 0);
141 shift_offsets[i] = pad[0] - w;
142 auto shifted = shift(t, shift_offsets);
143 // Apply stride
144 if (stride != 1) {
145 std::vector<at::indexing::TensorIndex> indices(
146 shifted.ndimension(), at::indexing::Slice(0, at::indexing::None));
147 if (out_extent_adj == 0) {
148 indices[i] = at::indexing::Slice(0, at::indexing::None, strides[i]);
149 } else {
150 indices[i] = at::indexing::Slice(0, out_extent_adj, strides[i]);
151 }
152 shifted = shifted.index(indices);
153 }
154 shifted = shifted.unsqueeze(-1);
155 if (w == 0) {
156 concat_tensor = shifted;
157 } else {
158 concat_tensor = at::cat({concat_tensor, shifted}, -1);
159 }
160 }
161 t = concat_tensor;
162 }
163
164 // Fill invalid regions with the marker. Note that when non-unit
165 // stride is used, it trims invalid regions, so no marking is
166 // necessary.
167 for (size_t i = 0; i < window_shape.size(); ++i) {
168 if (strides[i] != 1) {
169 continue;
170 }
171
172 const auto out_extent_adj =
173 -window_shape[i] + 1 + pad_width[i][0] + pad_width[i][1];
174 if (out_extent_adj < 0) {
175 std::vector<at::indexing::TensorIndex> indices(
176 t.ndimension(), at::indexing::Slice(0, at::indexing::None));
177 indices[i] = at::indexing::Slice(out_extent_adj, at::indexing::None);
178 t.index(indices) = invalid_marker;
179 }
180 }
181
182 return t;
183}
184
185} // namespace
186
187// Shift an input tensor
188TEST_F(NVFuserTest, FusionShift1_CUDA) {
189 Fusion fusion;
190 FusionGuard fg(&fusion);
191
192 auto tv0 = makeSymbolicTensor(2);
193 fusion.addInput(tv0);
194
195 auto tv1 = shift(tv0, {-1, 0});
196 fusion.addOutput(tv1);
197
198 auto tv2 = shift(tv0, {0, 1});
199 fusion.addOutput(tv2);
200
201 auto tv3 = shift(tv0, {2, 2});
202 fusion.addOutput(tv3);
203
204 auto tv4 = shift(tv0, {-2, -2});
205 fusion.addOutput(tv4);
206
207 int numel_x = 9;
208 int numel_y = 11;
209
210 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
211 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
212 std::vector<IValue> inputs = {t0};
213
214 FusionExecutor fe;
215 fe.compileFusion(&fusion, inputs);
216 auto outputs = fe.runFusion(inputs);
217
218 auto t1 = shift(t0, {-1, 0});
219 TORCH_CHECK(t1.equal(outputs[0]));
220
221 auto t2 = shift(t0, {0, 1});
222 TORCH_CHECK(t2.equal(outputs[1]));
223
224 auto t3 = shift(t0, {2, 2});
225 TORCH_CHECK(t3.equal(outputs[2]));
226
227 auto t4 = shift(t0, {-2, -2});
228 TORCH_CHECK(t4.equal(outputs[3]));
229}
230
231// Shifts an intermediate tensor
232TEST_F(NVFuserTest, FusionShift2_CUDA) {
233 Fusion fusion;
234 FusionGuard fg(&fusion);
235
236 auto tv0 = makeSymbolicTensor(2);
237 fusion.addInput(tv0);
238 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
239 auto tv2 = shift(tv1, {-1, 0});
240 fusion.addOutput(tv2);
241
242 // make it a little more complex
243 auto tv3 = add(tv0, IrBuilder::create<Double>(3));
244 auto tv4 = add(tv3, IrBuilder::create<Double>(4));
245 auto tv5 = shift(tv4, {-1, 0});
246 auto tv6 = shift(tv4, {0, -1});
247 auto tv7 = shift(tv4, {1, 0});
248 auto tv8 = shift(tv4, {0, 0});
249 auto tv9 = add(tv5, tv6);
250 auto tv10 = add(tv9, tv7);
251 auto tv11 = add(tv10, tv8);
252 fusion.addOutput(tv11);
253
254 for (auto tv : {tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11}) {
255 tv->setMemoryType(MemoryType::Global);
256 }
257
258 // t1 allocation: (t1.size[0] + 1) * (t1.size[1])
259 // t3 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
260 // t4 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
261 GpuLower gpulw(&fusion);
262
263 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
264 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
265 auto tensor_name = alloc->buffer()->name();
266 if (tensor_name == 1 || tensor_name == 3 || tensor_name == 4) {
267 TORCH_CHECK(alloc->shape().size() == 2);
268 for (int i = 0; i < 2; ++i) {
269 if (tensor_name == 1 && i == 1) {
270 TORCH_CHECK(alloc->shape().at(i)->isA<NamedScalar>());
271 continue;
272 }
273 auto def =
274 dynamic_cast<BinaryOp*>(alloc->shape().at(i)->definition());
275 TORCH_CHECK(
276 def != nullptr && def->getBinaryOpType() == BinaryOpType::Add);
277 TORCH_CHECK(def->as<BinaryOp>()->lhs()->isA<NamedScalar>());
278 auto rhs = dynamic_cast<Int*>(def->as<BinaryOp>()->rhs());
279 TORCH_CHECK(rhs != nullptr && rhs->isConst());
280 int rhs_value = *rhs->value();
281 if (tensor_name == 1) {
282 TORCH_CHECK(i == 0);
283 TORCH_CHECK(rhs_value == 1);
284 } else {
285 if (i == 0) {
286 TORCH_CHECK(rhs_value == 2);
287 } else {
288 TORCH_CHECK(rhs_value == 1);
289 }
290 }
291 }
292 }
293 }
294 }
295
296 int numel_x = 9;
297 int numel_y = 11;
298
299 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
300 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
301 std::vector<IValue> inputs = {t0};
302
303 FusionExecutor fe;
304 fe.compileFusion(&fusion, inputs);
305 auto outputs = fe.runFusion(inputs);
306
307 auto t1 = t0 + 1;
308 auto t2 = shift(t1, {-1, 0});
309
310 auto t3 = t0 + 3;
311 auto t4 = t3 + 4;
312 auto t5 = shift(t4, {-1, 0});
313 auto t6 = shift(t4, {0, -1});
314 auto t7 = shift(t4, {1, 0});
315 auto t8 = shift(t4, {0, 0});
316 auto t9 = t5 + t6;
317 auto t10 = t9 + t7;
318 auto t11 = t10 + t8;
319
320 testValidate(&fusion, outputs, inputs, {t2, t11}, __LINE__, __FILE__);
321}
322
323TEST_F(NVFuserTest, FusionShiftRightOfCA_CUDA) {
324 Fusion fusion;
325 FusionGuard fg(&fusion);
326
327 auto tv0 = makeSymbolicTensor(2);
328 fusion.addInput(tv0);
329
330 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
331 auto tv2 = shift(tv1, {0, 1});
332 fusion.addOutput(tv2);
333
334 tv0->computeAt(tv2, -2);
335
336 tv1->setMemoryType(MemoryType::Global);
337
338 int numel_x = 100;
339 int numel_y = 101;
340
341 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
342 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
343 std::vector<IValue> inputs = {t0};
344
345 FusionExecutor fe;
346 fe.compileFusion(&fusion, inputs);
347 auto outputs = fe.runFusion(inputs);
348
349 auto t1 = t0 + 1;
350 auto t2 = shift(t1, {0, 1});
351
352 TORCH_CHECK(t2.allclose(outputs[0]));
353}
354
355TEST_F(NVFuserTest, FusionShiftLeftOfCA_CUDA) {
356 Fusion fusion;
357 FusionGuard fg(&fusion);
358
359 auto tv0 = makeSymbolicTensor(2);
360 fusion.addInput(tv0);
361 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
362 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
363 auto tv3 = shift(tv2, {-1, 0});
364 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
365 fusion.addOutput(tv4);
366
367 tv0->computeAt(tv4, -1);
368
369 // Lowering should trigger an assertion failure as a shifted axis is
370 // found inside an allocation position.
371 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
372 ASSERT_ANY_THROW(fusion.printKernel());
373}
374
375TEST_F(NVFuserTest, FusionShiftSplit1_CUDA) {
376 Fusion fusion;
377 FusionGuard fg(&fusion);
378
379 auto tv0 = makeSymbolicTensor(2);
380 fusion.addInput(tv0);
381 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
382 auto tv2 = shift(tv1, {0, 1});
383 auto tv3 = shift(tv1, {0, -2});
384 fusion.addOutput(tv2);
385 fusion.addOutput(tv3);
386
387 int split_factor = 4;
388 tv2->split(-1, split_factor);
389 tv3->split(-1, split_factor);
390
391 tv0->computeAt(tv2, -2);
392 tv0->computeAt(tv3, -2);
393
394 // t1 allocation: 7
395 GpuLower gpulw(&fusion);
396 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
397 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
398 auto tensor_name = alloc->buffer()->name();
399 if (tensor_name == 1) {
400 TORCH_CHECK(alloc->shape().size() == 1);
401 auto size = dynamic_cast<Int*>(alloc->shape().at(0));
402 TORCH_CHECK(
403 size != nullptr && size->isConst() && size->value().value() == 7);
404 }
405 }
406 }
407
408 int numel_x = 9;
409 int numel_y = 11;
410
411 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
412 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
413 std::vector<IValue> inputs = {t0};
414
415 FusionExecutor fe;
416 fe.compileFusion(&fusion, inputs);
417 auto outputs = fe.runFusion(inputs);
418
419 auto t1 = t0 + 1;
420 auto t2 = shift(t1, {0, 1});
421 auto t3 = shift(t1, {0, -2});
422
423 testValidate(&fusion, outputs, inputs, {t2, t3}, __LINE__, __FILE__);
424}
425
426TEST_F(NVFuserTest, FusionShiftSplit2_CUDA) {
427 Fusion fusion;
428 FusionGuard fg(&fusion);
429
430 auto tv0 = makeSymbolicTensor(2);
431 fusion.addInput(tv0);
432
433 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
434 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
435 auto tv3 = shift(tv2, {0, -1});
436 auto tv4 = shift(tv2, {0, 1});
437 auto tv5 = add(tv3, tv4);
438 fusion.addOutput(tv5);
439
440 auto tv6 = add(tv0, IrBuilder::create<Double>(1));
441 auto tv7 = shift(tv6, {0, 0});
442 auto tv8 = add(tv7, IrBuilder::create<Double>(1));
443 fusion.addOutput(tv8);
444
445 int split_factor = 4;
446
447 tv5->split(-1, split_factor);
448 tv8->split(-1, split_factor);
449
450 tv0->computeAt(tv5, -2);
451 tv0->computeAt(tv8, -2);
452
453 // t1 and t2 allocation: 6
454 // t4 allocation: 4
455 GpuLower gpulw(&fusion);
456 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
457 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
458 auto tensor_name = alloc->buffer()->name();
459 if (tensor_name == 1 || tensor_name == 2) {
460 TORCH_CHECK(alloc->shape().size() == 1);
461 auto size = dynamic_cast<Int*>(alloc->shape().at(0));
462 TORCH_CHECK(
463 size != nullptr && size->isConst() && size->value().value() == 6);
464 } else if (tensor_name == 4) {
465 TORCH_CHECK(alloc->shape().size() == 1);
466 auto size = dynamic_cast<Int*>(alloc->shape().at(0));
467 TORCH_CHECK(size != nullptr && size->isConst());
468 int size_value = *size->value();
469 TORCH_CHECK(size_value == split_factor);
470 }
471 }
472 }
473
474 int numel_x = 9;
475 int numel_y = 11;
476
477 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
478 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
479 std::vector<IValue> inputs = {t0};
480
481 FusionExecutor fe;
482 fe.compileFusion(&fusion, inputs);
483 auto outputs = fe.runFusion(inputs);
484
485 auto t1 = t0 + 2;
486 auto t3 = shift(t1, {0, -1});
487 auto t4 = shift(t1, {0, 1});
488 auto t5 = t3 + t4;
489
490 auto t6 = t0 + 1;
491 auto t7 = t6;
492 auto t8 = t7 + 1;
493
494 testValidate(&fusion, outputs, inputs, {t5, t8}, __LINE__, __FILE__);
495}
496
497TEST_F(NVFuserTest, FusionShiftDoubleSplit_CUDA) {
498 Fusion fusion;
499 FusionGuard fg(&fusion);
500
501 auto tv0 = makeSymbolicTensor(2);
502 fusion.addInput(tv0);
503 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
504 auto tv2 = add(tv1, IrBuilder::create<Double>(2));
505 auto tv3 = shift(tv2, {0, 1});
506 fusion.addOutput(tv3);
507
508 int split_factor1 = 8;
509 int split_factor2 = 4;
510
511 tv3->split(-1, split_factor1);
512
513 tv0->computeAt(tv3, -2);
514
515 tv1->split(-1, split_factor2);
516
517 // t1: [i1, i2/8, 8/4, 4]
518 // t2: [i1, i2/8, 8]
519 // t3: [i1, i2/8, 8]
520
521 // t1 and t2 allocation: (split_factor1 + 1) = 9
522 GpuLower gpulw(&fusion);
523 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
524 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
525 auto tensor_name = alloc->buffer()->name();
526 if (tensor_name == 1 || tensor_name == 2) {
527 TORCH_CHECK(alloc->shape().size() == 1);
528 auto size = dynamic_cast<Int*>(alloc->shape().at(0));
529 TORCH_CHECK(
530 size != nullptr && size->isConst() && size->value().value() == 9);
531 }
532 }
533 }
534
535 int numel_x = 99;
536 int numel_y = 101;
537
538 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
539 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
540 std::vector<IValue> inputs = {t0};
541
542 FusionExecutor fe;
543 fe.compileFusion(&fusion, inputs);
544 auto outputs = fe.runFusion(inputs);
545
546 auto t1 = t0 + 3;
547 auto ref = shift(t1, {0, 1});
548
549 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
550}
551
552TEST_F(NVFuserTest, FusionShift3ptStencil_CUDA) {
553 Fusion fusion;
554 FusionGuard fg(&fusion);
555
556 // 3-pt stencil
557 auto tv0 = makeSymbolicTensor(1);
558 fusion.addInput(tv0);
559
560 std::vector<std::vector<int>> offsets = {{-1}, {1}};
561
562 std::vector<TensorView*> tvs;
563 for (const auto& offset : offsets) {
564 tvs.push_back(shift(tv0, offset));
565 }
566
567 auto tv_out = tv0;
568
569 for (auto tv : tvs) {
570 tv_out = add(tv_out, tv);
571 }
572
573 tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
574
575 fusion.addOutput(tv_out);
576
577 int split_factor = 4;
578
579 tv_out->split(0, split_factor);
580
581 // This seems fine but not verified yet
582 // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
583
584 auto cache = tv0->cacheAfter();
585
586 tv0->computeAt(tv_out, 1);
587
588 // Inline completely except for the cache
589 for (auto tv : tvs) {
590 tv->computeAt(tv_out, -1);
591 }
592
593 // cache allocation: (split_factor + 2)
594 GpuLower gpulw(&fusion);
595 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
596 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
597 auto tensor_name = alloc->buffer()->name();
598 if (tensor_name == cache->name()) {
599 TORCH_CHECK(alloc->shape().size() == 1);
600 auto size = dynamic_cast<Int*>(alloc->shape().at(0));
601 TORCH_CHECK(
602 size != nullptr && size->isConst() &&
603 size->value().value() == split_factor + 2);
604 }
605 }
606 }
607
608 cache->doubleBuffer();
609
610 int numel_x = 99;
611
612 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
613 at::Tensor t0 = at::randn({numel_x}, options);
614 std::vector<IValue> inputs = {t0};
615
616 FusionExecutor fe;
617 fe.compileFusion(&fusion, inputs);
618 auto outputs = fe.runFusion(inputs);
619
620 auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
621
622 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
623}
624
625TEST_F(NVFuserTest, FusionShift5ptStencil_CUDA) {
626 Fusion fusion;
627 FusionGuard fg(&fusion);
628
629 // 5-pt stencil
630 auto tv0 = makeSymbolicTensor(2);
631 fusion.addInput(tv0);
632 std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
633
634 std::vector<TensorView*> tvs;
635 for (const auto& offset : offsets) {
636 tvs.push_back(shift(tv0, offset));
637 }
638
639 auto tv_out = tv0;
640
641 for (auto tv : tvs) {
642 tv_out = add(tv_out, tv);
643 }
644
645 tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
646
647 fusion.addOutput(tv_out);
648
649 std::vector<int> split_factor({4, 8});
650
651 tv_out->split(-1, split_factor[1]);
652 tv_out->split(0, split_factor[0]);
653 tv_out->reorder({{1, 2}, {2, 1}});
654
655 auto cache = tv0->cacheAfter();
656
657 tv0->computeAt(tv_out, 2);
658
659 // Inline completely except for the cache
660 for (auto tv : tvs) {
661 tv->computeAt(tv_out, -1);
662 }
663
664 // cache allocation: (split_factor + 2) * (split_factor + 2)
665 GpuLower gpulw(&fusion);
666 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
667 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
668 auto tensor_name = alloc->buffer()->name();
669 if (tensor_name == cache->name()) {
670 TORCH_CHECK(alloc->shape().size() == 2);
671 for (int i = 0; i < 2; ++i) {
672 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
673 TORCH_CHECK(
674 size != nullptr && size->isConst() &&
675 size->value().value() == split_factor[i] + 2);
676 }
677 }
678 }
679 }
680
681 cache->doubleBuffer();
682
683 int numel_x = 99;
684 int numel_y = 101;
685
686 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
687 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
688 std::vector<IValue> inputs = {t0};
689
690 FusionExecutor fe;
691 fe.compileFusion(&fusion, inputs);
692 auto outputs = fe.runFusion(inputs);
693
694 auto ref = t0;
695 for (const auto& offset : offsets) {
696 ref = ref + shift(t0, offset);
697 }
698 ref = ref / int(offsets.size() + 1);
699
700 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
701}
702
703TEST_F(NVFuserTest, FusionShift9ptStencil_CUDA) {
704 Fusion fusion;
705 FusionGuard fg(&fusion);
706
707 // 9-pt stencil
708 std::vector<std::vector<int>> offsets;
709 for (int i = -1; i < 2; ++i) {
710 for (int j = -1; j < 2; ++j) {
711 if (i == 0 && j == 0) {
712 continue;
713 }
714 offsets.push_back({i, j});
715 }
716 }
717
718 auto tv0 = makeSymbolicTensor(2);
719 fusion.addInput(tv0);
720 std::vector<TensorView*> tvs;
721 for (const auto& offset : offsets) {
722 tvs.push_back(shift(tv0, offset));
723 }
724
725 auto tv_out = tv0;
726
727 for (auto tv : tvs) {
728 tv_out = add(tv_out, tv);
729 }
730
731 tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
732
733 fusion.addOutput(tv_out);
734
735 std::vector<int> split_factor({4, 8});
736 tv_out->split(-1, split_factor[1]);
737 tv_out->split(0, split_factor[0]);
738 tv_out->reorder({{1, 2}, {2, 1}});
739
740 auto cache = tv0->cacheAfter();
741
742 tv0->computeAt(tv_out, 2);
743
744 // Inline completely except for the cache
745 for (auto tv : tvs) {
746 tv->computeAt(tv_out, -1);
747 }
748
749 // This seems fine but not yet verified
750 // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
751
752 // cache allocation: (split_factor + 2) * (split_factor + 2)
753 GpuLower gpulw(&fusion);
754 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
755 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
756 auto tensor_name = alloc->buffer()->name();
757 if (tensor_name == cache->name()) {
758 TORCH_CHECK(alloc->shape().size() == 2);
759 for (int i = 0; i < 2; ++i) {
760 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
761 TORCH_CHECK(
762 size != nullptr && size->isConst() &&
763 size->value().value() == split_factor[i] + 2);
764 }
765 }
766 }
767 }
768
769 cache->doubleBuffer();
770
771 int numel_x = 99;
772 int numel_y = 101;
773
774 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
775 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
776 std::vector<IValue> inputs = {t0};
777
778 FusionExecutor fe;
779 fe.compileFusion(&fusion, inputs);
780 auto outputs = fe.runFusion(inputs);
781
782 auto ref = t0;
783 for (const auto& offset : offsets) {
784 ref = ref + shift(t0, offset);
785 }
786 ref = ref / int(offsets.size() + 1);
787
788 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
789}
790
791TEST_F(NVFuserTest, FusionShiftSmemBlocking_CUDA) {
792 Fusion fusion;
793 FusionGuard fg(&fusion);
794
795 auto tv0 = makeSymbolicTensor(2);
796 fusion.addInput(tv0);
797 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
798 auto tv2 = shift(tv1, {0, 1});
799 fusion.addOutput(tv2);
800
801 int smem_block_factor = 32;
802
803 tv2->split(-1, smem_block_factor);
804
805 tv0->computeAt(tv2, -2);
806
807 tv1->axis(-1)->parallelize(ParallelType::TIDx);
808 tv2->axis(-1)->parallelize(ParallelType::TIDx);
809
810 tv1->setMemoryType(MemoryType::Shared);
811
812 // tv1 allocation: (split_factor + 1)
813 GpuLower gpulw(&fusion);
814 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
815 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
816 auto tensor_name = alloc->buffer()->name();
817 if (tensor_name == tv1->name()) {
818 TORCH_CHECK(alloc->shape().size() == 1);
819 for (int i = 0; i < 1; ++i) {
820 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
821 TORCH_CHECK(
822 size != nullptr && size->isConst() &&
823 size->value().value() == smem_block_factor + 1);
824 }
825 }
826 }
827 }
828
829 int numel_x = 100;
830 int numel_y = 101;
831
832 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
833 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
834 std::vector<IValue> inputs = {t0};
835
836 FusionExecutor fe;
837 fe.compileFusion(&fusion, inputs);
838 auto outputs = fe.runFusion(inputs);
839
840 auto t1 = t0 + 1;
841 auto t2 = shift(t1, {0, 1});
842 auto ref = t2;
843
844 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
845}
846
847TEST_F(NVFuserTest, FusionShift3ptStencilParallel_CUDA) {
848 Fusion fusion;
849 FusionGuard fg(&fusion);
850
851 // 3-pt stencil
852 auto tv0 = makeSymbolicTensor(1);
853 fusion.addInput(tv0);
854 std::vector<TensorView*> tvs;
855 tvs.push_back(shift(tv0, {-1}));
856 tvs.push_back(shift(tv0, {1}));
857
858 auto tv_out = tv0;
859
860 for (auto tv : tvs) {
861 tv_out = add(tv_out, tv);
862 }
863
864 tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
865
866 fusion.addOutput(tv_out);
867
868 int smem_block_factor = 32;
869
870 tv_out->split(0, smem_block_factor);
871 // tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
872
873 auto tv0_cache = tv0->cacheAfter();
874
875 tv0->computeAt(tv_out, 1);
876
877 for (auto tv : tvs) {
878 tv->computeAt(tv_out, -1);
879 }
880
881 tv0_cache->setMemoryType(MemoryType::Shared);
882 tv_out->axis(-1)->parallelize(ParallelType::TIDx);
883 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
884
885 tv0_cache->doubleBuffer();
886
887 int numel_x = 99;
888
889 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
890 at::Tensor t0 = at::randn({numel_x}, options);
891 std::vector<IValue> inputs = {t0};
892
893 FusionExecutor fe;
894 fe.compileFusion(&fusion, inputs);
895 auto outputs = fe.runFusion(inputs);
896
897 auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
898
899 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
900}
901
902TEST_F(NVFuserTest, FusionShift5ptStencilParallel_CUDA) {
903 Fusion fusion;
904 FusionGuard fg(&fusion);
905
906 // 5-pt stencil
907 auto tv0 = makeSymbolicTensor(2);
908 fusion.addInput(tv0);
909 std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
910
911 std::vector<TensorView*> tvs;
912 for (const auto& offset : offsets) {
913 tvs.push_back(shift(tv0, offset));
914 }
915
916 auto tv_out = tv0;
917
918 for (auto tv : tvs) {
919 tv_out = add(tv_out, tv);
920 }
921
922 tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
923
924 fusion.addOutput(tv_out);
925
926 int smem_block_factor = 32;
927
928 tv_out->split(-1, smem_block_factor);
929 tv_out->split(0, smem_block_factor);
930
931 tv_out->reorder({{1, 2}, {2, 1}});
932
933 auto tv0_cache = tv0->cacheAfter();
934
935 tv0->computeAt(tv_out, 2);
936
937 for (auto tv : tvs) {
938 tv->computeAt(tv_out, -1);
939 }
940
941 tv_out->axis(-1)->parallelize(ParallelType::TIDx);
942 tv_out->axis(-2)->parallelize(ParallelType::TIDy);
943 tv_out->axis(-3)->parallelize(ParallelType::BIDx);
944 tv_out->axis(-4)->parallelize(ParallelType::BIDy);
945
946 tv0_cache->setMemoryType(MemoryType::Shared);
947 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
948 tv0_cache->axis(-2)->parallelize(ParallelType::TIDy);
949
950 int numel_x = 99;
951 int numel_y = 101;
952
953 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
954 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
955 std::vector<IValue> inputs = {t0};
956
957 FusionExecutor fe;
958 fe.compileFusion(&fusion, inputs);
959 auto outputs = fe.runFusion(inputs);
960
961 auto ref = t0;
962 for (const auto& offset : offsets) {
963 ref = ref + shift(t0, offset);
964 }
965 ref = ref / int(offsets.size() + 1);
966
967 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
968}
969
970TEST_F(NVFuserTest, FusionShiftMerge1_CUDA) {
971 Fusion fusion;
972 FusionGuard fg(&fusion);
973
974 auto tv0 = makeSymbolicTensor(2);
975 fusion.addInput(tv0);
976 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
977 auto tv2 = shift(tv1, {-1, 1});
978 fusion.addOutput(tv2);
979
980 int split_factor = 4;
981
982 tv2->split(-1, split_factor);
983 tv2->split(0, split_factor);
984 tv2->reorder({{1, 2}, {2, 1}});
985 tv2->merge(2, 3);
986
987 tv0->computeAt(tv2, 2);
988
989 // t1 allocation: (split_factor + 1) * (split_factor + 1)
990 GpuLower gpulw(&fusion);
991 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
992 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
993 auto tensor_name = alloc->buffer()->name();
994 if (tensor_name == 1) {
995 TORCH_CHECK(alloc->shape().size() == 2);
996 for (int i = 0; i < 2; ++i) {
997 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
998 TORCH_CHECK(
999 size != nullptr && size->isConst() &&
1000 size->value().value() == split_factor + 1);
1001 }
1002 }
1003 }
1004 }
1005
1006 int numel_x = 99;
1007 int numel_y = 101;
1008
1009 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1010 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1011 std::vector<IValue> inputs = {t0};
1012
1013 FusionExecutor fe;
1014 fe.compileFusion(&fusion, inputs);
1015 auto outputs = fe.runFusion(inputs);
1016
1017 auto t1 = t0 + 1;
1018 auto t2 = shift(t1, {-1, 1});
1019 auto ref = t2;
1020
1021 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1022}
1023
1024TEST_F(NVFuserTest, FusionShiftMerge2_CUDA) {
1025 Fusion fusion;
1026 FusionGuard fg(&fusion);
1027
1028 auto tv0 = makeSymbolicTensor(2);
1029 fusion.addInput(tv0);
1030 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1031 auto tv2 = shift(tv1, {1, -1});
1032 auto tv3 = shift(tv1, {-1, 1});
1033 auto tv4 = add(tv2, tv3);
1034 fusion.addOutput(tv4);
1035
1036 int split_factor = 4;
1037
1038 tv4->split(-1, split_factor);
1039 tv4->split(0, split_factor);
1040 tv4->reorder({{1, 2}, {2, 1}});
1041 tv4->merge(2, 3);
1042
1043 tv0->computeAt(tv4, -2);
1044
1045 // t1 allocation: (split_factor + 2) * (split_factor + 2)
1046 GpuLower gpulw(&fusion);
1047 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1048 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1049 auto tensor_name = alloc->buffer()->name();
1050 if (tensor_name == 1) {
1051 TORCH_CHECK(alloc->shape().size() == 2);
1052 for (int i = 0; i < 2; ++i) {
1053 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
1054 TORCH_CHECK(
1055 size != nullptr && size->isConst() &&
1056 size->value().value() == split_factor + 2);
1057 }
1058 }
1059 }
1060 }
1061
1062 int numel_x = 99;
1063 int numel_y = 101;
1064
1065 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1066 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1067 std::vector<IValue> inputs = {t0};
1068
1069 FusionExecutor fe;
1070 fe.compileFusion(&fusion, inputs);
1071 auto outputs = fe.runFusion(inputs);
1072
1073 auto t1 = t0 + 1;
1074 auto t2 = shift(t1, {1, -1});
1075 auto t3 = shift(t1, {-1, 1});
1076 auto t4 = t2 + t3;
1077
1078 TORCH_CHECK(t4.allclose(outputs[0]));
1079}
1080
1081TEST_F(NVFuserTest, FusionShiftGlobal_CUDA) {
1082 Fusion fusion;
1083 FusionGuard fg(&fusion);
1084
1085 auto tv0 = makeSymbolicTensor(2);
1086 fusion.addInput(tv0);
1087
1088 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1089 auto tv2 = shift(tv1, {0, 1});
1090 auto tv3 = shift(tv1, {-1, 0});
1091 auto tv4 = add(tv2, tv3);
1092 fusion.addOutput(tv4);
1093
1094 tv1->split(-1, 4);
1095 tv2->split(-1, 8);
1096 tv3->split(-1, 2);
1097 tv4->split(-1, 3);
1098
1099 tv1->merge(-2, -1);
1100
1101 tv1->setMemoryType(MemoryType::Global);
1102 tv2->setMemoryType(MemoryType::Global);
1103 tv3->setMemoryType(MemoryType::Global);
1104
1105 // t1 allocation: (t1.size[0] + 1) * (t1.size[1] + 1)
1106 GpuLower gpulw(&fusion);
1107 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1108 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1109 auto tensor_name = alloc->buffer()->name();
1110 if (tensor_name == 1) {
1111 TORCH_CHECK(alloc->shape().size() == 2);
1112 for (int i = 0; i < 2; ++i) {
1113 auto def =
1114 dynamic_cast<BinaryOp*>(alloc->shape().at(i)->definition());
1115 TORCH_CHECK(
1116 def != nullptr && def->getBinaryOpType() == BinaryOpType::Add);
1117 TORCH_CHECK(def->as<BinaryOp>()->lhs()->isA<NamedScalar>());
1118 auto rhs = dynamic_cast<Int*>(def->as<BinaryOp>()->rhs());
1119 TORCH_CHECK(rhs != nullptr && rhs->isConst());
1120 int rhs_value = *rhs->value();
1121 TORCH_CHECK(rhs_value == 1);
1122 }
1123 }
1124 }
1125 }
1126
1127 int numel_x = 99;
1128 int numel_y = 101;
1129
1130 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1131 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1132 std::vector<IValue> inputs = {t0};
1133
1134 FusionExecutor fe;
1135 fe.compileFusion(&fusion, inputs);
1136 auto outputs = fe.runFusion(inputs);
1137
1138 auto t1 = t0 + 1;
1139 auto t2 = shift(t1, {0, 1});
1140 auto t3 = shift(t1, {-1, 0});
1141 auto t4 = t2 + t3;
1142 auto ref = t4;
1143
1144 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1145}
1146
1147TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) {
1148 Fusion fusion;
1149 FusionGuard fg(&fusion);
1150
1151 auto tv0 = makeSymbolicTensor(2);
1152 fusion.addInput(tv0);
1153 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1154 auto tv2 = add(tv1, IrBuilder::create<Double>(2));
1155 auto tv3 = shift(tv2, {0, 1});
1156 fusion.addOutput(tv3);
1157
1158 int split_factor1 = 8;
1159 int split_factor2 = 4;
1160
1161 tv3->split(-1, split_factor1);
1162
1163 tv0->computeAt(tv3, -2);
1164
1165 tv1->split(-1, split_factor2);
1166 tv1->merge(-2, -1);
1167
1168 // t1 and t2 allocation: (split_factor1 + 1)
1169 GpuLower gpulw(&fusion);
1170 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1171 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1172 auto tensor_name = alloc->buffer()->name();
1173 if (tensor_name == 1 || tensor_name == 2) {
1174 auto size = dynamic_cast<Int*>(alloc->shape().at(0));
1175 TORCH_CHECK(
1176 size != nullptr && size->isConst() &&
1177 size->value().value() == split_factor1 + 1);
1178 }
1179 }
1180 }
1181
1182 int numel_x = 99;
1183 int numel_y = 101;
1184
1185 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1186 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1187 std::vector<IValue> inputs = {t0};
1188
1189 FusionExecutor fe;
1190 fe.compileFusion(&fusion, inputs);
1191 auto outputs = fe.runFusion(inputs);
1192
1193 auto t1 = t0 + 3;
1194 auto ref = shift(t1, {0, 1});
1195
1196 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1197}
1198
1199TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) {
1200 Fusion fusion;
1201 FusionGuard fg(&fusion);
1202
1203 auto tv0 = makeSymbolicTensor(2);
1204 fusion.addInput(tv0);
1205
1206 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1207 auto tv2 = add(tv1, IrBuilder::create<Double>(2));
1208 auto tv3 = shift(tv2, {1, 1});
1209 fusion.addOutput(tv3);
1210
1211 auto out = tv3;
1212
1213 int split_factor1 = 32;
1214 int split_factor2 = 4;
1215
1216 out->split(-1, split_factor1);
1217 out->split(-1, split_factor2);
1218 out->split(0, split_factor1);
1219 out->split(1, split_factor2);
1220 out->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
1221 out->merge(2, 3);
1222 out->merge(2, 3);
1223 out->merge(2, 3);
1224 out->merge(0, 1);
1225
1226 TransformPropagator propagator(out);
1227 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
1228
1229 tv0->computeAt(out, 1);
1230
1231 out->axis(0)->parallelize(ParallelType::BIDx);
1232 out->axis(1)->parallelize(ParallelType::TIDx);
1233
1234 scheduler_utils::parallelizeAllLike(out, {tv1, tv2});
1235
1236 for (auto tv : {tv1, tv2}) {
1237 tv->setMemoryType(MemoryType::Shared);
1238 }
1239
1240 // t1 and t2 allocation: (split_factor1 + 1) * (split_factor1 + 1)
1241 GpuLower gpulw(&fusion);
1242 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1243 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1244 auto tensor_name = alloc->buffer()->name();
1245 if (tensor_name == 1 || tensor_name == 2) {
1246 TORCH_CHECK(alloc->shape().size() == 2);
1247 for (int i = 0; i < 2; ++i) {
1248 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
1249 TORCH_CHECK(
1250 size != nullptr && size->isConst() &&
1251 size->value().value() == split_factor1 + 1);
1252 }
1253 }
1254 }
1255 }
1256
1257 int numel_x = 99;
1258 int numel_y = 101;
1259
1260 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1261 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1262 std::vector<IValue> inputs = {t0};
1263
1264 FusionExecutor fe;
1265 fe.compileFusion(&fusion, inputs);
1266 auto outputs = fe.runFusion(inputs);
1267
1268 auto ref = shift(t0 + 1 + 2, {1, 1});
1269
1270 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1271}
1272
1273TEST_F(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) {
1274 Fusion fusion;
1275 FusionGuard fg(&fusion);
1276
1277 // 5-pt stencil
1278 auto tv0 = makeSymbolicTensor(2);
1279 fusion.addInput(tv0);
1280 std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
1281
1282 std::vector<TensorView*> tvs;
1283 for (const auto& offset : offsets) {
1284 tvs.push_back(shift(tv0, offset));
1285 }
1286
1287 auto tv_out = tv0;
1288
1289 for (auto tv : tvs) {
1290 tv_out = add(tv_out, tv);
1291 }
1292
1293 tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
1294
1295 fusion.addOutput(tv_out);
1296
1297 std::vector<int> split_factor({4, 32});
1298
1299 tv_out->split(-1, split_factor[1]);
1300 tv_out->split(0, split_factor[0]);
1301 tv_out->reorder({{1, 2}, {2, 1}});
1302
1303 auto tv0_cache = tv0->cacheAfter();
1304
1305 // Merge the inner-most two axes and create
1306 // a 1D thread block of split_factor1*split_factor2 threads
1307 tv_out->merge(-2, -1);
1308
1309 tv0->computeAt(tv_out, 2);
1310
1311 // Inline completely except for the cache
1312 for (auto tv : tvs) {
1313 tv->computeAt(tv_out, -1);
1314 }
1315
1316 tv0_cache->merge(-2, -1);
1317
1318 tv_out->axis(-1)->parallelize(ParallelType::TIDx);
1319 tv_out->axis(1)->parallelize(ParallelType::BIDx);
1320 tv_out->axis(0)->parallelize(ParallelType::BIDy);
1321
1322 tv0_cache->setMemoryType(MemoryType::Shared);
1323 tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
1324
1325 // cache allocation: (split_factor1 + 2) * (split_factor2 + 2)
1326 GpuLower gpulw(&fusion);
1327 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1328 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1329 auto tensor_name = alloc->buffer()->name();
1330 if (tensor_name == tv0_cache->name()) {
1331 TORCH_CHECK(alloc->shape().size() == 2);
1332 for (int i = 0; i < 2; ++i) {
1333 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
1334 TORCH_CHECK(
1335 size != nullptr && size->isConst() &&
1336 size->value().value() == split_factor[i] + 2);
1337 }
1338 }
1339 }
1340 }
1341
1342 int numel_x = 99;
1343 int numel_y = 101;
1344
1345 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1346 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1347 std::vector<IValue> inputs = {t0};
1348
1349 FusionExecutor fe;
1350 fe.compileFusion(&fusion, inputs);
1351 auto outputs = fe.runFusion(inputs);
1352
1353 auto ref = t0;
1354 for (const auto& offset : offsets) {
1355 ref = ref + shift(t0, offset);
1356 }
1357 ref = ref / int(offsets.size() + 1);
1358
1359 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1360}
1361
1362TEST_F(NVFuserTest, FusionShiftChain1_CUDA) {
1363 Fusion fusion;
1364 FusionGuard fg(&fusion);
1365
1366 auto tv0 = makeSymbolicTensor(2);
1367 fusion.addInput(tv0);
1368 auto tv1 = shift(tv0, {0, 1});
1369 auto tv2 = shift(tv1, {0, 1});
1370 fusion.addOutput(tv2);
1371
1372 int split_factor = 4;
1373 tv2->split(-1, split_factor);
1374
1375 tv0->computeAt(tv2, -2);
1376
1377 int numel_x = 99;
1378 int numel_y = 101;
1379
1380 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1381 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1382 std::vector<IValue> inputs = {t0};
1383
1384 FusionExecutor fe;
1385 fe.compileFusion(&fusion, inputs);
1386 auto outputs = fe.runFusion(inputs);
1387
1388 auto ref = shift(shift(t0, {0, 1}), {0, 1});
1389
1390 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1391}
1392
1393TEST_F(NVFuserTest, FusionShiftChain2_CUDA) {
1394 Fusion fusion;
1395 FusionGuard fg(&fusion);
1396
1397 auto tv0 = makeSymbolicTensor(2);
1398 fusion.addInput(tv0);
1399 auto tv1 = shift(tv0, {0, 1});
1400 auto tv2 = shift(tv1, {0, -1});
1401 fusion.addOutput(tv2);
1402
1403 tv2->split(-1, 4);
1404
1405 tv0->computeAt(tv2, -2);
1406
1407 int numel_x = 99;
1408 int numel_y = 101;
1409
1410 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1411 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1412 std::vector<IValue> inputs = {t0};
1413
1414 FusionExecutor fe;
1415 fe.compileFusion(&fusion, inputs);
1416 auto outputs = fe.runFusion(inputs);
1417
1418 auto ref = shift(shift(t0, {0, 1}), {0, -1});
1419
1420 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1421}
1422
1423TEST_F(NVFuserTest, FusionShiftChain3_CUDA) {
1424 Fusion fusion;
1425 FusionGuard fg(&fusion);
1426
1427 auto tv0 = makeSymbolicTensor(2);
1428 fusion.addInput(tv0);
1429 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1430 auto tv2 = shift(tv1, {0, 1});
1431 auto tv3 = shift(tv2, {0, 1});
1432 fusion.addOutput(tv3);
1433
1434 int split_factor = 4;
1435 tv3->split(-1, split_factor);
1436
1437 tv0->computeAt(tv3, -2);
1438
1439 // Halo size of tv1 is 2 as it needs to account for both of the two
1440 // shift operations , while that of tv2 is still just 1
1441
1442 // tv1: (split_factor + 2)
1443 // tv2: (split_factor + 1)
1444 GpuLower gpulw(&fusion);
1445 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1446 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1447 auto tensor_name = alloc->buffer()->name();
1448 if (tensor_name == 1 || tensor_name == 2) {
1449 TORCH_CHECK(alloc->shape().size() == 1);
1450 for (int i = 0; i < 1; ++i) {
1451 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
1452 TORCH_CHECK(size != nullptr && size->isConst());
1453 if (tensor_name == 1) {
1454 TORCH_CHECK(size->value().value() == split_factor + 2);
1455 } else if (tensor_name == 2) {
1456 TORCH_CHECK(size->value().value() == split_factor + 1);
1457 }
1458 }
1459 }
1460 }
1461 }
1462
1463 int numel_x = 99;
1464 int numel_y = 101;
1465
1466 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1467 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1468 std::vector<IValue> inputs = {t0};
1469
1470 FusionExecutor fe;
1471 fe.compileFusion(&fusion, inputs);
1472 auto outputs = fe.runFusion(inputs);
1473
1474 auto t1 = t0 + 1;
1475 auto t2 = shift(t1, {0, 1});
1476 auto t3 = shift(t2, {0, 1});
1477 auto ref = t3;
1478
1479 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1480}
1481
1482TEST_F(NVFuserTest, FusionShiftChain4_CUDA) {
1483 Fusion fusion;
1484 FusionGuard fg(&fusion);
1485
1486 auto tv0 = makeSymbolicTensor(2);
1487 fusion.addInput(tv0);
1488 auto tv1 = shift(tv0, {1, -1});
1489 auto tv2 = shift(tv1, {2, -2});
1490 auto tv3 = shift(tv2, {3, -3});
1491 auto tv4 = shift(tv3, {4, -4});
1492 auto tv_out = tv4;
1493
1494 fusion.addOutput(tv_out);
1495
1496 int split_factor = 4;
1497
1498 tv_out->split(-1, split_factor);
1499 tv_out->split(0, split_factor);
1500 tv_out->reorder({{1, 2}, {2, 1}});
1501
1502 tv0->computeAt(tv_out, 2);
1503
1504 tv1->merge(-2, -1);
1505 tv2->merge(-2, -1);
1506 tv3->merge(-2, -1);
1507
1508 // tv1: (split_factor + 9) * (split_factor + 9)
1509 // tv2: (split_factor + 7) * (split_factor + 7)
1510 // tv3: (split_factor + 4) * (split_factor + 4)
1511 GpuLower gpulw(&fusion);
1512 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1513 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1514 auto tensor_name = alloc->buffer()->name();
1515 if (tensor_name == 1 || tensor_name == 2) {
1516 TORCH_CHECK(alloc->shape().size() == 2);
1517 for (int i = 0; i < 2; ++i) {
1518 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
1519 TORCH_CHECK(size != nullptr && size->isConst());
1520 auto size_val = size->value().value();
1521 if (tensor_name == 1) {
1522 TORCH_CHECK(size_val == split_factor + 9);
1523 } else if (tensor_name == 2) {
1524 TORCH_CHECK(size_val == split_factor + 7);
1525 } else if (tensor_name == 3) {
1526 TORCH_CHECK(size_val == split_factor + 4);
1527 }
1528 }
1529 }
1530 }
1531 }
1532
1533 int numel_x = 99;
1534 int numel_y = 101;
1535
1536 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1537 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1538 std::vector<IValue> inputs = {t0};
1539
1540 FusionExecutor fe;
1541 fe.compileFusion(&fusion, inputs);
1542 auto outputs = fe.runFusion(inputs);
1543
1544 auto t1 = shift(t0, {1, -1});
1545 auto t2 = shift(t1, {2, -2});
1546 auto t3 = shift(t2, {3, -3});
1547 auto t4 = shift(t3, {4, -4});
1548 auto ref = t4;
1549
1550 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1551}
1552
1553TEST_F(NVFuserTest, FusionShift5ptStencilChain_CUDA) {
1554 Fusion fusion;
1555 FusionGuard fg(&fusion);
1556
1557 auto tv0 = makeSymbolicTensor(2);
1558 fusion.addInput(tv0);
1559 std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
1560
1561 // First stencil: 5pt stencil
1562 // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
1563 std::vector<TensorView*> tv_stencil1_shifts;
1564 for (const auto& offset : offsets) {
1565 tv_stencil1_shifts.push_back(shift(tv0, offset));
1566 }
1567
1568 auto tv_stencil1 = tv0;
1569 for (auto tv : tv_stencil1_shifts) {
1570 tv_stencil1 = add(tv_stencil1, tv);
1571 }
1572
1573 tv_stencil1 = div(
1574 tv_stencil1, IrBuilder::create<Double>(tv_stencil1_shifts.size() + 1));
1575
1576 // Second stencil: Same 5pt stencil
1577 std::vector<TensorView*> tv_stencil2_shifts;
1578 for (const auto& offset : offsets) {
1579 tv_stencil2_shifts.push_back(shift(tv_stencil1, offset));
1580 }
1581
1582 auto tv_stencil2 = tv_stencil1;
1583 for (auto tv : tv_stencil2_shifts) {
1584 tv_stencil2 = add(tv_stencil2, tv);
1585 }
1586
1587 tv_stencil2 = div(
1588 tv_stencil2, IrBuilder::create<Double>(tv_stencil2_shifts.size() + 1));
1589
1590 auto tv_out = tv_stencil2;
1591
1592 fusion.addOutput(tv_out);
1593
1594 auto tv0_cache = tv0->cacheAfter();
1595
1596 std::vector<int> split_factor({16, 16});
1597
1598 tv_out->split(-1, split_factor[1]);
1599 tv_out->split(0, split_factor[0]);
1600 tv_out->reorder({{1, 2}, {2, 1}});
1601
1602 tv0->computeAt(tv_out, 2);
1603
1604 // Inline completely all inputs to the first stencil output, except for the
1605 // tv0 cache
1606 for (auto tv : tv_stencil1_shifts) {
1607 tv->computeAt(tv_stencil1, -1);
1608 }
1609
1610 // Inline completely all inputs to the second stencil output, except
1611 // for the first stencil output
1612 for (auto tv : tv_stencil2_shifts) {
1613 tv->computeAt(tv_stencil2, -1);
1614 }
1615
1616 tv_out->axis(1)->parallelize(ParallelType::BIDx);
1617 tv_out->axis(0)->parallelize(ParallelType::BIDy);
1618
1619 auto all_values = DependencyCheck::getAllValsBetween(
1620 {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
1621 for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
1622 tv->axis(-1)->parallelize(ParallelType::TIDx);
1623 tv->axis(-2)->parallelize(ParallelType::TIDy);
1624 }
1625
1626 tv0_cache->setMemoryType(MemoryType::Shared);
1627 tv_stencil1->setMemoryType(MemoryType::Shared);
1628
1629 // tv0_cache: (split_factor + 4) * (split_factor + 4)
1630 // tv_stencil1: (split_factor + 2) * (split_factor + 2)
1631 GpuLower gpulw(&fusion);
1632 for (const auto expr : gpulw.kernel()->unordered_exprs()) {
1633 if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
1634 auto tensor_name = alloc->buffer()->name();
1635 if (tensor_name == tv0_cache->name() ||
1636 tensor_name == tv_stencil1->name()) {
1637 TORCH_CHECK(alloc->shape().size() == 2);
1638 for (int i = 0; i < 2; ++i) {
1639 auto size = dynamic_cast<Int*>(alloc->shape().at(i));
1640 TORCH_CHECK(size != nullptr && size->isConst());
1641 if (tensor_name == tv0_cache->name()) {
1642 TORCH_CHECK(size->value().value() == split_factor[i] + 4);
1643 } else if (tensor_name == tv_stencil1->name()) {
1644 TORCH_CHECK(size->value().value() == split_factor[i] + 2);
1645 }
1646 }
1647 }
1648 }
1649 }
1650
1651 int numel_x = 99;
1652 int numel_y = 101;
1653
1654 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1655 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1656 std::vector<IValue> inputs = {t0};
1657
1658 FusionExecutor fe;
1659 fe.compileFusion(&fusion, inputs);
1660 auto outputs = fe.runFusion(inputs);
1661
1662 auto stencil1 = t0;
1663 for (const auto& offset : offsets) {
1664 stencil1 = stencil1 + shift(t0, offset);
1665 }
1666 stencil1 = stencil1 / int(offsets.size() + 1);
1667 auto stencil2 = stencil1;
1668 for (const auto& offset : offsets) {
1669 stencil2 = stencil2 + shift(stencil1, offset);
1670 }
1671 stencil2 = stencil2 / int(offsets.size() + 1);
1672 auto ref = stencil2;
1673
1674 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1675}
1676
1677// Shift a reduced tensor
1678TEST_F(NVFuserTest, FusionShiftReduction1_CUDA) {
1679 Fusion fusion;
1680 FusionGuard fg(&fusion);
1681
1682 auto tv0 = makeSymbolicTensor(2);
1683 fusion.addInput(tv0);
1684 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1685 auto tv2 = sum(tv1, {1});
1686 auto tv3 = shift(tv2, {1});
1687 fusion.addOutput(tv3);
1688
1689 tv3->split(0, 4);
1690 tv0->computeAt(tv3, 1);
1691 tv0->computeAt(tv2, -1);
1692
1693 const int numel_x = 9;
1694 const int numel_y = 11;
1695
1696 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1697 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1698 std::vector<IValue> inputs = {t0};
1699
1700 FusionExecutor fe;
1701 fe.compileFusion(&fusion, inputs);
1702 auto outputs = fe.runFusion(inputs);
1703
1704 auto t1 = t0 + 1;
1705 auto t2 = sum(t1, {1});
1706 auto t3 = shift(t2, {1});
1707 auto ref = t3;
1708
1709 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1710}
1711
1712// Parallelized version of FusionShiftReduction1
1713TEST_F(NVFuserTest, FusionShiftReduction2_CUDA) {
1714 Fusion fusion;
1715 FusionGuard fg(&fusion);
1716
1717 auto tv0 = makeSymbolicTensor(2);
1718 fusion.addInput(tv0);
1719 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1720 auto tv2 = sum(tv1, {1});
1721 auto tv3 = shift(tv2, {1});
1722 fusion.addOutput(tv3);
1723
1724 tv3->split(0, 4);
1725 tv0->computeAt(tv3, 1);
1726
1727 tv2->split(-1, 32);
1728 tv0->computeAt(tv2, -1);
1729
1730 tv2->axis(-1)->parallelize(ParallelType::TIDx);
1731
1732 tv2->setMemoryType(MemoryType::Shared);
1733
1734 const int numel_x = 201;
1735 const int numel_y = 301;
1736
1737 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1738 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1739 std::vector<IValue> inputs = {t0};
1740
1741 FusionExecutor fe;
1742 fe.compileFusion(&fusion, inputs);
1743 auto outputs = fe.runFusion(inputs);
1744
1745 auto t1 = t0 + 1;
1746 auto t2 = sum(t1, {1});
1747 auto t3 = shift(t2, {1});
1748 auto ref = t3;
1749
1750 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1751}
1752
1753TEST_F(NVFuserTest, FusionShiftRfactor1_CUDA) {
1754 Fusion fusion;
1755 FusionGuard fg(&fusion);
1756
1757 auto tv0 = makeSymbolicTensor(2);
1758 fusion.addInput(tv0);
1759 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1760 auto tv2 = sum(tv1, {1});
1761 auto tv3 = shift(tv2, {1});
1762 fusion.addOutput(tv3);
1763
1764 tv3->split(0, 4);
1765 tv0->computeAt(tv3, 1);
1766
1767 tv2->split(-1, 32);
1768 auto rf = tv2->rFactor({-2});
1769 tv0->computeAt(tv2, -1);
1770 tv0->computeAt(rf, -1);
1771
1772 tv2->axis(-1)->parallelize(ParallelType::TIDx);
1773
1774 tv2->setMemoryType(MemoryType::Shared);
1775
1776 const int numel_x = 201;
1777 const int numel_y = 301;
1778
1779 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1780 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1781 std::vector<IValue> inputs = {t0};
1782
1783 FusionExecutor fe;
1784 fe.compileFusion(&fusion, inputs);
1785 auto outputs = fe.runFusion(inputs);
1786
1787 auto t1 = t0 + 1;
1788 auto t2 = sum(t1, {1});
1789 auto t3 = shift(t2, {1});
1790 auto ref = t3;
1791
1792 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1793}
1794
1795TEST_F(NVFuserTest, FusionShiftBcast1_CUDA) {
1796 Fusion fusion;
1797 FusionGuard fg(&fusion);
1798
1799 auto tv0 = makeSymbolicTensor(1);
1800 fusion.addInput(tv0);
1801 auto tv1 = makeSymbolicTensor(2);
1802 fusion.addInput(tv1);
1803 auto tv2 = broadcast(tv0, {false, true});
1804 auto tv3 = shift(tv2, {0, 1});
1805 auto tv4 = add(tv3, tv1);
1806 fusion.addOutput(tv4);
1807
1808 tv0->computeAt(tv4, -1);
1809 tv1->computeAt(tv4, -1);
1810
1811 const int numel_x = 9;
1812 const int numel_y = 11;
1813
1814 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1815 at::Tensor t0 = at::randn({numel_x}, options);
1816 at::Tensor t1 = at::randn({numel_x, numel_y}, options);
1817 std::vector<IValue> inputs = {t0, t1};
1818
1819 FusionExecutor fe;
1820 fe.compileFusion(&fusion, inputs);
1821 auto outputs = fe.runFusion(inputs);
1822
1823 auto t4 = t0.unsqueeze(-1).expand({numel_x, numel_y}) + t1;
1824 auto ref = t4;
1825
1826 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1827}
1828
1829TEST_F(NVFuserTest, FusionShiftBcast2_CUDA) {
1830 Fusion fusion;
1831 FusionGuard fg(&fusion);
1832
1833 auto tv0 = makeSymbolicTensor(1);
1834 fusion.addInput(tv0);
1835 auto tv1 = makeSymbolicTensor(2);
1836 fusion.addInput(tv1);
1837 auto tv2 = broadcast(tv0, {false, true});
1838 auto tv3 = shift(tv2, {1, 0});
1839 auto tv4 = add(tv3, tv1);
1840 fusion.addOutput(tv4);
1841
1842 tv4->split(0, 4);
1843 tv0->computeAt(tv4, 1);
1844
1845 const int numel_x = 9;
1846 const int numel_y = 11;
1847
1848 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1849 at::Tensor t0 = at::randn({numel_x}, options);
1850 at::Tensor t1 = at::randn({numel_x, numel_y}, options);
1851 std::vector<IValue> inputs = {t0, t1};
1852
1853 FusionExecutor fe;
1854 fe.compileFusion(&fusion, inputs);
1855 auto outputs = fe.runFusion(inputs);
1856
1857 auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
1858 auto t3 = shift(t2, {1, 0});
1859 auto ref = t3 + t1;
1860
1861 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1862}
1863
1864// Combine ShiftBcast1 and ShiftBcast2 with parallelization
1865TEST_F(NVFuserTest, FusionShiftBcast3_CUDA) {
1866 Fusion fusion;
1867 FusionGuard fg(&fusion);
1868
1869 auto tv0 = makeSymbolicTensor(1);
1870 fusion.addInput(tv0);
1871 auto tv1 = makeSymbolicTensor(2);
1872 fusion.addInput(tv1);
1873 auto tv2 = broadcast(tv0, {false, true});
1874 auto tv3 = shift(tv2, {1, 0});
1875 auto tv4 = shift(tv2, {0, 1});
1876 auto tv5 = shift(tv2, {-1, -1});
1877 auto tv6 = add(tv3, tv4);
1878 auto tv7 = add(tv6, tv5);
1879 auto tv8 = add(tv7, tv1);
1880 fusion.addOutput(tv8);
1881
1882 tv8->split(0, 4);
1883 tv8->split(-1, 4);
1884 tv0->computeAt(tv8, 1);
1885
1886 tv8->axis(-1)->parallelize(ParallelType::TIDx);
1887 for (auto tv : {tv8, tv7, tv6, tv5, tv4, tv3, tv2}) {
1888 tv->axis(1)->parallelize(ParallelType::TIDy);
1889 }
1890
1891 tv2->setMemoryType(MemoryType::Shared);
1892
1893 const int numel_x = 101;
1894 const int numel_y = 201;
1895
1896 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1897 at::Tensor t0 = at::randn({numel_x}, options);
1898 at::Tensor t1 = at::randn({numel_x, numel_y}, options);
1899 std::vector<IValue> inputs = {t0, t1};
1900
1901 FusionExecutor fe;
1902 fe.compileFusion(&fusion, inputs);
1903 auto outputs = fe.runFusion(inputs);
1904
1905 auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
1906 auto t3 = shift(t2, {1, 0});
1907 auto t4 = t2;
1908 auto t5 = shift(t2, {-1, 0});
1909 auto ref = t3 + t4 + t5 + t1;
1910
1911 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
1912}
1913
1914// See issue #893
1915TEST_F(NVFuserTest, FusionShiftSyncPlacement1_CUDA) {
1916 Fusion fusion;
1917 FusionGuard fg(&fusion);
1918
1919 auto tv0 = makeSymbolicTensor(2);
1920 fusion.addInput(tv0);
1921 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1922 auto tv2 = add(tv0, IrBuilder::create<Double>(2));
1923 auto tv3 = add(tv1, tv2);
1924 auto tv4 = shift(tv3, {0, 1});
1925 fusion.addOutput(tv4);
1926
1927 tv4->split(1, 8);
1928 tv0->computeAt(tv4, 2);
1929
1930 tv2->computeAt(tv3, -1);
1931
1932 tv1->setMemoryType(MemoryType::Shared);
1933 tv3->setMemoryType(MemoryType::Shared);
1934
1935 tv1->axis(-1)->parallelize(ParallelType::TIDx);
1936 tv3->axis(-1)->parallelize(ParallelType::TIDx);
1937 tv4->axis(-1)->parallelize(ParallelType::TIDx);
1938
1939 int numel_x = 99;
1940 int numel_y = 101;
1941
1942 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1943 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
1944 std::vector<IValue> inputs = {t0};
1945
1946 FusionExecutor fe;
1947 fe.compileFusion(&fusion, inputs);
1948 auto outputs = fe.runFusion(inputs);
1949
1950 auto t1 = t0 + 1;
1951 auto t2 = t0 + 2;
1952 auto t3 = add(t1, t2);
1953 auto t4 = shift(t3, {0, 1});
1954
1955 testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
1956}
1957
1958// See issue #893. Top-level placement.
1959TEST_F(NVFuserTest, FusionShiftSyncPlacement2_CUDA) {
1960 Fusion fusion;
1961 FusionGuard fg(&fusion);
1962
1963 auto tv0 = makeSymbolicTensor(1);
1964 fusion.addInput(tv0);
1965 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
1966 auto tv2 = add(tv0, IrBuilder::create<Double>(2));
1967 auto tv3 = add(tv1, tv2);
1968 auto tv4 = shift(tv3, {1});
1969 fusion.addOutput(tv4);
1970
1971 tv2->computeAt(tv3, -1);
1972
1973 tv1->setMemoryType(MemoryType::Shared);
1974 tv3->setMemoryType(MemoryType::Shared);
1975
1976 tv1->axis(-1)->parallelize(ParallelType::TIDx);
1977 tv3->axis(-1)->parallelize(ParallelType::TIDx);
1978 tv4->axis(-1)->parallelize(ParallelType::TIDx);
1979
1980 int numel_x = 99;
1981
1982 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1983 at::Tensor t0 = at::randn({numel_x}, options);
1984 std::vector<IValue> inputs = {t0};
1985
1986 FusionExecutor fe;
1987 fe.compileFusion(&fusion, inputs);
1988 auto outputs = fe.runFusion(inputs);
1989
1990 auto t1 = t0 + 1;
1991 auto t2 = t0 + 2;
1992 auto t3 = add(t1, t2);
1993 auto t4 = shift(t3, {1});
1994
1995 testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
1996}
1997
1998// Based on original CUDA provided by Vishal Mehta.
1999// Major differences with the original version:
2000// - The original version uses additional 2 warps to load the halos
2001// along the Y dimension. The other 10 warps are used to load a 32x10
2002// tile, and all warps will do coalesced loads. No such optimization
2003// is done in the fuser version.
2004TEST_F(NVFuserTest, FusionHdiff_CUDA) {
2005 Fusion fusion;
2006 FusionGuard fg(&fusion);
2007
2008 auto inp = makeSymbolicTensor(3);
2009 fusion.addInput(inp);
2010 auto coeff = makeSymbolicTensor(3);
2011 fusion.addInput(coeff);
2012
2013 std::vector<std::vector<int>> offsets{
2014 {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
2015
2016 // T2, T3, T4, T5
2017 std::vector<TensorView*> inp_neighbors;
2018 for (const auto& offset : offsets) {
2019 inp_neighbors.push_back(shift(inp, offset, false));
2020 }
2021
2022 // T8
2023 TensorView* sum_of_neighbors = nullptr;
2024 for (auto inp_neighbor : inp_neighbors) {
2025 if (sum_of_neighbors == nullptr) {
2026 sum_of_neighbors = inp_neighbor;
2027 } else {
2028 sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
2029 }
2030 }
2031
2032 // T9 = T0 * 4
2033 // T10 = T9 - T8
2034 auto lap = sub(mul(inp, IrBuilder::create<Double>(4)), sum_of_neighbors);
2035
2036 // T11 = shift(T10)
2037 // T12 = T11 - T10
2038 auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
2039 // T14 = T13 - T0
2040 // T15 = T12 * T14
2041 // T16 = T15 > 0
2042 // T17 = T16 ? 0 : T12
2043 auto flx_cond =
2044 gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)),
2045 IrBuilder::create<Double>(0));
2046 auto flx0 = where(flx_cond, IrBuilder::create<Double>(0), flx);
2047
2048 // T18 = shift(T10)
2049 // T19 = T18 - T10
2050 auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
2051 // T20 = shift(T0)
2052 // T21 = T20 - T0
2053 // T22 = T19 * T21
2054 // T23 = T22 > 0
2055 auto fly_cond =
2056 gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)),
2057 IrBuilder::create<Double>(0));
2058 // T24 = T23 ? 0 : T19
2059 auto fly0 = where(fly_cond, IrBuilder::create<Double>(0), fly);
2060
2061 // T25 = shift(flx0)
2062 // T26 = T17 - T25
2063 // T27 = shift(fly0)
2064 // T28 = T24 - T27
2065 // T29 = T26 + T28
2066 // T30 = T1 * T29
2067 // T31 = T0 - T30
2068 auto out =
2069 sub(inp,
2070 mul(coeff,
2071 add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
2072 sub(fly0, shift(fly0, {0, 1, 0}, false)))));
2073
2074 fusion.addOutput(out);
2075
2076 /////////////////////////////////
2077 // Scheduling
2078 /////////////////////////////////
2079
2080 out->setContiguity(false);
2081
2082 // Step 1: 2D Tiling
2083
2084 const int tile_x = 32;
2085 const int tile_y = 8;
2086
2087 out->split(-1, tile_x);
2088 out->split(-3, tile_y);
2089 out->reorder({{-2, -3}});
2090 inp->computeAt(out, -3);
2091 coeff->computeAt(out, -3);
2092
2093 // Step 2: Inlining
2094
2095 // Inline inputs to lap
2096 auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
2097 for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
2098 if (val != lap && val != inp) {
2099 val->computeAt(lap, -1);
2100 }
2101 }
2102
2103 // Inline inputs to flx0
2104 auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
2105 for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
2106 if (val != lap && val != flx0 && val != inp) {
2107 val->computeAt(flx0, -1);
2108 }
2109 }
2110
2111 // Inline inputs to fly0
2112 auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
2113 for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
2114 if (val != lap && val != fly0 && val != inp) {
2115 val->computeAt(fly0, -1);
2116 }
2117 }
2118
2119 // Inline inputs to out
2120 auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
2121 for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
2122 if (val != flx0 && val != fly0 && val != out) {
2123 val->computeAt(out, -1);
2124 }
2125 }
2126
2127 // Step 3: Parallelization
2128
2129 // Block parallelization
2130 out->axis(0)->parallelize(ParallelType::BIDz);
2131 out->axis(1)->parallelize(ParallelType::BIDy);
2132 out->axis(2)->parallelize(ParallelType::BIDx);
2133 // Thread parallelization
2134 out->axis(3)->parallelize(ParallelType::TIDy);
2135 out->axis(4)->parallelize(ParallelType::TIDx);
2136 // Apply the same parallelization to all other tensors
2137 scheduler_utils::parallelizeAllLike(out);
2138
2139 // Store intermediate stencil results on smem so that they can be
2140 // accessed by threads
2141 for (auto tv : {flx0, fly0, lap}) {
2142 tv->setMemoryType(MemoryType::Shared);
2143 }
2144
2145 /////////////////////////////////
2146 int numel_x = 101;
2147 int numel_y = 99;
2148 int numel_z = 10;
2149
2150 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2151 at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
2152 at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
2153 std::vector<IValue> inputs = {inp_at, coeff_at};
2154
2155 FusionExecutor fe;
2156 fe.compileFusion(&fusion, inputs);
2157 auto fuser_output = fe.runFusion(inputs)[0];
2158
2159 // Trim the outer rim
2160 std::vector<at::indexing::TensorIndex> indices{
2161 at::indexing::Slice(0, at::indexing::None),
2162 at::indexing::Slice(2, -2),
2163 at::indexing::Slice(2, -2)};
2164 fuser_output = fuser_output.index(indices);
2165
2166 {
2167 at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
2168 auto lap = inp_at * 4 -
2169 (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
2170 shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
2171 auto flx = shift(lap, {0, 0, -1}) - lap;
2172 auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
2173 auto flx0 = at::where(flx_cond, zeros, flx);
2174 auto fly = shift(lap, {0, -1, 0}) - lap;
2175 auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
2176 auto fly0 = at::where(fly_cond, zeros, fly);
2177
2178 auto ref = inp_at -
2179 coeff_at *
2180 ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
2181 ref = ref.index(indices);
2182
2183 testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
2184 }
2185}
2186
2187TEST_F(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) {
2188 Fusion fusion;
2189 FusionGuard fg(&fusion);
2190
2191 auto inp = makeSymbolicTensor(3);
2192 fusion.addInput(inp);
2193 auto coeff = makeSymbolicTensor(3);
2194 fusion.addInput(coeff);
2195
2196 std::vector<std::vector<int>> offsets{
2197 {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
2198
2199 // T2, T3, T4, T5
2200 std::vector<TensorView*> inp_neighbors;
2201 for (const auto& offset : offsets) {
2202 inp_neighbors.push_back(shift(inp, offset, false));
2203 }
2204
2205 // T8
2206 TensorView* sum_of_neighbors = nullptr;
2207 for (auto inp_neighbor : inp_neighbors) {
2208 if (sum_of_neighbors == nullptr) {
2209 sum_of_neighbors = inp_neighbor;
2210 } else {
2211 sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
2212 }
2213 }
2214
2215 // T9 = T0 * 4
2216 // T10 = T9 - T8
2217 auto lap = sub(mul(inp, IrBuilder::create<Double>(4)), sum_of_neighbors);
2218
2219 // T11 = shift(T10)
2220 // T12 = T11 - T10
2221 auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
2222 // T14 = T13 - T0
2223 // T15 = T12 * T14
2224 // T16 = T15 > 0
2225 // T17 = T16 ? 0 : T12
2226 auto flx_cond =
2227 gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)),
2228 IrBuilder::create<Double>(0));
2229 auto flx0 = where(flx_cond, IrBuilder::create<Double>(0), flx);
2230
2231 // T18 = shift(T10)
2232 // T19 = T18 - T10
2233 auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
2234 // T20 = shift(T0)
2235 // T21 = T20 - T0
2236 // T22 = T19 * T21
2237 // T23 = T22 > 0
2238 auto fly_cond =
2239 gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)),
2240 IrBuilder::create<Double>(0));
2241 // T24 = T23 ? 0 : T19
2242 auto fly0 = where(fly_cond, IrBuilder::create<Double>(0), fly);
2243
2244 // T25 = shift(flx0)
2245 // T26 = T17 - T25
2246 // T27 = shift(fly0)
2247 // T28 = T24 - T27
2248 // T29 = T26 + T28
2249 // T30 = T1 * T29
2250 // T31 = T0 - T30
2251 auto out =
2252 sub(inp,
2253 mul(coeff,
2254 add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
2255 sub(fly0, shift(fly0, {0, 1, 0}, false)))));
2256
2257 fusion.addOutput(out);
2258
2259 out->setContiguity(false);
2260
2261 /////////////////////////////////
2262 // Scheduling
2263 /////////////////////////////////
2264
2265 const auto all_vals = fusion.usedMathVals();
2266 const std::vector<TensorView*> all_tensors(
2267 {ir_utils::filterByType<TensorView>(all_vals).begin(),
2268 ir_utils::filterByType<TensorView>(all_vals).end()});
2269
2270 // Step 1: Blocking
2271 // - Thread block size: (tile_x, tile_y)
2272 // - Each thread computes a vertical column of length tile_z along the Z
2273 // axis.
2274 // - Grid dize: (NX / block_x, NY / block_y, NZ / tile_z)
2275
2276 const int tile_x = 32;
2277 const int tile_y = 8;
2278 const int tile_z = 16;
2279
2280 out->split(0, tile_z);
2281 out->split(-1, tile_x, true, true);
2282 out->split(-3, tile_y, true, true);
2283 // out: [NZ/tz, tz, NY/by, by, NX/bx, bx]
2284 out->reorder({{1, 3}, {2, 1}, {3, 4}, {4, 2}});
2285 // out: [NZ/tz, NY/by, NX/bx, tz, by, bx]
2286
2287 TransformPropagator propagator(out);
2288 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
2289
2290 inp->computeAt(out, 4);
2291
2292 // Step 2: Inlining
2293
2294 // Inline inputs to lap
2295 auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
2296 for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
2297 if (val != lap && val != inp) {
2298 val->computeAt(lap, -1);
2299 }
2300 }
2301
2302 // Inline inputs to flx0
2303 auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
2304 for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
2305 if (val != lap && val != flx0 && val != inp) {
2306 val->computeAt(flx0, -1);
2307 }
2308 }
2309
2310 // Inline inputs to fly0
2311 auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
2312 for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
2313 if (val != lap && val != fly0 && val != inp) {
2314 val->computeAt(fly0, -1);
2315 }
2316 }
2317
2318 // Inline inputs to out
2319 auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
2320 for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
2321 if (val != flx0 && val != fly0 && val != out) {
2322 val->computeAt(out, -1);
2323 }
2324 }
2325
2326 // Step 3: Parallelization
2327
2328 // Block parallelization
2329 out->axis(0)->parallelize(ParallelType::BIDz);
2330 out->axis(1)->parallelize(ParallelType::BIDy);
2331 out->axis(2)->parallelize(ParallelType::BIDx);
2332 out->axis(4)->parallelize(ParallelType::TIDy);
2333 out->axis(5)->parallelize(ParallelType::TIDx);
2334 // Unswitch at the tz axis
2335 out->axis(3)->parallelize(ParallelType::Unswitch);
2336
2337 scheduler_utils::parallelizeAllLike(out, all_tensors);
2338
2339 // These need to be on smem
2340 for (auto tv : {flx0, fly0, lap}) {
2341 tv->setMemoryType(MemoryType::Shared);
2342 }
2343
2344 /////////////////////////////////
2345 const int halo_extent = 2;
2346 const int numel_x = 64 + halo_extent * 2;
2347 const int numel_y = 64 + halo_extent * 2;
2348 const int numel_z = 32;
2349
2350 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2351 at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
2352 at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
2353 std::vector<IValue> inputs = {inp_at, coeff_at};
2354
2355 FusionExecutor fe;
2356 fe.compileFusion(&fusion, inputs);
2357 auto fuser_output = fe.runFusion(inputs)[0];
2358
2359 // Trim the outer rim
2360 std::vector<at::indexing::TensorIndex> indices{
2361 at::indexing::Slice(0, at::indexing::None),
2362 at::indexing::Slice(2, -2),
2363 at::indexing::Slice(2, -2)};
2364 fuser_output = fuser_output.index(indices);
2365
2366 {
2367 at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
2368 auto lap = inp_at * 4 -
2369 (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
2370 shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
2371 auto flx = shift(lap, {0, 0, -1}) - lap;
2372 auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
2373 auto flx0 = at::where(flx_cond, zeros, flx);
2374 auto fly = shift(lap, {0, -1, 0}) - lap;
2375 auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
2376 auto fly0 = at::where(fly_cond, zeros, fly);
2377
2378 auto ref = inp_at -
2379 coeff_at *
2380 ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
2381 ref = ref.index(indices);
2382
2383 testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
2384 }
2385}
2386
2387// 3x3 max pooling
2388TEST_F(NVFuserTest, FusionMaxPooling_CUDA) {
2389 Fusion fusion;
2390 FusionGuard fg(&fusion);
2391
2392 // Format: CHW
2393 auto inp = makeSymbolicTensor(3);
2394 fusion.addInput(inp);
2395
2396 // 3x3 pooling of the HW spatial domain
2397 std::vector<std::vector<int>> offsets;
2398 for (int i = -1; i <= 1; ++i) {
2399 for (int j = -1; j <= 1; ++j) {
2400 if (i == 0 && j == 0) {
2401 continue;
2402 }
2403 offsets.push_back({i, j});
2404 }
2405 }
2406
2407 std::vector<TensorView*> inp_tile({inp});
2408 for (auto offset : offsets) {
2409 offset.insert(offset.begin(), 0);
2410 inp_tile.push_back(shift(inp, offset));
2411 }
2412
2413 TensorView* max_tensor = nullptr;
2414 for (auto tv : inp_tile) {
2415 if (max_tensor == nullptr) {
2416 max_tensor = tv;
2417 } else {
2418 max_tensor = binaryOp(BinaryOpType::Max, max_tensor, tv);
2419 }
2420 }
2421
2422 fusion.addOutput(max_tensor);
2423
2424 ////////////////////////////////////
2425
2426 // Cache the input and weight tensors
2427 auto inp_cache = inp->cacheAfter();
2428
2429 // Tiling the spatial domain
2430 const int tile_x = 32;
2431 const int tile_y = 8;
2432
2433 max_tensor->split(-2, tile_y);
2434 max_tensor->axis(-2)->parallelize(ParallelType::TIDy);
2435 max_tensor->split(-1, tile_x);
2436 max_tensor->axis(-1)->parallelize(ParallelType::TIDx);
2437 max_tensor->reorder({{-3, -2}});
2438
2439 inp_cache->computeAt(max_tensor, 3);
2440 inp_cache->axis(-2)->parallelize(ParallelType::TIDy);
2441 inp_cache->axis(-1)->parallelize(ParallelType::TIDx);
2442 inp_cache->setMemoryType(MemoryType::Shared);
2443
2444 auto max_tensor_dep =
2445 DependencyCheck::getAllValsBetween({inp_cache}, {max_tensor});
2446 for (auto tv : ir_utils::filterByType<TensorView>(max_tensor_dep)) {
2447 if (tv == inp_cache || tv == max_tensor) {
2448 continue;
2449 }
2450 tv->computeAt(max_tensor, -1);
2451 }
2452
2453 max_tensor->axis(0)->parallelize(ParallelType::BIDx);
2454
2455 const int hw = 50;
2456 const int num_channels = 20;
2457 const int pooling_window = 3;
2458
2459 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2460 at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
2461 // shift always pads by zero, so if all surrounding values are
2462 // negative, max pooling would pick a padded value, which isn't the
2463 // correct behavior. We need to be able to choose the value of
2464 // padding. In this case, padding by the minimum value would not
2465 // have this problem. For now, avoid the problem by making sure all
2466 // values are not negative.
2467 aten_inp = at::abs(aten_inp);
2468 std::vector<IValue> inputs = {aten_inp};
2469
2470 FusionExecutor fe;
2471 fe.compileFusion(&fusion, inputs);
2472 auto outputs = fe.runFusion(inputs);
2473
2474 auto ref = at::max_pool2d(
2475 aten_inp, {pooling_window, pooling_window}, {1, 1}, {1, 1});
2476
2477 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
2478}
2479
2480TEST_F(NVFuserTest, FusionGather1_CUDA) {
2481 Fusion fusion;
2482 FusionGuard fg(&fusion);
2483
2484 auto tv0 = makeSymbolicTensor(2);
2485 fusion.addInput(tv0);
2486
2487 const std::vector<int> window_shape = {1, 3};
2488 const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
2489
2490 auto tv1 = gather(tv0, window_shape, padding_width);
2491
2492 fusion.addOutput(tv1);
2493
2494 const int s1 = 11;
2495 const int s2 = 13;
2496
2497 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2498 at::Tensor t0 = at::randn({s1, s2}, options);
2499
2500 auto ref = gather(t0, window_shape, padding_width);
2501
2502 FusionExecutor fe;
2503 fe.compileFusion(&fusion, {t0});
2504 auto outputs = fe.runFusion({t0});
2505
2506 TORCH_CHECK(ref.equal(outputs[0]));
2507}
2508
2509TEST_F(NVFuserTest, FusionGather2_CUDA) {
2510 Fusion fusion;
2511 FusionGuard fg(&fusion);
2512
2513 const std::vector<int> window_shape = {1, 3};
2514 const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
2515
2516 auto tv0 = makeSymbolicTensor(2);
2517 fusion.addInput(tv0);
2518
2519 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
2520
2521 auto tv2 = gather(tv1, window_shape, padding_width);
2522
2523 auto tv3 = sum(tv2, {-1});
2524
2525 fusion.addOutput(tv3);
2526
2527 tv3->split(1, 32);
2528 tv0->computeAt(tv3, 2);
2529 tv2->computeAt(tv3, -1);
2530
2531 tv3->axis(0)->parallelize(ParallelType::BIDy);
2532 tv3->axis(1)->parallelize(ParallelType::BIDx);
2533 tv3->axis(2)->parallelize(ParallelType::TIDx);
2534 tv1->axis(2)->parallelize(ParallelType::TIDx);
2535
2536 tv1->setMemoryType(MemoryType::Shared);
2537
2538 const int s1 = 99;
2539 const int s2 = 101;
2540
2541 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2542 at::Tensor t0 = at::randn({s1, s2}, options);
2543 std::vector<IValue> inputs = {t0};
2544
2545 FusionExecutor fe;
2546 fe.compileFusion(&fusion, inputs);
2547 auto outputs = fe.runFusion(inputs);
2548
2549 auto t1 = t0 + 1;
2550 auto t2 = gather(t1, window_shape, padding_width);
2551 auto ref = sum(t2, {-1});
2552
2553 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
2554}
2555
2556TEST_F(NVFuserTest, FusionGather3_CUDA) {
2557 Fusion fusion;
2558 FusionGuard fg(&fusion);
2559
2560 auto tv0 = makeSymbolicTensor(2);
2561 fusion.addInput(tv0);
2562
2563 const std::vector<int> window_shape = {1, 3};
2564 const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
2565
2566 auto tv1 = gather(tv0, window_shape, padding_width);
2567
2568 fusion.addOutput(tv1);
2569
2570 const int s1 = 11;
2571 const int s2 = 13;
2572
2573 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2574 std::vector<int64_t> size({s1, s2});
2575 at::Tensor t0 = at::randn(size, options);
2576 size.insert(size.end(), window_shape.begin(), window_shape.end());
2577 // Use a pre-allocated output tensor filled with 1 so that invalid
2578 // writes to outside valid ranges can be detected
2579 at::Tensor output = at::ones(size, options);
2580
2581 FusionExecutor fe;
2582 fe.compileFusion(&fusion, {t0});
2583 auto outputs = fe.runFusion({t0}, {output});
2584
2585 auto ref = gather(t0, window_shape, padding_width);
2586 TORCH_CHECK(ref.equal(outputs[0]));
2587}
2588
2589TEST_F(NVFuserTest, FusionGather4_CUDA) {
2590 Fusion fusion;
2591 FusionGuard fg(&fusion);
2592
2593 auto tv0 = makeSymbolicTensor(2);
2594 fusion.addInput(tv0);
2595
2596 const std::vector<int> window_shape = {3, 3};
2597 const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
2598
2599 auto tv1 = gather(tv0, window_shape, padding_width);
2600
2601 fusion.addOutput(tv1);
2602
2603 const int s1 = 11;
2604 const int s2 = 13;
2605
2606 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2607 std::vector<int64_t> size({s1, s2});
2608 at::Tensor t0 = at::randn(size, options);
2609 size.insert(size.end(), window_shape.begin(), window_shape.end());
2610 // Use a pre-allocated output tensor filled with 1 so that invalid
2611 // writes to outside valid ranges can be detected
2612 at::Tensor output = at::ones(size, options);
2613
2614 FusionExecutor fe;
2615 fe.compileFusion(&fusion, {t0});
2616 auto outputs = fe.runFusion({t0}, {output});
2617
2618 auto ref = gather(t0, window_shape, padding_width);
2619
2620 TORCH_CHECK(ref.equal(outputs[0]));
2621}
2622
2623TEST_F(NVFuserTest, FusionGather5_CUDA) {
2624 Fusion fusion;
2625 FusionGuard fg(&fusion);
2626
2627 auto tv0 = makeSymbolicTensor(2);
2628 fusion.addInput(tv0);
2629
2630 const std::vector<int> window_shape = {3, 3};
2631 const std::vector<std::vector<int>> padding_width = {{1, 0}, {0, 1}};
2632
2633 auto tv1 = gather(tv0, window_shape, padding_width);
2634
2635 fusion.addOutput(tv1);
2636
2637 const int s1 = 11;
2638 const int s2 = 13;
2639
2640 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2641 std::vector<int64_t> size({s1, s2});
2642 at::Tensor t0 = at::randn(size, options);
2643 size.insert(size.end(), window_shape.begin(), window_shape.end());
2644 // Use a pre-allocated output tensor filled with 1 so that invalid
2645 // writes to outside valid ranges can be detected
2646 at::Tensor output = at::ones(size, options);
2647
2648 FusionExecutor fe;
2649 fe.compileFusion(&fusion, {t0});
2650 auto outputs = fe.runFusion({t0}, {output});
2651
2652 auto ref = gather(t0, window_shape, padding_width);
2653
2654 TORCH_CHECK(ref.equal(outputs[0]));
2655}
2656
2657// Conv-like pattern with no padding
2658TEST_F(NVFuserTest, FusionGather6_CUDA) {
2659 Fusion fusion;
2660 FusionGuard fg(&fusion);
2661
2662 auto tv0 = makeSymbolicTensor(2);
2663 fusion.addInput(tv0);
2664
2665 const std::vector<int> window_shape = {3, 4};
2666 const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
2667
2668 auto tv1 = gather(tv0, window_shape, padding_width);
2669
2670 fusion.addOutput(tv1);
2671
2672 // Blocking the spatial dimensions
2673 const int block_x = 16;
2674 const int block_y = 8;
2675
2676 auto tv0_cache = tv0->cacheAfter();
2677 auto out = tv1;
2678 auto out_cache = out->cacheBefore();
2679
2680 out->split(1, block_x);
2681 out->split(0, block_y);
2682 out->reorder({{1, 2}, {2, 1}});
2683
2684 TransformPropagator propagator(out);
2685 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
2686
2687 tv0->computeAt(out, 2);
2688
2689 tv0_cache->setMemoryType(MemoryType::Shared);
2690
2691 out->axis(0)->parallelize(ParallelType::BIDy);
2692 out->axis(1)->parallelize(ParallelType::BIDx);
2693 out->axis(2)->parallelize(ParallelType::TIDy);
2694 out->axis(3)->parallelize(ParallelType::TIDx);
2695 scheduler_utils::parallelizeAllLike(out);
2696
2697 const int s1 = 101;
2698 const int s2 = 99;
2699
2700 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2701 std::vector<int64_t> size({s1, s2});
2702 at::Tensor t0 = at::randn(size, options);
2703 size.insert(size.end(), window_shape.begin(), window_shape.end());
2704 // Use a pre-allocated output tensor filled with 1 so that invalid
2705 // writes to outside valid ranges can be detected
2706 at::Tensor output = at::ones(size, options);
2707
2708 FusionExecutor fe;
2709 fe.compileFusion(&fusion, {t0});
2710 auto outputs = fe.runFusion({t0}, {output});
2711
2712 auto ref = gather(t0, window_shape, padding_width);
2713
2714 TORCH_CHECK(ref.equal(outputs[0]));
2715}
2716
2717// Conv-like pattern with irregular padding
2718TEST_F(NVFuserTest, FusionGather7_CUDA) {
2719 Fusion fusion;
2720 FusionGuard fg(&fusion);
2721
2722 auto tv0 = makeSymbolicTensor(2);
2723 fusion.addInput(tv0);
2724
2725 const std::vector<int> window_shape = {3, 4};
2726 const std::vector<std::vector<int>> padding_width = {{0, 2}, {2, 1}};
2727
2728 auto tv1 = gather(tv0, window_shape, padding_width);
2729
2730 fusion.addOutput(tv1);
2731
2732 // Blocking the spatial dimensions
2733 const int block_x = 16;
2734 const int block_y = 8;
2735
2736 auto tv0_cache = tv0->cacheAfter();
2737 auto out = tv1;
2738 auto out_cache = out->cacheBefore();
2739
2740 out->split(1, block_x);
2741 out->split(0, block_y);
2742 out->reorder({{1, 2}, {2, 1}});
2743
2744 TransformPropagator propagator(out);
2745 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
2746
2747 tv0->computeAt(out, 2);
2748
2749 tv0_cache->setMemoryType(MemoryType::Shared);
2750
2751 out->axis(0)->parallelize(ParallelType::BIDy);
2752 out->axis(1)->parallelize(ParallelType::BIDx);
2753 out->axis(2)->parallelize(ParallelType::TIDy);
2754 out->axis(3)->parallelize(ParallelType::TIDx);
2755 scheduler_utils::parallelizeAllLike(out);
2756
2757 const int s1 = 101;
2758 const int s2 = 99;
2759
2760 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2761 std::vector<int64_t> size({s1, s2});
2762 at::Tensor t0 = at::randn(size, options);
2763 size.insert(size.end(), window_shape.begin(), window_shape.end());
2764 at::Tensor output = at::ones(size, options);
2765
2766 FusionExecutor fe;
2767 fe.compileFusion(&fusion, {t0});
2768 auto outputs = fe.runFusion({t0}, {output});
2769
2770 auto ref = gather(t0, window_shape, padding_width);
2771
2772 TORCH_CHECK(ref.equal(outputs[0]));
2773}
2774
2775// With no padding but with striding
2776TEST_F(NVFuserTest, FusionGather8_CUDA) {
2777 Fusion fusion;
2778 FusionGuard fg(&fusion);
2779
2780 auto tv0 = makeSymbolicTensor(2);
2781 fusion.addInput(tv0);
2782
2783 const std::vector<int> window_shape = {2, 3};
2784 const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
2785 const std::vector<int> strides = {3, 3};
2786
2787 auto tv1 = gather(tv0, window_shape, padding_width, strides);
2788
2789 fusion.addOutput(tv1);
2790
2791 const int s1 = 11;
2792 const int s2 = 13;
2793
2794 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2795 std::vector<int64_t> size({s1, s2});
2796 at::Tensor t0 = at::randn(size, options);
2797 for (const auto i : c10::irange(size.size())) {
2798 size[i] = ceilDiv(
2799 size[i] - window_shape[i] + 1 + padding_width[i][0] +
2800 padding_width[i][1],
2801 strides[i]);
2802 }
2803 size.insert(size.end(), window_shape.begin(), window_shape.end());
2804 // Use a pre-allocated output tensor filled with 1 so that invalid
2805 // writes to outside valid ranges can be detected
2806 at::Tensor output = at::ones(size, options);
2807
2808 FusionExecutor fe;
2809 fe.compileFusion(&fusion, {t0});
2810 auto outputs = fe.runFusion({t0}, {output});
2811
2812 auto ref = gather(t0, window_shape, padding_width, strides);
2813
2814 TORCH_CHECK(ref.equal(outputs[0]));
2815}
2816
2817// Similar to Gather8 but with splitting and parallelization
2818TEST_F(NVFuserTest, FusionGather9_CUDA) {
2819 Fusion fusion;
2820 FusionGuard fg(&fusion);
2821
2822 auto tv0 = makeSymbolicTensor(2);
2823 fusion.addInput(tv0);
2824
2825 const std::vector<int> window_shape = {3, 4};
2826 const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
2827 const std::vector<int> strides = {2, 2};
2828
2829 auto tv1 = gather(tv0, window_shape, padding_width, strides);
2830
2831 fusion.addOutput(tv1);
2832
2833 // Blocking the spatial dimensions
2834 const int block_x = 16;
2835 const int block_y = 8;
2836
2837 auto tv0_cache = tv0->cacheAfter();
2838 auto out = tv1;
2839 auto out_cache = out->cacheBefore();
2840
2841 out->split(1, block_x);
2842 out->split(0, block_y);
2843 out->reorder({{1, 2}, {2, 1}});
2844
2845 TransformPropagator propagator(out);
2846 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
2847
2848 tv0->computeAt(out, 2);
2849
2850 tv0_cache->setMemoryType(MemoryType::Shared);
2851
2852 out->axis(0)->parallelize(ParallelType::BIDy);
2853 out->axis(1)->parallelize(ParallelType::BIDx);
2854 out->axis(2)->parallelize(ParallelType::TIDy);
2855 out->axis(3)->parallelize(ParallelType::TIDx);
2856 scheduler_utils::parallelizeAllLike(out);
2857
2858 const int s1 = 101;
2859 const int s2 = 99;
2860
2861 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2862 std::vector<int64_t> size({s1, s2});
2863 at::Tensor t0 = at::randn(size, options);
2864 for (const auto i : c10::irange(size.size())) {
2865 size[i] = ceilDiv(
2866 size[i] - window_shape[i] + 1 + padding_width[i][0] +
2867 padding_width[i][1],
2868 strides[i]);
2869 }
2870 size.insert(size.end(), window_shape.begin(), window_shape.end());
2871 // Use a pre-allocated output tensor filled with 1 so that invalid
2872 // writes to outside valid ranges can be detected
2873 at::Tensor output = at::ones(size, options);
2874
2875 FusionExecutor fe;
2876 fe.compileFusion(&fusion, {t0});
2877 auto outputs = fe.runFusion({t0}, {output});
2878
2879 auto ref = gather(t0, window_shape, padding_width, strides);
2880
2881 TORCH_CHECK(ref.equal(outputs[0]));
2882}
2883
2884TEST_F(NVFuserTest, FusionConv2D_CUDA) {
2885 Fusion fusion;
2886 FusionGuard fg(&fusion);
2887
2888 // Input: [C, H, W]
2889 auto inp = makeSymbolicTensor(3);
2890 fusion.addInput(inp);
2891
2892 // Weights: [K, C, 3, 3]
2893 auto w = makeSymbolicTensor(4);
2894 fusion.addInput(w);
2895
2896 // Gather a neighbor tile of [3, 3] with padding size of 1 for each
2897 // side of the spatial dimensions
2898 auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}});
2899 // inp_tile: [C, H, W, 1, 3, 3]
2900
2901 auto inp_bc =
2902 broadcast(inp_tile, {true, false, false, false, false, false, false});
2903 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
2904
2905 auto inp_times_w = mul(inp_bc, w_bc);
2906
2907 // Reduce the channel and neighbor tile dimensions
2908 auto out = sum(inp_times_w, {1, 4, 5, 6});
2909
2910 fusion.addOutput(out);
2911
2912 ////////////////////////////////////
2913
2914 // Cache the input and weight tensors
2915 auto inp_cache = inp->cacheAfter();
2916
2917 // Blocking the spatial dimensions
2918 const int block_w = 16;
2919 const int block_h = 4;
2920 // Blocking the channel dimension
2921 const int block_c = 8;
2922
2923 out->split(2, block_h);
2924 out->split(4, block_w);
2925 out->reorder({{3, 4}});
2926 // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
2927
2928 out->split(1, block_c);
2929 // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
2930
2931 auto out_rf = out->rFactor({1, -3, -2, -1});
2932 // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
2933 // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
2934
2935 // Create a [block_x, block_y] tile on smem
2936 inp_cache->computeAt(out, 4);
2937 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
2938 inp_cache->setMemoryType(MemoryType::Shared);
2939
2940 // Move Ci forward
2941 out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
2942 inp_cache->computeAt(out_rf, 5);
2943
2944 inp_tile->computeAt(out_rf, -1);
2945 w->computeAt(out_rf, -1);
2946
2947 out->axis(0)->parallelize(ParallelType::BIDx);
2948 out->axis(1)->parallelize(ParallelType::TIDz);
2949 out->axis(4)->parallelize(ParallelType::TIDy);
2950 out->axis(5)->parallelize(ParallelType::TIDx);
2951
2952 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
2953
2954 const int dim_h = 99;
2955 const int dim_w = 101;
2956 const int dim_c = 10;
2957 const int dim_f = 20;
2958
2959 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2960 at::manual_seed(0);
2961 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
2962 at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
2963 std::vector<IValue> inputs = {at_inp, at_w};
2964
2965 FusionExecutor fe;
2966 fe.compileFusion(&fusion, inputs);
2967 auto cg_outputs = fe.runFusion(inputs);
2968
2969 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
2970 auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
2971 at_out = at_out.squeeze(0); // drop the N axis
2972
2973 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
2974}
2975
2976TEST_F(NVFuserTest, FusionConv2DNoPadding_CUDA) {
2977 Fusion fusion;
2978 FusionGuard fg(&fusion);
2979 ContextCudnnTF32Disabled disabling_tf32_cudnn;
2980
2981 // Input: [C, H, W]
2982 auto inp = makeSymbolicTensor(3);
2983 fusion.addInput(inp);
2984
2985 // Weights: [K, C, 3, 3]
2986 auto w = makeSymbolicTensor(4);
2987 fusion.addInput(w);
2988
2989 // Gather a neighbor tile of [3, 3] with no padding
2990 auto inp_tile =
2991 gather(inp, {1, 3, 3}, {{0, 0}, {0, 0}, {0, 0}}, {1, 1, 1}, true);
2992 // inp_tile: [C, H-2, W-2, 1, 3, 3]
2993
2994 auto inp_bc =
2995 broadcast(inp_tile, {true, false, false, false, false, false, false});
2996 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
2997
2998 auto inp_times_w = mul(inp_bc, w_bc);
2999
3000 // Reduce the channel and neighbor tile dimensions
3001 auto out = sum(inp_times_w, {1, 4, 5, 6});
3002
3003 fusion.addOutput(out);
3004
3005 ////////////////////////////////////
3006
3007 // Cache the input and weight tensors
3008 auto inp_cache = inp->cacheAfter();
3009
3010 // Blocking the spatial dimensions
3011 const int block_w = 16;
3012 const int block_h = 4;
3013 // Blocking the channel dimension
3014 const int block_c = 8;
3015
3016 out->split(2, block_h);
3017 out->split(4, block_w);
3018 out->reorder({{3, 4}});
3019 // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
3020
3021 out->split(1, block_c);
3022 // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
3023
3024 auto out_rf = out->rFactor({1, -3, -2, -1});
3025 // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
3026 // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
3027
3028 // Create a [block_x, block_y] tile on smem
3029 inp_cache->computeAt(out, 4);
3030 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
3031 inp_cache->setMemoryType(MemoryType::Shared);
3032
3033 // Move Ci forward
3034 out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
3035 inp_cache->computeAt(out_rf, 5);
3036
3037 inp_tile->computeAt(out_rf, -1);
3038 w->computeAt(out_rf, -1);
3039
3040 out->axis(0)->parallelize(ParallelType::BIDx);
3041 out->axis(1)->parallelize(ParallelType::TIDz);
3042 out->axis(4)->parallelize(ParallelType::TIDy);
3043 out->axis(5)->parallelize(ParallelType::TIDx);
3044
3045 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
3046
3047 const int dim_h = 99;
3048 const int dim_w = 101;
3049 const int dim_c = 10;
3050 const int dim_f = 20;
3051
3052 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3053 at::manual_seed(0);
3054 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
3055 at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
3056 std::vector<IValue> inputs = {at_inp, at_w};
3057
3058 FusionExecutor fe;
3059 fe.compileFusion(&fusion, inputs);
3060 auto cg_outputs = fe.runFusion(inputs);
3061
3062 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3063 std::vector<int64_t> stride = {1, 1};
3064 std::vector<int64_t> padding = {0, 0};
3065 auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding);
3066 at_out = at_out.squeeze(0); // drop the N axis
3067
3068 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3069}
3070
3071TEST_F(NVFuserTest, FusionConv2DNoPaddingStrided_CUDA) {
3072 Fusion fusion;
3073 FusionGuard fg(&fusion);
3074
3075 // Input: [C, H, W]
3076 auto inp = makeSymbolicTensor(3);
3077 fusion.addInput(inp);
3078
3079 // Weights: [K, C, 3, 3]
3080 auto w = makeSymbolicTensor(4);
3081 fusion.addInput(w);
3082
3083 // Gather a neighbor tile of [2, 2] with no padding and strides of
3084 // [2, 2]
3085 auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 0}, {0, 0}}, {1, 2, 2});
3086 // inp_tile: [C, H/2, W/2, 1, 2, 2]
3087
3088 auto inp_bc =
3089 broadcast(inp_tile, {true, false, false, false, false, false, false});
3090 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
3091
3092 auto inp_times_w = mul(inp_bc, w_bc);
3093
3094 // Reduce the channel and neighbor tile dimensions
3095 auto out = sum(inp_times_w, {1, 4, 5, 6});
3096
3097 fusion.addOutput(out);
3098
3099 ////////////////////////////////////
3100
3101 // Cache the input and weight tensors
3102 auto inp_cache = inp->cacheAfter();
3103
3104 // Blocking the spatial dimensions
3105 const int block_w = 16;
3106 const int block_h = 4;
3107 // Blocking the channel dimension
3108 const int block_c = 8;
3109
3110 out->split(2, block_h);
3111 out->split(4, block_w);
3112 out->reorder({{3, 4}});
3113 // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
3114
3115 out->split(1, block_c);
3116 // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
3117
3118 auto out_rf = out->rFactor({1, -3, -2, -1});
3119 // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
3120 // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
3121
3122 // Create a [block_x, block_y] tile on smem
3123 inp_cache->computeAt(out, 4);
3124 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
3125 inp_cache->setMemoryType(MemoryType::Shared);
3126
3127 // Move Ci forward
3128 out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
3129 inp_cache->computeAt(out_rf, 5);
3130
3131 inp_tile->computeAt(out_rf, -1);
3132 w->computeAt(out_rf, -1);
3133
3134 out->axis(0)->parallelize(ParallelType::BIDx);
3135 out->axis(1)->parallelize(ParallelType::TIDz);
3136 out->axis(4)->parallelize(ParallelType::TIDy);
3137 out->axis(5)->parallelize(ParallelType::TIDx);
3138
3139 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
3140
3141 const int dim_h = 99;
3142 const int dim_w = 101;
3143 const int dim_c = 10;
3144 const int dim_f = 20;
3145
3146 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3147 at::manual_seed(0);
3148 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
3149 at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
3150 std::vector<IValue> inputs = {at_inp, at_w};
3151
3152 FusionExecutor fe;
3153 fe.compileFusion(&fusion, inputs);
3154 auto cg_outputs = fe.runFusion(inputs);
3155
3156 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3157 std::vector<int64_t> stride = {2, 2};
3158 std::vector<int64_t> padding = {0, 0};
3159 auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding);
3160 at_out = at_out.squeeze(0); // drop the N axis
3161
3162 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3163}
3164
3165// 5x5 followed by 3x3
3166TEST_F(NVFuserTest, FusionConv2DChain_CUDA) {
3167 const int dim_w1_h = 5;
3168 const int dim_w1_w = 5;
3169 const int dim_pad1_h = (dim_w1_h - 1) / 2;
3170 const int dim_pad1_w = (dim_w1_w - 1) / 2;
3171 const int dim_w2_h = 3;
3172 const int dim_w2_w = 3;
3173 const int dim_pad2_h = (dim_w2_h - 1) / 2;
3174 const int dim_pad2_w = (dim_w2_w - 1) / 2;
3175
3176 Fusion fusion;
3177 FusionGuard fg(&fusion);
3178
3179 // Input: [K1, H, W]
3180 auto inp = makeSymbolicTensor(3);
3181 fusion.addInput(inp);
3182
3183 // Weights: [K2, K1, S1, T1]
3184 auto w1 = makeSymbolicTensor(4);
3185 fusion.addInput(w1);
3186
3187 // Weights: [K3, K2, S2, T2]
3188 auto w2 = makeSymbolicTensor(4);
3189 fusion.addInput(w2);
3190
3191 // Gather a neighbor tile of [w1_h, w1_w] with padding
3192 auto inp_tile = gather(
3193 inp,
3194 {1, dim_w1_h, dim_w1_w},
3195 {{0, 0}, {dim_pad1_h, dim_pad1_h}, {dim_pad1_w, dim_pad1_w}});
3196 // inp_tile: [C, 1, H - w1_h + 1, W - w1_w + 1, w1_h, w1_w]
3197
3198 auto inp_bc =
3199 broadcast(inp_tile, {true, false, false, false, false, false, false});
3200 auto w1_bc = broadcast(w1, {false, false, true, true, true, false, false});
3201
3202 auto inp_times_w1 = mul(inp_bc, w1_bc);
3203
3204 // Reduce the channel and neighbor tile dimensions
3205 auto out1 = sum(inp_times_w1, {1, 4, 5, 6});
3206
3207 // Second conv
3208 auto out1_tile = gather(
3209 out1,
3210 {1, dim_w2_h, dim_w2_w},
3211 {{0, 0}, {dim_pad2_h, dim_pad2_h}, {dim_pad2_w, dim_pad2_w}});
3212
3213 auto out1_bc =
3214 broadcast(out1_tile, {true, false, false, false, false, false, false});
3215 auto w2_bc = broadcast(w2, {false, false, true, true, true, false, false});
3216
3217 auto out1_times_w2 = mul(out1_bc, w2_bc);
3218
3219 auto out2 = sum(out1_times_w2, {1, 4, 5, 6});
3220
3221 fusion.addOutput(out2);
3222
3223 ////////////////////////////////////
3224 // Cache the input and weight tensors
3225 auto inp_cache = inp->cacheAfter();
3226
3227 // Blocking the spatial dimensions
3228 const int block_w = 16;
3229 const int block_h = 4;
3230
3231 out2->split(2, block_h);
3232 out2->split(4, block_w);
3233 out2->reorder({{3, 4}});
3234 // out2: [K3, K2, Ho, Wo, Hi, Wi, 1, 3, 3]
3235
3236 // Create a [block_x, block_y] tile on smem
3237 inp_cache->computeAt(out2, 4);
3238 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
3239 inp_cache->setMemoryType(MemoryType::Shared);
3240
3241 // Move Ci forward
3242 out1->reorder({{5, 3}, {3, 4}, {4, 5}});
3243 out1->setMemoryType(MemoryType::Shared);
3244
3245 inp_cache->computeAt(out1, 4);
3246
3247 inp_tile->computeAt(out1, -1);
3248 w1->computeAt(out1, -1);
3249
3250 out1_tile->computeAt(out2, -1);
3251 w2->computeAt(out2, -1);
3252
3253 out2->axis(0)->parallelize(ParallelType::BIDx);
3254 out2->axis(4)->parallelize(ParallelType::TIDy);
3255 out2->axis(5)->parallelize(ParallelType::TIDx);
3256
3257 scheduler_utils::parallelizeAllLike(out2, {inp_cache, out1});
3258
3259 const int dim_h = 99;
3260 const int dim_w = 101;
3261 const int dim_k1 = 3;
3262 const int dim_k2 = 5;
3263 const int dim_k3 = 7;
3264
3265 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3266 at::manual_seed(0);
3267 at::Tensor at_inp = at::randn({dim_k1, dim_h, dim_w}, options);
3268 at::Tensor at_w1 = at::randn({dim_k2, dim_k1, dim_w1_h, dim_w1_w}, options);
3269 at::Tensor at_w2 = at::randn({dim_k3, dim_k2, dim_w2_h, dim_w2_w}, options);
3270 std::vector<IValue> inputs = {at_inp, at_w1, at_w2};
3271
3272 FusionExecutor fe;
3273 fe.compileFusion(&fusion, inputs);
3274 auto cg_outputs = fe.runFusion(inputs);
3275
3276 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3277 auto at_out1 = at::conv2d(at_inp, at_w1, {}, 1, 2);
3278 auto at_out2 = at::conv2d(at_out1, at_w2, {}, 1, 1);
3279 at_out2 = at_out2.squeeze(0); // drop the N axis
3280
3281 testValidate(&fusion, cg_outputs, inputs, {at_out2}, __LINE__, __FILE__);
3282}
3283
3284TEST_F(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) {
3285 Fusion fusion;
3286 FusionGuard fg(&fusion);
3287
3288 // Input: [C, H, W]
3289 auto inp = makeSymbolicTensor(3);
3290 fusion.addInput(inp);
3291
3292 // Weights: [K, C, 2, 2]
3293 auto w = makeSymbolicTensor(4);
3294 fusion.addInput(w);
3295
3296 // Gather a neighbor tile of [2, 2] with padding size of 1 only for
3297 // the right side of the spatial dimensions. The left padding is
3298 // zero so that the output axis stays the same.
3299 auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 1}, {0, 1}});
3300 // inp_tile: [C, H, W, 1, 2, 2]
3301
3302 auto inp_bc =
3303 broadcast(inp_tile, {true, false, false, false, false, false, false});
3304 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
3305
3306 auto inp_times_w = mul(inp_bc, w_bc);
3307
3308 // Reduce the channel and neighbor tile dimensions
3309 auto out = sum(inp_times_w, {1, 4, 5, 6});
3310
3311 fusion.addOutput(out);
3312
3313 ////////////////////////////////////
3314
3315 // Cache the input and weight tensors
3316 auto inp_cache = inp->cacheAfter();
3317
3318 // Blocking the spatial dimensions
3319 const int block_w = 16;
3320 const int block_h = 4;
3321 // Blocking the channel dimension
3322 const int block_c = 8;
3323
3324 out->split(2, block_h);
3325 out->split(4, block_w);
3326 out->reorder({{3, 4}});
3327 // out: [K, C, Ho, Wo, Hi, Wi, 1, 2, 2]
3328
3329 out->split(1, block_c);
3330 // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
3331
3332 auto out_rf = out->rFactor({1, -3, -2, -1});
3333 // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
3334 // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
3335
3336 // Create a [block_x, block_y] tile on smem
3337 inp_cache->computeAt(out, 4);
3338 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
3339 inp_cache->setMemoryType(MemoryType::Shared);
3340
3341 // Move Ci forward
3342 out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
3343 inp_cache->computeAt(out_rf, 5);
3344
3345 inp_tile->computeAt(out_rf, -1);
3346 w->computeAt(out_rf, -1);
3347
3348 out->axis(0)->parallelize(ParallelType::BIDx);
3349 out->axis(1)->parallelize(ParallelType::TIDz);
3350 out->axis(4)->parallelize(ParallelType::TIDy);
3351 out->axis(5)->parallelize(ParallelType::TIDx);
3352
3353 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
3354
3355 const int dim_h = 99;
3356 const int dim_w = 101;
3357 const int dim_c = 10;
3358 const int dim_f = 20;
3359
3360 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3361 at::manual_seed(0);
3362 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
3363 at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
3364 std::vector<IValue> inputs = {at_inp, at_w};
3365
3366 FusionExecutor fe;
3367 fe.compileFusion(&fusion, inputs);
3368 auto cg_outputs = fe.runFusion(inputs);
3369
3370 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3371 auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
3372 at_out = at_out.squeeze(0); // drop the N axis
3373 // The shape of the spatial domain is (dim_h+1)x(dim_w+1), whereas
3374 // the fuser output has dim_h*dim_w. Drop the first elements to make
3375 // it match with the fuser output.
3376 std::vector<at::indexing::TensorIndex> indices{
3377 at::indexing::Slice(0, at::indexing::None),
3378 at::indexing::Slice(1, at::indexing::None),
3379 at::indexing::Slice(1, at::indexing::None)};
3380 at_out = at_out.index(indices);
3381
3382 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3383}
3384
3385TEST_F(NVFuserTest, FusionConv4x4Pad1x1_CUDA) {
3386 Fusion fusion;
3387 FusionGuard fg(&fusion);
3388
3389 // Input: [C, H, W]
3390 auto inp = makeSymbolicTensor(3);
3391 fusion.addInput(inp);
3392
3393 // Weights: [K, C, 4, 4]
3394 auto w = makeSymbolicTensor(4);
3395 fusion.addInput(w);
3396
3397 // Gather a neighbor tile of [4, 4] with padding size of 1 for both
3398 // sides of the spatial dimensions. The resulting extent is
3399 // decreased by one.
3400 auto inp_tile =
3401 gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 1, 1}, true);
3402 // inp_tile: [C, H-1, W-1, 1, 4, 4]
3403
3404 auto inp_bc =
3405 broadcast(inp_tile, {true, false, false, false, false, false, false});
3406 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
3407
3408 auto inp_times_w = mul(inp_bc, w_bc);
3409
3410 // Reduce the channel and neighbor tile dimensions
3411 auto out = sum(inp_times_w, {1, 4, 5, 6});
3412
3413 fusion.addOutput(out);
3414
3415 ////////////////////////////////////
3416
3417 // Cache the input and weight tensors
3418 auto inp_cache = inp->cacheAfter();
3419
3420 // Blocking the spatial dimensions
3421 const int block_w = 16;
3422 const int block_h = 4;
3423 // Blocking the channel dimension
3424 const int block_c = 8;
3425
3426 out->split(2, block_h);
3427 out->split(4, block_w);
3428 out->reorder({{3, 4}});
3429 // out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 4]
3430
3431 out->split(1, block_c);
3432 // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 4]
3433
3434 auto out_rf = out->rFactor({1, -3, -2, -1});
3435 // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 4]
3436 // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
3437
3438 // Create a [block_x, block_y] tile on smem
3439 inp_cache->computeAt(out, 4);
3440 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
3441 inp_cache->setMemoryType(MemoryType::Shared);
3442
3443 // Move Ci forward
3444 out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
3445 inp_cache->computeAt(out_rf, 5);
3446
3447 inp_tile->computeAt(out_rf, -1);
3448 w->computeAt(out_rf, -1);
3449
3450 out->axis(0)->parallelize(ParallelType::BIDx);
3451 out->axis(1)->parallelize(ParallelType::TIDz);
3452 out->axis(4)->parallelize(ParallelType::TIDy);
3453 out->axis(5)->parallelize(ParallelType::TIDx);
3454
3455 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
3456
3457 const int dim_h = 99;
3458 const int dim_w = 101;
3459 const int dim_c = 10;
3460 const int dim_f = 20;
3461
3462 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3463 at::manual_seed(0);
3464 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
3465 at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options);
3466 std::vector<IValue> inputs = {at_inp, at_w};
3467
3468 FusionExecutor fe;
3469 fe.compileFusion(&fusion, inputs);
3470 auto cg_outputs = fe.runFusion(inputs);
3471
3472 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3473 auto at_out =
3474 at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, 1);
3475 at_out = at_out.squeeze(0); // drop the N axis
3476
3477 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3478}
3479
3480TEST_F(NVFuserTest, FusionConv4x5Pad1x2_CUDA) {
3481 Fusion fusion;
3482 FusionGuard fg(&fusion);
3483
3484 // Input: [C, H, W]
3485 auto inp = makeSymbolicTensor(3);
3486 fusion.addInput(inp);
3487
3488 // Weights: [K, C, 4, 4]
3489 auto w = makeSymbolicTensor(4);
3490 fusion.addInput(w);
3491
3492 // Gather a neighbor tile of [4, 5] with padding size of 1 and 2 for
3493 // each side of the spatial dimensions.
3494 auto inp_tile =
3495 gather(inp, {1, 4, 5}, {{0, 0}, {1, 1}, {2, 2}}, {1, 1, 1}, true);
3496 // inp_tile: [C, H-1, W, 1, 4, 5]
3497
3498 auto inp_bc =
3499 broadcast(inp_tile, {true, false, false, false, false, false, false});
3500 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
3501
3502 auto inp_times_w = mul(inp_bc, w_bc);
3503
3504 // Reduce the channel and neighbor tile dimensions
3505 auto out = sum(inp_times_w, {1, 4, 5, 6});
3506
3507 fusion.addOutput(out);
3508
3509 ////////////////////////////////////
3510
3511 // Cache the input and weight tensors
3512 auto inp_cache = inp->cacheAfter();
3513
3514 // Blocking the spatial dimensions
3515 const int block_w = 16;
3516 const int block_h = 4;
3517 // Blocking the channel dimension
3518 const int block_c = 8;
3519
3520 out->split(2, block_h);
3521 out->split(4, block_w);
3522 out->reorder({{3, 4}});
3523 // out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 5]
3524
3525 out->split(1, block_c);
3526 // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 5]
3527
3528 auto out_rf = out->rFactor({1, -3, -2, -1});
3529 // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 5]
3530 // out_rf: [K, Ci, Ho, Wo, Hi, Wi]
3531
3532 // Create a [block_x, block_y] tile on smem
3533 inp_cache->computeAt(out, 4);
3534 // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
3535 inp_cache->setMemoryType(MemoryType::Shared);
3536
3537 // Move Ci forward
3538 out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
3539 inp_cache->computeAt(out_rf, 5);
3540
3541 inp_tile->computeAt(out_rf, -1);
3542 w->computeAt(out_rf, -1);
3543
3544 out->axis(0)->parallelize(ParallelType::BIDx);
3545 out->axis(1)->parallelize(ParallelType::TIDz);
3546 out->axis(4)->parallelize(ParallelType::TIDy);
3547 out->axis(5)->parallelize(ParallelType::TIDx);
3548
3549 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
3550
3551 const int dim_h = 99;
3552 const int dim_w = 101;
3553 const int dim_c = 10;
3554 const int dim_f = 20;
3555
3556 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3557 at::manual_seed(0);
3558 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
3559 at::Tensor at_w = at::randn({dim_f, dim_c, 4, 5}, options);
3560 std::vector<IValue> inputs = {at_inp, at_w};
3561
3562 FusionExecutor fe;
3563 fe.compileFusion(&fusion, inputs);
3564 auto cg_outputs = fe.runFusion(inputs);
3565
3566 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3567 auto at_out =
3568 at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, {1, 2});
3569 at_out = at_out.squeeze(0); // drop the N axis
3570
3571 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3572}
3573
3574TEST_F(NVFuserTest, FusionConv4x4Pad1x1Stride4_CUDA) {
3575 Fusion fusion;
3576 FusionGuard fg(&fusion);
3577
3578 // Input: [C, H, W]
3579 auto inp = makeSymbolicTensor(3);
3580 fusion.addInput(inp);
3581
3582 // Weights: [K, C, 3, 3]
3583 auto w = makeSymbolicTensor(4);
3584 fusion.addInput(w);
3585
3586 // Gather a neighbor tile of [4, 4] with padding size of 1 for both
3587 // sides of the spatial dimensions. Set the stride width as 4.
3588 auto inp_tile = gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 4, 4});
3589 // inp_tile: [C, H/4, s4, W/4, s4, 1, 4, 4]
3590
3591 auto inp_bc =
3592 broadcast(inp_tile, {true, false, false, false, false, false, false});
3593 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
3594
3595 auto inp_times_w = mul(inp_bc, w_bc);
3596
3597 // Reduce the channel and neighbor tile dimensions
3598 auto out = sum(inp_times_w, {1, 4, 5, 6});
3599
3600 fusion.addOutput(out);
3601
3602 ////////////////////////////////////
3603
3604 // Cache the input and weight tensors
3605 auto inp_cache = inp->cacheAfter();
3606
3607 // Blocking the spatial dimensions
3608 const int block_w = 16;
3609 const int block_h = 4;
3610 const int block_c = 2;
3611
3612 // [K, C, H/s, W/s, 1, 4, 4]
3613 out->split(2, block_h);
3614 // [K, C, H/s/block_h, block_h, W/s, 1, 4, 4]
3615 out->split(4, block_w);
3616 // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 4, 4]
3617 out->reorder({{3, 4}});
3618 // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4, 4]
3619 out->split(1, block_c);
3620 // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4,
3621 // 4]
3622 out->split(4, 1);
3623 // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
3624 // 4, 4]
3625
3626 auto out_rf = out->rFactor({1, -3, -2, -1});
3627 // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
3628 // 4, 4]
3629
3630 // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
3631
3632 inp_cache->computeAt(out, 5);
3633 inp_cache->setMemoryType(MemoryType::Shared);
3634 // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
3635 // 4, 4]
3636
3637 // Move C/block_c before block_h/2 and share the domain from
3638 // inp_cache to out_rf
3639 out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
3640 inp_cache->computeAt(out_rf, 6);
3641
3642 inp_tile->computeAt(out_rf, -1);
3643 w->computeAt(out_rf, -1);
3644
3645 out->axis(0)->parallelize(ParallelType::BIDx);
3646 out->axis(1)->parallelize(ParallelType::TIDz);
3647 out->axis(4)->parallelize(ParallelType::Unswitch);
3648 out->axis(5)->parallelize(ParallelType::TIDy);
3649 out->axis(6)->parallelize(ParallelType::TIDx);
3650
3651 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
3652
3653 const int dim_h = 99;
3654 const int dim_w = 101;
3655 const int dim_c = 10;
3656 const int dim_f = 20;
3657
3658 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3659 at::manual_seed(0);
3660 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
3661 at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options);
3662 std::vector<IValue> inputs = {at_inp, at_w};
3663
3664 FusionExecutor fe;
3665 fe.compileFusion(&fusion, inputs);
3666 auto cg_outputs = fe.runFusion(inputs);
3667
3668 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
3669 auto at_out =
3670 at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 4, {1, 1});
3671 at_out = at_out.squeeze(0); // drop the N axis
3672
3673 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3674}
3675
3676// POC implementation of im2col for 3-by-3 kernels
3677TEST_F(NVFuserTest, FusionIm2Col_CUDA) {
3678 Fusion fusion;
3679 FusionGuard fg(&fusion);
3680
3681 // Input: [N, C, H, W]
3682 auto inp = makeSymbolicTensor(4);
3683 fusion.addInput(inp);
3684
3685 // Gather a neighbor tile of [3, 3] with padding size of 1 for each
3686 // side of the spatial dimensions
3687 auto inp_tile = gather(inp, {1, 1, 3, 3}, {{0, 0}, {0, 0}, {1, 1}, {1, 1}});
3688 // inp_tile: [N, C, H, W, 1, 1, 3, 3]
3689
3690 auto inp_col = permute(inp_tile, {0, 2, 3, 1, 4, 5, 6, 7});
3691 // inp_col: [N, H, W, C, 1, 1, 3, 3]
3692
3693 fusion.addOutput(inp_col);
3694
3695 ////////////////////////////////////
3696
3697 // Cache the input tensor
3698 auto inp_cache = inp->cacheAfter();
3699
3700 // Blocking the spatial dimensions
3701 const int block_w = 16;
3702 const int block_h = 4;
3703
3704 auto out = inp_col;
3705
3706 out->split(1, block_h);
3707 out->split(3, block_w);
3708 out->reorder({{2, 3}});
3709 // out: [N, Ho, Wo, Hi, Wi, C, 1, 1, 3, 3]
3710 // Move the C axis out of Hi*Wi
3711 out->reorder({{5, 3}, {3, 4}, {4, 5}});
3712 // out: [N, Ho, Wo, C, Hi, Wi, 1, 1, 3, 3]
3713
3714 // Create a [block_x, block_y] tile on smem
3715 inp_cache->computeAt(out, 4);
3716 inp_cache->setMemoryType(MemoryType::Shared);
3717 // Fully inline inp_tile
3718 inp_tile->computeAt(out, -1);
3719
3720 out->axis(0)->parallelize(ParallelType::BIDz);
3721 out->axis(1)->parallelize(ParallelType::BIDy);
3722 out->axis(2)->parallelize(ParallelType::BIDx);
3723 out->axis(4)->parallelize(ParallelType::TIDy);
3724 out->axis(5)->parallelize(ParallelType::TIDx);
3725
3726 scheduler_utils::parallelizeAllLike(out, {inp_cache, inp_tile});
3727
3728 const int dim_h = 31;
3729 const int dim_w = 33;
3730 const int dim_c = 5;
3731 const int dim_n = 3;
3732
3733 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3734 at::manual_seed(0);
3735 at::Tensor at_inp = at::randn({dim_n, dim_c, dim_h, dim_w}, options);
3736 std::vector<IValue> inputs = {at_inp};
3737
3738 FusionExecutor fe;
3739 fe.compileFusion(&fusion, inputs);
3740 auto cg_outputs = fe.runFusion(inputs);
3741
3742 auto at_out = at::im2col(at_inp, {3, 3}, {1, 1}, {1, 1}, {1, 1});
3743
3744 // at::im2col outputs [N, C*3*3, N*H]
3745 at_out = at::transpose(at_out, 1, 2);
3746 at_out = at::reshape(at_out, {dim_n, dim_h, dim_w, dim_c, 1, 1, 3, 3});
3747
3748 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
3749}
3750
3751TEST_F(NVFuserTest, FusionShiftNoPadding1_CUDA) {
3752 Fusion fusion;
3753 FusionGuard fg(&fusion);
3754
3755 auto tv0 = makeSymbolicTensor(2);
3756 fusion.addInput(tv0);
3757
3758 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3759 auto tv2 = shift(tv1, {1, -1}, false);
3760 auto tv3 = shift(tv1, {-1, 1}, false);
3761 auto tv4 = add(tv2, tv3);
3762 auto tv5 = sum(tv4, {0, 1});
3763
3764 fusion.addOutput(tv5);
3765
3766 tv1->setMemoryType(MemoryType::Shared);
3767
3768 tv5->split(0, 4);
3769 tv5->split(-1, 8);
3770 tv5->reorder({{1, 2}});
3771
3772 TransformPropagator propagator(tv5);
3773 MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
3774
3775 tv2->computeAt(tv5, -1);
3776 tv3->computeAt(tv5, -1);
3777
3778 tv5->axis(-1)->parallelize(ParallelType::TIDx);
3779 tv5->axis(-2)->parallelize(ParallelType::TIDy);
3780 scheduler_utils::parallelizeAllLike(tv5);
3781
3782 int numel_x = 99;
3783 int numel_y = 101;
3784
3785 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3786 at::manual_seed(0);
3787 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
3788 std::vector<IValue> inputs = {t0};
3789
3790 FusionExecutor fe;
3791 fe.compileFusion(&fusion, inputs);
3792 auto outputs = fe.runFusion(inputs);
3793
3794 auto t1 = t0 + 1;
3795 auto t2 = shift(t1, {1, -1});
3796 auto t3 = shift(t1, {-1, 1});
3797 auto t4 = t2 + t3;
3798 std::vector<at::indexing::TensorIndex> indices{
3799 at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
3800 t4 = t4.index(indices);
3801 auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
3802
3803 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
3804}
3805
3806// Split and merge
3807TEST_F(NVFuserTest, FusionShiftNoPadding2_CUDA) {
3808 Fusion fusion;
3809 FusionGuard fg(&fusion);
3810
3811 auto tv0 = makeSymbolicTensor(2);
3812 fusion.addInput(tv0);
3813
3814 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3815 auto tv2 = shift(tv1, {1, -1}, false);
3816 auto tv3 = shift(tv1, {-1, 1}, false);
3817 auto tv4 = add(tv2, tv3);
3818 auto tv5 = sum(tv4, {0, 1});
3819
3820 fusion.addOutput(tv5);
3821
3822 tv1->setMemoryType(MemoryType::Shared);
3823
3824 tv5->split(0, 4);
3825 tv5->split(-1, 8);
3826 tv5->reorder({{1, 2}});
3827 tv5->merge(-2, -1);
3828
3829 TransformPropagator propagator(tv5);
3830 MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
3831
3832 tv2->computeAt(tv5, -1);
3833 tv3->computeAt(tv5, -1);
3834
3835 tv5->axis(-1)->parallelize(ParallelType::TIDx);
3836 scheduler_utils::parallelizeAllLike(tv5);
3837
3838 int numel_x = 99;
3839 int numel_y = 101;
3840
3841 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3842 at::manual_seed(0);
3843 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
3844 std::vector<IValue> inputs = {t0};
3845
3846 FusionExecutor fe;
3847 fe.compileFusion(&fusion, inputs);
3848 auto outputs = fe.runFusion(inputs);
3849
3850 auto t1 = t0 + 1;
3851 auto t2 = shift(t1, {1, -1});
3852 auto t3 = shift(t1, {-1, 1});
3853 auto t4 = t2 + t3;
3854 std::vector<at::indexing::TensorIndex> indices{
3855 at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
3856 t4 = t4.index(indices);
3857 auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
3858
3859 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
3860}
3861
3862// Split and merge, then welford
3863TEST_F(NVFuserTest, FusionShiftNoPadding3_CUDA) {
3864 Fusion fusion;
3865 FusionGuard fg(&fusion);
3866
3867 auto tv0 = makeSymbolicTensor(2);
3868 fusion.addInput(tv0);
3869
3870 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3871 auto tv2 = shift(tv1, {1, -1}, false);
3872 auto tv3 = shift(tv1, {-1, 1}, false);
3873 auto tv4 = add(tv2, tv3);
3874 auto tvs = Welford(tv4, {0, 1});
3875 auto tv_avg = tvs.avg;
3876 auto tv_M2 = tvs.var_sum;
3877 auto tv_N = tvs.n;
3878
3879 fusion.addOutput(tv_avg);
3880 fusion.addOutput(tv_M2);
3881 fusion.addOutput(tv_N);
3882
3883 tv1->setMemoryType(MemoryType::Shared);
3884
3885 tv_avg->split(0, 4);
3886 tv_avg->split(-1, 8);
3887 tv_avg->reorder({{1, 2}});
3888 tv_avg->merge(-2, -1);
3889
3890 TransformPropagator propagator(tv_avg);
3891 MaxRootDomainInfoSpanningTree(tv_avg).traverse(&propagator);
3892
3893 tv2->computeAt(tv_avg, -1);
3894 tv3->computeAt(tv_avg, -1);
3895
3896 tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
3897 scheduler_utils::parallelizeAllLike(tv_avg);
3898
3899 int numel_x = 99;
3900 int numel_y = 101;
3901
3902 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3903 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
3904 at::manual_seed(0);
3905 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
3906 std::vector<IValue> inputs = {t0};
3907
3908 FusionExecutor fe;
3909 fe.compileFusion(&fusion, inputs);
3910 auto outputs = fe.runFusion(inputs);
3911
3912 outputs[1] /= (numel_x - 2) * (numel_y - 2);
3913
3914 auto t1 = t0 + 1;
3915 auto t2 = shift(t1, {1, -1});
3916 auto t3 = shift(t1, {-1, 1});
3917 auto t4 = t2 + t3;
3918 std::vector<at::indexing::TensorIndex> indices{
3919 at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
3920 t4 = t4.index(indices);
3921 auto ref_avg = t4.mean(at::ArrayRef<int64_t>{0, 1});
3922 auto ref_M2 = t4.var(at::ArrayRef<int64_t>{0, 1}, false);
3923 auto ref_N = at::ones({}, options_int) * (numel_x - 2) * (numel_y - 2);
3924
3925 testValidate(
3926 fe.kernel(),
3927 outputs,
3928 inputs,
3929 {ref_avg, ref_M2, ref_N},
3930 __LINE__,
3931 __FILE__);
3932}
3933
3934// Shift indexing and predication with contiguous merge
3935TEST_F(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) {
3936 Fusion fusion;
3937 FusionGuard fg(&fusion);
3938
3939 auto tv0 = makeSymbolicTensor(2);
3940 fusion.addInput(tv0);
3941 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3942 auto tv2 = shift(tv1, {1, -1}, true);
3943 auto tv3 = shift(tv1, {-1, 1}, false);
3944 auto tv4 = add(tv2, tv3);
3945 fusion.addOutput(tv4);
3946
3947 tv2->merge(0);
3948 tv3->merge(0);
3949 tv4->merge(0);
3950
3951 tv1->setMemoryType(MemoryType::Global);
3952 tv2->setMemoryType(MemoryType::Global);
3953 tv3->setMemoryType(MemoryType::Global);
3954
3955 int numel_x = 9;
3956 int numel_y = 11;
3957
3958 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3959 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
3960 std::vector<IValue> inputs = {t0};
3961
3962 FusionExecutor fe;
3963 fe.compileFusion(&fusion, inputs);
3964 auto outputs = fe.runFusion(inputs);
3965
3966 std::vector<at::indexing::TensorIndex> indices{
3967 at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
3968
3969 auto fuser_out = outputs[0].index(indices);
3970
3971 auto t1 = t0 + 1;
3972 auto t2 = shift(t1, {1, -1});
3973 auto t3 = shift(t1, {-1, 1});
3974 auto ref = t2 + t3;
3975
3976 ref = ref.index(indices);
3977
3978 testValidate(&fusion, {fuser_out}, inputs, {ref}, __LINE__, __FILE__);
3979}
3980
3981TEST_F(NVFuserTest, FusionShiftNoPaddingChain_CUDA) {
3982 Fusion fusion;
3983 FusionGuard fg(&fusion);
3984
3985 auto tv0 = makeSymbolicTensor(2);
3986 fusion.addInput(tv0);
3987
3988 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3989 auto tv2 = shift(tv1, {1, -1}, false);
3990 auto tv3 = shift(tv2, {1, -1}, false);
3991 auto tv4 = sum(tv3, {0, 1});
3992 fusion.addOutput(tv4);
3993
3994 tv1->setMemoryType(MemoryType::Shared);
3995 tv2->setMemoryType(MemoryType::Shared);
3996
3997 tv4->split(0, 4);
3998 tv4->split(-1, 8);
3999 tv4->reorder({{1, 2}});
4000
4001 tv1->computeAt(tv4, 2);
4002
4003 tv4->axis(-1)->parallelize(ParallelType::TIDx);
4004 tv4->axis(-2)->parallelize(ParallelType::TIDy);
4005
4006 tv4->axis(0)->parallelize(ParallelType::BIDy);
4007 tv4->axis(1)->parallelize(ParallelType::BIDx);
4008
4009 scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
4010
4011 int numel_x = 99;
4012 int numel_y = 101;
4013
4014 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4015 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
4016 at::manual_seed(0);
4017 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
4018 std::vector<IValue> inputs = {t0};
4019
4020 FusionExecutor fe;
4021 fe.compileFusion(&fusion, inputs);
4022 auto outputs = fe.runFusion(inputs);
4023
4024 auto t1 = t0 + 1;
4025 auto t2 = shift(t1, {1, -1});
4026 auto t3 = shift(t2, {1, -1});
4027 std::vector<at::indexing::TensorIndex> indices{
4028 at::indexing::Slice(2, at::indexing::None), at::indexing::Slice(0, -2)};
4029 t3 = t3.index(indices);
4030 auto ref = t3.sum(at::ArrayRef<int64_t>{0, 1});
4031
4032 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4033}
4034
4035// Rfactor is not allowed with partial domains
4036TEST_F(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) {
4037 Fusion fusion;
4038 FusionGuard fg(&fusion);
4039
4040 auto tv0 = makeSymbolicTensor(2);
4041 fusion.addInput(tv0);
4042
4043 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4044 auto tv2 = shift(tv1, {1, -1}, false);
4045 auto tv3 = sum(tv2, {0, 1});
4046 fusion.addOutput(tv3);
4047
4048 tv3->split(0, 4);
4049 tv3->split(-1, 8);
4050 tv3->reorder({{1, 2}});
4051
4052 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4053 ASSERT_ANY_THROW(tv3->rFactor({-2}));
4054}
4055
4056TEST_F(NVFuserTest, FusionShiftPadding1_CUDA) {
4057 Fusion fusion;
4058 FusionGuard fg(&fusion);
4059
4060 auto tv0 = makeSymbolicTensor(2);
4061 fusion.addInput(tv0);
4062
4063 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4064 auto tv2 = shift(tv1, {2, -2}, {1, 1});
4065 auto tv3 = shift(tv1, {-3, 2}, {2, 2});
4066 auto tv4 = add(tv2, tv3);
4067 auto tv5 = sum(tv4, {0, 1});
4068
4069 fusion.addOutput(tv5);
4070
4071 tv1->setMemoryType(MemoryType::Shared);
4072
4073 tv5->split(0, 4);
4074 tv5->split(-1, 8);
4075 tv5->reorder({{1, 2}});
4076
4077 TransformPropagator propagator(tv5);
4078 MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
4079
4080 tv2->computeAt(tv5, -1);
4081 tv3->computeAt(tv5, -1);
4082
4083 tv5->axis(-1)->parallelize(ParallelType::TIDx);
4084 tv5->axis(-2)->parallelize(ParallelType::TIDy);
4085 scheduler_utils::parallelizeAllLike(tv5);
4086
4087 int numel_x = 99;
4088 int numel_y = 101;
4089
4090 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4091 at::manual_seed(0);
4092 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
4093 std::vector<IValue> inputs = {t0};
4094
4095 FusionExecutor fe;
4096 fe.compileFusion(&fusion, inputs);
4097 auto outputs = fe.runFusion(inputs);
4098
4099 auto t1 = t0 + 1;
4100 auto t2 = shift(t1, {2, -2});
4101 auto t3 = shift(t1, {-3, 2});
4102 auto t4 = t2 + t3;
4103 std::vector<at::indexing::TensorIndex> indices{
4104 at::indexing::Slice(1, -1), at::indexing::Slice(0, -1)};
4105 t4 = t4.index(indices);
4106 auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
4107
4108 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4109}
4110
4111TEST_F(NVFuserTest, FusionPartialSplit1_CUDA) {
4112 Fusion fusion;
4113 FusionGuard fg(&fusion);
4114
4115 auto tv0 = makeSymbolicTensor(1);
4116 // [I]
4117 fusion.addInput(tv0);
4118
4119 auto tv1 = add(tv0, IrBuilder::create<Double>(0));
4120 // [I]
4121 auto tv2 = shift(tv1, {1}, false);
4122 // [1:I]
4123 auto tv3 = shift(tv1, {-1}, false);
4124 // [0:I-1]
4125 auto tv4 = add(tv2, tv3);
4126 // [1:I-1]
4127 fusion.addOutput(tv4);
4128
4129 // Partial split of tv4. Split only the valid range, which is
4130 // [1:-1].
4131 tv4->split(0, 8, true, true);
4132 // [(I-2)/8, 8]
4133
4134 // Propagates the partial split back to tv1. This means that all of
4135 // the other tensors are also shaped as [(I-2)/8, 8], which appears
4136 // to mean only the sub region of ((I-2)/8 * 8) is
4137 // computed for tv1, tv2 and tv3. It's fine for the tv2 and tv3
4138 // tensors as only that sub region is used by tv4. It's also fine
4139 // for tv1 since it has halo of size one at each side, so the whole
4140 // region is actually calculated for tv1.
4141 tv1->computeAt(tv4, 1);
4142
4143 tv4->axis(-1)->parallelize(ParallelType::TIDx);
4144 tv4->axis(-2)->parallelize(ParallelType::BIDx);
4145 scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
4146
4147 tv1->setMemoryType(MemoryType::Shared);
4148
4149 // gridDim.x is ceilDiv(numel_x - 2, 8), not ceilDiv(numel_x, 8),
4150 // so it's going to be just 2 rather than 3.
4151 const int numel_x = 18;
4152
4153 ExpressionEvaluator evaluator(&fusion);
4154 auto root_extent = tv4->getRootDomain()[0]->extent();
4155 evaluator.bind(root_extent, numel_x);
4156 auto extent_eval = evaluator.evaluate(tv4->axis(0)->extent());
4157 TORCH_CHECK(
4158 extent_eval.has_value(),
4159 "Invalid evaluation of outer domain extent of partial split");
4160 TORCH_CHECK(
4161 extent_eval.value() == (numel_x - 2) / 8,
4162 "Invalid extent of outer domain of partial split");
4163
4164 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4165 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
4166 at::manual_seed(0);
4167 at::Tensor t0 = at::randn({numel_x}, options);
4168 std::vector<IValue> inputs = {t0};
4169
4170 FusionExecutor fe;
4171 fe.compileFusion(&fusion, inputs);
4172 auto outputs = fe.runFusion(inputs);
4173
4174 std::vector<at::indexing::TensorIndex> indices{at::indexing::Slice(1, -1)};
4175
4176 outputs[0] = outputs[0].index(indices);
4177
4178 auto ref = (shift(t0, {1}) + shift(t0, {-1})).index(indices);
4179
4180 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4181}
4182
4183TEST_F(NVFuserTest, FusionPartialSplit2_CUDA) {
4184 Fusion fusion;
4185 FusionGuard fg(&fusion);
4186
4187 auto tv0 = makeSymbolicTensor(1);
4188 fusion.addInput(tv0);
4189
4190 auto tv1 = add(tv0, IrBuilder::create<Double>(0));
4191 auto tv2 = shift(tv1, {1}, false);
4192 auto tv3 = shift(tv1, {-1}, false);
4193 auto tv4 = add(tv2, tv3);
4194 fusion.addOutput(tv4);
4195
4196 auto tv5 = add(tv1, IrBuilder::create<Double>(1));
4197 auto tv6 = add(tv5, IrBuilder::create<Double>(1));
4198 fusion.addOutput(tv6);
4199
4200 tv4->split(0, 4, true, true);
4201
4202 // This causes tv5 and tv6 also to be split with the same partial
4203 // offsets, however, since they need to be calculated entirely, the
4204 // resulting code would be invalid. It should be detected as part of
4205 // initial fusion validation during lowering.
4206 tv1->computeAt(tv4, 1);
4207
4208 // Validation should throw an error due to tv5 and tv6.
4209 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4210 ASSERT_ANY_THROW(fusion.printKernel());
4211}
4212
4213// 2D version of PartialSplit1
4214TEST_F(NVFuserTest, FusionPartialSplit3_CUDA) {
4215 Fusion fusion;
4216 FusionGuard fg(&fusion);
4217
4218 auto tv0 = makeSymbolicTensor(2);
4219 fusion.addInput(tv0);
4220
4221 auto tv1 = add(tv0, IrBuilder::create<Double>(0));
4222 auto tv2 = shift(tv1, {1, 2}, false);
4223 auto tv3 = shift(tv1, {-2, -1}, false);
4224 auto tv4 = add(tv2, tv3);
4225 fusion.addOutput(tv4);
4226
4227 tv4->split(1, 8, true, true);
4228 tv4->split(0, 4, true, true);
4229 tv4->reorder({{1, 2}, {2, 1}});
4230
4231 tv1->computeAt(tv4, 2);
4232
4233 tv4->axis(0)->parallelize(ParallelType::BIDy);
4234 tv4->axis(1)->parallelize(ParallelType::BIDx);
4235 tv4->axis(2)->parallelize(ParallelType::TIDy);
4236 tv4->axis(3)->parallelize(ParallelType::TIDx);
4237 scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
4238
4239 tv1->setMemoryType(MemoryType::Shared);
4240
4241 const int numel_x = 32 + 3;
4242 const int numel_y = 32 + 3;
4243
4244 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4245 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
4246 at::manual_seed(0);
4247 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
4248 std::vector<IValue> inputs = {t0};
4249
4250 FusionExecutor fe;
4251 fe.compileFusion(&fusion, inputs);
4252 auto outputs = fe.runFusion(inputs);
4253
4254 std::vector<at::indexing::TensorIndex> indices{
4255 at::indexing::Slice(1, -2), at::indexing::Slice(2, -1)};
4256
4257 outputs[0] = outputs[0].index(indices);
4258
4259 auto ref = (shift(t0, {1, 2}) + shift(t0, {-2, -1})).index(indices);
4260
4261 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4262}
4263
4264// Almost same fusion with Shift5ptStencilChain but non-padded shift
4265// and partial split.
4266TEST_F(NVFuserTest, FusionPartialSplit4_CUDA) {
4267 Fusion fusion;
4268 FusionGuard fg(&fusion);
4269
4270 auto tv0 = makeSymbolicTensor(2);
4271 fusion.addInput(tv0);
4272
4273 std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
4274
4275 // First stencil: 5pt stencil
4276 // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
4277 std::vector<TensorView*> tv_stencil1_shifts;
4278 for (const auto& offset : offsets) {
4279 tv_stencil1_shifts.push_back(shift(tv0, offset, false));
4280 }
4281
4282 auto tv_stencil1 = tv0;
4283 for (auto tv : tv_stencil1_shifts) {
4284 tv_stencil1 = add(tv_stencil1, tv);
4285 }
4286
4287 tv_stencil1 = div(
4288 tv_stencil1, IrBuilder::create<Double>(tv_stencil1_shifts.size() + 1));
4289
4290 // Second stencil: Same 5pt stencil
4291 std::vector<TensorView*> tv_stencil2_shifts;
4292 for (const auto& offset : offsets) {
4293 tv_stencil2_shifts.push_back(shift(tv_stencil1, offset, false));
4294 }
4295
4296 auto tv_stencil2 = tv_stencil1;
4297 for (auto tv : tv_stencil2_shifts) {
4298 tv_stencil2 = add(tv_stencil2, tv);
4299 }
4300
4301 tv_stencil2 = div(
4302 tv_stencil2, IrBuilder::create<Double>(tv_stencil2_shifts.size() + 1));
4303
4304 auto tv_out = tv_stencil2;
4305
4306 fusion.addOutput(tv_out);
4307
4308 auto tv0_cache = tv0->cacheAfter();
4309
4310 std::vector<int> split_factor({16, 16});
4311
4312 tv_out->split(-1, split_factor[1], true, true);
4313 tv_out->split(0, split_factor[0], true, true);
4314 tv_out->reorder({{1, 2}, {2, 1}});
4315
4316 tv0->computeAt(tv_out, 2);
4317
4318 // Inline completely all inputs to the first stencil output, except for the
4319 // tv0 cache
4320 for (auto tv : tv_stencil1_shifts) {
4321 tv->computeAt(tv_stencil1, -1);
4322 }
4323
4324 // Inline completely all inputs to the second stencil output, except
4325 // for the first stencil output
4326 for (auto tv : tv_stencil2_shifts) {
4327 tv->computeAt(tv_stencil2, -1);
4328 }
4329
4330 tv_out->axis(0)->parallelize(ParallelType::BIDy);
4331 tv_out->axis(1)->parallelize(ParallelType::BIDx);
4332 tv_out->axis(2)->parallelize(ParallelType::TIDy);
4333 tv_out->axis(3)->parallelize(ParallelType::TIDx);
4334
4335 auto all_values = DependencyCheck::getAllValsBetween(
4336 {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
4337 for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
4338 scheduler_utils::parallelizeAllLike(tv_out, {tv});
4339 }
4340
4341 tv0_cache->setMemoryType(MemoryType::Shared);
4342 tv_stencil1->setMemoryType(MemoryType::Shared);
4343
4344 // Input matrix size is 68x68, and the output is 64x64. Both
4345 // gridDim.x and gridim.y should be ceilDiv(numel - 4,
4346 // split_factor), which is 4. If full split is used, the grid
4347 // dimension would be 5.
4348 const int numel_x = 64 + 4;
4349 const int numel_y = 64 + 4;
4350
4351 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4352 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
4353 std::vector<IValue> inputs = {t0};
4354
4355 FusionExecutor fe;
4356 fe.compileFusion(&fusion, inputs);
4357 auto outputs = fe.runFusion(inputs);
4358
4359 std::vector<at::indexing::TensorIndex> indices{
4360 at::indexing::Slice(2, -2), at::indexing::Slice(2, -2)};
4361
4362 outputs[0] = outputs[0].index(indices);
4363
4364 auto stencil1 = t0;
4365 for (const auto& offset : offsets) {
4366 stencil1 = stencil1 + shift(t0, offset);
4367 }
4368 stencil1 = stencil1 / int(offsets.size() + 1);
4369 auto stencil2 = stencil1;
4370 for (const auto& offset : offsets) {
4371 stencil2 = stencil2 + shift(stencil1, offset);
4372 }
4373 stencil2 = stencil2 / int(offsets.size() + 1);
4374 auto ref = stencil2.index(indices);
4375
4376 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4377}
4378
4379TEST_F(NVFuserTest, FusionPartialSplit5_CUDA) {
4380 Fusion fusion;
4381 FusionGuard fg(&fusion);
4382
4383 const int numel_x = 10;
4384 const int numel_y = 11;
4385
4386 // auto tv0 = makeSymbolicTensor(2);
4387 auto tv0 = makeConcreteTensor({numel_x, numel_y});
4388 fusion.addInput(tv0);
4389
4390 auto tv1 = shift(tv0, {0, 1}, false);
4391 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4392
4393 fusion.addOutput(tv2);
4394
4395 // Partially split tv2 but not tv1. Producer indexing with tv2 as a consumer
4396 // requires adjustment of the index to account for the difference of split
4397 // offsets.
4398 tv2->split(1, 4, true, true);
4399 tv1->split(1, 4);
4400
4401 tv1->computeAt(tv2, 1);
4402
4403 tv2->axis(1)->parallelize(ParallelType::TIDx);
4404 tv1->axis(1)->parallelize(ParallelType::TIDx);
4405
4406 tv1->setMemoryType(MemoryType::Shared);
4407
4408 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4409 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
4410 std::vector<IValue> inputs = {t0};
4411
4412 FusionExecutor fe;
4413 fe.compileFusion(&fusion, inputs);
4414 auto outputs = fe.runFusion(inputs);
4415
4416 std::vector<at::indexing::TensorIndex> indices{
4417 at::indexing::Slice(0, at::indexing::None),
4418 at::indexing::Slice(1, at::indexing::None)};
4419
4420 outputs[0] = outputs[0].index(indices);
4421
4422 auto ref = (shift(t0, {0, 1}) + 1).index(indices);
4423
4424 testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
4425}
4426
4427TEST_F(NVFuserTest, FusionPartialSplit6_CUDA) {
4428 Fusion fusion;
4429 FusionGuard fg(&fusion);
4430
4431 const int numel_x = 9;
4432
4433 auto tv0 = makeConcreteTensor({numel_x});
4434 fusion.addInput(tv0);
4435
4436 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4437 auto tv2 = shift(tv1, {1}, false);
4438 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
4439
4440 fusion.addOutput(tv3);
4441
4442 // Another mix of partial and non-partial split
4443 tv1->split(0, 4);
4444 tv2->split(0, 4, true, true);
4445 tv3->split(0, 4);
4446
4447 // Just make it easier for compute-sanitizer to flag invalid memory accesses
4448 tv1->setMemoryType(MemoryType::Shared);
4449 tv2->setMemoryType(MemoryType::Shared);
4450
4451 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4452 at::Tensor t0 = at::randn({numel_x}, options);
4453 std::vector<IValue> inputs = {t0};
4454
4455 FusionExecutor fe;
4456 fe.compileFusion(&fusion, inputs);
4457 auto outputs = fe.runFusion(inputs);
4458
4459 std::vector<at::indexing::TensorIndex> indices{
4460 at::indexing::Slice(1, at::indexing::None)};
4461
4462 outputs[0] = outputs[0].index(indices);
4463
4464 auto ref = (shift(t0 + 1, {1}) + 1).index(indices);
4465
4466 testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
4467}
4468
4469TEST_F(NVFuserTest, FusionShiftUnswitch1_CUDA) {
4470 Fusion fusion;
4471 FusionGuard fg(&fusion);
4472
4473 auto tv0 = makeSymbolicTensor(2);
4474 fusion.addInput(tv0);
4475
4476 auto tv1 = shift(tv0, {-1, 0});
4477 fusion.addOutput(tv1);
4478
4479 auto tv2 = shift(tv0, {0, 1});
4480 fusion.addOutput(tv2);
4481
4482 auto tv3 = shift(tv0, {2, 2});
4483 fusion.addOutput(tv3);
4484
4485 auto tv4 = shift(tv0, {-2, -2});
4486 fusion.addOutput(tv4);
4487
4488 auto tv5 = add(tv0, IrBuilder::create<Double>(1));
4489 auto tv6 = shift(tv5, {0, -1});
4490 fusion.addOutput(tv6);
4491
4492 tv1->axis(1)->parallelize(ParallelType::Unswitch);
4493 tv2->axis(1)->parallelize(ParallelType::Unswitch);
4494 tv3->axis(0)->parallelize(ParallelType::Unswitch);
4495 tv4->axis(0)->parallelize(ParallelType::Unswitch);
4496
4497 tv5->axis(1)->parallelize(ParallelType::TIDx);
4498 tv6->axis(1)->parallelize(ParallelType::TIDx);
4499 tv5->axis(0)->parallelize(ParallelType::Unswitch);
4500 tv5->setMemoryType(MemoryType::Shared);
4501
4502 int numel_x = 9;
4503 int numel_y = 11;
4504
4505 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4506 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
4507 std::vector<IValue> inputs = {t0};
4508
4509 FusionExecutor fe;
4510 fe.compileFusion(&fusion, inputs);
4511 auto outputs = fe.runFusion(inputs);
4512
4513 auto t1 = shift(t0, {-1, 0});
4514 TORCH_CHECK(t1.equal(outputs[0]));
4515
4516 auto t2 = shift(t0, {0, 1});
4517 TORCH_CHECK(t2.equal(outputs[1]));
4518
4519 auto t3 = shift(t0, {2, 2});
4520 TORCH_CHECK(t3.equal(outputs[2]));
4521
4522 auto t4 = shift(t0, {-2, -2});
4523 TORCH_CHECK(t4.equal(outputs[3]));
4524
4525 auto t6 = shift(t0 + 1, {0, -1});
4526 TORCH_CHECK(t6.equal(outputs[4]));
4527}
4528
4529TEST_F(NVFuserTest, FusionGatherUnswitch1_CUDA) {
4530 const int tv1_gather = 3;
4531 const int tv1_gather_pad = 1;
4532 const int tv2_gather = 5;
4533 const int tv2_gather_pad = 2;
4534
4535 Fusion fusion;
4536 FusionGuard fg(&fusion);
4537
4538 auto tv0 = makeSymbolicTensor(1);
4539 fusion.addInput(tv0);
4540
4541 auto tv1 = gather(tv0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
4542 fusion.addOutput(tv1);
4543
4544 auto tv2 = gather(tv0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
4545 fusion.addOutput(tv2);
4546
4547 // Static gather
4548 auto tv3 = gather(tv0, {3}, {{1, 1}});
4549 fusion.addOutput(tv3);
4550
4551 // Static gather
4552 auto tv4 = gather(tv0, {5}, {{2, 2}});
4553 fusion.addOutput(tv4);
4554
4555 auto tv0_cache = tv0->cacheAfter();
4556 tv0_cache->setMemoryType(MemoryType::Shared);
4557
4558 tv4->split(0, 32);
4559
4560 tv0->computeAt(tv4, 1);
4561
4562 tv4->axis(0)->parallelize(ParallelType::Unswitch);
4563 tv4->axis(1)->parallelize(ParallelType::TIDx);
4564
4565 const int numel_x = 100;
4566
4567 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4568 at::Tensor t0 = at::randn({numel_x}, options);
4569 std::vector<IValue> inputs = {t0};
4570
4571 FusionExecutor fe;
4572 fe.compileFusion(&fusion, inputs);
4573 auto outputs = fe.runFusion(inputs);
4574
4575 auto t1 = gather(t0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
4576 TORCH_CHECK(t1.equal(outputs[0]));
4577
4578 auto t2 = gather(t0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
4579 TORCH_CHECK(t2.equal(outputs[1]));
4580
4581 auto t3 = gather(t0, {3}, {{1, 1}});
4582 TORCH_CHECK(t3.equal(outputs[2]));
4583
4584 auto t4 = gather(t0, {5}, {{2, 2}});
4585 TORCH_CHECK(t4.equal(outputs[3]));
4586}
4587
4588TEST_F(NVFuserTest, FusionGatherStrided1_CUDA) {
4589 Fusion fusion;
4590 FusionGuard fg(&fusion);
4591
4592 auto tv0 = makeSymbolicTensor(2);
4593 fusion.addInput(tv0);
4594
4595 const std::vector<int> window_shape = {1, 3};
4596 const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
4597
4598 const std::vector<int> strides = {1, 3};
4599
4600 auto tv1 = gather(tv0, window_shape, padding_width, strides);
4601
4602 fusion.addOutput(tv1);
4603
4604 const int s1 = 11;
4605 const int s2 = 13;
4606
4607 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4608 at::Tensor t0 = at::randn({s1, s2}, options);
4609
4610 FusionExecutor fe;
4611 fe.compileFusion(&fusion, {t0});
4612 auto outputs = fe.runFusion({t0});
4613
4614 // tv1 has a stride dimension, so its number of dimensions should be
4615 // input_ndims + window_ndims + stride.
4616 TORCH_CHECK(tv1->nDims() == tv0->nDims() * 2 + 1);
4617
4618 // However, the number of dimensions of the Aten tensor should still
4619 // be just the twice of the number of dimensions of the input
4620 // tensor.
4621 auto fuser_out = outputs[0];
4622 TORCH_CHECK(
4623 fuser_out.ndimension() == static_cast<int64_t>(tv0->nDims()) * 2,
4624 "Invalid dimensionality of output tensor: ",
4625 fuser_out.ndimension());
4626
4627 // Each output dimension should be: ceilDiv(input_size + padding_width -
4628 // window, stride).
4629 for (const auto i : c10::irange(window_shape.size())) {
4630 auto valid_dim = ceilDiv(
4631 t0.size(i) + padding_width[i][0] + padding_width[i][1] -
4632 window_shape[i] + 1,
4633 strides[i]);
4634 auto actual_dim = outputs[0].size(i);
4635 TORCH_CHECK(
4636 valid_dim == actual_dim,
4637 "Invalid output size at dimension ",
4638 i,
4639 ". Expected: ",
4640 valid_dim,
4641 ", actual: ",
4642 actual_dim);
4643 }
4644
4645 auto ref = gather(t0, window_shape, padding_width, strides);
4646
4647 TORCH_CHECK(ref.equal(outputs[0]));
4648}
4649
4650// Split strided domain
4651TEST_F(NVFuserTest, FusionGatherStrided2_CUDA) {
4652 Fusion fusion;
4653 FusionGuard fg(&fusion);
4654
4655 const std::vector<int> window_shape = {3};
4656 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4657 const std::vector<int> strides = {3};
4658
4659 auto tv0 = makeSymbolicTensor(1);
4660 fusion.addInput(tv0);
4661
4662 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4663
4664 auto tv2 = gather(tv1, window_shape, padding_width, strides);
4665
4666 auto tv3 = sum(tv2, {-1});
4667
4668 fusion.addOutput(tv3);
4669
4670 // Split the strided domain
4671 tv3->split(0, 4);
4672
4673 // Propagate the split by 4 of the tv3 domain to pre-stride domains,
4674 // making them split by 4 * 3
4675 tv0->computeAt(tv3, 1);
4676
4677 tv2->computeAt(tv3, -1);
4678
4679 tv3->axis(0)->parallelize(ParallelType::BIDx);
4680 tv3->axis(1)->parallelize(ParallelType::TIDx);
4681 scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
4682
4683 tv1->setMemoryType(MemoryType::Shared);
4684
4685 const int s1 = 100;
4686
4687 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4688 at::Tensor t0 = at::randn({s1}, options);
4689 std::vector<IValue> inputs = {t0};
4690
4691 FusionExecutor fe;
4692 fe.compileFusion(&fusion, inputs);
4693 auto outputs = fe.runFusion(inputs);
4694
4695 auto t1 = t0 + 1;
4696 auto t2 = gather(t1, window_shape, padding_width, strides);
4697 auto ref = sum(t2, {-1});
4698
4699 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4700}
4701
4702// Outer split
4703TEST_F(NVFuserTest, FusionGatherStrided3_CUDA) {
4704 Fusion fusion;
4705 FusionGuard fg(&fusion);
4706
4707 const std::vector<int> window_shape = {3};
4708 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4709 const std::vector<int> strides = {3};
4710
4711 auto tv0 = makeSymbolicTensor(1);
4712 fusion.addInput(tv0);
4713
4714 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4715
4716 auto tv2 = gather(tv1, window_shape, padding_width, strides);
4717
4718 auto tv3 = sum(tv2, {-1});
4719 fusion.addOutput(tv3);
4720
4721 // Outer split
4722 tv3->split(0, 2, false);
4723
4724 tv0->computeAt(tv3, 1);
4725
4726 tv3->axis(0)->parallelize(ParallelType::BIDx);
4727 tv3->axis(1)->parallelize(ParallelType::TIDx);
4728 scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
4729
4730 tv1->setMemoryType(MemoryType::Shared);
4731 tv2->setMemoryType(MemoryType::Shared);
4732
4733 const int s1 = 100;
4734
4735 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4736 at::Tensor t0 = at::randn({s1}, options);
4737 std::vector<IValue> inputs = {t0};
4738
4739 FusionExecutor fe;
4740 fe.compileFusion(&fusion, inputs);
4741 auto outputs = fe.runFusion(inputs);
4742
4743 auto t1 = t0 + 1;
4744 auto t2 = gather(t1, window_shape, padding_width, strides);
4745 auto ref = sum(t2, {-1});
4746
4747 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4748}
4749
4750TEST_F(NVFuserTest, FusionGatherStrided4_CUDA) {
4751 Fusion fusion;
4752 FusionGuard fg(&fusion);
4753
4754 const std::vector<int> window_shape = {3};
4755 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4756 const std::vector<int> strides = {3};
4757
4758 auto tv0 = makeSymbolicTensor(1);
4759 fusion.addInput(tv0);
4760
4761 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4762
4763 // Test propagation of split from one gather output to another
4764 auto tv2 = gather(tv1, window_shape, padding_width, strides);
4765 auto tv3 = gather(tv1, window_shape, padding_width, strides);
4766
4767 auto tv4 = sum(tv2, {-1});
4768 fusion.addOutput(tv4);
4769
4770 auto tv5 = sum(tv3, {-1});
4771 fusion.addOutput(tv5);
4772
4773 tv4->split(0, 2);
4774
4775 // Test forward computeAt propagation from tv1 to tv3
4776 tv0->computeAt(tv4, 1);
4777
4778 const int s1 = 101;
4779
4780 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4781 at::Tensor t0 = at::randn({s1}, options);
4782 std::vector<IValue> inputs = {t0};
4783
4784 FusionExecutor fe;
4785 fe.compileFusion(&fusion, inputs);
4786 auto outputs = fe.runFusion(inputs);
4787
4788 auto t1 = t0 + 1;
4789 auto t2 = gather(t1, window_shape, padding_width, strides);
4790 auto ref = sum(t2, {-1});
4791
4792 testValidate(&fusion, outputs, inputs, {ref, ref}, __LINE__, __FILE__);
4793}
4794
4795// Same as GatherStrided1 but with stride != window
4796TEST_F(NVFuserTest, FusionGatherStrided5_CUDA) {
4797 Fusion fusion;
4798 FusionGuard fg(&fusion);
4799
4800 auto tv0 = makeSymbolicTensor(2);
4801 fusion.addInput(tv0);
4802
4803 const std::vector<int> window_shape = {1, 3};
4804 const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
4805
4806 const std::vector<int> strides = {1, 2};
4807
4808 auto tv1 = gather(tv0, window_shape, padding_width, strides);
4809
4810 fusion.addOutput(tv1);
4811
4812 const int s1 = 11;
4813 const int s2 = 13;
4814
4815 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4816 at::Tensor t0 = at::randn({s1, s2}, options);
4817
4818 FusionExecutor fe;
4819 fe.compileFusion(&fusion, {t0});
4820 auto outputs = fe.runFusion({t0});
4821
4822 auto ref = gather(t0, window_shape, padding_width, strides);
4823
4824 TORCH_CHECK(ref.equal(outputs[0]));
4825}
4826
4827// Same as GatherStrided2 but with stride != window
4828TEST_F(NVFuserTest, FusionGatherStrided6_CUDA) {
4829 Fusion fusion;
4830 FusionGuard fg(&fusion);
4831
4832 const std::vector<int> window_shape = {3};
4833 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4834 const std::vector<int> strides = {2};
4835
4836 auto tv0 = makeSymbolicTensor(1);
4837 fusion.addInput(tv0);
4838
4839 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4840
4841 auto tv2 = gather(tv1, window_shape, padding_width, strides);
4842
4843 auto tv3 = sum(tv2, {-1});
4844
4845 fusion.addOutput(tv3);
4846
4847 // Split the strided domain
4848 tv3->split(0, 4);
4849
4850 // Propagate the split by 4 of the tv3 domain to pre-stride domains,
4851 // making them split by 4 * 2
4852 tv0->computeAt(tv3, 1);
4853
4854 tv2->computeAt(tv3, -1);
4855
4856 tv3->axis(0)->parallelize(ParallelType::BIDx);
4857 tv3->axis(1)->parallelize(ParallelType::TIDx);
4858 scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
4859
4860 tv1->setMemoryType(MemoryType::Shared);
4861
4862 const int s1 = 100;
4863
4864 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4865 at::Tensor t0 = at::randn({s1}, options);
4866 std::vector<IValue> inputs = {t0};
4867
4868 FusionExecutor fe;
4869 fe.compileFusion(&fusion, inputs);
4870 auto outputs = fe.runFusion(inputs);
4871
4872 auto t1 = t0 + 1;
4873 auto t2 = gather(t1, window_shape, padding_width, strides);
4874 auto ref = sum(t2, {-1});
4875
4876 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4877}
4878
4879// Same as GatherStrided4 but different strides
4880TEST_F(NVFuserTest, FusionGatherStrided7_CUDA) {
4881 Fusion fusion;
4882 FusionGuard fg(&fusion);
4883
4884 const std::vector<int> window_shape = {3};
4885 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4886
4887 auto tv0 = makeSymbolicTensor(1);
4888 fusion.addInput(tv0);
4889
4890 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4891
4892 // Use different strides
4893 auto tv2 = gather(tv1, window_shape, padding_width, {3});
4894 auto tv3 = gather(tv1, window_shape, padding_width, {2});
4895
4896 auto tv4 = sum(tv2, {-1});
4897 fusion.addOutput(tv4);
4898
4899 auto tv5 = sum(tv3, {-1});
4900 fusion.addOutput(tv5);
4901
4902 tv4->split(0, 2);
4903
4904 // Since tv3 has a different stride factor, this should fail.
4905 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4906 ASSERT_ANY_THROW(tv0->computeAt(tv4, 1));
4907}
4908
4909// Same as GatherStrided2 but with unswitch
4910TEST_F(NVFuserTest, FusionGatherStrided8_CUDA) {
4911 Fusion fusion;
4912 FusionGuard fg(&fusion);
4913
4914 const std::vector<int> window_shape = {3};
4915 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4916 const std::vector<int> strides = {3};
4917
4918 auto tv0 = makeSymbolicTensor(1);
4919 fusion.addInput(tv0);
4920
4921 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4922
4923 auto tv2 = gather(tv1, window_shape, padding_width, strides);
4924
4925 auto tv3 = sum(tv2, {-1});
4926
4927 fusion.addOutput(tv3);
4928
4929 const int tidx = 32;
4930
4931 // Split the strided domain
4932 tv3->split(0, tidx);
4933
4934 // Split for unswitch
4935 tv3->split(0, 1);
4936
4937 tv0->computeAt(tv3, 2);
4938
4939 tv2->computeAt(tv3, -1);
4940
4941 tv3->axis(0)->parallelize(ParallelType::BIDx);
4942 tv3->axis(1)->parallelize(ParallelType::Unswitch);
4943 tv3->axis(2)->parallelize(ParallelType::TIDx);
4944 scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
4945
4946 tv1->setMemoryType(MemoryType::Shared);
4947
4948 const int s1 = 1023;
4949
4950 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4951 at::Tensor t0 = at::randn({s1}, options);
4952 std::vector<IValue> inputs = {t0};
4953
4954 FusionExecutor fe;
4955 fe.compileFusion(&fusion, inputs);
4956 auto outputs = fe.runFusion(inputs);
4957
4958 auto t1 = t0 + 1;
4959 auto t2 = gather(t1, window_shape, padding_width, strides);
4960 auto ref = sum(t2, {-1});
4961
4962 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
4963}
4964
4965// Chained strided gather. Not supported yet.
4966TEST_F(NVFuserTest, FusionGatherStridedChain_CUDA) {
4967 Fusion fusion;
4968 FusionGuard fg(&fusion);
4969
4970 const std::vector<int> window_shape = {3};
4971 const std::vector<std::vector<int>> padding_width = {{1, 1}};
4972 const std::vector<int> strides = {3};
4973 // const std::vector<int> strides = {1};
4974
4975 auto tv0 = makeSymbolicTensor(1);
4976 fusion.addInput(tv0);
4977
4978 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4979
4980 auto tv2 = gather(tv1, window_shape, padding_width, strides);
4981 // Reduce gathered window
4982 auto tv3 = sum(tv2, {-1});
4983
4984 // Repeat
4985 auto tv4 = gather(tv3, window_shape, padding_width, strides);
4986 auto tv5 = sum(tv4, {-1});
4987 auto out = tv5;
4988
4989 fusion.addOutput(out);
4990
4991 // This should throw an error at HaloInfo::build.
4992 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4993 ASSERT_ANY_THROW(GpuLower gpulw(&fusion));
4994}
4995
4996TEST_F(NVFuserTest, FusionMaxPoolingStrided_CUDA) {
4997 Fusion fusion;
4998 FusionGuard fg(&fusion);
4999
5000 // Input: CHW
5001 // Pooling window: 3x3
5002 // Strides: 3
5003 // Padding: 1 at each end of the inner 2 dimensions
5004
5005 // [C, H, W]
5006 auto inp = makeSymbolicTensor(3);
5007 fusion.addInput(inp);
5008
5009 // [C, H/3, W/3, 1, 3, 3]
5010 auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
5011
5012 // [C, H/3, W/3]
5013 auto max_tensor = reductionOp(
5014 BinaryOpType::Max,
5015 {-3, -2, -1},
5016 IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
5017 inp_tile);
5018 fusion.addOutput(max_tensor);
5019
5020 ////////////////////////////////////
5021
5022 // Cache the input and weight tensors
5023 auto inp_cache = inp->cacheAfter();
5024
5025 // Tiling the spatial domain
5026 const int tile_x = 32;
5027 const int tile_y = 8;
5028
5029 max_tensor->split(1, tile_y);
5030 max_tensor->split(3, tile_x);
5031 max_tensor->reorder({{2, 3}});
5032 // [C, H/tile_y, W/tile_x, tile_y, tile_x]
5033 max_tensor->split(2, 1);
5034 // [C, H/tile_y, W/tile_x, 1, tile_y, tile_x]
5035
5036 inp->computeAt(max_tensor, 4);
5037
5038 max_tensor->axis(0)->parallelize(ParallelType::BIDx);
5039 max_tensor->axis(3)->parallelize(ParallelType::Unswitch);
5040 max_tensor->axis(4)->parallelize(ParallelType::TIDy);
5041 max_tensor->axis(5)->parallelize(ParallelType::TIDx);
5042
5043 scheduler_utils::parallelizeAllLike(max_tensor);
5044
5045 inp_cache->setMemoryType(MemoryType::Shared);
5046
5047 const int hw = 50;
5048 const int num_channels = 20;
5049 const int pooling_window = 3;
5050
5051 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5052 at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
5053 // We always pad inputs by zero, so if all surrounding values are
5054 // negative, max pooling would pick a padded value, which isn't the
5055 // correct behavior. We need to be able to choose the value of
5056 // padding. In this case, padding by the minimum value would not
5057 // have this problem. For now, avoid the problem by making sure all
5058 // values are not negative.
5059 aten_inp = at::abs(aten_inp);
5060 std::vector<IValue> inputs = {aten_inp};
5061
5062 FusionExecutor fe;
5063 fe.compileFusion(&fusion, inputs);
5064 auto outputs = fe.runFusion(inputs);
5065
5066 auto ref = at::max_pool2d(
5067 aten_inp, {pooling_window, pooling_window}, {3, 3}, {1, 1});
5068
5069 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
5070}
5071
5072TEST_F(NVFuserTest, FusionConv2DStaticStrided_CUDA) {
5073 Fusion fusion;
5074 FusionGuard fg(&fusion);
5075
5076 // Input: [C, H, W]
5077 auto inp = makeSymbolicTensor(3);
5078 fusion.addInput(inp);
5079
5080 // Weights: [K, C, 3, 3]
5081 auto w = makeSymbolicTensor(4);
5082 fusion.addInput(w);
5083
5084 // Gather a neighbor tile of [3, 3] with padding size of 1 for each
5085 // side of the spatial dimensions
5086 auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
5087 // inp_tile: [C, H/3, s3, W/3, s3, 1, 3, 3]
5088
5089 auto inp_bc =
5090 broadcast(inp_tile, {true, false, false, false, false, false, false});
5091 auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
5092
5093 auto inp_times_w = mul(inp_bc, w_bc);
5094
5095 // Reduce the channel and neighbor tile dimensions
5096 auto out = sum(inp_times_w, {1, 4, 5, 6});
5097
5098 fusion.addOutput(out);
5099
5100 ////////////////////////////////////
5101
5102 // Cache the input and weight tensors
5103 auto inp_cache = inp->cacheAfter();
5104
5105 // Blocking the spatial dimensions
5106 const int block_w = 16;
5107 const int block_h = 4;
5108 const int block_c = 2;
5109
5110 // [K, C, H/s, W/s, 1, 3, 3]
5111 out->split(2, block_h);
5112 // [K, C, H/s/block_h, block_h, W/s, 1, 3, 3]
5113 out->split(4, block_w);
5114 // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 3, 3]
5115 out->reorder({{3, 4}});
5116 // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3, 3]
5117 out->split(1, block_c);
5118 // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3,
5119 // 3]
5120 out->split(4, 1);
5121 // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
5122 // 3, 3]
5123
5124 auto out_rf = out->rFactor({1, -3, -2, -1});
5125 // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
5126 // 3, 3]
5127
5128 // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
5129
5130 inp_cache->computeAt(out, 5);
5131 inp_cache->setMemoryType(MemoryType::Shared);
5132 // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
5133 // 3, 3]
5134
5135 // Move C/block_c before block_h/2 and share the domain from
5136 // inp_cache to out_rf
5137 out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
5138 inp_cache->computeAt(out_rf, 6);
5139
5140 inp_tile->computeAt(out_rf, -1);
5141 w->computeAt(out_rf, -1);
5142
5143 out->axis(0)->parallelize(ParallelType::BIDx);
5144 out->axis(1)->parallelize(ParallelType::TIDz);
5145 out->axis(4)->parallelize(ParallelType::Unswitch);
5146 out->axis(5)->parallelize(ParallelType::TIDy);
5147 out->axis(6)->parallelize(ParallelType::TIDx);
5148
5149 scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
5150
5151 const int dim_h = 99;
5152 const int dim_w = 101;
5153 const int dim_c = 10;
5154 const int dim_f = 20;
5155
5156 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5157 at::manual_seed(0);
5158 at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
5159 at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
5160 std::vector<IValue> inputs = {at_inp, at_w};
5161
5162 FusionExecutor fe;
5163 fe.compileFusion(&fusion, inputs);
5164 auto cg_outputs = fe.runFusion(inputs);
5165
5166 at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
5167 auto at_out = at::conv2d(at_inp, at_w, {}, 3, 1);
5168 at_out = at_out.squeeze(0); // drop the N axis
5169
5170 testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
5171}
5172
5173TEST_F(NVFuserTest, FusionNonDivisibleHalo1_CUDA) {
5174 Fusion fusion;
5175 FusionGuard fg(&fusion);
5176
5177 auto tv0 = makeSymbolicTensor(1);
5178 fusion.addInput(tv0);
5179
5180 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
5181 auto tv2 = shift(tv1, {-1});
5182 fusion.addOutput(tv2);
5183
5184 // [I]
5185 tv2->split(0, 8);
5186 // [I/8, 8]
5187 tv2->split(1, 3);
5188 // [I/8, 3, 3]
5189
5190 tv0->computeAt(tv2, -2);
5191
5192 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5193 at::Tensor t0 = at::randn({24}, options);
5194
5195 FusionExecutor fe;
5196 fe.compileFusion(&fusion, {t0});
5197 auto cg_outputs = fe.runFusion({t0});
5198
5199 auto ref = shift((t0 + 1), {-1});
5200
5201 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
5202}
5203
5204TEST_F(NVFuserTest, FusionNonDivisibleHalo2_CUDA) {
5205 Fusion fusion;
5206 FusionGuard fg(&fusion);
5207
5208 auto tv0 = makeSymbolicTensor(2);
5209 fusion.addInput(tv0);
5210
5211 auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
5212 auto tv2 = sum(tv1, {-2, -1});
5213 auto tv3 = add(tv0, tv2);
5214 auto tv4 = sum(tv3, {0, 1});
5215 fusion.addOutput(tv4);
5216
5217 const int gy = 50;
5218 const int gx = 50;
5219 const int by = 8;
5220 const int bx = 16;
5221
5222 auto tv5 = tv0->cacheAfter();
5223
5224 // [I, J]
5225 tv4->split(0, gy);
5226 // [I/gy, gy, J]
5227 tv4->split(1, by);
5228 // [I/gy, gy/by, by, J]
5229 tv4->split(-1, gx);
5230 // [I/gy, gy/by, by, J/gx, gx]
5231 tv4->split(-1, bx);
5232 // [I/gy, gy/by, by, J/gx, gx/bx, bx]
5233 tv4->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
5234 // [I/gy, J/gx, gy/by, gx/bx, by, bx]
5235
5236 auto tv6 = tv4->rFactor({2, 3});
5237
5238 tv0->computeAt(tv6, 4);
5239
5240 tv4->axis(0)->parallelize(ParallelType::BIDy);
5241 tv4->axis(1)->parallelize(ParallelType::BIDx);
5242 tv4->axis(2)->parallelize(ParallelType::TIDy);
5243 tv4->axis(3)->parallelize(ParallelType::TIDx);
5244
5245 scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3, tv5, tv6});
5246
5247 tv5->setMemoryType(MemoryType::Shared);
5248
5249 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5250 at::Tensor t0 = at::randn({111, 222}, options);
5251
5252 FusionExecutor fe;
5253 fe.compileFusion(&fusion, {t0});
5254 auto cg_outputs = fe.runFusion({t0});
5255
5256 auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
5257 auto t2 = t1.sum({-2, -1});
5258 auto t3 = t0 + t2;
5259 auto t4 = t3.sum({-2, -1});
5260
5261 testValidate(&fusion, cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
5262}
5263
5264TEST_F(NVFuserTest, FusionGather9ptStencilDoubleBuffering_CUDA) {
5265 Fusion fusion;
5266 FusionGuard fg(&fusion);
5267
5268 auto tv0 = makeSymbolicTensor(2);
5269 fusion.addInput(tv0);
5270
5271 auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
5272 auto tv2 = sum(tv1, {-2, -1});
5273 auto tv3 = div(tv2, IrBuilder::create<Double>(9));
5274
5275 auto out = tv3;
5276
5277 fusion.addOutput(out);
5278
5279 auto tv0_cache = tv0->cacheAfter();
5280
5281 tv0_cache->setMemoryType(MemoryType::Shared);
5282
5283 out->split(-2, 4);
5284 out->split(-1, 32);
5285 out->reorder({{1, 2}, {2, 1}});
5286 TransformPropagator propagator(out);
5287 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
5288
5289 tv0->computeAt(out, 2);
5290
5291 out->axis(3)->parallelize(ParallelType::TIDx);
5292 out->axis(2)->parallelize(ParallelType::TIDy);
5293 out->axis(0)->parallelize(ParallelType::BIDx);
5294
5295 scheduler_utils::parallelizeAllLike(out);
5296
5297 tv0_cache->doubleBuffer();
5298
5299 int numel_x = 99;
5300 int numel_y = 101;
5301
5302 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5303 at::Tensor t0 = at::randn({numel_x, numel_y}, options);
5304 std::vector<IValue> inputs = {t0};
5305
5306 FusionExecutor fe;
5307 fe.compileFusion(&fusion, inputs);
5308 auto outputs = fe.runFusion(inputs);
5309
5310 auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
5311 auto t2 = sum(t1, {-2, -1});
5312 auto t3 = t2 / 9;
5313 auto ref = t3;
5314
5315 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
5316}
5317
5318TEST_F(NVFuserTest, FusionValidateParallelizeShift_CUDA) {
5319 Fusion fusion;
5320 FusionGuard fg(&fusion);
5321
5322 auto tv0 = makeSymbolicTensor(1);
5323 fusion.addInput(tv0);
5324
5325 auto tv1 = set(tv0);
5326 auto tv2 = shift(tv1, {1});
5327 auto tv3 = shift(tv1, {-1});
5328 auto tv4 = add(tv1, tv2);
5329 auto tv5 = add(tv4, tv3);
5330 fusion.addOutput(tv5);
5331
5332 tv1->setMemoryType(MemoryType::Shared);
5333
5334 tv5->split(-1, 1024);
5335 tv5->split(-1, 2);
5336 TransformPropagator propagator(tv5);
5337 MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
5338
5339 tv0->computeAt(tv5, 1);
5340
5341 tv5->axis(1)->parallelize(ParallelType::TIDx);
5342
5343 scheduler_utils::parallelizeAllLike(tv5);
5344
5345 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5346 at::Tensor t0 = at::randn({1024 * 32}, options);
5347 std::vector<IValue> inputs = {t0};
5348
5349 FusionExecutor fe;
5350 fe.compileFusion(&fusion, inputs);
5351 auto outputs = fe.runFusion(inputs);
5352
5353 auto ref = t0 + shift(t0, {1}) + shift(t0, {-1});
5354
5355 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
5356}
5357
5358// Test IterType promotion with gather
5359TEST_F(NVFuserTest, FusionGatherIterTypePromotion_CUDA) {
5360 Fusion fusion;
5361 FusionGuard fg(&fusion);
5362
5363 const int s1 = 11;
5364 const int s2 = 3;
5365
5366 auto tv0 = makeConcreteTensor({s1});
5367 fusion.addInput(tv0);
5368 auto tv1 = makeConcreteTensor({s1, s2});
5369 fusion.addInput(tv1);
5370
5371 const std::vector<int> window_shape = {3};
5372 const std::vector<std::vector<int>> padding_width = {{1, 1}};
5373
5374 auto tv2 = gather(tv0, window_shape, padding_width);
5375 auto tv3 = add(tv2, tv1);
5376
5377 fusion.addOutput(tv3);
5378
5379 TORCH_CHECK(
5380 tv3->axis(1)->getIterType() == IterType::Iteration,
5381 "Invalid IterType promotion: ",
5382 tv3->axis(1)->toString());
5383
5384 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5385 at::Tensor t0 = at::randn({s1}, options);
5386 at::Tensor t1 = at::randn({s1, s2}, options);
5387 std::vector<IValue> inputs = {t0, t1};
5388
5389 auto ref = gather(t0, window_shape, padding_width) + t1;
5390
5391 FusionExecutor fe;
5392 fe.compileFusion(&fusion, inputs);
5393 auto outputs = fe.runFusion(inputs);
5394
5395 testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
5396}
5397
5398TEST_F(NVFuserTest, FusionContigPredicateShift_CUDA) {
5399 Fusion fusion;
5400 FusionGuard fg(&fusion);
5401
5402 std::vector<int64_t> shape({2, 2});
5403
5404 auto tv0 = makeConcreteTensor(shape);
5405 // [0:I]
5406 fusion.addInput(tv0);
5407
5408 // Below, tv2 and tv3 are mostly the same, except for tv2 is padded
5409 // with 0, whereas tv3 is not, so the valid range of tv3 is [0:I-1]
5410
5411 // [0:I]
5412 auto tv1 = shift(tv0, {-1, 0});
5413
5414 // [0:I-1]
5415 auto tv2 = shift(tv0, {-1, 0}, false);
5416
5417 // tv3 is not an output of shift, but it gets a partial root
5418 // domain from tv2, so it must be predicated at the root domain
5419 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
5420
5421 fusion.addOutput(tv1);
5422 fusion.addOutput(tv3);
5423
5424 // contig merge
5425 tv1->merge(0);
5426 tv1->split(0, 4);
5427 TransformPropagator propagator(tv1);
5428 MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
5429
5430 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5431
5432 // Create 3x2 and trim to 2x2. This would cause the output tensor
5433 // non-zero values if not properly predicated.
5434 at::Tensor t0 = at::randn({3, 2}, options);
5435 t0 = t0.index(
5436 {at::indexing::Slice(0, 2), at::indexing::Slice(0, at::indexing::None)});
5437
5438 // Use random output to detect invalid writes
5439 at::Tensor t1 = at::rand_like(t0, options);
5440 // Use zero-cleared output to detect invalid writes
5441 at::Tensor t3 = at::zeros_like(t0, options);
5442
5443 std::vector<IValue> inputs = {t0};
5444 std::vector<at::Tensor> outputs = {t1, t3};
5445
5446 std::vector<at::indexing::TensorIndex> indices{
5447 at::indexing::Slice(0, -1), at::indexing::Slice(0, at::indexing::None)};
5448
5449 FusionExecutor fe;
5450 fe.compileFusion(&fusion, inputs);
5451 fe.runFusion(inputs, outputs);
5452
5453 // Make sure the padded region is zero filled
5454 TORCH_CHECK(t1[1].equal(at::zeros(2, options)));
5455 // Make sure not touched as the shift is not padded
5456 TORCH_CHECK(t3[1].equal(at::zeros(2, options)));
5457
5458 auto ref = shift(t0, {-1, 0});
5459
5460 TORCH_CHECK(t1.equal(ref));
5461 TORCH_CHECK(t3.index(indices).equal((ref + 1).index(indices)));
5462}
5463
5464} // namespace jit
5465} // namespace torch
5466#endif // #if defined(USE_CUDA)
5467