1#if defined(USE_CUDA)
2#include <gmock/gmock-matchers.h>
3#include <gtest/gtest.h>
4
5#include <arith.h>
6#include <codegen.h>
7#include <disjoint_set.h>
8#include <executor.h>
9#include <executor_launch_params.h>
10#include <expr_evaluator.h>
11#include <fusion.h>
12#include <fusion_segmenter.h>
13#include <inlining.h>
14#include <ir_all_nodes.h>
15#include <ir_builder.h>
16#include <ir_graphviz.h>
17#include <ir_iostream.h>
18#include <ir_utils.h>
19#include <iter_visitor.h>
20#include <kernel_cache.h>
21#include <kernel_expr_evaluator.h>
22#include <kernel_ir.h>
23#include <kernel_ir_dispatch.h>
24#include <lower2device.h>
25#include <lower_divisible_split.h>
26#include <mutator.h>
27#include <ops/all_ops.h>
28#include <register_interface.h>
29#include <root_domain_map.h>
30#include <scheduler/all_schedulers.h>
31#include <scheduler/reduction_utils.h>
32#include <scheduler/utils.h>
33#include <test/test_gpu_validator.h>
34#include <test/test_utils.h>
35#include <transform_replay.h>
36#include <transform_rfactor.h>
37
38// fuser and IR parser
39#include <parser.h>
40#include <torch/csrc/jit/ir/irparser.h>
41
42#include <ATen/cuda/CUDAContext.h>
43#include <ATen/cuda/Exceptions.h>
44#include <c10/cuda/CUDAStream.h>
45
46#include <algorithm>
47#include <iostream>
48
49// Tests go in torch::jit
50namespace torch {
51namespace jit {
52
53using namespace torch::jit::fuser::cuda;
54using namespace at::indexing;
55
56TEST_F(NVFuserTest, FusionViewDtypeSameSizeOutput_CUDA) {
57 Fusion fusion;
58 FusionGuard fg(&fusion);
59
60 std::vector<int64_t> input_shape{2, 10, 40};
61
62 TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float);
63 TensorView* bias = makeSymbolicTensor(input_shape.size());
64 fusion.addInput(x);
65 fusion.addInput(bias);
66
67 auto x_add_bias = add(x, bias);
68 auto x_view = view(x_add_bias, DataType::Int32);
69 fusion.addOutput(x_view);
70
71 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
72 at::Tensor at_x = at::randn(input_shape, options);
73 at::Tensor at_bias = at::randn(input_shape, options);
74 std::vector<IValue> aten_inputs = {at_x, at_bias};
75
76 auto lparams = schedulePointwise(&fusion, aten_inputs);
77
78 FusionExecutor fe;
79 fe.compileFusion(&fusion, aten_inputs, lparams);
80 auto outputs = fe.runFusion(aten_inputs, lparams);
81
82 auto at_x_add_bias = at_x + at_bias;
83 auto at_x_view = at_x_add_bias.view(at::ScalarType::Int);
84
85 testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
86}
87
88TEST_F(NVFuserTest, FusionViewDtypeFailMismatchSize_CUDA) {
89 Fusion fusion;
90 FusionGuard fg(&fusion);
91
92 std::vector<int64_t> input_shape{2, 10, 40};
93
94 TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float);
95 TensorView* bias = makeSymbolicTensor(input_shape.size());
96 fusion.addInput(x);
97 fusion.addInput(bias);
98
99 auto x_add_bias = add(x, bias);
100 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
101 ASSERT_ANY_THROW(view(x_add_bias, DataType::Int));
102 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
103 ASSERT_ANY_THROW(view(x_add_bias, DataType::Half));
104}
105
106TEST_F(NVFuserTest, FusionViewAsRealOutput_CUDA) {
107 Fusion fusion;
108 FusionGuard fg(&fusion);
109
110 // TODO: We should modify our schedulers to correctly handle
111 // view_as_real. And test these schedulers.
112 std::vector<int64_t> input_shape{512};
113 std::vector<int64_t> output_shape{512, 2};
114
115 TensorView* x =
116 makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat);
117 TensorView* bias =
118 makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat);
119 fusion.addInput(x);
120 fusion.addInput(bias);
121
122 TensorView* y = makeSymbolicTensor(output_shape.size());
123 fusion.addInput(y);
124
125 auto y_plus_1 = add(y, IrBuilder::create<Double>(1));
126
127 auto x_add_bias = add(x, bias);
128 auto x_view = view_as_real(x_add_bias);
129 auto out = add(y_plus_1, x_view);
130 fusion.addOutput(out);
131
132 out->axis(0)->parallelize(ParallelType::TIDx);
133 x_add_bias->computeAt(out, -1);
134 y->computeAt(out, -1);
135
136 auto in_options =
137 at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0);
138 auto out_options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
139 at::Tensor at_x = at::randn(input_shape, in_options);
140 at::Tensor at_bias = at::randn(input_shape, in_options);
141 at::Tensor at_y = at::randn(output_shape, out_options);
142 std::vector<IValue> aten_inputs = {at_x, at_bias, at_y};
143
144 FusionExecutor fe;
145 fe.compileFusion(&fusion, aten_inputs);
146 auto outputs = fe.runFusion(aten_inputs);
147
148 auto at_x_add_bias = at_x + at_bias;
149 auto at_x_view = at::view_as_real(at_x_add_bias);
150 auto at_y_plus_1 = at_y + 1.0;
151 auto at_out = at_y_plus_1 + at_x_view;
152
153 testValidate(&fusion, outputs, aten_inputs, {at_out}, __LINE__, __FILE__);
154}
155
156TEST_F(NVFuserTest, FusionViewRfactorExtentReplacement_CUDA) {
157 auto fusion = std::make_unique<Fusion>();
158 FusionGuard fg(fusion.get());
159
160 auto tv0 = makeSymbolicTensor(2);
161 fusion->addInput(tv0);
162 auto tv1 = makeContigTensor(2);
163 fusion->addInput(tv1);
164
165 auto tv2 = view(tv0, {12, 8}, {4, 3, 8});
166 auto tv3 = sum(tv2, {-1});
167 auto tv4 = add(tv3, IrBuilder::create<Double>(1));
168 auto tv5 = add(tv1, tv4);
169 fusion->addOutput(tv5);
170
171 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
172 at::manual_seed(0);
173 auto t0 = at::randn({12, 8}, options);
174 auto t1 = at::randn({4, 3}, options);
175
176 FusionExecutorCache executor_cache(std::move(fusion));
177 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
178
179 auto ref = at::native::view(t0, {4, 3, 8}).sum({-1}) + 1 + t1;
180
181 testValidate(
182 executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
183}
184
185TEST_F(NVFuserTest, FusionViewOutput_CUDA) {
186 Fusion fusion;
187 FusionGuard fg(&fusion);
188
189 std::vector<int64_t> input_shape{2, 10, 40};
190 std::vector<int64_t> output_shape{2, 10, 4, 10};
191
192 TensorView* x = makeSymbolicTensor(input_shape.size());
193 TensorView* bias = makeSymbolicTensor(input_shape.size());
194 fusion.addInput(x);
195 fusion.addInput(bias);
196
197 auto x_add_bias = add(x, bias);
198 auto x_view = view(x_add_bias, input_shape, output_shape);
199 fusion.addOutput(x_view);
200
201 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
202 at::Tensor at_x = at::randn(input_shape, options);
203 at::Tensor at_bias = at::randn(input_shape, options);
204 std::vector<IValue> aten_inputs = {at_x, at_bias};
205
206 auto lparams = schedulePointwise(&fusion, aten_inputs);
207
208 FusionExecutor fe;
209 fe.compileFusion(&fusion, aten_inputs, lparams);
210 auto outputs = fe.runFusion(aten_inputs, lparams);
211
212 auto at_x_add_bias = at_x + at_bias;
213 auto at_x_view = at::native::view(at_x_add_bias, output_shape);
214
215 testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
216}
217
218TEST_F(NVFuserTest, FusionViewFailMismatchSize_CUDA) {
219 Fusion fusion;
220 FusionGuard fg(&fusion);
221
222 // The number of elements in input and output shapes do not match,
223 // so this view transformation is invalid.
224 // 2 * 10 * 40 != 2 * 50 * 4 * 10
225
226 std::vector<int64_t> input_shape{2, 10, 40};
227 std::vector<int64_t> output_shape{2, 50, 4, 10};
228
229 TensorView* x = makeSymbolicTensor(input_shape.size());
230 TensorView* bias = makeSymbolicTensor(input_shape.size());
231 fusion.addInput(x);
232 fusion.addInput(bias);
233
234 auto x_add_bias = add(x, bias);
235 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
236 ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
237}
238
239TEST_F(NVFuserTest, FusionViewFailMulitDimInference_CUDA) {
240 Fusion fusion;
241 FusionGuard fg(&fusion);
242
243 // Only one dimension can be inferred in the output shape.
244 // Otherwise, the size of the dimensions is ambiguous.
245 std::vector<int64_t> input_shape{2, 10, 40};
246 std::vector<int64_t> output_shape{2, -1, 4, -1};
247
248 TensorView* x = makeSymbolicTensor(input_shape.size());
249 TensorView* bias = makeSymbolicTensor(input_shape.size());
250 fusion.addInput(x);
251 fusion.addInput(bias);
252
253 auto x_add_bias = add(x, bias);
254 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
255 ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
256}
257
258void reductionViewAddFusion(
259 std::vector<int64_t>& input_shape,
260 std::vector<int64_t>& output_shape,
261 bool view_before_reduction) {
262 constexpr int kReductionAxis = -1;
263
264 // Drop size for reduction axis from view_shape
265 std::vector<int64_t> view_shape;
266 {
267 const auto kAxis = (kReductionAxis < 0)
268 ? (kReductionAxis + input_shape.size())
269 : kReductionAxis;
270 for (auto i : c10::irange(input_shape.size())) {
271 if (view_before_reduction || i != kAxis) {
272 view_shape.push_back(input_shape[i]);
273 }
274 }
275 }
276
277 auto bias_shape = (view_before_reduction) ? input_shape : output_shape;
278 for (auto has_implicit_broadcast : {false, true}) {
279 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
280 Fusion& fusion = *fusion_ptr.get();
281 FusionGuard fg(&fusion);
282
283 TensorView* x = (has_implicit_broadcast)
284 ? makeConcreteTensor(input_shape)
285 : makeSymbolicTensor(input_shape.size());
286 TensorView* bias = (has_implicit_broadcast)
287 ? makeConcreteTensor(bias_shape)
288 : makeSymbolicTensor(bias_shape.size());
289 fusion.addInput(x);
290 fusion.addInput(bias);
291
292 auto tv1 =
293 (view_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis});
294 auto x_view = view(tv1, view_shape, output_shape);
295 auto y = (view_before_reduction) ? sum(x_view, {kReductionAxis})
296 : add(x_view, bias);
297 fusion.addOutput(y);
298
299 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
300 at::Tensor at_x = at::randn(input_shape, options);
301 at::Tensor at_bias = at::randn(bias_shape, options);
302 std::vector<IValue> aten_inputs = {at_x, at_bias};
303
304 FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
305 auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
306
307 auto at_tv1 = (view_before_reduction) ? (at_x + at_bias)
308 : at::sum(at_x, kReductionAxis);
309 auto at_x_view = at::native::view(at_tv1, output_shape);
310 auto at_y = (view_before_reduction) ? at::sum(at_x_view, kReductionAxis)
311 : at::add(at_x_view, at_bias);
312
313 testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
314 }
315}
316
317typedef std::vector<int64_t> shape;
318typedef std::pair<shape, shape> view_example;
319
320// TODO: View examples with just 333 elements are failing validation in
321// normalization. This might just be because our tolerances aren't tuned well
322// for small sizes and the parallelization could be limited which could be
323// detected as a validation issue, though it might not actually be a correctness
324// issue. Using 3333 instead of 333 in those cases but should validate what's
325// going on in the 333 case.
326std::vector<view_example> all_view_examples = {
327 {{1, 19, 1, 3 * 4, 7, 1, 99}, {1, 19, -1, 3, 4 * 7 * 99}},
328 {{1, 19, 1, 3 * 4, 7, 1, 99}, {1, 19, 1, 3, 4 * 7 * 99}},
329 {{19, 3 * 4, 7, 99}, {19, 3, 4 * 7 * 99}},
330
331 {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, 4, -1}},
332 {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, 4, 10}},
333 {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 2, 4, 1, 10}},
334
335 {{3, 17, 2 * 4 * 10, 1, 9}, {-1, 1, 2, 4, 10, 9}},
336 {{3, 17, 2 * 4 * 10, 1, 9}, {3 * 17, 1, 2, 4, 10, 9}},
337 {{3, 17, 2 * 4 * 10, 1, 9}, {3 * 17, 2, 4, 1, 10, 9}},
338
339 {{2, 3, 2 * 2, 5}, {1, 2 * 3, 1, -1, 2, 5, 1}},
340
341 {{22, 11 * 2, 2}, {22, -1, 1, 1, 2 * 2}},
342 {{22, 1, 22, 1}, {-1}},
343 {{22, 11 * 2, 2}, {22, 11, 1, 1, 2 * 2}},
344 {{22, 1, 22, 1}, {22 * 22}},
345
346 {{37, 9, 7, 3 * 2, 5 * 2}, {37 * 9, 2, -1, 3, 7 * 5}},
347 {{37, 9, 7, 3 * 2, 5 * 2}, {37 * 9, 2, 2, 3, 7 * 5}},
348
349 {{1, 1, 3333, 1}, {1, 1, -1, 1}},
350 // Disabled for now due to non-deterministic nan issue (#1920)
351 // {{1, 1111 * 3}, {1, 1, 1, -1, 1, 3}},
352 {{1, 3333, 1}, {-1}},
353 {{1, 1, 3333, 1}, {1, 1, 3333, 1}},
354 {{1, 303 * 11, 1}, {1, 303, -1, 1}},
355 {{1, 3333, 1}, {1, 303, 11, 1}},
356 // Disabled for now due to non-deterministic nan issue (#1920)
357 // {{1, 3333}, {1, 1, 1, 1111, 1, 3}},
358 {{1, 3333, 1}, {3333}},
359
360 {{1, 3922 * 7, 1, 2}, {1, 3922 * 2, 1, -1}},
361 {{1, 3922 * 2, 1, 7}, {1, -1, 2}},
362 {{1, 3922 * 7, 2}, {1, 3922 * 2, 7}},
363 {{1, 3922 * 2, 1, 7}, {1, 3922 * 7, 2}},
364 {{1, 3922 * 7, 1, 2}, {1, 3922 * 2, 1, 7}},
365
366 {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, 4, 1, -1}},
367 {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}},
368
369 {{2, 3, 2 * 2, 5}, {1, 6, 1, 2, 2, 5, 1}},
370};
371
372TEST_F(NVFuserTest, FusionViewReductionShmoo_CUDA) {
373 for (auto e : all_view_examples) {
374 reductionViewAddFusion(e.first, e.second, true /* view_before_reduction */);
375 }
376 std::vector<view_example> view_after_reduce_examples = {
377 {{19, 12, 7, 99}, {19, 3, 28}},
378 {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 28}},
379 {{3, 17, 80, 1}, {51, 1, 2, 4, 10}},
380 {{3, 17, 80, 1, 9}, {51, 1, 2, 4, 10}},
381 {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 1}},
382 {{22, 22, 2}, {22, 11, 1, 1, 2}},
383 {{37, 9, 7, 6, 10}, {333, 2, 21}},
384 {{1, 1, 333, 1}, {1, 1, 333, 1}},
385 {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1}},
386 {{1, 333, 1}, {1, 37, 9, 1}},
387 {{22, 1, 22, 1}, {484}},
388 {{1, 333, 1}, {333}},
389 {{1, 27454, 1, 2}, {1, 3922, 1, 7}},
390 {{1, 7844, 1, 7}, {1, 1961, 4}}};
391
392 for (auto e : view_after_reduce_examples) {
393 reductionViewAddFusion(
394 e.first, e.second, false /* view_before_reduction */);
395 }
396}
397
398void persistentViewAddFusion(
399 std::vector<int64_t>& input_shape,
400 std::vector<int64_t>& output_shape,
401 bool view_before_persistent) {
402 constexpr int kAxis = -1;
403
404 // Support -1 sizes in the inputs
405 auto inferred_shapes = inferViewShapes(input_shape, output_shape);
406 auto inferred_input = inferred_shapes.first;
407 auto inferred_output = inferred_shapes.second;
408
409 auto bias_shape = view_before_persistent ? inferred_input : inferred_output;
410 for (auto has_implicit_broadcast : {false, true}) {
411 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
412 Fusion& fusion = *fusion_ptr.get();
413 FusionGuard fg(&fusion);
414
415 TensorView* x = (has_implicit_broadcast)
416 ? makeConcreteTensor(inferred_input)
417 : makeSymbolicTensor(inferred_input.size());
418 TensorView* bias = (has_implicit_broadcast)
419 ? makeConcreteTensor(bias_shape)
420 : makeSymbolicTensor(bias_shape.size());
421 fusion.addInput(x);
422 fusion.addInput(bias);
423
424 auto tv1 = (view_before_persistent) ? add(x, bias) : softmax(x, kAxis);
425 auto x_view = view(tv1, inferred_input, inferred_output);
426 auto y =
427 (view_before_persistent) ? softmax(x_view, kAxis) : add(x_view, bias);
428 fusion.addOutput(y);
429
430 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
431 at::Tensor at_x = at::randn(inferred_input, options);
432 at::Tensor at_bias = at::randn(bias_shape, options);
433 std::vector<IValue> aten_inputs = {at_x, at_bias};
434
435 FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
436 auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
437
438 auto at_tv1 = (view_before_persistent)
439 ? (at_x + at_bias)
440 : at::_softmax(at_x, kAxis, false /* half_to_float */);
441 auto at_x_view = at::native::view(at_tv1, inferred_output);
442 auto at_y = (view_before_persistent)
443 ? at::_softmax(at_x_view, kAxis, false /* half_to_float */)
444 : at::add(at_x_view, at_bias);
445
446 testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
447 }
448}
449
450TEST_F(NVFuserTest, FusionViewPersistentShmoo_CUDA) {
451 for (auto e : all_view_examples) {
452 persistentViewAddFusion(
453 e.first, e.second, true /* view_before_persistent */);
454 }
455
456 for (auto e : all_view_examples) {
457 persistentViewAddFusion(
458 e.first, e.second, false /* view_before_persistent */);
459 }
460}
461
462void addViewGeluFusion(
463 std::vector<int64_t>& input_shape,
464 std::vector<int64_t>& output_shape) {
465 for (auto has_implicit_broadcast : {false, true}) {
466 Fusion fusion;
467 FusionGuard fg(&fusion);
468
469 TensorView* x = (has_implicit_broadcast)
470 ? makeConcreteTensor(input_shape)
471 : makeSymbolicTensor(input_shape.size());
472 TensorView* bias = (has_implicit_broadcast)
473 ? makeConcreteTensor(input_shape)
474 : makeSymbolicTensor(input_shape.size());
475 fusion.addInput(x);
476 fusion.addInput(bias);
477
478 auto x_add_bias = add(x, bias);
479 auto x_view = view(x_add_bias, input_shape, output_shape);
480 auto y = gelu(x_view);
481 fusion.addOutput(y);
482
483 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
484 at::Tensor at_x = at::randn(input_shape, options);
485 at::Tensor at_bias = at::randn(input_shape, options);
486 std::vector<IValue> aten_inputs = {at_x, at_bias};
487
488 auto lparams = schedulePointwise(&fusion, aten_inputs);
489
490 FusionExecutor fe;
491 fe.compileFusion(&fusion, aten_inputs, lparams);
492 auto outputs = fe.runFusion(aten_inputs, lparams);
493
494 auto at_x_add_bias = at_x + at_bias;
495 auto at_x_view = at::native::view(at_x_add_bias, output_shape);
496 auto at_y = at::gelu(at_x_view);
497
498 testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
499 }
500}
501
502TEST_F(NVFuserTest, FusionViewSplit_CUDA) {
503 std::vector<int64_t> input_shape{80};
504 std::vector<int64_t> output_shape{2, 4, 10};
505 addViewGeluFusion(input_shape, output_shape);
506}
507
508TEST_F(NVFuserTest, FusionViewBroadcast_CUDA) {
509 std::vector<int64_t> input_shape{80};
510 std::vector<int64_t> output_shape{1, 80};
511 addViewGeluFusion(input_shape, output_shape);
512}
513
514TEST_F(NVFuserTest, FusionViewMerge_CUDA) {
515 std::vector<int64_t> input_shape{2, 40, 7};
516 std::vector<int64_t> output_shape{560};
517 addViewGeluFusion(input_shape, output_shape);
518}
519
520TEST_F(NVFuserTest, FusionViewAllShmoo_CUDA) {
521 for (auto e : all_view_examples) {
522 addViewGeluFusion(e.first, e.second);
523 }
524}
525
526void geluViewAddFusion(
527 std::vector<int64_t> input_shape,
528 std::vector<int64_t> output_shape) {
529 // Support -1 sizes in the inputs
530 auto inferred_shapes = inferViewShapes(input_shape, output_shape);
531 auto inferred_input = inferred_shapes.first;
532 auto inferred_output = inferred_shapes.second;
533
534 for (auto hasImplicitBroadcast : {false, true}) {
535 Fusion fusion;
536 FusionGuard fg(&fusion);
537
538 TensorView* x = (hasImplicitBroadcast)
539 ? makeConcreteTensor(inferred_input)
540 : makeSymbolicTensor(inferred_input.size());
541 TensorView* bias = (hasImplicitBroadcast)
542 ? makeConcreteTensor(inferred_output)
543 : makeSymbolicTensor(inferred_output.size());
544 fusion.addInput(x);
545 fusion.addInput(bias);
546
547 auto x_gelu = gelu(x);
548 auto x_view = view(x_gelu, inferred_input, inferred_output);
549 auto y = add(x_view, bias);
550 fusion.addOutput(y);
551
552 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
553 at::Tensor at_x = at::randn(inferred_input, options);
554 at::Tensor at_bias = at::randn(inferred_output, options);
555 std::vector<IValue> aten_inputs = {at_x, at_bias};
556
557 auto lparams = schedulePointwise(&fusion, aten_inputs);
558
559 FusionExecutor fe;
560 fe.compileFusion(&fusion, aten_inputs, lparams);
561 auto outputs = fe.runFusion(aten_inputs, lparams);
562
563 auto at_x_gelu = at::gelu(at_x);
564 auto at_x_view = at::native::view(at_x_gelu, inferred_output);
565 auto at_y = at_x_view + at_bias;
566
567 testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
568 }
569}
570
571TEST_F(NVFuserTest, FusionViewStride_CUDA) {
572 for (const auto& e : all_view_examples) {
573 geluViewAddFusion(e.first, e.second);
574 }
575}
576
577void geluViewBinaryAddFusion(
578 std::vector<int64_t> input_shape1,
579 std::vector<int64_t> input_shape2,
580 std::vector<int64_t> output_shape) {
581 for (auto hasImplicitBroadcast : {false, true}) {
582 Fusion fusion;
583 FusionGuard fg(&fusion);
584
585 TensorView* x = (hasImplicitBroadcast)
586 ? makeConcreteTensor(input_shape1)
587 : makeSymbolicTensor(input_shape1.size());
588 TensorView* bias = (hasImplicitBroadcast)
589 ? makeConcreteTensor(input_shape2)
590 : makeSymbolicTensor(input_shape2.size());
591 fusion.addInput(x);
592 fusion.addInput(bias);
593
594 auto x_gelu = gelu(x);
595 auto x_view = view(x_gelu, input_shape1, output_shape);
596 auto bias_view = view(bias, input_shape2, output_shape);
597 auto y = add(x_view, bias_view);
598 fusion.addOutput(y);
599
600 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
601 at::Tensor at_x = at::randn(input_shape1, options);
602 at::Tensor at_bias = at::randn(input_shape2, options);
603 std::vector<IValue> aten_inputs = {at_x, at_bias};
604
605 auto lparams = schedulePointwise(&fusion, aten_inputs);
606
607 FusionExecutor fe;
608 fe.compileFusion(&fusion, aten_inputs, lparams);
609 auto outputs = fe.runFusion(aten_inputs, lparams);
610
611 auto at_x_gelu = at::gelu(at_x);
612 auto at_x_view = at::native::view(at_x_gelu, output_shape);
613 auto at_bias_view = at::native::view(at_bias, output_shape);
614 auto at_y = at_x_view + at_bias_view;
615
616 testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
617 }
618}
619
620TEST_F(NVFuserTest, FusionViewBinary_CUDA) {
621 geluViewBinaryAddFusion({27454, 2}, {54908}, {7844, 7});
622}
623
624// Repro of issue #1493
625TEST_F(NVFuserTest, FusionViewConcreteDomain_CUDA) {
626 Fusion fusion;
627 FusionGuard fg(&fusion);
628
629 auto tv0 = makeSymbolicTensor(2);
630 fusion.addInput(tv0);
631 auto tv1 = makeContigTensor(2);
632 fusion.addInput(tv1);
633
634 auto tv2 = view(tv0, {2, 3}, {6});
635 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
636 auto tv4 = broadcast(tv3, {true, false});
637 auto tv5 = add(tv4, tv1);
638
639 fusion.addOutput(tv5);
640
641 tv5->merge(0);
642 tv0->computeAt(tv5, -1);
643 tv1->computeAt(tv5, -1);
644
645 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
646 at::manual_seed(0);
647 auto t0 = at::randn({2, 3}, options);
648 auto t1 = at::randn({1, 6}, options);
649
650 FusionExecutor fe;
651 fe.compileFusion(&fusion, {t0, t1});
652 auto cg_outputs = fe.runFusion({t0, t1});
653
654 auto ref = (at::native::view(t0, {6}) + 1).unsqueeze(0) + t1;
655
656 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
657}
658
659TEST_F(NVFuserTest, FusionViewConcreteDomain2_CUDA) {
660 constexpr int kAxis = -1;
661 std::vector<int64_t> input_shape = {19, 12, 7, 99};
662 std::vector<int64_t> output_shape = {19, 3, 2772};
663
664 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
665 Fusion& fusion = *fusion_ptr.get();
666 FusionGuard fg(&fusion);
667
668 TensorView* x = makeSymbolicTensor(input_shape.size());
669 TensorView* bias = makeSymbolicTensor(output_shape.size());
670 fusion.addInput(x);
671 fusion.addInput(bias);
672
673 auto tv1 = softmax(x, kAxis);
674 auto x_view = view(tv1, input_shape, output_shape);
675 auto y = add(x_view, bias);
676 fusion.addOutput(y);
677
678 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
679 at::Tensor at_x = at::randn(input_shape, options);
680 at::Tensor at_bias = at::randn(output_shape, options);
681 std::vector<IValue> aten_inputs = {at_x, at_bias};
682
683 FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
684 auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
685
686 auto at_tv1 = at::_softmax(at_x, kAxis, false /* half_to_float */);
687 auto at_x_view = at::native::view(at_tv1, output_shape);
688 auto at_y = at::add(at_x_view, at_bias);
689
690 testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
691}
692
693// Repro of issue #1608
694TEST_F(NVFuserTest, FusionViewConcreteDomain3_CUDA) {
695 std::vector<int64_t> input_shape = {14, 12, 8, 100};
696 std::vector<int64_t> bcast_shape = {14, 12, 8, 1};
697 std::vector<int64_t> other_shape = {14, 100, 96};
698 std::vector<int64_t> output_shape = {14, 3, 3200};
699
700 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
701 Fusion& fusion = *fusion_ptr.get();
702 FusionGuard fg(&fusion);
703
704 TensorView* x = makeSymbolicTensor(input_shape.size());
705 TensorView* y = makeConcreteTensor(bcast_shape);
706 TensorView* z = makeSymbolicTensor(other_shape.size());
707 fusion.addInput(x);
708 fusion.addInput(y);
709 fusion.addInput(z);
710
711 auto tv1 = add(x, y);
712 auto tv2 = view(tv1, input_shape, output_shape);
713 auto tv3 = view(z, other_shape, output_shape);
714 auto output = add(tv2, tv3);
715 fusion.addOutput(output);
716
717 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
718 at::Tensor at_x = at::randn(input_shape, options);
719 at::Tensor at_y = at::randn(bcast_shape, options);
720 at::Tensor at_z = at::randn(other_shape, options);
721 std::vector<IValue> aten_inputs = {at_x, at_y, at_z};
722
723 FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
724 auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
725
726 auto at_tv1 = at::add(at_x, at_y);
727 auto at_tv2 = at::native::view(at_tv1, output_shape);
728 auto at_tv3 = at::native::view(at_z, output_shape);
729 auto at_output = at::add(at_tv2, at_tv3);
730
731 testValidate(&fusion, outputs, aten_inputs, {at_output}, __LINE__, __FILE__);
732}
733
734TEST_F(NVFuserTest, FusionViewConcreteDomain4_CUDA) {
735 std::vector<int64_t> shape1 = {3, 4, 5};
736 std::vector<int64_t> shape2 = {3 * 4 * 5};
737
738 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
739 Fusion& fusion = *fusion_ptr.get();
740 FusionGuard fg(&fusion);
741
742 auto tv0 = makeSymbolicTensor(shape1.size() - 1);
743 fusion.addInput(tv0);
744
745 auto tv1 = makeSymbolicTensor(shape1.size());
746 fusion.addInput(tv1);
747
748 auto tv2 = broadcast(tv0, {true, false, false});
749 auto tv3 = add(tv1, tv2);
750 auto tv4 = view(tv3, shape1, shape2);
751 auto tv5 = set(tv4);
752 fusion.addOutput(tv5);
753
754 tv0->computeAt(tv5, -1);
755 tv1->computeAt(tv5, -1);
756
757 TORCH_CHECK(tv5->nDims() == 1);
758
759 // The concrete domain of tv5, which is 1D, with permissive or loop mapping
760 // needs to be either the domain of tv4 or tv5, both of which have the three
761 // concrete root domains of tv1. In other words, it must map with tv4 and tv5
762 // with the exact mapping.
763 ComputeAtMap map(&fusion);
764 auto concrete_id =
765 map.getConcreteMappedID(tv5->axis(0), IdMappingMode::PERMISSIVE);
766 TORCH_CHECK(
767 map.areMapped(concrete_id, tv5->axis(0), IdMappingMode::EXACT),
768 "Invalid concrete ID: ",
769 concrete_id->toString());
770 TORCH_CHECK(
771 map.areMapped(concrete_id, tv4->axis(0), IdMappingMode::EXACT),
772 "Invalid concrete ID: ",
773 concrete_id->toString());
774}
775
776TEST_F(NVFuserTest, FusionViewConcreteDomain5_CUDA) {
777 const std::vector<int64_t> shape1 = {12};
778 const std::vector<int64_t> shape2 = {4, 3};
779 const std::vector<int64_t> shape3 = {12, 5};
780 const std::vector<int64_t> shape4 = {4, 3, 5};
781
782 for (auto order : {true, false}) {
783 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
784 Fusion& fusion = *fusion_ptr.get();
785 FusionGuard fg(&fusion);
786
787 auto tv0 = makeSymbolicTensor(1);
788 fusion.addInput(tv0);
789
790 auto tv1 = makeSymbolicTensor(2);
791 fusion.addInput(tv1);
792
793 auto tv0_cache = set(tv0);
794
795 auto path1 = [&]() {
796 auto view_2d = view(tv0_cache, shape1, shape2);
797 auto view_2d_copy = set(view_2d);
798 fusion.addOutput(view_2d_copy);
799 return view_2d_copy;
800 };
801
802 auto path2 = [&]() {
803 auto tv0_bc = broadcast(tv0_cache, {false, true});
804 auto tv0_bc_plus_tv1 = add(tv0_bc, tv1);
805 auto view_3d = view(tv0_bc_plus_tv1, shape3, shape4);
806 auto view_3d_copy = set(view_3d);
807 fusion.addOutput(view_3d_copy);
808 return view_3d_copy;
809 };
810
811 TensorView* path1_out = nullptr;
812 TensorView* path2_out = nullptr;
813
814 if (order) {
815 // Fails before #1544. Concrete ID is picked from path1_out, which
816 // doesn't have the second root domain of tv1
817 path2_out = path2();
818 path1_out = path1();
819 } else {
820 // Works fine
821 path1_out = path1();
822 path2_out = path2();
823 }
824
825 path2_out->merge(-2, -1);
826 path2_out->merge(-2, -1);
827
828 tv0->computeAt(path2_out, -1);
829 tv1->computeAt(path2_out, -1);
830
831 TORCH_CHECK(path1_out->nDims() == 1);
832 TORCH_CHECK(path2_out->nDims() == 1);
833
834 ComputeAtMap map(&fusion);
835
836 // Make sure the two output tensors are mapped. Note both are 1D.
837 TORCH_CHECK(map.areMapped(
838 path1_out->axis(0), path2_out->axis(0), IdMappingMode::LOOP));
839
840 auto concrete_id =
841 map.getConcreteMappedID(path2_out->axis(0), IdMappingMode::LOOP);
842 TORCH_CHECK(
843 path2_out->axis(0) == concrete_id,
844 "Incorrect concrete ID: ",
845 concrete_id->toString());
846 }
847}
848
849TEST_F(NVFuserTest, FusionFlattenAfterUnsqueezeOutput_CUDA) {
850 Fusion fusion;
851 FusionGuard fg(&fusion);
852
853 std::vector<int64_t> input_shape{512};
854
855 TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Double);
856 TensorView* bias = makeSymbolicTensor(input_shape.size(), DataType::Double);
857 fusion.addInput(x);
858 fusion.addInput(bias);
859
860 auto x_add_bias = add(x, bias);
861 auto x_unsqueeze = unsqueeze(x_add_bias, -1);
862 auto x_view = flatten(x_unsqueeze);
863 fusion.addOutput(x_view);
864
865 auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
866 at::Tensor at_x = at::randn(input_shape, options);
867 at::Tensor at_bias = at::randn(input_shape, options);
868 std::vector<IValue> aten_inputs = {at_x, at_bias};
869
870 x_view->split(0, 4);
871 x_add_bias->computeAt(x_view, 1);
872 x_view->axis(0)->parallelize(ParallelType::TIDx);
873
874 FusionExecutor fe;
875 fe.compileFusion(&fusion, aten_inputs);
876 auto outputs = fe.runFusion(aten_inputs);
877
878 auto at_x_add_bias = at_x + at_bias;
879 auto at_x_view = at_x_add_bias.unsqueeze(-1).flatten();
880
881 testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
882}
883
884TEST_F(NVFuserTest, FusionComputeAtRootDomainMapWithView_CUDA) {
885 Fusion fusion;
886 FusionGuard fg(&fusion);
887
888 const std::vector<int64_t> input_shape1{10, 12};
889 const std::vector<int64_t> input_shape2{10, 3, 4};
890
891 auto tv0 = makeSymbolicTensor(2);
892 fusion.addInput(tv0);
893
894 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
895
896 // reduction followed by broadcast
897 auto tv2 = sum(tv1, {1});
898 auto tv3 = broadcast(tv2, {false, true, true});
899
900 // Path with a view
901 auto tv4 = view(tv1, input_shape1, input_shape2);
902
903 // Join the reduciton+broadcast and view paths together
904 auto tv5 = add(tv3, tv4);
905 fusion.addOutput(tv5);
906
907 ComputeAtRootDomainMap map;
908 map.build();
909
910 // It's not possible to compute tv1 at the -1 position of
911 // t2. ComputeAtRootDomainMap should tell that by not mapping the
912 // second axis.
913 auto tv1_tv2_mappable_dims =
914 map.getMappableDims(tv1->domain(), tv2->domain());
915 TORCH_CHECK(
916 tv1_tv2_mappable_dims.find(tv1->axis(1)) == tv1_tv2_mappable_dims.end(),
917 "Invalid ComputeAtRootDomainMap. Domain should not be mappable: ",
918 tv1->axis(1)->toString());
919}
920
921TEST_F(NVFuserTest, FusionExpandRepro_CUDA) {
922 Fusion fusion;
923 FusionGuard fg(&fusion);
924
925 const std::vector<int64_t> input_shape1{4, 1, 1};
926 const std::vector<int64_t> input_shape2{4, 3, 2};
927
928 auto tv0 = makeConcreteTensor({-1, 1, 1});
929 fusion.addInput(tv0);
930 auto tv1 = makeSymbolicTensor(3);
931 fusion.addInput(tv1);
932
933 auto tv2 = expand_as(tv0, tv1);
934 fusion.addOutput(tv2);
935
936 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
937 at::Tensor at_x = at::randn(input_shape1, options);
938 at::Tensor at_y = at::randn(input_shape2, options);
939 std::vector<IValue> aten_inputs = {at_x, at_y};
940
941 FusionExecutor fe;
942 fe.compileFusion(&fusion);
943 LaunchParams l_params;
944 auto outputs = fe.runFusion(aten_inputs, {}, l_params, 0);
945
946 auto out = at_x.expand_as(at_y);
947
948 testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__);
949
950 // second run to verify cached output allocation
951 outputs = fe.runFusion(aten_inputs, {}, l_params, 0);
952 testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__);
953}
954
955TEST_F(NVFuserTest, FusionExpandView1_CUDA) {
956 auto fusion = std::make_unique<Fusion>();
957 FusionGuard fg(fusion.get());
958
959 auto tv0 = makeConcreteTensor({4, 1, 8});
960 fusion->addInput(tv0);
961
962 auto tv1 = makeConcreteTensor({12, 8});
963 fusion->addInput(tv1);
964
965 auto tv2 = expand(
966 tv0,
967 {IrBuilder::create<Int>(4),
968 IrBuilder::create<Int>(3),
969 IrBuilder::create<Int>(8)});
970
971 auto tv3 = view(tv2, {4, 3, 8}, {12, 8});
972 auto tv4 = add(tv3, tv1);
973 fusion->addOutput(tv4);
974
975 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
976 at::manual_seed(0);
977 auto t0 = at::randn({4, 1, 8}, options);
978 auto t1 = at::randn({12, 8}, options);
979
980 FusionExecutorCache executor_cache(std::move(fusion));
981 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
982
983 auto ref = at::reshape(t0.expand({4, 3, 8}), {12, 8}) + t1;
984
985 testValidate(
986 executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
987}
988
989TEST_F(NVFuserTest, FusionExpandView2_CUDA) {
990 auto fusion = std::make_unique<Fusion>();
991 FusionGuard fg(fusion.get());
992
993 auto tv0 = makeConcreteTensor({1, 8});
994 fusion->addInput(tv0);
995
996 auto tv1 = makeConcreteTensor({3, 4, 8});
997 fusion->addInput(tv1);
998
999 auto tv2 =
1000 expand(tv0, {IrBuilder::create<Int>(12), IrBuilder::create<Int>(8)});
1001
1002 auto tv3 = view(tv2, {12, 8}, {3, 4, 8});
1003 auto tv4 = add(tv3, tv1);
1004 fusion->addOutput(tv4);
1005
1006 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1007 at::manual_seed(0);
1008 auto t0 = at::randn({1, 8}, options);
1009 auto t1 = at::randn({3, 4, 8}, options);
1010
1011 FusionExecutorCache executor_cache(std::move(fusion));
1012 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
1013
1014 auto ref = at::reshape(t0.expand({12, 8}), {3, 4, 8}) + t1;
1015
1016 testValidate(
1017 executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1018}
1019
1020TEST_F(NVFuserTest, FusionViewTransformCache_CUDA) {
1021 auto assert_matches = [](view_example example_0, view_example example_1) {
1022 TORCH_INTERNAL_ASSERT(
1023 analyzeViewConstraint(example_0.first, example_0.second) ==
1024 analyzeViewConstraint(example_1.first, example_1.second),
1025 "View: ",
1026 example_0.first,
1027 " -> ",
1028 example_0.second,
1029 " Does not match:",
1030 example_1.first,
1031 " -> ",
1032 example_1.second);
1033 };
1034
1035 auto assert_does_not_match = [](view_example example_0,
1036 view_example example_1) {
1037 TORCH_INTERNAL_ASSERT(
1038 !(analyzeViewConstraint(example_0.first, example_0.second) ==
1039 analyzeViewConstraint(example_1.first, example_1.second)),
1040 "View: ",
1041 example_0.first,
1042 " -> ",
1043 example_0.second,
1044 " Should not match:",
1045 example_1.first,
1046 " -> ",
1047 example_1.second);
1048 };
1049
1050 // Splits are done as splitting out left hand side, so left hand side
1051 // split changes can't reuse view, but right hand side split changes can.
1052 // Merges, since they don't bury hard values in can always be reshared.
1053 // Need to make sure trivial reduction, and broadcast changes don't try to
1054 // reuse view. What matches and what doesn't is very specific to the
1055 // implementation of how the splits/merges are generated. This could be
1056 // changed over time as there isn't a single set of transformations to
1057 // potentially make a view. For example we could always merge all dimensions,
1058 // then split out all dimensions. This would always be valid but would not be
1059 // efficient for indexing.
1060
1061 // "Same"
1062 assert_matches(
1063 {{1, 1, 3333, 1}, {1, 1, 3333, 1}}, {{1, 1, 3333, 1}, {1, 1, -1, 1}});
1064 assert_matches(
1065 {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, 4, 1, 8}},
1066 {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, 4, 1, -1}});
1067
1068 // Trivial reduce matching
1069 assert_matches({{1, 3333, 1}, {-1}}, {{1, 24, 1}, {-1}});
1070
1071 // Trivial reduce not matching
1072 assert_does_not_match({{1, 3333, 1}, {-1}}, {{1, 3333}, {-1}});
1073
1074 // Broadcast matching
1075 assert_matches({{3333}, {1, -1, 1}}, {{24}, {1, -1, 1}});
1076
1077 // Broadcast not matching
1078 assert_does_not_match({{3333}, {1, -1, 1}}, {{24}, {1, -1}});
1079
1080 // RHS split
1081 assert_matches(
1082 {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, 4, -1}},
1083 {{3, 17, 2 * 4 * 10 * 7, 1}, {3 * 17, 1, 2, 4, -1}});
1084 assert_matches(
1085 {{1, 303 * 11, 1}, {1, 303, -1, 1}},
1086 {{1, 303 * 11 * 4, 1}, {1, 303, -1, 1}});
1087 assert_matches(
1088 {{2, 3, 2 * 2 * 3, 5}, {1, 2 * 3, 1, 2, -1, 5, 1}},
1089 {{2, 3, 2 * 2 * 4, 5}, {1, 2 * 3, 1, 2, -1, 5, 1}});
1090 assert_matches(
1091 {{22, 11 * 2, 2}, {22, 11, 1, 1, -1}},
1092 {{22, 11 * 2 * 4, 2 * 3}, {22, 11, 1, 1, -1}});
1093 assert_matches(
1094 {{1, 1111 * 3}, {1, 1, 1, 1111, 1, -1}},
1095 {{1, 1111 * 3 * 7}, {1, 1, 1, 1111, 1, -1}});
1096 assert_matches(
1097 {{1, 303 * 11 * 2, 1}, {1, 303, -1, 1}},
1098 {{1, 303 * 11 * 3, 1}, {1, 303, -1, 1}});
1099 assert_matches(
1100 {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, -1, 1, 8}},
1101 {{8, 1, 1, 2 * 4 * 6, 1, 8}, {8, 2, -1, 1, 8}});
1102
1103 // LHS split not matching
1104 assert_does_not_match(
1105 {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, -1, 10}},
1106 {{3, 17, 2 * 4 * 3 * 10, 1}, {3 * 17, 1, 2, -1, 10}});
1107 assert_does_not_match(
1108 {{1, 303 * 11, 1}, {1, -1, 11, 1}},
1109 {{1, 303 * 11 * 2, 1}, {1, -1, 11, 1}});
1110 assert_does_not_match(
1111 {{2, 3, 2 * 2, 5}, {1, 2 * 3, 1, -1, 2, 5, 1}},
1112 {{2, 3, 3 * 2, 5}, {1, 2 * 3, 1, -1, 2, 5, 1}});
1113 assert_does_not_match(
1114 {{22, (11 + 1) * 2, 2}, {22, -1, 1, 1, 2 * 2}},
1115 {{22, 11 * 2, 2}, {22, -1, 1, 1, 2 * 2}});
1116 assert_does_not_match(
1117 {{1, 1111 * 3}, {1, 1, 1, -1, 1, 3}},
1118 {{1, 1111 * 2 * 3}, {1, 1, 1, -1, 1, 3}});
1119 assert_does_not_match(
1120 {{1, 303 * 11, 1}, {1, -1, 11, 1}},
1121 {{1, (303 + 1) * 11, 1}, {1, -1, 11, 1}});
1122 assert_does_not_match(
1123 {{8, 1, 1, 2 * 4, 1, 8}, {8, -1, 4, 1, 8}},
1124 {{8, 1, 1, 3 * 4, 1, 8}, {8, -1, 4, 1, 8}});
1125
1126 // Merge matching
1127 assert_matches(
1128 {{3, 17, 2 * 4 * 10, 1, 9}, {-1, 1, 2, 4, 10, 9}},
1129 {{4, 18, 2 * 4 * 10, 1, 9}, {-1, 1, 2, 4, 10, 9}});
1130 assert_matches({{22, 1, 23, 1}, {-1, 1}}, {{23, 1, 22, 1}, {-1, 1}});
1131
1132 // Merge not matching
1133 assert_does_not_match({{2, 3, 4}, {-1, 4}}, {{2, 3, 4}, {2, -1}});
1134 assert_does_not_match(
1135 {{22, 1, 23, 1, 24}, {-1, 24}}, {{22, 1, 23, 1, 24}, {22, -1}});
1136
1137 // Split->Merge matching
1138 assert_matches(
1139 {{22, 11 * 2, 3}, {22, 11, 1, 1, -1}},
1140 {{22, 11 * 3, 2}, {22, 11, 1, 1, -1}});
1141 assert_matches(
1142 {{1, 3922 * 3 * 7, 1, 2 * 2}, {1, 3922 * 2, 1, -1}},
1143 {{1, 3922 * 7, 1, 2}, {1, 3922 * 2, 1, -1}});
1144
1145 // Split->Merge not matching
1146 assert_does_not_match(
1147 {{22, 11 * 2, 2}, {22, -1, 1, 1, 4}},
1148 {{22, 11 * 2 * 3, 2}, {22, -1, 1, 1, 4}});
1149 assert_does_not_match(
1150 {{1, 3922 * 7, 1, 2}, {1, -1, 1, 7}},
1151 {{1, 3922 * 7 * 2, 1, 2}, {1, -1, 1, 7}});
1152
1153 // Merge->Split matching
1154 assert_matches(
1155 {{1, 3922 * 2, 1, 7}, {1, 3922 * 7, -1}},
1156 {{1, 3922 * 2 * 3, 1, 7}, {1, 3922 * 7, -1}});
1157 assert_matches(
1158 {{19, 3 * 4, 7, 99}, {19, 3, -1}}, {{19, 3 * 3, 8, 10}, {19, 3, -1}});
1159
1160 // Merge->Split not matching
1161 assert_does_not_match(
1162 {{1, 3922 * 2, 1, 7}, {1, -1, 2}}, {{1, 3922, 1, 7}, {1, -1, 2}});
1163 assert_does_not_match(
1164 {{19, 3 * 4, 7, 99}, {19, -1, 3}}, {{19, 3 * 5, 7, 99}, {19, -1, 3}});
1165}
1166
1167TEST_F(NVFuserTest, FusionViewIdGraph_CUDA) {
1168 Fusion fusion;
1169 FusionGuard fg(&fusion);
1170
1171 int w = 2, x = 3, y = 4, z = 5;
1172
1173 auto tv0 = makeConcreteTensor({w, x, y, z});
1174 fusion.addInput(tv0);
1175
1176 auto tv1 = sin(tv0);
1177
1178 auto tv2 = view(tv1, {w, x, y, z}, {w, y, x * z});
1179 fusion.addOutput(tv2);
1180
1181 auto tv3 = makeConcreteTensor({w, x, y, z});
1182 fusion.addInput(tv3);
1183
1184 auto tv4 = view(tv3, {w, x, y, z}, {w, y, x * z});
1185 fusion.addOutput(tv4);
1186
1187 // Link 0 and 3 together for view analysis done based on before the views
1188 // actually happened.
1189 auto tv5 = add(tv0, tv3);
1190 fusion.addOutput(tv5);
1191
1192 auto tv6 = makeConcreteTensor({w, x, x, y, z});
1193
1194 auto tv7 = sum(tv6, {2});
1195 auto tv8 = broadcast(tv7, {false, true, false, true, false, false});
1196
1197 auto tv9 = makeConcreteTensor({w, 6, x, 7, y, z});
1198 fusion.addInput(tv9);
1199 auto tv10 = add(tv8, tv9);
1200 fusion.addOutput(tv10);
1201
1202 auto tv12 = view(tv8, {w, 1, x, 1, y, z}, {w, y, x * z});
1203 fusion.addOutput(tv12);
1204
1205 // Link the views after the views happen
1206 auto t13 = add(tv12, tv4);
1207 fusion.addOutput(t13);
1208
1209 // Grab the trivial reduced tensor from t12's view.
1210 auto tv11 = ir_utils::producerTvsOf(tv12)[0];
1211
1212 // Start from the exact iter domain graph of the fusion
1213 IterDomainGraph id_graph(&fusion);
1214 auto disjoint_view_ids = id_graph.exactNodes();
1215
1216 TORCH_CHECK(
1217 id_graph.exactNodes().strictAreMapped(tv2->axis(1), tv4->axis(1)));
1218 TORCH_CHECK(
1219 id_graph.exactNodes().strictAreMapped(tv2->axis(2), tv4->axis(2)));
1220
1221 TORCH_CHECK(id_graph.exactNodes().strictAreMapped(
1222 tv2->getRootDomain()[1], tv12->getRootDomain()[1]));
1223 TORCH_CHECK(id_graph.exactNodes().strictAreMapped(
1224 tv2->getRootDomain()[2], tv12->getRootDomain()[2]));
1225 TORCH_CHECK(id_graph.exactNodes().strictAreMapped(
1226 tv2->getRootDomain()[3], tv12->getRootDomain()[3]));
1227}
1228
1229TEST_F(NVFuserTest, FusionViewVectorize_CUDA) {
1230 Fusion fusion;
1231 FusionGuard fg(&fusion);
1232
1233 auto tv0 = makeContigTensor(3);
1234 fusion.addInput(tv0);
1235 auto tv1 = flatten(tv0, 1, 2);
1236 auto tv2 = flatten(tv0, 1, 2);
1237 auto tv3 = sin(tv1);
1238 auto tv4 = sin(tv2);
1239 fusion.addOutput(tv3);
1240 fusion.addOutput(tv4);
1241
1242 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1243 at::Tensor input = at::randn({256, 1024, 1024}, options);
1244
1245 auto lparams = schedulePointwise(&fusion, {input});
1246
1247 auto hasVectorization = [](TensorView* tv) -> bool {
1248 for (auto i : tv->domain()->domain()) {
1249 if (i->getParallelType() == ParallelType::Vectorize) {
1250 return true;
1251 }
1252 }
1253 return false;
1254 };
1255
1256 for (auto o : fusion.outputs()) {
1257 TORCH_CHECK(hasVectorization(o->as<TensorView>()));
1258 }
1259 for (auto i : fusion.inputs()) {
1260 for (auto c : ir_utils::consumerTvsOf(i->as<TensorView>())) {
1261 TORCH_CHECK(hasVectorization(c));
1262 }
1263 }
1264
1265 FusionExecutor fe;
1266 fe.compileFusion(&fusion, {input}, lparams);
1267 auto outputs = fe.runFusion({input}, lparams);
1268
1269 auto tv_ref = input.flatten(1, 2).sin();
1270
1271 testValidate(&fusion, outputs, {input}, {tv_ref, tv_ref}, __LINE__, __FILE__);
1272}
1273
1274TEST_F(NVFuserTest, FusionExpandFlatten_CUDA) {
1275#ifdef FBCODE_CAFFE2
1276 GTEST_SKIP() << "Fails accuracy on V100 32gb";
1277#endif
1278 auto fusion = std::make_unique<Fusion>();
1279 FusionGuard fg(fusion.get());
1280
1281 auto tv0 = makeConcreteTensor({-1, -1, 1});
1282 fusion->addInput(tv0);
1283 auto tv1 = expand(
1284 tv0,
1285 {tv0->axis(0)->extent(),
1286 tv0->axis(1)->extent(),
1287 IrBuilder::create<Int>(8)});
1288 auto tv2 = flatten(tv1, 1, 2);
1289 auto tv3 = sum(tv2, {1});
1290 fusion->addOutput(tv3);
1291
1292 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1293 at::Tensor input = at::randn({256, 1024, 1}, options);
1294
1295 FusionExecutorCache executor_cache(std::move(fusion));
1296 auto cg_outputs = executor_cache.runFusionWithInputs({input});
1297
1298 auto aten_out = input.expand({256, 1024, 8}).flatten(1, 2).sum(1);
1299
1300 testValidate(
1301 executor_cache.fusion(),
1302 cg_outputs,
1303 {input},
1304 {aten_out},
1305 __LINE__,
1306 __FILE__);
1307}
1308
1309TEST_F(NVFuserTest, FusionIllegalReductionFlatten_CUDA) {
1310 EXPECT_THAT(
1311 []() {
1312 auto fusion = std::make_unique<Fusion>();
1313 FusionGuard fg(fusion.get());
1314
1315 auto tv0 = makeConcreteTensor({2, 3});
1316 fusion->addInput(tv0);
1317
1318 auto tv1 = sum(tv0, {1});
1319 auto tv2 = flatten(tv1, 0, 1);
1320 fusion->addOutput(tv2);
1321 },
1322 testing::ThrowsMessage<c10::Error>(
1323 testing::HasSubstr("Invalid end_dim")));
1324}
1325
1326TEST_F(NVFuserTest, FusionReductionFlatten1_CUDA) {
1327 auto fusion = std::make_unique<Fusion>();
1328 FusionGuard fg(fusion.get());
1329
1330 auto tv0 = makeConcreteTensor({2, 3, 5});
1331 fusion->addInput(tv0);
1332
1333 auto tv1 = sum(tv0, {1});
1334 auto tv2 = flatten(tv1, 0, 1);
1335 fusion->addOutput(tv2);
1336
1337 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1338 at::manual_seed(0);
1339 auto t0 = at::randn({2, 3, 5}, options);
1340 auto ref = t0.sum({1}).flatten(0, 1);
1341
1342 FusionExecutorCache executor_cache(std::move(fusion));
1343 auto cg_outputs = executor_cache.runFusionWithInputs({t0});
1344
1345 testValidate(
1346 executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1347}
1348
1349TEST_F(NVFuserTest, FusionPwiseViewSchedule_CUDA) {
1350 Fusion fusion;
1351 FusionGuard fg(&fusion);
1352
1353 int x = 31, y = 65, z = 103;
1354
1355 auto tv0 = makeConcreteTensor({x, y, z});
1356 fusion.addInput(tv0);
1357
1358 auto tv1 = sin(tv0);
1359
1360 auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1361 fusion.addOutput(tv2);
1362
1363 auto tv3 = makeConcreteTensor({x, y, z});
1364 fusion.addInput(tv3);
1365
1366 auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1367 fusion.addOutput(tv4);
1368
1369 // Link 0 and 3 together for view analysis done based on before the views
1370 // actually happened.
1371 auto tv5 = add(tv0, tv3);
1372 fusion.addOutput(tv5);
1373
1374 TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion));
1375 {
1376 TransformPropagator propagator(tv4);
1377 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
1378 }
1379
1380 for (auto i : c10::irange(tv5->nDims() - 1)) {
1381 tv5->merge(0);
1382 }
1383 tv5->split(0, 32);
1384 tv5->split(0, 4);
1385 tv5->axis(0)->parallelize(ParallelType::BIDx);
1386 tv5->axis(1)->parallelize(ParallelType::Unroll);
1387 tv5->axis(2)->parallelize(ParallelType::TIDx);
1388
1389 {
1390 TransformPropagator propagator(tv5);
1391 MaxRootDomainInfoSpanningTree spanning_tree(tv5);
1392 spanning_tree.traverse(&propagator);
1393 scheduler_utils::parallelizeAllLike(tv5);
1394
1395 // Inline the schedule
1396 inlineMost();
1397 }
1398
1399 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1400
1401 at::Tensor t0 = at::randn({x, y, z}, options);
1402 at::Tensor t3 = at::randn({x, y, z}, options);
1403 auto t1 = sin(t0);
1404 auto t2 = at::native::view(t1, {x, y * z});
1405 auto t4 = at::native::view(t3, {x, y * z});
1406 auto t5 = t0 + t3;
1407
1408 FusionExecutor fe;
1409 fe.compileFusion(&fusion, {t0, t3});
1410 auto cg_outputs = fe.runFusion({t0, t3});
1411
1412 testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__);
1413}
1414
1415TEST_F(NVFuserTest, FusionSumViewSchedule_CUDA) {
1416 Fusion fusion;
1417 FusionGuard fg(&fusion);
1418
1419 int x = 31, y = 65, z = 103;
1420
1421 auto tv0 = makeConcreteTensor({x, y, z});
1422 fusion.addInput(tv0);
1423
1424 auto tv1 = sin(tv0);
1425
1426 auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1427 fusion.addOutput(tv2);
1428
1429 auto tv3 = makeConcreteTensor({x, y, z});
1430 fusion.addInput(tv3);
1431
1432 auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1433 auto tv5 = sum(tv4, {1});
1434 fusion.addOutput(tv5);
1435
1436 // Link 0 and 3 together for view analysis done based on before the views
1437 // actually happened.
1438 auto tv6 = add(tv0, tv3);
1439 fusion.addOutput(tv6);
1440
1441 TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion));
1442 {
1443 TransformPropagator propagator(tv4);
1444 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
1445 }
1446
1447 tv5->split(1, 128);
1448 tv5->split(1, 4);
1449
1450 auto tv5_rf = tv5->rFactor({1, 2});
1451 tv5_rf->axis(0)->parallelize(ParallelType::BIDx);
1452 tv5_rf->axis(2)->parallelize(ParallelType::Unroll);
1453 tv5_rf->axis(3)->parallelize(ParallelType::TIDx);
1454
1455 {
1456 TransformPropagator propagator(tv5_rf);
1457 MaxRootDomainInfoSpanningTree spanning_tree(tv5_rf);
1458 spanning_tree.traverse(&propagator);
1459 scheduler_utils::parallelizeAllLike(tv5_rf);
1460
1461 // Inline the schedule
1462 inlineMost();
1463 }
1464
1465 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1466
1467 at::Tensor t0 = at::randn({x, y, z}, options);
1468 at::Tensor t3 = at::randn({x, y, z}, options);
1469 auto t1 = sin(t0);
1470 auto t2 = at::native::view(t1, {x, y * z});
1471 auto t4 = at::native::view(t3, {x, y * z});
1472 auto t5 = t4.sum({1});
1473 auto t6 = t0 + t3;
1474
1475 FusionExecutor fe;
1476 fe.compileFusion(&fusion, {t0, t3});
1477 auto cg_outputs = fe.runFusion({t0, t3});
1478
1479 testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__);
1480}
1481
1482// Make sure matching views are segmented into the same kernel
1483TEST_F(NVFuserTest, FusionViewMagicSchedule1_CUDA) {
1484 auto fusion_ptr = std::make_unique<Fusion>();
1485 Fusion& fusion = *fusion_ptr.get();
1486 FusionGuard fg(&fusion);
1487
1488 int x = 31, y = 65, z = 103;
1489
1490 auto tv0 = makeConcreteTensor({x, y, z});
1491 fusion.addInput(tv0);
1492
1493 auto tv1 = sin(tv0);
1494
1495 auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1496 fusion.addOutput(tv2);
1497
1498 auto tv3 = makeConcreteTensor({x, y, z});
1499 fusion.addInput(tv3);
1500
1501 auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1502 fusion.addOutput(tv4);
1503
1504 // Link 0 and 3 together for view analysis done based on before the views
1505 // actually happened.
1506 auto tv5 = add(tv0, tv3);
1507 fusion.addOutput(tv5);
1508
1509 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1510
1511 at::Tensor t0 = at::randn({x, y, z}, options);
1512 at::Tensor t3 = at::randn({x, y, z}, options);
1513 auto t1 = sin(t0);
1514 auto t2 = at::native::view(t1, {x, y * z});
1515 auto t4 = at::native::view(t3, {x, y * z});
1516 auto t5 = t0 + t3;
1517
1518 FusionExecutorCache executor_cache(std::move(fusion_ptr));
1519 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3});
1520 TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1521
1522 testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__);
1523}
1524
1525// Make sure views of views are correct
1526TEST_F(NVFuserTest, FusionViewMagicSchedule2_CUDA) {
1527 auto fusion_ptr = std::make_unique<Fusion>();
1528 Fusion& fusion = *fusion_ptr.get();
1529 FusionGuard fg(&fusion);
1530
1531 int x = 31, y = 65, z = 103;
1532
1533 auto tv0 = makeConcreteTensor({x, y, z});
1534 fusion.addInput(tv0);
1535
1536 auto tv1 = sin(tv0);
1537
1538 auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1539 auto tv3 = view(tv2, {x, y * z}, {x * y, z});
1540 auto tv4 = view(tv3, {x * y, z}, {y, x * z});
1541 auto tv5 = view(tv4, {y, x * z}, {x, y, z});
1542 fusion.addOutput(tv5);
1543
1544 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1545
1546 at::Tensor t0 = at::randn({x, y, z}, options);
1547 auto aten_out = sin(t0);
1548
1549 // For now pointwise scheduler only accepts a single view at a time, so this
1550 // will be broken up into multiple kernels. This is due to the reference check
1551 // looking for all mappings to all input IDs.
1552 // TODO: Fix the reference check for this case
1553 FusionExecutorCache executor_cache(std::move(fusion_ptr));
1554 auto cg_outputs = executor_cache.runFusionWithInputs({t0});
1555
1556 testValidate(&fusion, cg_outputs, {t0}, {aten_out}, __LINE__, __FILE__);
1557}
1558
1559// Make sure broadcasts not on the view path that don't interfere with view are
1560// segmented in one kernel and correctly trigger 2D pointwise scheduling
1561TEST_F(NVFuserTest, FusionViewMagicSchedule3_CUDA) {
1562 auto fusion_ptr = std::make_unique<Fusion>();
1563 Fusion& fusion = *fusion_ptr.get();
1564 FusionGuard fg(&fusion);
1565
1566 int w = 15, x = 31, y = 49, z = 65;
1567
1568 auto tv0 = makeConcreteTensor({x, y, z});
1569 fusion.addInput(tv0);
1570
1571 auto tv1 = sin(tv0);
1572
1573 auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1574 fusion.addOutput(tv2);
1575
1576 auto tv3 = makeConcreteTensor({x, y, z});
1577 fusion.addInput(tv3);
1578
1579 auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1580 fusion.addOutput(tv4);
1581
1582 // Link 0 and 3 together for view analysis done based on before the views
1583 // actually happened.
1584 auto tv5 = add(tv0, tv3);
1585 fusion.addOutput(tv5);
1586
1587 // Broadcast on another branch to drive the pointwise reference to not be on
1588 // the view paths.
1589
1590 auto tv6 = makeConcreteTensor({w, x, y, z});
1591 fusion.addInput(tv6);
1592 auto tv7 = broadcast(tv0, {true, false, false, false});
1593 auto tv8 = add(tv6, tv7);
1594 // tv8 should be the reference for the pointwise fusion. This broadcast
1595 // pattern doesn't interfere with the views, so this should also be scheduled
1596 // as 2D.
1597 fusion.addOutput(tv8);
1598
1599 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1600
1601 at::Tensor t0 = at::randn({x, y, z}, options);
1602 at::Tensor t3 = at::randn({x, y, z}, options);
1603 auto t1 = sin(t0);
1604 auto t2 = at::native::view(t1, {x, y * z});
1605 auto t4 = at::native::view(t3, {x, y * z});
1606 auto t5 = t0 + t3;
1607 at::Tensor t6 = at::randn({w, x, y, z}, options);
1608 auto t8 = t6.add(t0);
1609
1610 FusionExecutorCache executor_cache(std::move(fusion_ptr));
1611 // Collect the heuristic params
1612 executor_cache.profile(true);
1613 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6});
1614
1615 TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1616 TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
1617 .params->isA<PointwiseParams>());
1618 auto pparams =
1619 executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
1620 TORCH_CHECK(pparams->break_point == 1);
1621
1622 testValidate(
1623 &fusion, cg_outputs, {t0, t3, t6}, {t2, t4, t5, t8}, __LINE__, __FILE__);
1624}
1625
1626// Make sure broadcasts through views when not conflicting with view are
1627// segmented into one kernel and trigger 2D pointwise scheduler.
1628TEST_F(NVFuserTest, FusionViewMagicSchedule4_CUDA) {
1629 auto fusion_ptr = std::make_unique<Fusion>();
1630 Fusion& fusion = *fusion_ptr.get();
1631 FusionGuard fg(&fusion);
1632
1633 int w = 15, x = 31, y = 49, z = 65;
1634
1635 auto tv0 = makeConcreteTensor({x, y, z});
1636 fusion.addInput(tv0);
1637
1638 auto tv1 = sin(tv0);
1639
1640 auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1641 fusion.addOutput(tv2);
1642
1643 auto tv3 = makeConcreteTensor({x, y, z});
1644 fusion.addInput(tv3);
1645
1646 auto tv4 = makeConcreteTensor({x, 1, 1});
1647 fusion.addInput(tv4);
1648
1649 auto tv5 = add(tv4, tv3);
1650
1651 auto tv6 = view(tv5, {x, y, z}, {x, y * z});
1652 fusion.addOutput(tv6);
1653
1654 // Link 0 and 3 together for view analysis done based on before the views
1655 // actually happened.
1656 auto tv7 = add(tv0, tv3);
1657 fusion.addOutput(tv7);
1658
1659 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1660
1661 at::Tensor t0 = at::randn({x, y, z}, options);
1662 at::Tensor t3 = at::randn({x, y, z}, options);
1663 at::Tensor t4 = at::randn({x, 1, 1}, options);
1664 auto t1 = sin(t0);
1665 auto t2 = at::native::view(t1, {x, y * z});
1666 auto t5 = t4 + t3;
1667 auto t6 = at::native::view(t5, {x, y * z});
1668 auto t7 = t0 + t3;
1669
1670 FusionExecutorCache executor_cache(std::move(fusion_ptr));
1671 // Collect the heuristic params
1672 executor_cache.profile(true);
1673 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t4});
1674
1675 TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1676 TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
1677 .params->isA<PointwiseParams>());
1678 auto pparams =
1679 executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
1680 TORCH_CHECK(pparams->break_point == 1);
1681
1682 testValidate(
1683 &fusion, cg_outputs, {t0, t3, t4}, {t2, t6, t7}, __LINE__, __FILE__);
1684}
1685
1686// Make sure different views that are consumed by the reference are segmented
1687// into a single kernel.
1688TEST_F(NVFuserTest, FusionViewMagicSchedule5_CUDA) {
1689 auto fusion_ptr = std::make_unique<Fusion>();
1690 Fusion& fusion = *fusion_ptr.get();
1691 FusionGuard fg(&fusion);
1692
1693 int w = 15, x = 31, y = 49, z = 65;
1694
1695 auto tv0 = makeConcreteTensor({w, x, y * z});
1696 fusion.addInput(tv0);
1697 auto tv1 = sin(tv0);
1698 auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w});
1699
1700 auto tv3 = makeConcreteTensor({w, x * y, z});
1701 fusion.addInput(tv3);
1702 auto tv4 = cos(tv3);
1703 auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w});
1704
1705 auto tv6 = add(tv2, tv5);
1706 fusion.addOutput(tv6);
1707
1708 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1709
1710 at::Tensor t0 = at::randn({w, x, y * z}, options);
1711 auto t1 = sin(t0);
1712 auto t2 = at::native::view(t1, {z, y, x, w});
1713 at::Tensor t3 = at::randn({w, x * y, z}, options);
1714 auto t4 = cos(t3);
1715 auto t5 = at::native::view(t4, {z, y, x, w});
1716 auto t6 = add(t2, t5);
1717
1718 FusionExecutorCache executor_cache(std::move(fusion_ptr));
1719 // Collect the heuristic params
1720 executor_cache.profile(true);
1721 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3});
1722
1723 TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1724 TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
1725 .params->isA<PointwiseParams>());
1726
1727 testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__);
1728}
1729
1730// Make sure different views that are consumed by the reference are segmented
1731// into a single kernel.
1732TEST_F(NVFuserTest, FusionViewMapping_CUDA) {
1733 auto fusion_ptr = std::make_unique<Fusion>();
1734 Fusion& fusion = *fusion_ptr.get();
1735 FusionGuard fg(&fusion);
1736
1737 int w = 15, x = 31, y = 49, z = 65;
1738
1739 auto tv0 = makeConcreteTensor({w, x, y * z});
1740 fusion.addInput(tv0);
1741 auto tv1 = sin(tv0);
1742 auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w});
1743
1744 auto tv3 = makeConcreteTensor({w, x * y, z});
1745 fusion.addInput(tv3);
1746 auto tv4 = cos(tv3);
1747 auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w});
1748
1749 auto tv6 = add(tv2, tv5);
1750 fusion.addOutput(tv6);
1751
1752 tv6->merge(0);
1753 tv6->merge(0);
1754 tv6->merge(0);
1755 tv6->split(0, 128);
1756 tv6->split(0, 4);
1757 tv6->axis(0)->parallelize(ParallelType::BIDx);
1758 tv6->axis(1)->parallelize(ParallelType::Unroll);
1759 tv6->axis(2)->parallelize(ParallelType::TIDx);
1760
1761 TransformPropagator propagator(tv6);
1762 MaxRootDomainInfoSpanningTree spanning_tree(tv6);
1763 spanning_tree.traverse(&propagator);
1764 scheduler_utils::parallelizeAllLike(tv6);
1765
1766 // Inline the schedule
1767 inlineMost();
1768
1769 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1770
1771 at::Tensor t0 = at::randn({w, x, y * z}, options);
1772 auto t1 = sin(t0);
1773 auto t2 = at::native::view(t1, {z, y, x, w});
1774 at::Tensor t3 = at::randn({w, x * y, z}, options);
1775 auto t4 = cos(t3);
1776 auto t5 = at::native::view(t4, {z, y, x, w});
1777 auto t6 = add(t2, t5);
1778
1779 FusionExecutor fe;
1780 fe.compileFusion(&fusion, {t0, t3});
1781 auto cg_outputs = fe.runFusion({t0, t3});
1782
1783 testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__);
1784}
1785
1786TEST_F(NVFuserTest, FusionLowerDivisibleSplits_CUDA) {
1787 auto fusion_ptr = std::make_unique<Fusion>();
1788 Fusion& fusion = *fusion_ptr.get();
1789 FusionGuard fg(&fusion);
1790
1791 int w = 15, x = 31, y = 49, z = 65;
1792
1793 auto tv0 = makeContigTensor(4);
1794 fusion.addInput(tv0);
1795 auto tv1 = sin(tv0);
1796 auto tv2 = view(tv1, {w, x, y, z}, {z, y, x, w});
1797
1798 fusion.addOutput(tv2);
1799
1800 tv2->merge(0)->merge(0)->merge(0)->split(0, 4)->split(0, 8, false);
1801
1802 TransformPropagator propagator(tv2);
1803 MaxRootDomainInfoSpanningTree spanning_tree(tv2);
1804 spanning_tree.traverse(&propagator);
1805 scheduler_utils::parallelizeAllLike(tv2);
1806
1807 // Inline the schedule
1808 inlineMost();
1809
1810 auto divisible_splits = getAllDivisibleSplits(&fusion);
1811
1812 // Operations on all tensors are basically:
1813 // [10] merge(0) [9]->outer->definition
1814 // [9] merge(0) [8]->outer->definition
1815 // [8] merge(0) [7]->in->definition
1816 // [7] split(0, z, false) [6]->in->definition
1817 // [6] split(1, y, false) [5]->in->definition
1818 // [5] split(2, x, false) [3]->inner->definition
1819 // RFactor of tv2
1820 // [4] merge(0) [3]->outer->definition
1821 // [3] merge(0) [2]->outer->definition
1822 // [2] merge(0) [1]->in->definition
1823 // [1] split(0, 4) [0]->in->definition
1824 // [0] split(0, 8, false) tv->axis(0)->definition
1825
1826 for (auto tv : std::vector<TensorView*>({tv2, tv1, tv0})) {
1827 auto transform_0 = tv->axis(0)->definition()->as<Split>();
1828 auto transform_1 = transform_0->in()->definition()->as<Split>();
1829 auto transform_2 = transform_1->in()->definition()->as<Merge>();
1830 auto transform_3 = transform_2->outer()->definition()->as<Merge>();
1831
1832 auto transform_5 = transform_3->inner()->definition()->as<Split>();
1833 auto transform_6 = transform_5->in()->definition()->as<Split>();
1834 auto transform_7 = transform_6->in()->definition()->as<Split>();
1835
1836 TORCH_CHECK(
1837 divisible_splits.find(transform_5) != divisible_splits.end(),
1838 "Expecting: ",
1839 transform_5->toString(),
1840 "\nFrom TV: ",
1841 tv,
1842 "\nTo be a divisible split.");
1843 TORCH_CHECK(
1844 divisible_splits.find(transform_6) != divisible_splits.end(),
1845 "Expecting: ",
1846 transform_6->toString(),
1847 "\nFrom TV: ",
1848 tv,
1849 "\nTo be a divisible split.");
1850 TORCH_CHECK(
1851 divisible_splits.find(transform_7) != divisible_splits.end(),
1852 "Expecting: ",
1853 transform_7->toString(),
1854 "\nFrom TV: ",
1855 tv,
1856 "\nTo be a divisible split.");
1857 }
1858}
1859
1860} // namespace jit
1861} // namespace torch
1862#endif // #if defined(USE_CUDA)
1863