1 | #if defined(USE_CUDA) |
2 | #include <gmock/gmock-matchers.h> |
3 | #include <gtest/gtest.h> |
4 | |
5 | #include <arith.h> |
6 | #include <codegen.h> |
7 | #include <disjoint_set.h> |
8 | #include <executor.h> |
9 | #include <executor_launch_params.h> |
10 | #include <expr_evaluator.h> |
11 | #include <fusion.h> |
12 | #include <fusion_segmenter.h> |
13 | #include <grouped_reduction.h> |
14 | #include <inlining.h> |
15 | #include <ir_all_nodes.h> |
16 | #include <ir_builder.h> |
17 | #include <ir_graphviz.h> |
18 | #include <ir_iostream.h> |
19 | #include <ir_utils.h> |
20 | #include <iter_visitor.h> |
21 | #include <kernel_cache.h> |
22 | #include <kernel_expr_evaluator.h> |
23 | #include <kernel_ir.h> |
24 | #include <kernel_ir_dispatch.h> |
25 | #include <lower2device.h> |
26 | #include <lower_magic_zero.h> |
27 | #include <mutator.h> |
28 | #include <ops/all_ops.h> |
29 | #include <parser.h> |
30 | #include <register_interface.h> |
31 | #include <root_domain_map.h> |
32 | #include <scheduler/all_schedulers.h> |
33 | #include <scheduler/reduction_utils.h> |
34 | #include <scheduler/utils.h> |
35 | #include <test/test_gpu_validator.h> |
36 | #include <test/test_utils.h> |
37 | #include <transform_replay.h> |
38 | #include <transform_rfactor.h> |
39 | |
40 | #include <test/cpp/jit/test_utils.h> |
41 | #include <torch/csrc/jit/api/function_impl.h> |
42 | #include <torch/csrc/jit/codegen/cuda/interface.h> |
43 | #include <torch/csrc/jit/ir/irparser.h> |
44 | #include <torch/torch.h> |
45 | |
46 | #include <ATen/cuda/CUDAContext.h> |
47 | #include <ATen/cuda/Exceptions.h> |
48 | #include <c10/cuda/CUDAStream.h> |
49 | |
50 | #include <algorithm> |
51 | #include <iostream> |
52 | #include <sstream> |
53 | #include <thread> |
54 | |
55 | // Tests go in torch::jit |
56 | namespace torch { |
57 | namespace jit { |
58 | |
59 | using namespace torch::jit::fuser::cuda; |
60 | using namespace at::indexing; |
61 | |
62 | TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { |
63 | Fusion fusion; |
64 | FusionGuard fg(&fusion); |
65 | |
66 | auto tv0 = makeSymbolicTensor(1); |
67 | fusion.addInput(tv0); |
68 | |
69 | auto tv1 = sum(tv0, {0}); |
70 | fusion.addOutput(tv1); |
71 | |
72 | // [I] |
73 | tv1->split(0, 5); |
74 | // [ceilDiv(I, 5), 5] |
75 | |
76 | // This second split is non-divisible. The split domain must be predicated. |
77 | tv1->split(1, 3); |
78 | // [ceilDiv(I, 5), 2, 3] |
79 | |
80 | auto tv2 = sum(tv0, {0}); |
81 | fusion.addOutput(tv2); |
82 | |
83 | // tv2 shouldn't need to have another predicate |
84 | tv2->split(0, 4); |
85 | tv2->split(1, 2); |
86 | |
87 | GpuLower gpulw(&fusion); |
88 | TORCH_CHECK( |
89 | gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), |
90 | "There must be no split to validate" ); |
91 | TORCH_CHECK( |
92 | gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1, |
93 | "Only tv1 should have a non-divisible predicate." ); |
94 | for (auto tv : {loweredTv(tv1, gpulw)}) { |
95 | auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); |
96 | TORCH_CHECK( |
97 | it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), |
98 | "No info found for " , |
99 | tv); |
100 | const auto& splits_to_predicate = it->second; |
101 | TORCH_CHECK( |
102 | splits_to_predicate.size() == 1, |
103 | "There must be one split to predicate" ); |
104 | } |
105 | |
106 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
107 | at::manual_seed(0); |
108 | at::Tensor t0 = at::randn({24}, options); |
109 | |
110 | FusionExecutor fe; |
111 | fe.compileFusion(&fusion, {t0}); |
112 | auto cg_outputs = fe.runFusion({t0}); |
113 | |
114 | auto ref = t0.sum(); |
115 | |
116 | testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__); |
117 | } |
118 | |
119 | // Repro of issue #1074 |
120 | TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { |
121 | Fusion fusion; |
122 | FusionGuard fg(&fusion); |
123 | |
124 | auto tv0 = makeSymbolicTensor(2); |
125 | fusion.addInput(tv0); |
126 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
127 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
128 | fusion.addOutput(tv2); |
129 | |
130 | tv2->split(0, 2); |
131 | tv2->split(-1, 4); |
132 | tv2->reorder({{1, 2}, {2, 1}}); |
133 | tv0->computeAt(tv2, 2); |
134 | |
135 | tv2->split(-1, 3); |
136 | |
137 | // To make the sanitizer catch the invalid accesses. Not necessary |
138 | // to expose the bug. |
139 | tv1->setMemoryType(MemoryType::Shared); |
140 | |
141 | GpuLower gpulw(&fusion); |
142 | TORCH_CHECK( |
143 | gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), |
144 | "There must be no split to validate" ); |
145 | TORCH_CHECK( |
146 | gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1, |
147 | "Only tv2 should have a non-divisible predicate." ); |
148 | for (auto tv : {loweredTv(tv2, gpulw)}) { |
149 | auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); |
150 | TORCH_CHECK( |
151 | it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), |
152 | "No info found for " , |
153 | tv); |
154 | const auto& splits_to_predicate = it->second; |
155 | TORCH_CHECK( |
156 | splits_to_predicate.size() == 1, |
157 | "There must be one split to predicate" ); |
158 | } |
159 | |
160 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
161 | at::manual_seed(0); |
162 | at::Tensor t0 = at::randn({13, 17}, options); |
163 | |
164 | FusionExecutor fe; |
165 | fe.compileFusion(&fusion, {t0}); |
166 | auto cg_outputs = fe.runFusion({t0}); |
167 | |
168 | auto ref = t0 + 2; |
169 | |
170 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
171 | } |
172 | |
173 | // Similar to FusionNonDivisibleSplit1 but with unswitch |
174 | TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { |
175 | Fusion fusion; |
176 | FusionGuard fg(&fusion); |
177 | |
178 | auto tv0 = makeSymbolicTensor(1); |
179 | fusion.addInput(tv0); |
180 | |
181 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
182 | auto tv2 = sum(tv1, {0}); |
183 | fusion.addOutput(tv2); |
184 | |
185 | tv2->split(0, 5); |
186 | tv2->split(1, 3); |
187 | |
188 | tv0->computeAt(tv2, -1); |
189 | |
190 | tv2->axis(0)->parallelize(ParallelType::Unswitch); |
191 | |
192 | GpuLower gpulw(&fusion); |
193 | TORCH_CHECK( |
194 | gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), |
195 | "There must be no split to validate" ); |
196 | TORCH_CHECK( |
197 | gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, |
198 | "Both tv1 and tv2 should have a non-divisible predicate." ); |
199 | for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) { |
200 | auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); |
201 | TORCH_CHECK( |
202 | it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), |
203 | "No info found for " , |
204 | tv); |
205 | const auto& splits_to_predicate = it->second; |
206 | TORCH_CHECK( |
207 | splits_to_predicate.size() == 1, |
208 | "There must be one split to predicate" ); |
209 | } |
210 | |
211 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
212 | at::manual_seed(0); |
213 | at::Tensor t0 = at::randn({24}, options); |
214 | |
215 | FusionExecutor fe; |
216 | fe.compileFusion(&fusion, {t0}); |
217 | auto cg_outputs = fe.runFusion({t0}); |
218 | |
219 | auto ref = (t0 + 1).sum(); |
220 | |
221 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
222 | } |
223 | |
224 | // Non-divisible split through merge |
225 | TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { |
226 | Fusion fusion; |
227 | FusionGuard fg(&fusion); |
228 | |
229 | auto tv0 = makeSymbolicTensor(2); |
230 | fusion.addInput(tv0); |
231 | |
232 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
233 | auto tv2 = sum(tv1, {0, 1}); |
234 | fusion.addOutput(tv2); |
235 | |
236 | tv2->split(0, 5); |
237 | tv2->merge(1, 2); |
238 | tv2->split(1, 3); |
239 | |
240 | tv0->computeAt(tv2, -1); |
241 | |
242 | GpuLower gpulw(&fusion); |
243 | TORCH_CHECK( |
244 | gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), |
245 | "There must be no split to validate" ); |
246 | TORCH_CHECK( |
247 | gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, |
248 | "Both tv1 and tv2 should have a non-divisible predicate." ); |
249 | for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) { |
250 | auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); |
251 | TORCH_CHECK( |
252 | it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), |
253 | "No info found for " , |
254 | tv); |
255 | const auto& splits_to_predicate = it->second; |
256 | TORCH_CHECK( |
257 | splits_to_predicate.size() == 1, |
258 | "There must be one split to predicate" ); |
259 | } |
260 | |
261 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
262 | at::manual_seed(0); |
263 | at::Tensor t0 = at::randn({24, 2}, options); |
264 | |
265 | FusionExecutor fe; |
266 | fe.compileFusion(&fusion, {t0}); |
267 | auto cg_outputs = fe.runFusion({t0}); |
268 | |
269 | auto ref = (t0 + 1).sum(); |
270 | |
271 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
272 | } |
273 | |
274 | // Nested splits |
275 | TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { |
276 | Fusion fusion; |
277 | FusionGuard fg(&fusion); |
278 | |
279 | auto tv0 = makeSymbolicTensor(1); |
280 | fusion.addInput(tv0); |
281 | |
282 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
283 | auto tv2 = sum(tv1, {0}); |
284 | fusion.addOutput(tv2); |
285 | |
286 | // [I] |
287 | tv2->split(0, 8); |
288 | // [I/8, 8] |
289 | tv2->split(1, 2); |
290 | // [I/8, 4, 2] |
291 | tv2->split(1, 3); // non-divisible split of outer output |
292 | // [I/8, 2, 3, 2] |
293 | |
294 | tv0->computeAt(tv2, -1); |
295 | |
296 | GpuLower gpulw(&fusion); |
297 | TORCH_CHECK( |
298 | gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), |
299 | "There must be no split to validate" ); |
300 | TORCH_CHECK( |
301 | gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, |
302 | "Both tv1 and tv2 should have a non-divisible predicate." ); |
303 | for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) { |
304 | auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); |
305 | TORCH_CHECK( |
306 | it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), |
307 | "No info found for " , |
308 | tv); |
309 | const auto& splits_to_predicate = it->second; |
310 | TORCH_CHECK( |
311 | splits_to_predicate.size() == 1, |
312 | "There must be one split to predicate" ); |
313 | } |
314 | |
315 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
316 | at::manual_seed(0); |
317 | at::Tensor t0 = at::randn({24}, options); |
318 | |
319 | FusionExecutor fe; |
320 | fe.compileFusion(&fusion, {t0}); |
321 | auto cg_outputs = fe.runFusion({t0}); |
322 | |
323 | auto ref = (t0 + 1).sum(); |
324 | |
325 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
326 | } |
327 | |
328 | // Vectorized non-divisible split. Must be validated at run time |
329 | TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { |
330 | Fusion fusion; |
331 | FusionGuard fg(&fusion); |
332 | |
333 | auto tv0 = makeContigTensor(1); |
334 | fusion.addInput(tv0); |
335 | |
336 | auto tv1 = set(tv0); |
337 | fusion.addOutput(tv1); |
338 | |
339 | tv1->split(0, 8, false); |
340 | tv1->split(1, 4); |
341 | |
342 | tv1->axis(-1)->parallelize(ParallelType::Vectorize); |
343 | |
344 | GpuLower gpulw(&fusion); |
345 | TORCH_CHECK( |
346 | gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1, |
347 | "There should be one split to validate" ); |
348 | for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) { |
349 | const auto& splits_to_predicate = kv.second; |
350 | TORCH_CHECK( |
351 | splits_to_predicate.empty(), |
352 | "There must be no split to predicate, but tensor t" , |
353 | kv.first->name(), |
354 | " has:" , |
355 | splits_to_predicate); |
356 | } |
357 | |
358 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
359 | at::manual_seed(0); |
360 | auto t0 = at::randn({32}, options); |
361 | |
362 | FusionExecutor fe; |
363 | fe.compileFusion(&fusion, {t0}); |
364 | auto cg_outputs = fe.runFusion({t0}); |
365 | |
366 | auto ref = t0; |
367 | |
368 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
369 | |
370 | auto t0_non_divisible = at::randn({8}, options); |
371 | // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is |
372 | // illegal. The run-time validation of vectorization should throw an error. |
373 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
374 | ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible})); |
375 | } |
376 | |
377 | // If a split is validated at run time, it's not necessary to predicate. |
378 | TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { |
379 | Fusion fusion; |
380 | FusionGuard fg(&fusion); |
381 | |
382 | auto tv0 = makeContigTensor(1); |
383 | fusion.addInput(tv0); |
384 | |
385 | auto tv1 = set(tv0); |
386 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
387 | auto tv3 = sum(tv2, {0}); |
388 | fusion.addOutput(tv3); |
389 | |
390 | tv3->split(0, 8, false); |
391 | tv3->split(1, 4); |
392 | TransformPropagatorWithCheck propagator(tv3); |
393 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
394 | |
395 | tv3->axis(1)->parallelize(ParallelType::TIDx); |
396 | scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2}); |
397 | |
398 | tv1->axis(2)->parallelize(ParallelType::Vectorize); |
399 | |
400 | GpuLower gpulw(&fusion); |
401 | TORCH_CHECK( |
402 | gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1, |
403 | "There should be one split to validate" ); |
404 | for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) { |
405 | const auto& splits_to_predicate = kv.second; |
406 | TORCH_CHECK( |
407 | splits_to_predicate.empty(), |
408 | "There must be no split to predicate, but tensor t" , |
409 | kv.first->name(), |
410 | " has:" , |
411 | splits_to_predicate); |
412 | } |
413 | |
414 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
415 | at::manual_seed(0); |
416 | |
417 | auto t0 = at::randn({1024}, options); |
418 | |
419 | FusionExecutor fe; |
420 | fe.compileFusion(&fusion, {t0}); |
421 | auto cg_outputs = fe.runFusion({t0}); |
422 | |
423 | auto ref = (t0 + 1).sum(); |
424 | |
425 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
426 | } |
427 | |
428 | TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) { |
429 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
430 | Fusion& fusion = *fusion_ptr.get(); |
431 | FusionGuard fg(&fusion); |
432 | |
433 | std::vector<int64_t> input_shape_0 = {10, 20}; |
434 | std::vector<int64_t> input_shape_1 = {15}; |
435 | |
436 | TensorView* in_0 = makeSymbolicTensor(input_shape_0.size()); |
437 | TensorView* in_1 = makeSymbolicTensor(input_shape_1.size()); |
438 | fusion.addInput(in_0); |
439 | fusion.addInput(in_1); |
440 | |
441 | TensorView* out_0 = add(in_0, IrBuilder::create<Double>(0.f)); |
442 | TensorView* out_1 = add(in_1, IrBuilder::create<Double>(2.f)); |
443 | |
444 | fusion.addOutput(out_0); |
445 | fusion.addOutput(out_1); |
446 | |
447 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
448 | at::Tensor at_in_0 = at::randn(input_shape_0, options); |
449 | at::Tensor at_in_1 = at::randn(input_shape_1, options); |
450 | std::vector<IValue> aten_inputs = {at_in_0, at_in_1}; |
451 | |
452 | FusionExecutorCache fec(std::move(fusion_ptr)); |
453 | auto outputs = fec.runFusionWithInputs(aten_inputs); |
454 | |
455 | auto t1 = at_in_1 + 2; |
456 | |
457 | auto runtime = fec.getMostRecentKernelRuntime(); |
458 | TORCH_INTERNAL_ASSERT(runtime->isSegmented()); |
459 | TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2); |
460 | |
461 | testValidate( |
462 | &fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__); |
463 | } |
464 | |
465 | TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) { |
466 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
467 | Fusion& fusion = *fusion_ptr.get(); |
468 | FusionGuard fg(&fusion); |
469 | |
470 | std::vector<int64_t> input_shape_0 = {4, 4}; |
471 | std::vector<int64_t> input_shape_1 = {3, 4, 4}; |
472 | std::vector<int64_t> input_shape_2 = {2, 8, 4, 4}; |
473 | |
474 | TensorView* in_0 = makeSymbolicTensor(input_shape_0.size()); |
475 | TensorView* in_1 = makeSymbolicTensor(input_shape_1.size()); |
476 | TensorView* in_2 = makeSymbolicTensor(input_shape_2.size()); |
477 | |
478 | fusion.addInput(in_0); |
479 | fusion.addInput(in_1); |
480 | fusion.addInput(in_2); |
481 | |
482 | TensorView* out_0 = add(in_0, in_1); |
483 | TensorView* out_1 = add(in_0, in_2); |
484 | |
485 | fusion.addOutput(out_0); |
486 | fusion.addOutput(out_1); |
487 | |
488 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
489 | at::Tensor at_in_0 = at::randn(input_shape_0, options); |
490 | at::Tensor at_in_1 = at::randn(input_shape_1, options); |
491 | at::Tensor at_in_2 = at::randn(input_shape_2, options); |
492 | |
493 | std::vector<IValue> aten_inputs = {at_in_0, at_in_1, at_in_2}; |
494 | |
495 | FusionExecutorCache fec(std::move(fusion_ptr)); |
496 | auto outputs = fec.runFusionWithInputs(aten_inputs); |
497 | |
498 | auto t0 = at_in_0 + at_in_1; |
499 | auto t1 = at_in_0 + at_in_2; |
500 | |
501 | auto runtime = fec.getMostRecentKernelRuntime(); |
502 | TORCH_INTERNAL_ASSERT(runtime->isSegmented()); |
503 | TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2); |
504 | |
505 | testValidate( |
506 | &fusion, |
507 | outputs, |
508 | {at_in_0, at_in_1, at_in_2}, |
509 | {t0, t1}, |
510 | __LINE__, |
511 | __FILE__); |
512 | } |
513 | |
514 | TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) { |
515 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
516 | Fusion& fusion = *fusion_ptr.get(); |
517 | FusionGuard fg(&fusion); |
518 | |
519 | auto t0 = makeContigTensor(1); |
520 | auto t1 = makeContigTensor(2); |
521 | |
522 | fusion.addInput(t0); |
523 | fusion.addInput(t1); |
524 | |
525 | auto t2 = broadcast(t0, {true, false}); |
526 | auto t3 = add(t1, t2); |
527 | auto t4 = add(t3, t2); |
528 | auto t5 = sum(t4, {1}); |
529 | auto t6 = broadcast(t5, {false, true}); |
530 | auto t7 = add(t3, t6); |
531 | |
532 | fusion.addOutput(t7); |
533 | |
534 | t3->computeAt(t7, -1, ComputeAtMode::MostInlined); |
535 | |
536 | TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == 1); |
537 | } |
538 | |
539 | TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) { |
540 | Fusion fusion; |
541 | FusionGuard fg(&fusion); |
542 | |
543 | auto tv0 = makeContigTensor(1); |
544 | fusion.addInput(tv0); |
545 | |
546 | auto tv1 = set(tv0); |
547 | auto tv2 = add(tv1, IrBuilder::create<Double>(1.0)); |
548 | auto tv3 = set(tv2); |
549 | fusion.addOutput(tv3); |
550 | |
551 | tv1->setMemoryType(MemoryType::Shared); |
552 | |
553 | tv3->split(-1, 128); |
554 | tv3->split(-1, 32); |
555 | TransformPropagatorWithCheck propagator(tv3); |
556 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
557 | |
558 | tv0->computeAt(tv3, 1); |
559 | |
560 | tv3->axis(-2)->parallelize(ParallelType::BIDx); |
561 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
562 | scheduler_utils::parallelizeAllLike(tv3); |
563 | |
564 | tv1->doubleBuffer(); |
565 | |
566 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
567 | at::manual_seed(0); |
568 | auto t0 = at::randn({1000}, options); |
569 | |
570 | FusionExecutor fe; |
571 | fe.compileFusion(&fusion, {t0}); |
572 | auto cg_outputs = fe.runFusion({t0}); |
573 | |
574 | auto ref = t0 + 1; |
575 | |
576 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
577 | } |
578 | |
579 | TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) { |
580 | Fusion fusion; |
581 | FusionGuard fg(&fusion); |
582 | |
583 | auto tv0 = makeContigTensor(1); |
584 | fusion.addInput(tv0); |
585 | |
586 | auto tv1 = set(tv0); |
587 | auto tv2 = add(tv1, IrBuilder::create<Double>(1.0)); |
588 | auto tv3 = set(tv2); |
589 | fusion.addOutput(tv3); |
590 | |
591 | tv3->split(-1, 128); |
592 | tv3->split(-1, 32); |
593 | TransformPropagatorWithCheck propagator(tv3); |
594 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
595 | |
596 | tv0->computeAt(tv3, -1); |
597 | |
598 | tv3->axis(-2)->parallelize(ParallelType::BIDx); |
599 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
600 | scheduler_utils::parallelizeAllLike(tv3); |
601 | |
602 | tv1->doubleBuffer(); |
603 | |
604 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
605 | at::manual_seed(0); |
606 | auto t0 = at::randn({1000}, options); |
607 | |
608 | FusionExecutor fe; |
609 | fe.compileFusion(&fusion, {t0}); |
610 | auto cg_outputs = fe.runFusion({t0}); |
611 | |
612 | auto ref = t0 + 1; |
613 | |
614 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
615 | } |
616 | |
617 | TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) { |
618 | Fusion fusion; |
619 | FusionGuard fg(&fusion); |
620 | |
621 | auto tv0 = makeContigTensor(1); |
622 | fusion.addInput(tv0); |
623 | |
624 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
625 | auto tv2 = set(tv1); |
626 | auto tv3 = add(tv2, IrBuilder::create<Double>(1.0)); |
627 | fusion.addOutput(tv3); |
628 | |
629 | tv1->setMemoryType(MemoryType::Shared); |
630 | |
631 | tv3->split(-1, 128); |
632 | tv3->split(-1, 32); |
633 | TransformPropagatorWithCheck propagator(tv3); |
634 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
635 | |
636 | tv0->computeAt(tv3, 1); |
637 | |
638 | // tv2 is invalid to double-buffer as its producer, tv1, is |
639 | // computed inside the double-buffering loop. |
640 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
641 | ASSERT_ANY_THROW(tv2->doubleBuffer()); |
642 | |
643 | // Moving tv2 inner makes tv1 large enough to double-buffer tv2 |
644 | tv2->computeAt(tv3, 2); |
645 | |
646 | tv2->doubleBuffer(); |
647 | |
648 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
649 | scheduler_utils::parallelizeAllLike(tv3); |
650 | |
651 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
652 | at::manual_seed(0); |
653 | auto t0 = at::randn({1000}, options); |
654 | |
655 | FusionExecutor fe; |
656 | fe.compileFusion(&fusion, {t0}); |
657 | auto cg_outputs = fe.runFusion({t0}); |
658 | |
659 | auto ref = t0 + 2; |
660 | |
661 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
662 | } |
663 | |
664 | // Double buffering smem to local and unswitch |
665 | TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) { |
666 | Fusion fusion; |
667 | FusionGuard fg(&fusion); |
668 | |
669 | auto tv0 = makeContigTensor(1); |
670 | fusion.addInput(tv0); |
671 | |
672 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
673 | auto tv2 = set(tv1); |
674 | auto tv3 = add(tv2, IrBuilder::create<Double>(1.0)); |
675 | fusion.addOutput(tv3); |
676 | |
677 | tv1->setMemoryType(MemoryType::Shared); |
678 | |
679 | tv3->split(-1, 128); |
680 | tv3->split(-1, 32); |
681 | tv3->split(-1, 8); |
682 | TransformPropagatorWithCheck propagator(tv3); |
683 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
684 | |
685 | tv0->computeAt(tv3, 2); |
686 | tv2->computeAt(tv3, -1); |
687 | |
688 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
689 | tv3->axis(1)->parallelize(ParallelType::Unswitch); |
690 | scheduler_utils::parallelizeAllLike(tv3); |
691 | |
692 | tv2->doubleBuffer(); |
693 | |
694 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
695 | at::manual_seed(0); |
696 | auto t0 = at::randn({1000}, options); |
697 | |
698 | FusionExecutor fe; |
699 | fe.compileFusion(&fusion, {t0}); |
700 | auto cg_outputs = fe.runFusion({t0}); |
701 | |
702 | auto ref = t0 + 2; |
703 | |
704 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
705 | } |
706 | |
707 | // Double buffering gmem to shared and unswitch |
708 | TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) { |
709 | Fusion fusion; |
710 | FusionGuard fg(&fusion); |
711 | |
712 | auto tv0 = makeContigTensor(1); |
713 | fusion.addInput(tv0); |
714 | |
715 | auto tv1 = set(tv0); |
716 | auto tv2 = add(tv1, IrBuilder::create<Double>(1.0)); |
717 | fusion.addOutput(tv2); |
718 | |
719 | tv1->setMemoryType(MemoryType::Shared); |
720 | |
721 | tv2->split(-1, 128); |
722 | tv2->split(-1, 32); |
723 | tv2->split(-1, 8); |
724 | TransformPropagatorWithCheck propagator(tv2); |
725 | MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator); |
726 | |
727 | tv0->computeAt(tv2, 2); |
728 | tv1->computeAt(tv2, -1); |
729 | |
730 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
731 | tv2->axis(1)->parallelize(ParallelType::Unswitch); |
732 | scheduler_utils::parallelizeAllLike(tv2); |
733 | |
734 | tv1->doubleBuffer(); |
735 | |
736 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
737 | at::manual_seed(0); |
738 | auto t0 = at::randn({1000}, options); |
739 | |
740 | FusionExecutor fe; |
741 | fe.compileFusion(&fusion, {t0}); |
742 | auto cg_outputs = fe.runFusion({t0}); |
743 | |
744 | auto ref = t0 + 1; |
745 | |
746 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
747 | } |
748 | |
749 | // Double buffering smem to local and unroll |
750 | TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) { |
751 | Fusion fusion; |
752 | FusionGuard fg(&fusion); |
753 | |
754 | auto tv0 = makeContigTensor(1); |
755 | fusion.addInput(tv0); |
756 | |
757 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
758 | auto tv2 = set(tv1); |
759 | auto tv3 = add(tv2, IrBuilder::create<Double>(1.0)); |
760 | fusion.addOutput(tv3); |
761 | |
762 | tv1->setMemoryType(MemoryType::Shared); |
763 | |
764 | tv3->split(-1, 128); |
765 | tv3->split(-1, 16); |
766 | tv3->split(-2, 4); |
767 | tv3->split(-2, 2); |
768 | TransformPropagatorWithCheck propagator(tv3); |
769 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
770 | |
771 | tv0->computeAt(tv3, 1); |
772 | tv2->computeAt(tv3, -1); |
773 | |
774 | tv3->axis(2)->parallelize(ParallelType::Unroll); |
775 | tv3->axis(4)->parallelize(ParallelType::TIDx); |
776 | |
777 | tv2->doubleBuffer(); |
778 | |
779 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
780 | at::manual_seed(0); |
781 | auto t0 = at::randn({199}, options); |
782 | |
783 | FusionExecutor fe; |
784 | fe.compileFusion(&fusion, {t0}); |
785 | auto cg_outputs = fe.runFusion({t0}); |
786 | |
787 | auto ref = t0 + 2; |
788 | |
789 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
790 | } |
791 | |
792 | // Double buffering and vectorize |
793 | TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) { |
794 | Fusion fusion; |
795 | FusionGuard fg(&fusion); |
796 | |
797 | auto tv0 = makeContigTensor(1); |
798 | fusion.addInput(tv0); |
799 | |
800 | auto tv1 = set(tv0); |
801 | auto tv2 = add(tv1, IrBuilder::create<Double>(1.0)); |
802 | fusion.addOutput(tv2); |
803 | |
804 | tv2->split(-1, 128); |
805 | tv2->split(-1, 4); |
806 | TransformPropagatorWithCheck propagator(tv2); |
807 | MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator); |
808 | |
809 | tv1->computeAt(tv2, 2); |
810 | |
811 | tv2->axis(-2)->parallelize(ParallelType::TIDx); |
812 | |
813 | tv1->axis(-1)->parallelize(ParallelType::Vectorize); |
814 | |
815 | tv1->doubleBuffer(); |
816 | |
817 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
818 | at::manual_seed(0); |
819 | auto t0 = at::randn({200}, options); |
820 | |
821 | FusionExecutor fe; |
822 | fe.compileFusion(&fusion, {t0}); |
823 | auto cg_outputs = fe.runFusion({t0}); |
824 | |
825 | auto ref = t0 + 1; |
826 | |
827 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
828 | } |
829 | |
830 | // Multiple tensors to double-buffer |
831 | TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) { |
832 | Fusion fusion; |
833 | FusionGuard fg(&fusion); |
834 | |
835 | auto tv0 = makeContigTensor(1); |
836 | fusion.addInput(tv0); |
837 | auto tv1 = makeContigTensor(1); |
838 | fusion.addInput(tv1); |
839 | |
840 | auto tv2 = set(tv0); |
841 | auto tv3 = set(tv1); |
842 | auto tv4 = add(tv2, tv3); |
843 | fusion.addOutput(tv4); |
844 | |
845 | tv4->split(0, 32); |
846 | tv4->split(0, 4); |
847 | TransformPropagatorWithCheck propagator(tv4); |
848 | MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator); |
849 | |
850 | tv0->computeAt(tv4, 1); |
851 | tv1->computeAt(tv4, 1); |
852 | |
853 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
854 | scheduler_utils::parallelizeAllLike(tv4); |
855 | |
856 | tv2->doubleBuffer(); |
857 | tv3->doubleBuffer(); |
858 | |
859 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
860 | at::manual_seed(0); |
861 | auto t0 = at::randn({100}, options); |
862 | auto t1 = at::randn({100}, options); |
863 | |
864 | FusionExecutor fe; |
865 | fe.compileFusion(&fusion, {t0, t1}); |
866 | auto cg_outputs = fe.runFusion({t0, t1}); |
867 | |
868 | auto ref = t0 + t1; |
869 | |
870 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
871 | } |
872 | |
873 | // Nested double buffering from gmem to smem and smem to register |
874 | TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) { |
875 | Fusion fusion; |
876 | FusionGuard fg(&fusion); |
877 | |
878 | auto tv0 = makeContigTensor(1); |
879 | fusion.addInput(tv0); |
880 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
881 | auto out = tv1; |
882 | fusion.addOutput(out); |
883 | |
884 | auto tv2 = tv0->cacheAfter(); |
885 | auto tv3 = tv2->cacheAfter(); |
886 | |
887 | out->split(0, 32); |
888 | out->split(0, 4); |
889 | TransformPropagatorWithCheck propagator(out); |
890 | MaxRootDomainInfoSpanningTree(out).traverse(&propagator); |
891 | |
892 | tv2->setMemoryType(MemoryType::Shared); |
893 | |
894 | tv2->computeAt(out, 1); |
895 | tv3->computeAt(out, -1); |
896 | |
897 | out->axis(-1)->parallelize(ParallelType::TIDx); |
898 | scheduler_utils::parallelizeAllLike(out); |
899 | |
900 | tv2->doubleBuffer(); |
901 | tv3->doubleBuffer(); |
902 | |
903 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
904 | at::manual_seed(0); |
905 | auto t0 = at::randn({1001}, options); |
906 | |
907 | FusionExecutor fe; |
908 | fe.compileFusion(&fusion, {t0}); |
909 | auto cg_outputs = fe.runFusion({t0}); |
910 | |
911 | auto ref = t0 + 1; |
912 | |
913 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
914 | } |
915 | |
916 | // FusionSmemBlockGemmCache + double buffering at both smem and local |
917 | TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) { |
918 | Fusion fusion; |
919 | FusionGuard fg(&fusion); |
920 | |
921 | // Algorithm |
922 | TensorView* tv0 = makeSymbolicTensor(2); // (M, K) |
923 | TensorView* tv1 = makeSymbolicTensor(2); // (K, N) |
924 | TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) |
925 | TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) |
926 | TensorView* tv4 = mul(tv2, tv3); // M, K, N |
927 | TensorView* tv5 = sum(tv4, {1}); // M, R, N |
928 | fusion.addInput(tv0); |
929 | fusion.addInput(tv1); |
930 | fusion.addOutput(tv5); |
931 | |
932 | TensorView* tv6 = tv5->cacheBefore(); |
933 | |
934 | // For smem double buffering |
935 | auto tv0_cache_local = tv0->cacheAfter(); |
936 | auto tv1_cache_local = tv1->cacheAfter(); |
937 | |
938 | // For register double buffering |
939 | auto tv0_cache_smem = tv0->cacheAfter(); |
940 | auto tv1_cache_smem = tv1->cacheAfter(); |
941 | |
942 | const int BSX = 32; |
943 | const int TSX = 8; |
944 | |
945 | // [M, K, N] |
946 | tv6->split(-1, BSX); |
947 | tv6->split(-1, TSX); |
948 | tv6->split(1, BSX); |
949 | tv6->split(0, BSX); |
950 | tv6->split(1, TSX); |
951 | // [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX] |
952 | tv6->reorder( |
953 | {{4, 7}, {7, 6}, {6, 5}, {2, 4}, {1, 3}, {3, 2}, {5, 1}, {0, 0}}); |
954 | // [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX] |
955 | |
956 | auto tv6_rf = tv6->rFactor({-1}); |
957 | |
958 | TransformPropagatorWithCheck propagator(tv6_rf); |
959 | MaxRootDomainInfoSpanningTree(tv6_rf).traverse(&propagator); |
960 | |
961 | tv0->computeAt(tv6, 3); |
962 | tv1->computeAt(tv6, 3); |
963 | |
964 | tv6_rf->computeAt(tv6, -1); |
965 | tv0_cache_local->computeAt(tv6_rf, -1); |
966 | tv1_cache_local->computeAt(tv6_rf, -1); |
967 | |
968 | tv0_cache_smem->setMemoryType(MemoryType::Shared); |
969 | tv1_cache_smem->setMemoryType(MemoryType::Shared); |
970 | |
971 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
972 | tv5->axis(1)->parallelize(ParallelType::BIDy); |
973 | tv5->axis(-3)->parallelize(ParallelType::TIDy); |
974 | tv5->axis(-1)->parallelize(ParallelType::TIDx); |
975 | |
976 | scheduler_utils::parallelizeAllLike(tv5); |
977 | |
978 | tv0_cache_local->doubleBuffer(); |
979 | tv1_cache_local->doubleBuffer(); |
980 | |
981 | tv0_cache_smem->doubleBuffer(); |
982 | tv1_cache_smem->doubleBuffer(); |
983 | |
984 | constexpr int M = 154, K = 45, N = 1524; |
985 | |
986 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
987 | at::Tensor t0 = at::randn({M, K}, options); |
988 | at::Tensor t1 = at::randn({K, N}, options); |
989 | at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); |
990 | |
991 | std::vector<IValue> aten_inputs = {t0, t1}; |
992 | |
993 | FusionExecutor fe; |
994 | fe.compileFusion(&fusion, aten_inputs); |
995 | auto cg_outputs = fe.runFusion(aten_inputs); |
996 | |
997 | testValidate( |
998 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
999 | // The smem cache write in this test case is redundant predicated, |
1000 | // and also double buffered. Currently we are relying on WAR sync |
1001 | // insertion to ensure ordering of double buffered tensor access. |
1002 | // The check below makes sure that the sync is inserted so that the |
1003 | // test isn't running on a race condition. |
1004 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0); |
1005 | } |
1006 | |
1007 | TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) { |
1008 | std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local}; |
1009 | |
1010 | for (auto mem_type : mem_types) { |
1011 | Fusion fusion; |
1012 | FusionGuard fg(&fusion); |
1013 | |
1014 | auto tv0 = makeContigTensor(1); |
1015 | fusion.addInput(tv0); |
1016 | |
1017 | auto tv1 = set(tv0); |
1018 | auto tv2 = set(tv1); |
1019 | auto tv3 = set(tv2); |
1020 | fusion.addOutput(tv3); |
1021 | |
1022 | tv1->setMemoryType(mem_type); |
1023 | |
1024 | tv3->split(-1, 4); |
1025 | TransformPropagatorWithCheck propagator(tv3); |
1026 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
1027 | |
1028 | tv1->computeAt(tv3, -2); |
1029 | |
1030 | tv2->axis(-1)->parallelize(ParallelType::Vectorize); |
1031 | |
1032 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1033 | at::manual_seed(0); |
1034 | auto t0 = at::randn({15}, options); |
1035 | FusionExecutor fe; |
1036 | fe.compileFusion(&fusion); |
1037 | |
1038 | // This should throw an exception as the extent of t0 is not |
1039 | // divisible by the vector width |
1040 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
1041 | ASSERT_ANY_THROW(fe.runFusion({t0})); |
1042 | |
1043 | auto t1 = at::randn({16}, options); |
1044 | auto cg_outputs = fe.runFusion({t1}); |
1045 | |
1046 | auto ref = t1; |
1047 | |
1048 | testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__); |
1049 | } |
1050 | } |
1051 | |
1052 | TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) { |
1053 | Fusion fusion; |
1054 | FusionGuard fg(&fusion); |
1055 | |
1056 | auto tv0 = makeConcreteTensor({10, 1}); |
1057 | fusion.addInput(tv0); |
1058 | auto tv1 = makeConcreteTensor({10, 20}); |
1059 | fusion.addInput(tv1); |
1060 | auto tv2 = makeConcreteTensor({10, 10}); |
1061 | fusion.addInput(tv2); |
1062 | |
1063 | // Not concretized |
1064 | auto tv3 = sum(tv2, {1}); |
1065 | auto tv4 = broadcast(tv3, {false, true}); |
1066 | auto tv5 = add(tv0, tv4); |
1067 | fusion.addOutput(tv5); |
1068 | |
1069 | // Concretized |
1070 | auto tv6 = sum(tv2, {1}); |
1071 | auto tv7 = broadcast(tv6, {false, true}); |
1072 | auto tv8 = add(tv1, tv7); |
1073 | fusion.addOutput(tv8); |
1074 | |
1075 | for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) { |
1076 | tv->axis(1)->parallelize(ParallelType::TIDx); |
1077 | } |
1078 | |
1079 | GpuLower gpulw(&fusion); |
1080 | TORCH_CHECK(!gpulw.concretizedBroadcastDomains()->isConcretized( |
1081 | loweredTv(tv4, gpulw)->axis(1))); |
1082 | TORCH_CHECK(gpulw.concretizedBroadcastDomains()->isConcretized( |
1083 | loweredTv(tv7, gpulw)->axis(1))); |
1084 | |
1085 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1086 | at::manual_seed(0); |
1087 | auto t0 = at::randn({10, 1}, options); |
1088 | auto t1 = at::randn({10, 20}, options); |
1089 | auto t2 = at::randn({10, 10}, options); |
1090 | std::vector<IValue> aten_inputs = {t0, t1, t2}; |
1091 | |
1092 | FusionExecutor fe; |
1093 | fe.compileFusion(&fusion, aten_inputs); |
1094 | auto outputs = fe.runFusion(aten_inputs); |
1095 | |
1096 | auto t5 = t0 + t2.sum({1}).unsqueeze(-1); |
1097 | auto t8 = t1 + t2.sum({1}).unsqueeze(-1); |
1098 | |
1099 | testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__); |
1100 | } |
1101 | |
1102 | TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) { |
1103 | Fusion fusion; |
1104 | FusionGuard fg(&fusion); |
1105 | |
1106 | auto tv0 = makeSymbolicTensor(2); |
1107 | fusion.addInput(tv0); |
1108 | |
1109 | auto tv1 = sum(tv0, {0, 1}); |
1110 | auto tv2 = broadcast(tv1, {true}); |
1111 | auto tv3 = broadcast(tv2, {false, true}); |
1112 | fusion.addOutput(tv3); |
1113 | |
1114 | // tv1 is thread-predicated with TIDx and TIDy |
1115 | tv1->axis(0)->parallelize(ParallelType::TIDx); |
1116 | tv1->axis(1)->parallelize(ParallelType::TIDy); |
1117 | // tv2 broadcasts along TIDx |
1118 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
1119 | // tv3 broadcasts along TIDy |
1120 | tv3->axis(0)->parallelize(ParallelType::TIDx); |
1121 | tv3->axis(1)->parallelize(ParallelType::TIDy); |
1122 | |
1123 | // Both tv2 and tv3 broadcast along predicated TID dimensions, but |
1124 | // since the broadcast domains are not concretized, there should be |
1125 | // no actual parallel broadcast |
1126 | |
1127 | GpuLower gpulw(&fusion); |
1128 | TORCH_CHECK( |
1129 | !gpulw.kernel()->summary().has_block_broadcasts && |
1130 | !gpulw.kernel()->summary().has_grid_broadcasts, |
1131 | "There must be no parallel broadcast in this fusion" ); |
1132 | |
1133 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1134 | at::manual_seed(0); |
1135 | auto t0 = at::randn({10, 11}, options); |
1136 | std::vector<IValue> aten_inputs = {t0}; |
1137 | |
1138 | FusionExecutor fe; |
1139 | fe.compileFusion(&fusion, aten_inputs); |
1140 | auto outputs = fe.runFusion(aten_inputs); |
1141 | |
1142 | auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1); |
1143 | |
1144 | testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__); |
1145 | } |
1146 | |
1147 | TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) { |
1148 | Fusion fusion; |
1149 | FusionGuard fg(&fusion); |
1150 | |
1151 | std::vector<int64_t> input_shape({10, 4, 8}); |
1152 | std::vector<int64_t> output_shape({8, 4, 1}); |
1153 | |
1154 | auto tv0 = makeConcreteTensor(input_shape); |
1155 | fusion.addInput(tv0); |
1156 | |
1157 | auto tv2 = sum(tv0, {0}); |
1158 | auto tv3 = set(tv2); |
1159 | auto tv4 = |
1160 | view(tv3, {input_shape.begin() + 1, input_shape.end()}, output_shape); |
1161 | auto tv5 = add(tv4, IrBuilder::create<Double>(1)); |
1162 | fusion.addOutput(tv5); |
1163 | |
1164 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
1165 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
1166 | tv5->axis(-1)->parallelize(ParallelType::TIDx); |
1167 | |
1168 | // The view op adds a broadcast domain in tv4, which is |
1169 | // parallelized. Howver, it is never materialized, so there should |
1170 | // be no parallel broadcast. |
1171 | |
1172 | GpuLower gpulw(&fusion); |
1173 | TORCH_CHECK( |
1174 | !gpulw.kernel()->summary().has_block_broadcasts && |
1175 | !gpulw.kernel()->summary().has_grid_broadcasts, |
1176 | "There must be no parallel broadcast in this fusion" ); |
1177 | |
1178 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1179 | at::manual_seed(0); |
1180 | auto t0 = at::randn(input_shape, options); |
1181 | std::vector<IValue> aten_inputs = {t0}; |
1182 | |
1183 | FusionExecutor fe; |
1184 | fe.compileFusion(&fusion, aten_inputs); |
1185 | auto outputs = fe.runFusion(aten_inputs); |
1186 | |
1187 | auto t5 = at::native::view(t0.sum(0), output_shape) + 1; |
1188 | |
1189 | testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__); |
1190 | } |
1191 | |
1192 | // Merging non-broadcast and broadcast domains |
1193 | // TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418 |
1194 | // validateParallelize does not pass. Even if it's skipped, |
1195 | // generated code is invalid as blockBroadcast is not used. |
1196 | #if 0 |
1197 | TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) { |
1198 | Fusion fusion; |
1199 | FusionGuard fg(&fusion); |
1200 | |
1201 | auto tv0 = makeSymbolicTensor(2); |
1202 | fusion.addInput(tv0); |
1203 | |
1204 | auto tv1 = sum(tv0, {1}); |
1205 | auto tv2 = broadcast(tv1, {false, true}); |
1206 | auto tv3 = add(tv2, tv0); |
1207 | fusion.addOutput(tv3); |
1208 | |
1209 | tv1->axis(1)->parallelize(ParallelType::TIDx); |
1210 | |
1211 | tv2->merge(0, 1); |
1212 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
1213 | // TODO: When set to shared memory, this kernel should be correct, but fails |
1214 | // validation and when skipped produces incorrect code |
1215 | tv2->setMemoryType(MemoryType::Shared); |
1216 | |
1217 | tv3->merge(0, 1); |
1218 | tv3->axis(0)->parallelize(ParallelType::TIDx); |
1219 | |
1220 | fusion.printMath(); |
1221 | fusion.printKernel(); |
1222 | } |
1223 | #endif |
1224 | |
1225 | TEST_F(NVFuserTest, FusionBroadcastConcretization5_CUDA) { |
1226 | Fusion fusion; |
1227 | FusionGuard fg(&fusion); |
1228 | |
1229 | auto tv0 = makeSymbolicTensor(1); |
1230 | fusion.addInput(tv0); |
1231 | auto tv1 = makeSymbolicTensor(1); |
1232 | fusion.addInput(tv1); |
1233 | auto tv2 = makeSymbolicTensor(1); |
1234 | fusion.addInput(tv2); |
1235 | auto tv3 = makeSymbolicTensor(1); |
1236 | fusion.addInput(tv3); |
1237 | |
1238 | // Assert tv2 and tv3 have the same shape |
1239 | auto tv4 = add(tv2, tv3); |
1240 | fusion.addOutput(tv4); |
1241 | |
1242 | // Concretize a broadcast domain to multiple non-concrete domains |
1243 | // through a multi-output expression. It should be considered to be |
1244 | // non-uniquely concretized. |
1245 | auto tv5 = broadcast(tv0, {false, true}); |
1246 | // Reduce only the non-broadcast domain. |
1247 | auto tvs = Welford(tv5, {0}); |
1248 | auto tv9 = add(tvs.avg, tv1); |
1249 | auto tv10 = add(tvs.var_sum, tv2); |
1250 | fusion.addOutput(tv9); |
1251 | fusion.addOutput(tv10); |
1252 | |
1253 | // Same pattern as the above, but concretize the broadcast domain |
1254 | // with tv2 and tv3, which have the exactly same shape, so the |
1255 | // broadcast should be considered uniquely concretized. |
1256 | auto tv11 = broadcast(tv0, {false, true}); |
1257 | // Reduce only the non-broadcast domain. |
1258 | auto tvs2 = Welford(tv11, {0}); |
1259 | auto tv15 = add(tvs2.avg, tv2); |
1260 | auto tv16 = add(tvs2.var_sum, tv3); |
1261 | fusion.addOutput(tv15); |
1262 | fusion.addOutput(tv16); |
1263 | |
1264 | // Reduce only the broadcast domain. Since it's reduced, it should |
1265 | // not be considered to be concretized. |
1266 | auto tv17 = broadcast(tv0, {false, true}); |
1267 | auto tvs3 = Welford(tv17, {1}); |
1268 | fusion.addOutput(tvs3.avg); |
1269 | |
1270 | ConcretizedBroadcastDomains bcast_concretization_info(&fusion); |
1271 | |
1272 | TORCH_CHECK( |
1273 | bcast_concretization_info.maybeNonUniquelyConcretized(tv5->axis(1)), |
1274 | "Failed to detect non-unique concretization of " , |
1275 | tv5->toString()); |
1276 | |
1277 | TORCH_CHECK( |
1278 | bcast_concretization_info.isUniquelyConcretized(tv11->axis(1)), |
1279 | "Failed to detect unique concretization of " , |
1280 | tv11->toString()); |
1281 | |
1282 | TORCH_CHECK( |
1283 | !bcast_concretization_info.isConcretized(tv17->axis(1)), |
1284 | "Failed to detect non-concretization of " , |
1285 | tv17->toString()); |
1286 | } |
1287 | |
1288 | TEST_F(NVFuserTest, FusionIssue1430_CUDA) { |
1289 | // Derived from an expression sorting issue when using loop map, now expr |
1290 | // sorting uses parallel map. |
1291 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
1292 | Fusion& fusion = *fusion_ptr.get(); |
1293 | FusionGuard fg(&fusion); |
1294 | |
1295 | int V = 2, W = 3, X = 4, Y = 5, Z = 6; |
1296 | |
1297 | // setup fusion |
1298 | auto tv0 = TensorViewBuilder() |
1299 | .ndims(5) |
1300 | .dtype(DataType::Half) |
1301 | .contiguity(std::vector<bool>(5, true)) |
1302 | .shape({V, W, X, Y, Z}) |
1303 | .build(); |
1304 | |
1305 | fusion.addInput(tv0); |
1306 | auto tv1 = set(tv0); |
1307 | auto tv2 = castOp(DataType::Float, tv1); |
1308 | |
1309 | auto tvs = Welford(tv2, {1, 2, 3, 4}); |
1310 | auto tv3 = tvs.avg; |
1311 | auto tv4 = tvs.var_sum; |
1312 | auto tv5 = tvs.n; |
1313 | |
1314 | // avg |
1315 | auto tv6 = broadcast(tvs.avg, {false, true, true, true, true}); |
1316 | |
1317 | // var |
1318 | auto tv7 = mul(tv4, IrBuilder::create<Double>(1. / (W * X * Y * Z))); |
1319 | auto tv8 = add(tv7, IrBuilder::create<Double>(1.e-6)); |
1320 | auto tv9 = broadcast(tv8, {false, true, true, true, true}); |
1321 | auto tv10 = rsqrt(tv9); |
1322 | |
1323 | auto tv11 = castOp(DataType::Float, tv1); |
1324 | auto tv12 = sub(tv11, tv6); |
1325 | auto tv13 = mul(tv12, tv10); |
1326 | |
1327 | auto tv14 = set(tv13); |
1328 | fusion.addOutput(tv14); |
1329 | |
1330 | tv3->axis(0)->parallelize(ParallelType::BIDy); |
1331 | tv3->axis(2)->parallelize(ParallelType::BIDx); |
1332 | tv3->axis(3)->parallelize(ParallelType::TIDx); |
1333 | tv3->axis(4)->parallelize(ParallelType::Vectorize); |
1334 | |
1335 | // tv3->reorder({{1, -2}}); |
1336 | |
1337 | auto rfactor = ir_utils::rfactorHelper(tv3, {1, 4}); |
1338 | |
1339 | scheduler_utils::parallelizeAllLike(rfactor); |
1340 | |
1341 | for (auto tv : ir_utils::allTvs(&fusion)) { |
1342 | if (tv != tv1 || tv != tv3) { |
1343 | for (auto i : c10::irange(tv->nDims())) { |
1344 | if (isParallelTypeVectorize(tv->axis(i)->getParallelType())) { |
1345 | tv->axis(i)->parallelize(ParallelType::Serial); |
1346 | } |
1347 | } |
1348 | } |
1349 | } |
1350 | |
1351 | tv0->computeAt(tv14, 1); |
1352 | tv13->computeAt(tv14, -2); |
1353 | tv2->computeAt(tv14, -1, ComputeAtMode::MostInlined); |
1354 | tv11->computeAt(tv14, -1, ComputeAtMode::MostInlined); |
1355 | |
1356 | auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); |
1357 | at::Tensor t0 = at::randn({V, W, X, Y, Z}, options); |
1358 | |
1359 | FusionExecutor fe; |
1360 | fe.compileFusion(&fusion); |
1361 | auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1)); |
1362 | |
1363 | auto t0_double = t0.to(at::kDouble); |
1364 | |
1365 | auto at_mu = at::mean(t0_double, {1, 2, 3, 4}) |
1366 | .unsqueeze(-1) |
1367 | .unsqueeze(-1) |
1368 | .unsqueeze(-1) |
1369 | .unsqueeze(-1); |
1370 | auto at_var = at::var(t0_double, {1, 2, 3, 4}, false) |
1371 | .unsqueeze(-1) |
1372 | .unsqueeze(-1) |
1373 | .unsqueeze(-1) |
1374 | .unsqueeze(-1); |
1375 | |
1376 | auto at_out = t0_double.sub(at_mu).div(at_var.add(1.e-6).sqrt()); |
1377 | |
1378 | testValidate( |
1379 | &fusion, |
1380 | cg_outputs, |
1381 | {t0}, |
1382 | {at_out}, |
1383 | __LINE__, |
1384 | __FILE__, |
1385 | "" , |
1386 | LaunchParams(X, V, -1, Y, -1, -1)); |
1387 | } |
1388 | |
1389 | // Test code generation of allocated scalars |
1390 | TEST_F(NVFuserTest, FusionCodegenAllocatedScalars_CUDA) { |
1391 | Fusion fusion; |
1392 | FusionGuard fg(&fusion); |
1393 | |
1394 | // Fusion is just a dummy container in this test, just used for |
1395 | // getting a Kernel container |
1396 | auto tv0 = makeSymbolicTensor(0); |
1397 | fusion.addInput(tv0); |
1398 | auto tv1 = set(tv0); |
1399 | fusion.addOutput(tv1); |
1400 | |
1401 | GpuLower gpulw(&fusion); |
1402 | auto kernel = gpulw.kernel(); |
1403 | |
1404 | // Set the kernel as the current fusion |
1405 | FusionGuard kg(kernel); |
1406 | |
1407 | // Create alocated scalars |
1408 | auto ks0 = add(kernel->zeroVal(), kernel->oneVal()); |
1409 | auto ks0_alloc = IrBuilder::create<kir::Allocate>( |
1410 | ks0, MemoryType::Local, kernel->oneVal()); |
1411 | |
1412 | auto ks1 = add(ks0, kernel->oneVal()); |
1413 | auto ks1_alloc = IrBuilder::create<kir::Allocate>( |
1414 | ks1, MemoryType::Local, kernel->oneVal()); |
1415 | |
1416 | auto tk0 = kernel->inputs()[0]->as<TensorView>(); |
1417 | auto tki0 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks0}); |
1418 | auto tki1 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks1}); |
1419 | auto tk0_expr = IrBuilder::create<UnaryOp>(UnaryOpType::Set, tki0, tki1); |
1420 | |
1421 | // Insert the scalar expression and the allocation of the |
1422 | // output directly to the kernel |
1423 | auto proxy = kir::KernelInternalProxy(kernel); |
1424 | |
1425 | const auto indent = " " ; |
1426 | const auto ks0_name = "i" + std::to_string(ks0->name()); |
1427 | const auto ks1_name = "i" + std::to_string(ks1->name()); |
1428 | const auto tk0_name = "T" + std::to_string(tk0->name()); |
1429 | |
1430 | auto& exprs = proxy.topLevelExprs(); |
1431 | exprs.push_back(tk0_expr); |
1432 | |
1433 | // Invalid code gen |
1434 | const auto no_alloc_code = codegen::generateCudaKernel(kernel); |
1435 | |
1436 | // Without alloc, Int vals are just inlined, resulting in: |
1437 | // t0[(0 + 1)] = t0[((0 + 1) + 1)] |
1438 | std::stringstream no_alloc_ref; |
1439 | no_alloc_ref << "\n" |
1440 | << indent << tk0_name << "[(0 + 1)]\n" |
1441 | << indent << indent << " = " << tk0_name << "[((0 + 1) + 1)];\n" ; |
1442 | |
1443 | TORCH_CHECK( |
1444 | no_alloc_code.find(no_alloc_ref.str()) != std::string::npos, |
1445 | "Invalid code generation. Expected:" , |
1446 | no_alloc_ref.str(), |
1447 | "Actual:\n" , |
1448 | no_alloc_code); |
1449 | |
1450 | // Insert proper allocations and definitions |
1451 | exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks0_alloc); |
1452 | exprs.insert( |
1453 | std::find(exprs.begin(), exprs.end(), tk0_expr), ks0->definition()); |
1454 | exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks1_alloc); |
1455 | exprs.insert( |
1456 | std::find(exprs.begin(), exprs.end(), tk0_expr), ks1->definition()); |
1457 | |
1458 | const auto valid_code = codegen::generateCudaKernel(kernel); |
1459 | |
1460 | std::stringstream valid_ref; |
1461 | valid_ref << "\n" |
1462 | << indent << tk0_name << "[" << ks0_name << "]\n" |
1463 | << indent << indent << " = " << tk0_name << "[" << ks1_name |
1464 | << "];\n" ; |
1465 | |
1466 | TORCH_CHECK( |
1467 | valid_code.find(valid_ref.str()) != std::string::npos, |
1468 | "Invalid code generation. Expected:" , |
1469 | valid_ref.str(), |
1470 | "Actual:\n" , |
1471 | valid_code); |
1472 | } |
1473 | |
1474 | TEST_F(NVFuserTest, FusionIndexHoist1_CUDA) { |
1475 | if (isOptionDisabled(DisableOption::IndexHoist)) { |
1476 | GTEST_SKIP() << "Index hoisting disabled" ; |
1477 | } |
1478 | |
1479 | Fusion fusion; |
1480 | FusionGuard fg(&fusion); |
1481 | |
1482 | auto tv0 = makeSymbolicTensor(2); |
1483 | fusion.addInput(tv0); |
1484 | |
1485 | auto tv1 = set(tv0); |
1486 | auto tv2 = set(tv1); |
1487 | auto tv3 = set(tv2); |
1488 | auto tv4 = set(tv3); |
1489 | auto tv5 = set(tv4); |
1490 | fusion.addOutput(tv5); |
1491 | |
1492 | tv1->split(-1, 4); |
1493 | tv2->split(-1, 4); |
1494 | tv3->merge(0, 1); |
1495 | tv3->split(0, 8); |
1496 | tv5->merge(0, 1); |
1497 | tv5->split(0, 8); |
1498 | tv4->computeAt(tv5, -1); |
1499 | |
1500 | tv1->setMemoryType(MemoryType::Global); |
1501 | tv2->setMemoryType(MemoryType::Global); |
1502 | tv3->setMemoryType(MemoryType::Global); |
1503 | |
1504 | // Use Int32 as the index type to verify Int32 is used as the type |
1505 | // of hoisted indices |
1506 | GpuLower gpulw(&fusion, DataType::Int32); |
1507 | auto kernel = gpulw.kernel(); |
1508 | |
1509 | auto is_index_times_ns = [](Val* val, Val* index, std::string name) -> bool { |
1510 | auto def = dynamic_cast<BinaryOp*>(val->definition()); |
1511 | if (def == nullptr) { |
1512 | return false; |
1513 | } |
1514 | return def->getBinaryOpType() == BinaryOpType::Mul && |
1515 | def->rhs()->isA<NamedScalar>() && |
1516 | def->rhs()->as<NamedScalar>()->name() == name && def->lhs() == index; |
1517 | }; |
1518 | |
1519 | // Validate indices in the kernel are hoisted as |
1520 | // intended. Validation could be also done by just string comparison |
1521 | // as the parser test, but updating such tests would be tedious. |
1522 | for (auto top_level_loop : |
1523 | ir_utils::filterByType<kir::ForLoop>(kernel->topLevelExprs())) { |
1524 | auto innermost_loop = top_level_loop; |
1525 | while (auto first_expr_loop = dynamic_cast<kir::ForLoop*>( |
1526 | innermost_loop->body().exprs().at(0))) { |
1527 | innermost_loop = first_expr_loop; |
1528 | } |
1529 | const auto& exprs = innermost_loop->body().exprs(); |
1530 | TORCH_CHECK(!exprs.empty(), "No expression found" ); |
1531 | TORCH_CHECK( |
1532 | exprs.at(0)->isA<kir::Allocate>(), |
1533 | "Invalid expression: " , |
1534 | exprs.at(0)->toString()); |
1535 | auto hoisted_index = exprs.at(0)->as<kir::Allocate>()->buffer(); |
1536 | TORCH_CHECK( |
1537 | hoisted_index->dtype() == DataType::Int32, |
1538 | "Invalid data type of hoisted indices. Should be Int32 but: " , |
1539 | hoisted_index->dtype()); |
1540 | kir::Predicate* pred = nullptr; |
1541 | for (auto expr : exprs) { |
1542 | if (expr->isA<kir::IfThenElse>()) { |
1543 | pred = expr->as<kir::IfThenElse>()->predicate(); |
1544 | auto arith_expr = expr->as<kir::IfThenElse>()->thenBody().exprs().at(0); |
1545 | auto out_ti = arith_expr->outputs()[0]->as<kir::TensorIndex>(); |
1546 | if (out_ti->view()->name() == 1) { |
1547 | // Ref: T1[*, hoisted_index] = T0[*, hoisted_index * T0.stride]; |
1548 | auto t1_index = out_ti->index(1); |
1549 | TORCH_CHECK( |
1550 | t1_index == hoisted_index, |
1551 | "Invalid index: " , |
1552 | t1_index->toInlineString()); |
1553 | // Pred: hoisted_index < T0.size[1] |
1554 | TORCH_CHECK( |
1555 | pred->value()->definition()->as<BinaryOp>()->lhs() == |
1556 | hoisted_index, |
1557 | "Invalid predicate: " , |
1558 | pred->value()->toInlineString(), |
1559 | ", " , |
1560 | expr->toString()); |
1561 | TORCH_CHECK(arith_expr->inputs().size() == 1); |
1562 | auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>(); |
1563 | TORCH_CHECK(in0->view()->name() == 0); |
1564 | // hoisted_index * T0.stride[1] |
1565 | auto t0_index = in0->index(1); |
1566 | TORCH_CHECK( |
1567 | is_index_times_ns(t0_index, hoisted_index, "T0.stride[1]" ), |
1568 | "Invalid index: " , |
1569 | t0_index->toInlineString(), |
1570 | ", " , |
1571 | expr->toString()); |
1572 | } else if (out_ti->view()->name() == 2) { |
1573 | // Ref: T3[*, hoisted_index] = T2[*, hoisted_index]; |
1574 | auto out_index = out_ti->index(1); |
1575 | TORCH_CHECK( |
1576 | out_index == hoisted_index, |
1577 | "Invalid index: " , |
1578 | out_index->toInlineString(), |
1579 | ", " , |
1580 | expr->toString()); |
1581 | TORCH_CHECK( |
1582 | pred->value()->definition()->as<BinaryOp>()->lhs() == |
1583 | hoisted_index, |
1584 | "Invalid predicate: " , |
1585 | pred->value()->toInlineString(), |
1586 | ", " , |
1587 | expr->toString()); |
1588 | TORCH_CHECK(arith_expr->inputs().size() == 1); |
1589 | auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>(); |
1590 | TORCH_CHECK(in0->view()->name() == 1); |
1591 | auto in0_index = in0->index(1); |
1592 | TORCH_CHECK( |
1593 | in0_index == hoisted_index, |
1594 | "Invalid index: " , |
1595 | in0_index->toInlineString(), |
1596 | ", " , |
1597 | expr->toString()); |
1598 | } else if (out_ti->view()->name() == 3) { |
1599 | // Ref: T3[hoisted_index] = T2[hoisted_index]; |
1600 | auto out_index = out_ti->index(0); |
1601 | TORCH_CHECK( |
1602 | out_index == hoisted_index, |
1603 | "Invalid index: " , |
1604 | out_index->toInlineString(), |
1605 | ", " , |
1606 | expr->toString()); |
1607 | TORCH_CHECK( |
1608 | pred->value()->definition()->as<BinaryOp>()->lhs() == |
1609 | hoisted_index, |
1610 | "Invalid predicate: " , |
1611 | pred->value()->toInlineString(), |
1612 | ", " , |
1613 | expr->toString()); |
1614 | TORCH_CHECK(arith_expr->inputs().size() == 1); |
1615 | auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>(); |
1616 | TORCH_CHECK(in0->view()->name() == 2); |
1617 | auto in0_index = in0->index(0); |
1618 | TORCH_CHECK( |
1619 | in0_index == hoisted_index, |
1620 | "Invalid index: " , |
1621 | in0_index->toInlineString(), |
1622 | ", " , |
1623 | expr->toString()); |
1624 | } else if (out_ti->view()->name() == 4) { |
1625 | // Ref: T4[0] = T3[hoisted_index]; |
1626 | TORCH_CHECK( |
1627 | pred->value()->definition()->as<BinaryOp>()->lhs() == |
1628 | hoisted_index, |
1629 | "Invalid predicate: " , |
1630 | pred->value()->toInlineString(), |
1631 | ", " , |
1632 | expr->toString()); |
1633 | TORCH_CHECK(arith_expr->inputs().size() == 1); |
1634 | auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>(); |
1635 | TORCH_CHECK(in0->view()->name() == 3); |
1636 | auto in0_index = in0->index(0); |
1637 | TORCH_CHECK( |
1638 | in0_index == hoisted_index, |
1639 | "Invalid index: " , |
1640 | in0_index->toInlineString(), |
1641 | ", " , |
1642 | expr->toString()); |
1643 | } else if (out_ti->view()->name() == 5) { |
1644 | // Ref: T5[hoisted_index] = T4[0] |
1645 | auto out_index = out_ti->index(0); |
1646 | TORCH_CHECK( |
1647 | out_index == hoisted_index, |
1648 | "Invalid index: " , |
1649 | out_index->toInlineString(), |
1650 | ", " , |
1651 | expr->toString()); |
1652 | TORCH_CHECK( |
1653 | pred->value()->definition()->as<BinaryOp>()->lhs() == |
1654 | hoisted_index, |
1655 | "Invalid predicate: " , |
1656 | pred->value()->toInlineString(), |
1657 | ", " , |
1658 | expr->toString()); |
1659 | } |
1660 | } |
1661 | } |
1662 | } |
1663 | |
1664 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1665 | at::manual_seed(0); |
1666 | auto t0 = at::randn({15, 17}, options); |
1667 | |
1668 | FusionExecutor fe; |
1669 | fe.compileFusion(&fusion, {t0}); |
1670 | auto cg_outputs = fe.runFusion({t0}); |
1671 | |
1672 | auto ref = t0; |
1673 | |
1674 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
1675 | } |
1676 | |
1677 | // Hoist indices for vectorized tensors |
1678 | TEST_F(NVFuserTest, FusionIndexHoist2_CUDA) { |
1679 | if (isOptionDisabled(DisableOption::IndexHoist)) { |
1680 | GTEST_SKIP() << "Index hoisting disabled" ; |
1681 | } |
1682 | |
1683 | Fusion fusion; |
1684 | FusionGuard fg(&fusion); |
1685 | |
1686 | auto tv0 = makeContigTensor(1); |
1687 | fusion.addInput(tv0); |
1688 | auto tv1 = makeContigTensor(1); |
1689 | fusion.addInput(tv1); |
1690 | |
1691 | auto tv2 = set(tv0); |
1692 | auto tv3 = set(tv1); |
1693 | auto tv4 = add(tv2, tv3); |
1694 | auto tv5 = set(tv4); |
1695 | fusion.addOutput(tv5); |
1696 | |
1697 | tv5->split(-1, 4); |
1698 | TransformPropagatorWithCheck propagator(tv5); |
1699 | MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator); |
1700 | |
1701 | tv4->split(-1, 3); |
1702 | |
1703 | tv0->computeAt(tv5, 1); |
1704 | tv1->computeAt(tv5, 1); |
1705 | |
1706 | tv2->axis(-1)->parallelize(ParallelType::Vectorize); |
1707 | tv3->axis(-1)->parallelize(ParallelType::Vectorize); |
1708 | tv5->axis(-1)->parallelize(ParallelType::Vectorize); |
1709 | |
1710 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1711 | at::manual_seed(0); |
1712 | auto t0 = at::randn({16}, options); |
1713 | auto t1 = at::randn({16}, options); |
1714 | |
1715 | FusionExecutor fe; |
1716 | fe.compileFusion(&fusion, {t0, t1}); |
1717 | auto cg_outputs = fe.runFusion({t0, t1}); |
1718 | |
1719 | auto ref = t0 + t1; |
1720 | |
1721 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
1722 | } |
1723 | |
1724 | TEST_F(NVFuserTest, FusionTestGridComm_CUDA) { |
1725 | Fusion fusion; |
1726 | FusionGuard fg(&fusion); |
1727 | int X = 3, Y = 4, Z = 2; |
1728 | auto tv0 = makeConcreteTensor({X, Y, Z}); |
1729 | fusion.addInput(tv0); |
1730 | auto tv1 = makeConcreteTensor({X, Y, Z}); |
1731 | fusion.addInput(tv1); |
1732 | |
1733 | auto tv2 = set(tv0); |
1734 | auto tv3 = add(tv2, tv1); |
1735 | auto tv4 = set(tv3); |
1736 | auto tv5 = set(tv4); |
1737 | fusion.addOutput(tv5); |
1738 | |
1739 | tv2->setMemoryType(MemoryType::Global); |
1740 | tv3->setMemoryType(MemoryType::Global); |
1741 | tv4->setMemoryType(MemoryType::Global); |
1742 | |
1743 | tv2->axis(0)->parallelize(ParallelType::BIDy); |
1744 | tv2->axis(1)->parallelize(ParallelType::BIDx); |
1745 | tv2->axis(2)->parallelize(ParallelType::Vectorize); |
1746 | |
1747 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1748 | tv3->axis(1)->parallelize(ParallelType::BIDy); |
1749 | |
1750 | tv4->axis(0)->parallelize(ParallelType::BIDy); |
1751 | tv4->axis(1)->parallelize(ParallelType::BIDx); |
1752 | |
1753 | tv5->axis(0)->parallelize(ParallelType::BIDy); |
1754 | tv5->axis(1)->parallelize(ParallelType::BIDx); |
1755 | tv5->axis(2)->parallelize(ParallelType::Vectorize); |
1756 | |
1757 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1758 | at::manual_seed(0); |
1759 | auto t0 = at::randn({X, Y, Z}, options); |
1760 | auto t1 = at::randn({X, Y, Z}, options); |
1761 | |
1762 | FusionExecutor fe; |
1763 | fe.compileFusion(&fusion, {t0, t1}); |
1764 | auto cg_outputs = fe.runFusion({t0, t1}); |
1765 | |
1766 | auto ref = t0 + t1; |
1767 | |
1768 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
1769 | } |
1770 | |
1771 | // See issue https://github.com/csarofeen/pytorch/issues/1497 |
1772 | TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) { |
1773 | Fusion fusion; |
1774 | FusionGuard fg(&fusion); |
1775 | |
1776 | int64_t W = 3, X = 4; |
1777 | |
1778 | auto tv0 = makeConcreteTensor({X}); |
1779 | auto tv1 = makeConcreteTensor({W, X}); |
1780 | fusion.addInput(tv0); |
1781 | fusion.addInput(tv1); |
1782 | |
1783 | auto tv2 = add(tv0, IrBuilder::create<Double>(1)); |
1784 | auto tv3 = broadcast(tv2, {true, false}); |
1785 | auto tv4 = add(tv3, tv1); |
1786 | fusion.addOutput(tv4); |
1787 | |
1788 | tv4->merge(0); |
1789 | tv4->split(0, 2); |
1790 | |
1791 | TransformPropagatorWithCheck propagator(tv4); |
1792 | MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator); |
1793 | |
1794 | tv3->computeAt(tv4, 1); |
1795 | |
1796 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
1797 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
1798 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
1799 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
1800 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
1801 | |
1802 | tv2->setMemoryType(MemoryType::Global); |
1803 | |
1804 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1805 | at::manual_seed(0); |
1806 | auto t0 = at::randn({X}, options); |
1807 | auto t1 = at::randn({W, X}, options); |
1808 | |
1809 | FusionExecutor fe; |
1810 | fe.compileFusion(&fusion, {t0, t1}); |
1811 | auto cg_outputs = fe.runFusion({t0, t1}); |
1812 | |
1813 | auto ref = t0 + t1 + 1; |
1814 | |
1815 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
1816 | } |
1817 | |
1818 | // Vectorized reset test for double buffered registers |
1819 | TEST_F(NVFuserTest, FusionDoubleBufferVector_CUDA) { |
1820 | Fusion fusion; |
1821 | FusionGuard fg(&fusion); |
1822 | |
1823 | auto tv0 = makeContigTensor(1); |
1824 | fusion.addInput(tv0); |
1825 | |
1826 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
1827 | auto tv2 = sum(tv1, {0}); |
1828 | auto tv2c = tv2->cacheBefore(); |
1829 | |
1830 | fusion.addOutput(tv2); |
1831 | |
1832 | auto tv1cw = tv1->cacheAfter(); |
1833 | auto tv1cr = tv1cw->cacheAfter(); |
1834 | |
1835 | tv1cw->split(-1, 32); |
1836 | tv1cr->split(-1, 32); |
1837 | tv1cr->split(-1, 4); |
1838 | tv1cr->axis(-1)->parallelize(ParallelType::Vectorize); |
1839 | |
1840 | tv1cw->computeAt(tv1cr, 1); |
1841 | tv0->computeAt(tv1cw, -1); |
1842 | tv2c->split(-1, 32); |
1843 | tv2c->split(-1, 4); |
1844 | tv1cr->computeAt(tv2c, 2); |
1845 | |
1846 | tv1cw->setMemoryType(MemoryType::Shared); |
1847 | tv1cr->doubleBuffer(); |
1848 | |
1849 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1850 | |
1851 | at::manual_seed(0); |
1852 | auto t0 = at::randn({200}, options); |
1853 | FusionExecutor fe; |
1854 | fe.compileFusion(&fusion, {t0}); |
1855 | auto cg_outputs = fe.runFusion({t0}); |
1856 | auto ref = (t0 + 1).sum({0}); |
1857 | |
1858 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
1859 | } |
1860 | |
1861 | // Request 48KB of data in shared mem, |
1862 | // should be large enough not to fit in |
1863 | // static allocations, but small enough |
1864 | // to fit in supported devices (sm70+). |
1865 | TEST_F(NVFuserTest, FusionLargeSmem_CUDA) { |
1866 | Fusion fusion; |
1867 | FusionGuard fg(&fusion); |
1868 | |
1869 | auto tv0 = makeContigTensor(1); |
1870 | fusion.addInput(tv0); |
1871 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
1872 | auto tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
1873 | fusion.addOutput(tv2); |
1874 | |
1875 | tv2->split(0, 12288); |
1876 | tv2->split(1, 128); |
1877 | tv1->computeAt(tv2, 1); |
1878 | tv1->split(1, 128); |
1879 | tv0->computeAt(tv1, -1); |
1880 | tv1->setMemoryType(MemoryType::Shared); |
1881 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
1882 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
1883 | |
1884 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1885 | |
1886 | at::manual_seed(0); |
1887 | auto t0 = at::randn({12288 * 4}, options); |
1888 | FusionExecutor fe; |
1889 | fe.compileFusion(&fusion, {t0}); |
1890 | auto cg_outputs = fe.runFusion({t0}); |
1891 | auto ref = t0 + 1 + 2; |
1892 | |
1893 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
1894 | } |
1895 | |
1896 | // Request a smem allocation that is equal to the device limit |
1897 | TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) { |
1898 | Fusion fusion; |
1899 | FusionGuard fg(&fusion); |
1900 | |
1901 | auto properties = at::cuda::getDeviceProperties( |
1902 | c10::Device(c10::DeviceType::CUDA, 0).index()); |
1903 | int device_limit = properties->sharedMemPerBlockOptin; |
1904 | |
1905 | auto tv0 = makeContigTensor(1); |
1906 | fusion.addInput(tv0); |
1907 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
1908 | auto tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
1909 | fusion.addOutput(tv2); |
1910 | |
1911 | // 4 byte per float |
1912 | tv2->split(0, device_limit / 4); |
1913 | tv2->split(1, 128); |
1914 | tv1->computeAt(tv2, 1); |
1915 | tv1->split(1, 128); |
1916 | tv0->computeAt(tv1, -1); |
1917 | tv1->setMemoryType(MemoryType::Shared); |
1918 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
1919 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
1920 | |
1921 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1922 | |
1923 | at::manual_seed(0); |
1924 | auto t0 = at::randn({12288 * 4}, options); |
1925 | FusionExecutor fe; |
1926 | |
1927 | // First compile gets a compiled kernel |
1928 | fe.compileFusion(&fusion, {t0}); |
1929 | |
1930 | // Should be throwing because the kernel |
1931 | // requested absolute device limit |
1932 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
1933 | ASSERT_ANY_THROW(fe.runFusion({t0})); |
1934 | } |
1935 | |
1936 | // Try to test alignment when multiple tensors are |
1937 | // in shared mem. |
1938 | TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) { |
1939 | Fusion fusion; |
1940 | FusionGuard fg(&fusion); |
1941 | |
1942 | auto tv0 = makeConcreteTensor({3, 4, 7, 2, 5}); |
1943 | fusion.addInput(tv0); |
1944 | auto tv1 = sum(tv0, {4}); |
1945 | auto tv2 = sum(tv1, {3}); |
1946 | auto tv3 = sum(tv2, {2}); |
1947 | auto tv4 = sum(tv3, {1}); |
1948 | fusion.addOutput(tv4); |
1949 | |
1950 | auto tv0c = tv0->cacheAfter(); |
1951 | auto tv1bc = tv1->cacheBefore(); |
1952 | auto tv2bc = tv2->cacheBefore(); |
1953 | auto tv3bc = tv3->cacheBefore(); |
1954 | auto tv4bc = tv4->cacheBefore(); |
1955 | |
1956 | tv0c->setMemoryType(MemoryType::Shared); |
1957 | tv1bc->setMemoryType(MemoryType::Shared); |
1958 | tv2bc->setMemoryType(MemoryType::Shared); |
1959 | tv3bc->setMemoryType(MemoryType::Shared); |
1960 | tv4bc->setMemoryType(MemoryType::Shared); |
1961 | |
1962 | tv1->axis(-1)->parallelize(ParallelType::Vectorize); |
1963 | tv3->axis(-1)->parallelize(ParallelType::Vectorize); |
1964 | tv0->computeAt(tv4, 0); |
1965 | tv0->computeAt(tv2, 2); |
1966 | |
1967 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1968 | |
1969 | at::manual_seed(0); |
1970 | auto t0 = at::randn({3, 4, 7, 2, 5}, options); |
1971 | FusionExecutor fe; |
1972 | |
1973 | fe.compileFusion(&fusion, {t0}); |
1974 | auto cg_outputs = fe.runFusion({t0}); |
1975 | auto tref = t0.sum({1, 2, 3, 4}); |
1976 | |
1977 | testValidate(&fusion, cg_outputs, {t0}, {tref}, __LINE__, __FILE__); |
1978 | } |
1979 | |
1980 | // Repro of #1521 |
1981 | TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) { |
1982 | Fusion fusion; |
1983 | FusionGuard fg(&fusion); |
1984 | |
1985 | auto tv0 = makeSymbolicTensor(1); |
1986 | fusion.addInput(tv0); |
1987 | |
1988 | auto immediate_scalr = IrBuilder::create<Double>(0.1); |
1989 | // Adding an immediate scalar value as an input is not allowed |
1990 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
1991 | ASSERT_ANY_THROW(fusion.addInput(immediate_scalr)); |
1992 | |
1993 | // Instead, use a symbolic value |
1994 | auto symbolic_scalar = IrBuilder::create<Double>(); |
1995 | fusion.addInput(symbolic_scalar); |
1996 | |
1997 | auto tv1 = add(tv0, symbolic_scalar); |
1998 | fusion.addOutput(tv1); |
1999 | |
2000 | // Make sure the kernel is compiled. |
2001 | FusionExecutor fe; |
2002 | fe.compileFusion(&fusion); |
2003 | } |
2004 | |
2005 | // Repro of #1506 |
2006 | TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) { |
2007 | std::vector<int64_t> shape{14, 14}; |
2008 | |
2009 | Fusion fusion; |
2010 | FusionGuard fg(&fusion); |
2011 | |
2012 | auto tv0 = makeContigTensor(2); |
2013 | fusion.addInput(tv0); |
2014 | auto tv1 = set(tv0); |
2015 | auto tv2 = set(tv1); |
2016 | fusion.addOutput(tv2); |
2017 | |
2018 | tv2->merge(0); |
2019 | |
2020 | // Vectorize by 4 should be allowed |
2021 | tv2->split(0, 4); |
2022 | |
2023 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
2024 | tv0->computeAt(tv2, 1); |
2025 | |
2026 | tv1->axis(1)->parallelize(ParallelType::Vectorize); |
2027 | tv2->axis(1)->parallelize(ParallelType::Vectorize); |
2028 | |
2029 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2030 | auto t0 = at::randn(shape, options); |
2031 | |
2032 | FusionExecutor fe; |
2033 | fe.compileFusion(&fusion, {t0}); |
2034 | auto cg_outputs = fe.runFusion({t0}); |
2035 | |
2036 | TORCH_CHECK(t0.equal(cg_outputs[0])); |
2037 | } |
2038 | |
2039 | // Make sure the same fusion as FusionVectorizeContigIndex fails if |
2040 | // not contig. |
2041 | TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) { |
2042 | std::vector<int64_t> shape{14, 14}; |
2043 | |
2044 | Fusion fusion; |
2045 | FusionGuard fg(&fusion); |
2046 | |
2047 | auto tv0 = makeSymbolicTensor(2); |
2048 | fusion.addInput(tv0); |
2049 | auto tv1 = set(tv0); |
2050 | auto tv2 = set(tv1); |
2051 | fusion.addOutput(tv2); |
2052 | |
2053 | tv2->merge(0); |
2054 | |
2055 | tv2->split(0, 4); |
2056 | |
2057 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
2058 | tv0->computeAt(tv2, 1); |
2059 | |
2060 | tv1->axis(1)->parallelize(ParallelType::Vectorize); |
2061 | tv2->axis(1)->parallelize(ParallelType::Vectorize); |
2062 | |
2063 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2064 | auto t0 = at::randn(shape, options); |
2065 | |
2066 | FusionExecutor fe; |
2067 | fe.compileFusion(&fusion, {t0}); |
2068 | |
2069 | // This should fail at the launch time as 14 is not divisible by the |
2070 | // vector word size. The two domains are merged, but they are not |
2071 | // contiguous, so contig indexing is not involved in this case. |
2072 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
2073 | ASSERT_ANY_THROW(fe.runFusion({t0})); |
2074 | } |
2075 | |
2076 | TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) { |
2077 | Fusion fusion; |
2078 | FusionGuard fg(&fusion); |
2079 | |
2080 | auto tv0 = makeSymbolicTensor(1); |
2081 | fusion.addInput(tv0); |
2082 | auto tv1 = set(tv0); |
2083 | fusion.addOutput(tv1); |
2084 | |
2085 | tv1->split(0, 4); |
2086 | |
2087 | tv1->axis(-1)->parallelize(ParallelType::Vectorize); |
2088 | |
2089 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2090 | at::manual_seed(0); |
2091 | |
2092 | const int n = 12; |
2093 | auto t0 = at::randn({n}, options); |
2094 | // Shift by one to make it non-aligned |
2095 | auto t0_misaligned = at::randn({n + 1}, options).index({Slice(1)}); |
2096 | auto t1_misaligned = at::empty({n + 1}, options).index({Slice(1)}); |
2097 | |
2098 | FusionExecutor fe; |
2099 | fe.compileFusion(&fusion, {t0}); |
2100 | auto cg_outputs = fe.runFusion({t0}); |
2101 | TORCH_CHECK(t0.equal(cg_outputs[0])); |
2102 | |
2103 | // Pass misaligned input. This must fail. |
2104 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
2105 | ASSERT_ANY_THROW(fe.runFusion({t0_misaligned})); |
2106 | |
2107 | // Pass misaligned output. This must fail too. |
2108 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
2109 | ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned})); |
2110 | } |
2111 | |
2112 | // Repro of issue #1530 |
2113 | TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) { |
2114 | std::vector<int64_t> shape{1, 2, 1}; |
2115 | |
2116 | Fusion fusion; |
2117 | FusionGuard fg(&fusion); |
2118 | |
2119 | auto tv0 = makeContigTensor(shape.size()); |
2120 | fusion.addInput(tv0); |
2121 | auto tv1 = set(tv0); |
2122 | fusion.addOutput(tv1); |
2123 | |
2124 | tv1->merge(1); |
2125 | tv1->merge(0); |
2126 | |
2127 | auto invalid_vec_size = shape[0] * shape[1] * shape[2]; |
2128 | invalid_vec_size *= invalid_vec_size; |
2129 | |
2130 | tv1->split(0, invalid_vec_size); |
2131 | |
2132 | tv1->axis(1)->parallelize(ParallelType::Vectorize); |
2133 | |
2134 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2135 | auto t0 = at::randn(shape, options); |
2136 | |
2137 | FusionExecutor fe; |
2138 | fe.compileFusion(&fusion, {t0}); |
2139 | |
2140 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
2141 | ASSERT_ANY_THROW(fe.runFusion({t0})); |
2142 | } |
2143 | |
2144 | TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) { |
2145 | Fusion fusion; |
2146 | FusionGuard fg(&fusion); |
2147 | |
2148 | auto tv0 = makeConcreteTensor({4}); |
2149 | fusion.addInput(tv0); |
2150 | auto tv1 = makeConcreteTensor({3, 4}); |
2151 | fusion.addInput(tv1); |
2152 | |
2153 | auto tv2 = broadcast(tv0, {true, false}); |
2154 | auto tv3 = add(tv2, tv1); |
2155 | fusion.addOutput(tv3); |
2156 | |
2157 | tv3->merge(0); |
2158 | TransformPropagatorWithCheck propagator(tv3); |
2159 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
2160 | |
2161 | tv2->setMemoryType(MemoryType::Local); |
2162 | |
2163 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2164 | auto t0 = at::randn({4}, options); |
2165 | auto t1 = at::randn({3, 4}, options); |
2166 | |
2167 | auto t3 = t0.unsqueeze(0).add(t1); |
2168 | { |
2169 | FusionExecutor fe; |
2170 | fe.compileFusion(&fusion, {t0, t1}); |
2171 | auto cg_outputs = fe.runFusion({t0, t1}); |
2172 | |
2173 | testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__); |
2174 | } |
2175 | |
2176 | // Make sure tv2 indexing also works when it's stored in global memory |
2177 | tv2->setMemoryType(MemoryType::Global); |
2178 | { |
2179 | FusionExecutor fe; |
2180 | fe.compileFusion(&fusion, {t0, t1}); |
2181 | auto cg_outputs = fe.runFusion({t0, t1}); |
2182 | |
2183 | testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__); |
2184 | } |
2185 | } |
2186 | |
2187 | // Repro of #1534. Validation should detect invalid vectorization. |
2188 | TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) { |
2189 | std::vector<int64_t> shape1{2, 3, 2}; |
2190 | std::vector<int64_t> shape2{2, 2}; |
2191 | |
2192 | Fusion fusion; |
2193 | FusionGuard fg(&fusion); |
2194 | |
2195 | auto tv0 = makeContigConcreteTensor(shape1); |
2196 | fusion.addInput(tv0); |
2197 | auto tv1 = makeContigConcreteTensor(shape2); |
2198 | fusion.addInput(tv1); |
2199 | |
2200 | auto tv2 = set(tv1); |
2201 | auto tv3 = broadcast(tv2, {false, true, false}); |
2202 | auto tv4 = add(tv0, tv3); |
2203 | fusion.addOutput(tv4); |
2204 | |
2205 | tv4->merge(1, 2); |
2206 | tv4->merge(0, 1); |
2207 | tv4->split(0, 4); |
2208 | TransformPropagatorWithCheck propagator(tv4); |
2209 | MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator); |
2210 | |
2211 | tv0->computeAt(tv4, -2); |
2212 | tv1->computeAt(tv4, -2); |
2213 | |
2214 | tv2->axis(-1)->parallelize(ParallelType::Vectorize); |
2215 | |
2216 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2217 | auto t0 = at::randn(shape1, options); |
2218 | auto t1 = at::randn(shape2, options); |
2219 | |
2220 | FusionExecutor fe; |
2221 | fe.compileFusion(&fusion, {t0, t1}); |
2222 | |
2223 | // Vectorization of tv2 should be detected as invalid. |
2224 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
2225 | ASSERT_ANY_THROW(fe.runFusion({t0, t1})); |
2226 | } |
2227 | |
2228 | TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) { |
2229 | std::vector<int64_t> shape1{2, 2, 2}; |
2230 | std::vector<int64_t> shape2{1, 2, 2}; |
2231 | |
2232 | Fusion fusion; |
2233 | FusionGuard fg(&fusion); |
2234 | |
2235 | // [I0, I1, I2] |
2236 | auto tv0 = makeContigTensor(shape1.size()); |
2237 | fusion.addInput(tv0); |
2238 | |
2239 | // [B3, I1, I2] |
2240 | auto tv1 = makeContigConcreteTensor(shape2); |
2241 | fusion.addInput(tv1); |
2242 | |
2243 | auto tv2 = set(tv1); |
2244 | auto tv3 = add(tv0, tv2); |
2245 | fusion.addOutput(tv3); |
2246 | |
2247 | tv3->merge(1, 2); |
2248 | tv3->merge(0, 1); |
2249 | tv3->split(0, 4); |
2250 | |
2251 | // Don't modify tv1 so that it's replayed as tv2 with actual |
2252 | // transformations. It would create temporary IterDomains, and the |
2253 | // validation should still be able to detect vectorization by 4 is valid. |
2254 | // TransformPropagatorWithCheck propagator(tv3); |
2255 | // MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
2256 | |
2257 | tv2->merge(1, 2); |
2258 | tv2->merge(0, 1); |
2259 | tv2->split(0, 4); |
2260 | |
2261 | tv2->computeAt(tv3, -2); |
2262 | |
2263 | tv2->axis(-1)->parallelize(ParallelType::Vectorize); |
2264 | |
2265 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2266 | auto t0 = at::randn(shape1, options); |
2267 | auto t1 = at::randn(shape2, options); |
2268 | |
2269 | FusionExecutor fe; |
2270 | fe.compileFusion(&fusion, {t0, t1}); |
2271 | auto cg_outputs = fe.runFusion({t0, t1}); |
2272 | |
2273 | auto ref = t0 + t1; |
2274 | |
2275 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2276 | } |
2277 | |
2278 | TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) { |
2279 | std::vector<int64_t> shape0{100, 14, 2, 14}; |
2280 | std::vector<int64_t> shape1{100, 2, 14}; |
2281 | |
2282 | Fusion fusion; |
2283 | FusionGuard fg(&fusion); |
2284 | |
2285 | auto tv0 = makeContigTensor(shape0.size()); |
2286 | fusion.addInput(tv0); |
2287 | auto tv1 = makeContigTensor(shape1.size()); |
2288 | fusion.addInput(tv1); |
2289 | |
2290 | auto tv2 = broadcast(tv1, {false, true, false, false}); |
2291 | auto tv3 = add(tv0, tv2); |
2292 | fusion.addOutput(tv3); |
2293 | |
2294 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2295 | auto t0 = at::randn(shape0, options); |
2296 | auto t1 = at::randn(shape1, options); |
2297 | |
2298 | auto lparams = schedulePointwise(&fusion, {t0, t1}); |
2299 | |
2300 | GpuLower gpulw(&fusion); |
2301 | auto kernel = gpulw.kernel(); |
2302 | |
2303 | // The innermost two dimensions are merged and contiguous, so |
2304 | // vectorization can be done against 2*14=28 rather than 14, so |
2305 | // vector word size should be 4. Broadcasting of tv1 should not |
2306 | // matter. |
2307 | for (const auto& vec_info : kernel->summary().vectorized_set_info) { |
2308 | TORCH_CHECK( |
2309 | vec_info.word_size == 4, |
2310 | "Invalid vector word size: " , |
2311 | vec_info.word_size); |
2312 | } |
2313 | |
2314 | FusionExecutor fe; |
2315 | fe.compileFusion(&fusion, {t0, t1}, lparams); |
2316 | auto cg_outputs = fe.runFusion({t0, t1}); |
2317 | |
2318 | auto ref = t0 + t1.unsqueeze(-3); |
2319 | |
2320 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2321 | } |
2322 | |
2323 | // Repro of issue #1539. |
2324 | TEST_F(NVFuserTest, FusionTrivialReductionForwarding1_CUDA) { |
2325 | Fusion fusion; |
2326 | FusionGuard fg(&fusion); |
2327 | |
2328 | auto tv0 = makeSymbolicTensor(1); |
2329 | fusion.addInput(tv0); |
2330 | |
2331 | auto tv1 = broadcast(tv0, {true, false}); |
2332 | auto tv2 = sum(tv1, {0}); |
2333 | auto tv3 = set(tv2); |
2334 | fusion.addOutput(tv3); |
2335 | |
2336 | tv2->merge(0); |
2337 | tv2->split(0, 4); |
2338 | |
2339 | TransformPropagatorWithCheck propagator(tv2); |
2340 | MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator); |
2341 | |
2342 | // All tensors must be transformed to a 2D tensor with each axis |
2343 | // mapped with each other in the LOOP map. |
2344 | ComputeAtMap ca_map(&fusion); |
2345 | for (auto tv : ir_utils::allTvs(&fusion)) { |
2346 | TORCH_CHECK( |
2347 | tv->nDims() == 2, "Expected to be a 2D tensor but: " , tv->toString()); |
2348 | for (const auto i : c10::irange(2)) { |
2349 | TORCH_CHECK(ca_map.areMapped( |
2350 | tv->axis(i), tv3->axis(i), IdMappingMode::PERMISSIVE)); |
2351 | } |
2352 | } |
2353 | } |
2354 | |
2355 | TEST_F(NVFuserTest, FusionTrivialReductionForwarding2_CUDA) { |
2356 | Fusion fusion; |
2357 | FusionGuard fg(&fusion); |
2358 | |
2359 | auto tv0 = makeSymbolicTensor(1); |
2360 | fusion.addInput(tv0); |
2361 | |
2362 | auto tv1 = broadcast(tv0, {true, false}); |
2363 | auto tv2 = sum(tv1, {0}); |
2364 | auto tv3 = add(tv2, IrBuilder::create<Double>(1)); |
2365 | |
2366 | fusion.addOutput(tv3); |
2367 | |
2368 | // Merging a trivial reduction with a non-reduction domain |
2369 | tv2->merge(0, 1); |
2370 | tv2->split(0, 4); |
2371 | |
2372 | tv3->split(0, 4); |
2373 | |
2374 | // tv2 and tv3 are different as tv3 lacks the trivial reduction, but |
2375 | // they are mapped with each other by BestEffortReplay as the merge |
2376 | // of trivial reduciton dim is forwarded. |
2377 | |
2378 | PairwiseRootDomainMap root_map(tv2, tv3); |
2379 | |
2380 | auto p2c = BestEffortReplay::replayCasP(tv3, tv2, 2, root_map).getReplay(); |
2381 | for (const auto i : c10::irange(tv2->nDims())) { |
2382 | auto tv2_id = tv2->axis(i); |
2383 | auto it = p2c.find(tv2_id); |
2384 | TORCH_CHECK( |
2385 | it != p2c.end(), |
2386 | "Expected mapped consumer ID but not found: " , |
2387 | tv2_id->toString()); |
2388 | auto tv3_mapped_id = it->second; |
2389 | TORCH_CHECK( |
2390 | tv3_mapped_id == tv3->axis(i), |
2391 | "Unexpected mapped consumer ID: " , |
2392 | tv3_mapped_id->toString()); |
2393 | } |
2394 | |
2395 | auto c2p = BestEffortReplay::replayPasC(tv2, tv3, 2, root_map).getReplay(); |
2396 | for (const auto i : c10::irange(tv3->nDims())) { |
2397 | auto tv3_id = tv3->axis(i); |
2398 | auto it = c2p.find(tv3_id); |
2399 | TORCH_CHECK( |
2400 | it != c2p.end(), |
2401 | "Expected mapped producer ID but not found: " , |
2402 | tv3_id->toString()); |
2403 | auto tv2_mapped_id = it->second; |
2404 | TORCH_CHECK( |
2405 | tv2_mapped_id == tv2->axis(i), |
2406 | "Unexpected mapped consumer ID: " , |
2407 | tv2_mapped_id->toString()); |
2408 | } |
2409 | } |
2410 | |
2411 | TEST_F(NVFuserTest, FusionTrivialReductionForwarding3_CUDA) { |
2412 | Fusion fusion; |
2413 | FusionGuard fg(&fusion); |
2414 | |
2415 | auto tv0 = makeSymbolicTensor(2); |
2416 | fusion.addInput(tv0); |
2417 | |
2418 | auto tv1 = sum(tv0, {1}); |
2419 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
2420 | fusion.addOutput(tv2); |
2421 | |
2422 | // Similar pattern as FusionTrivialReductionForwarding2 but trivial |
2423 | // reduciton at non-root domain |
2424 | |
2425 | // Create a trivial reduction by splitting with a factor of 1 |
2426 | tv1->split(1, 1, false); |
2427 | // Merging with a trivial reduction |
2428 | tv1->merge(0, 1); |
2429 | auto tv1_merge_out_id = tv1->axis(0); |
2430 | tv1->split(0, 5); |
2431 | |
2432 | tv2->split(0, 5); |
2433 | |
2434 | // The merge of tv1 is done with a non-root trivial |
2435 | // reduciton. BestEffortReplay should forward the merge. |
2436 | |
2437 | PairwiseRootDomainMap root_map(tv1, tv2); |
2438 | auto p2c = BestEffortReplay::replayCasP(tv2, tv1, 2, root_map).getReplay(); |
2439 | |
2440 | // The two tensors should look like: |
2441 | // tv1: [I1*1//5, 5, I2//1] |
2442 | // tv2: [I1//5, 5] |
2443 | // |
2444 | // BestEffortRepaly should forward the merge of (I1 * 1) and create |
2445 | // mappings of: |
2446 | // I1*1//5 -> I1//5 |
2447 | // 5 -> 5 |
2448 | // I1*1 -> I1 |
2449 | |
2450 | TORCH_CHECK(p2c.size() == 3, "Unexpected number of mappings" ); |
2451 | TORCH_CHECK(p2c.count(tv1->axis(0)) && p2c[tv1->axis(0)] == tv2->axis(0)); |
2452 | TORCH_CHECK(p2c.count(tv1->axis(1)) && p2c[tv1->axis(1)] == tv2->axis(1)); |
2453 | TORCH_CHECK( |
2454 | p2c.count(tv1_merge_out_id) && |
2455 | p2c[tv1_merge_out_id] == tv2->getRootDomain()[0]); |
2456 | } |
2457 | |
2458 | TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) { |
2459 | Fusion fusion; |
2460 | FusionGuard fg(&fusion); |
2461 | |
2462 | auto tv0 = makeSymbolicTensor(1); |
2463 | fusion.addInput(tv0); |
2464 | |
2465 | auto tv1 = makeSymbolicTensor(2); |
2466 | fusion.addInput(tv1); |
2467 | |
2468 | auto tv2 = broadcast(tv0, {true, false}); |
2469 | auto tv3 = add(tv1, tv2); |
2470 | fusion.addOutput(tv3); |
2471 | |
2472 | // tv4 has a trivial reduction axis |
2473 | auto tv4 = sum(tv2, {0}); |
2474 | auto tv5 = add(tv4, IrBuilder::create<Double>(1)); |
2475 | fusion.addOutput(tv5); |
2476 | |
2477 | tv3->merge(0, 1); |
2478 | tv3->split(0, 32); |
2479 | |
2480 | // This causes the trivial reduction of tv4 to be merged with |
2481 | // another axis of tv4, and then forward computeAt is done from tv4 |
2482 | // to tv5. The split of the merged id of tv4 should be done on tv5 |
2483 | // by forwarding the merge of the trivial reduction. |
2484 | tv0->computeAt(tv3, -1); |
2485 | |
2486 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
2487 | tv3->axis(1)->parallelize(ParallelType::TIDx); |
2488 | |
2489 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2490 | auto t0 = at::randn({111}, options); |
2491 | auto t1 = at::randn({123, 111}, options); |
2492 | |
2493 | FusionExecutor fe; |
2494 | fe.compileFusion(&fusion, {t0, t1}); |
2495 | auto cg_outputs = fe.runFusion({t0, t1}); |
2496 | |
2497 | auto t2 = t0.unsqueeze(0); |
2498 | auto t3 = t1 + t2; |
2499 | auto t5 = sum(t2, {0}) + 1; |
2500 | |
2501 | testValidate(&fusion, cg_outputs, {t0, t1}, {t3, t5}, __LINE__, __FILE__); |
2502 | } |
2503 | |
2504 | // See issue #1598 |
2505 | TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) { |
2506 | Fusion fusion; |
2507 | FusionGuard fg(&fusion); |
2508 | |
2509 | auto tv0 = makeSymbolicTensor(2); |
2510 | auto tv1 = makeSymbolicTensor(2); |
2511 | fusion.addInput(tv0); |
2512 | fusion.addInput(tv1); |
2513 | |
2514 | auto tv2 = set(tv0); |
2515 | auto tv3 = set(tv1); |
2516 | auto tv4 = add(tv2, tv3); |
2517 | fusion.addOutput(tv4); |
2518 | |
2519 | // Place tv2 on shared memory |
2520 | tv2->split(0, 2); |
2521 | tv2->split(-1, 4); |
2522 | tv2->setMemoryType(MemoryType::Shared); |
2523 | tv2->axis(-2)->parallelize(ParallelType::TIDy); |
2524 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
2525 | |
2526 | tv3->split(0, 2); |
2527 | tv3->split(-1, 4); |
2528 | // swap tidx and tidy |
2529 | tv3->axis(-2)->parallelize(ParallelType::TIDx); |
2530 | tv3->axis(-1)->parallelize(ParallelType::TIDy); |
2531 | |
2532 | tv4->split(0, 2); |
2533 | tv4->split(-1, 4); |
2534 | tv4->axis(-2)->parallelize(ParallelType::TIDx); |
2535 | tv4->axis(-1)->parallelize(ParallelType::TIDy); |
2536 | |
2537 | tv0->computeAt(tv4, 1); |
2538 | tv3->computeAt(tv4, -1); |
2539 | |
2540 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2541 | auto t0 = at::randn({10, 64}, options); |
2542 | auto t1 = at::randn({10, 64}, options); |
2543 | |
2544 | FusionExecutor fe; |
2545 | fe.compileFusion(&fusion, {t0, t1}); |
2546 | auto cg_outputs = fe.runFusion({t0, t1}); |
2547 | |
2548 | auto ref = t0 + t1; |
2549 | |
2550 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2551 | } |
2552 | |
2553 | // See issue #1598 |
2554 | TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) { |
2555 | Fusion fusion; |
2556 | FusionGuard fg(&fusion); |
2557 | |
2558 | auto tv0 = makeSymbolicTensor(2); |
2559 | auto tv1 = makeSymbolicTensor(2); |
2560 | fusion.addInput(tv0); |
2561 | fusion.addInput(tv1); |
2562 | |
2563 | auto tv2 = set(tv0); |
2564 | auto tv3 = set(tv1); |
2565 | auto tv4 = add(tv2, tv3); |
2566 | fusion.addOutput(tv4); |
2567 | |
2568 | tv2->split(0, 2); |
2569 | tv2->split(-1, 4); |
2570 | tv2->setMemoryType(MemoryType::Shared); |
2571 | |
2572 | tv2->axis(-2)->parallelize(ParallelType::TIDy); |
2573 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
2574 | |
2575 | tv4->split(0, 2); |
2576 | tv4->split(-1, 4); |
2577 | // Also do unroll for tv3 and tv4 |
2578 | tv4->split(-2, 8, false); |
2579 | tv4->axis(-3)->parallelize(ParallelType::Unroll); |
2580 | // swap tidx and tidy |
2581 | tv4->axis(-2)->parallelize(ParallelType::TIDx); |
2582 | tv4->axis(-1)->parallelize(ParallelType::TIDy); |
2583 | |
2584 | tv0->computeAt(tv4, 1); |
2585 | tv3->computeAt(tv4, -1); |
2586 | |
2587 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2588 | auto t0 = at::randn({10, 64}, options); |
2589 | auto t1 = at::randn({10, 64}, options); |
2590 | |
2591 | FusionExecutor fe; |
2592 | fe.compileFusion(&fusion, {t0, t1}); |
2593 | auto cg_outputs = fe.runFusion({t0, t1}); |
2594 | |
2595 | auto ref = t0 + t1; |
2596 | |
2597 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2598 | } |
2599 | |
2600 | // See issue #1599 |
2601 | TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) { |
2602 | Fusion fusion; |
2603 | FusionGuard fg(&fusion); |
2604 | |
2605 | auto tv0 = makeSymbolicTensor(2); |
2606 | auto tv1 = makeSymbolicTensor(2); |
2607 | fusion.addInput(tv0); |
2608 | fusion.addInput(tv1); |
2609 | |
2610 | auto tv2 = set(tv0); |
2611 | auto tv3 = set(tv1); |
2612 | auto tv4 = add(tv2, tv3); |
2613 | fusion.addOutput(tv4); |
2614 | |
2615 | // Use unroll where a RAW-sync tensor is stored |
2616 | |
2617 | tv4->split(0, 2); |
2618 | tv4->split(0, 3); |
2619 | tv4->split(-1, 4); |
2620 | tv4->axis(1)->parallelize(ParallelType::Unroll); |
2621 | tv4->axis(-2)->parallelize(ParallelType::TIDx); |
2622 | tv4->axis(-1)->parallelize(ParallelType::TIDy); |
2623 | |
2624 | tv0->computeAt(tv4, 3); |
2625 | tv3->computeAt(tv4, -1); |
2626 | |
2627 | tv2->split(-1, 4); |
2628 | tv2->axis(-2)->parallelize(ParallelType::TIDy); |
2629 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
2630 | tv2->setMemoryType(MemoryType::Shared); |
2631 | |
2632 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2633 | auto t0 = at::randn({50, 64}, options); |
2634 | auto t1 = at::randn({50, 64}, options); |
2635 | |
2636 | FusionExecutor fe; |
2637 | fe.compileFusion(&fusion, {t0, t1}); |
2638 | auto cg_outputs = fe.runFusion({t0, t1}); |
2639 | |
2640 | auto ref = t0 + t1; |
2641 | |
2642 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2643 | } |
2644 | |
2645 | // See #1618 |
2646 | TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace4_CUDA) { |
2647 | Fusion fusion; |
2648 | FusionGuard fg(&fusion); |
2649 | |
2650 | auto tv0 = makeConcreteTensor({16, 128}); |
2651 | auto tv1 = makeConcreteTensor({16, 128}); |
2652 | fusion.addInput(tv0); |
2653 | fusion.addInput(tv1); |
2654 | |
2655 | auto tv2 = set(tv0); |
2656 | auto tv3 = set(tv1); |
2657 | auto tv4 = set(tv2); |
2658 | auto tv5 = set(tv3); |
2659 | auto tv6 = add(tv4, tv5); |
2660 | fusion.addOutput(tv6); |
2661 | |
2662 | tv2->setMemoryType(MemoryType::Shared); |
2663 | tv3->setMemoryType(MemoryType::Shared); |
2664 | |
2665 | tv2->computeAt(tv6, 0); |
2666 | tv3->computeAt(tv6, 1); |
2667 | tv4->computeAt(tv6, 1); |
2668 | tv5->computeAt(tv6, -1); |
2669 | tv2->split(1, 64); |
2670 | tv3->split(1, 64); |
2671 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
2672 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
2673 | tv6->axis(-1)->parallelize(ParallelType::TIDx); |
2674 | |
2675 | // Check the block sync is inserted at the correct location. |
2676 | // There is exactly one block sync needed in this test case |
2677 | // and the sync needs to be after the 2 expressions |
2678 | // that modify shared memory. |
2679 | class SyncInsertionPointChecker : public kir::IrVisitor { |
2680 | public: |
2681 | using kir::IrVisitor::handle; |
2682 | |
2683 | private: |
2684 | void handle(UnaryOp* uop) final { |
2685 | // Record number of unary ops that modifies shared memory. |
2686 | if (uop->out()->isA<kir::TensorIndex>() && |
2687 | uop->out()->as<kir::TensorIndex>()->view()->getMemoryType() == |
2688 | MemoryType::Shared && |
2689 | // Filter out initialization expressions |
2690 | uop->in()->isA<kir::TensorIndex>()) { |
2691 | number_of_writes_++; |
2692 | } |
2693 | } |
2694 | void handle(kir::BlockSync* bsync) final { |
2695 | // Make sure both shared memory modifying expressions |
2696 | // have been observed at the sync insertion point. |
2697 | TORCH_INTERNAL_ASSERT( |
2698 | number_of_writes_ == 2, |
2699 | "FusionRAWSyncInsertionPlace4 test fail:" , |
2700 | "only 1 sync after the 2 shared mem writes is needed in this test," |
2701 | "either a redundant sync has been inserted or the block sync is not inserted at the right place" ); |
2702 | } |
2703 | |
2704 | private: |
2705 | int number_of_writes_ = 0; |
2706 | } sync_insertion_checker; |
2707 | GpuLower gpulw(&fusion); |
2708 | sync_insertion_checker.handle(gpulw.kernel()->topLevelExprs()); |
2709 | } |
2710 | |
2711 | // Test serial write and parallel read of shared mem: mapped case |
2712 | TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) { |
2713 | Fusion fusion; |
2714 | FusionGuard fg(&fusion); |
2715 | |
2716 | TensorView* tv0 = makeConcreteTensor({128, 6}); |
2717 | TensorView* tv1 = makeConcreteTensor({128, 6}); |
2718 | TensorView* tv2 = makeConcreteTensor({128, 6}); |
2719 | fusion.addInput(tv0); |
2720 | fusion.addInput(tv1); |
2721 | fusion.addInput(tv2); |
2722 | |
2723 | TensorView* tv3 = add(tv0, tv1); |
2724 | TensorView* tv4 = add(tv3, tv2); |
2725 | |
2726 | fusion.addOutput(tv4); |
2727 | |
2728 | // Use shared memory |
2729 | tv3->setMemoryType(MemoryType::Shared); |
2730 | |
2731 | // Parallelize t4, in this case dim 0 on tv3 will |
2732 | // not be parallelized but dim0 of t4 will be. |
2733 | // We will need to make sure a sync is inserted |
2734 | // even if these dimensions are mapped. |
2735 | tv4->axis(0)->parallelize(ParallelType::TIDx); |
2736 | |
2737 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2738 | |
2739 | at::Tensor t0 = at::randn({128, 6}, options); |
2740 | at::Tensor t1 = at::randn({128, 6}, options); |
2741 | at::Tensor t2 = at::randn({128, 6}, options); |
2742 | |
2743 | FusionExecutor fe; |
2744 | fe.compileFusion(&fusion, {t0, t1, t2}); |
2745 | auto cg_outputs = fe.runFusion({t0, t1, t2}); |
2746 | |
2747 | auto ref = t0 + t1 + t2; |
2748 | |
2749 | testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__); |
2750 | } |
2751 | |
2752 | // Test serial write and parallel read of shared mem: un-mapped case |
2753 | TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) { |
2754 | Fusion fusion; |
2755 | FusionGuard fg(&fusion); |
2756 | |
2757 | TensorView* tv0 = makeConcreteTensor({128, 6}); |
2758 | TensorView* tv1 = makeConcreteTensor({128, 6}); |
2759 | TensorView* tv2 = makeConcreteTensor({128, 6}); |
2760 | fusion.addInput(tv0); |
2761 | fusion.addInput(tv1); |
2762 | fusion.addInput(tv2); |
2763 | |
2764 | TensorView* tv3 = add(tv0, tv1); |
2765 | TensorView* tv4 = add(tv3, tv2); |
2766 | |
2767 | fusion.addOutput(tv4); |
2768 | |
2769 | // Use shared memory |
2770 | tv3->setMemoryType(MemoryType::Shared); |
2771 | |
2772 | // Split and parallelize t4, |
2773 | // the parallelized dimension in t4 will not |
2774 | // map across to the shared mem tensor, t3. So |
2775 | // there will need to be a sync before use of t3. |
2776 | tv4->split(0, 2); |
2777 | tv4->axis(0)->parallelize(ParallelType::TIDx); |
2778 | |
2779 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2780 | |
2781 | at::Tensor t0 = at::randn({128, 6}, options); |
2782 | at::Tensor t1 = at::randn({128, 6}, options); |
2783 | at::Tensor t2 = at::randn({128, 6}, options); |
2784 | |
2785 | FusionExecutor fe; |
2786 | fe.compileFusion(&fusion, {t0, t1, t2}); |
2787 | auto cg_outputs = fe.runFusion({t0, t1, t2}); |
2788 | |
2789 | auto ref = t0 + t1 + t2; |
2790 | |
2791 | testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__); |
2792 | } |
2793 | |
2794 | // Simple test of async copy primitive |
2795 | TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) { |
2796 | Fusion fusion; |
2797 | FusionGuard fg(&fusion); |
2798 | |
2799 | int m = 33, n = 31; |
2800 | |
2801 | TensorView* tv0 = makeConcreteTensor({m, n}); |
2802 | TensorView* tv1 = makeConcreteTensor({m, n}); |
2803 | |
2804 | fusion.addInput(tv0); |
2805 | fusion.addInput(tv1); |
2806 | |
2807 | TensorView* tv2 = add(tv0, tv1); |
2808 | |
2809 | fusion.addOutput(tv2); |
2810 | |
2811 | auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync); |
2812 | tv0_shared->setMemoryType(MemoryType::Shared); |
2813 | |
2814 | tv0->computeAt(tv2, 1); |
2815 | tv0_shared->axis(1)->parallelize(ParallelType::TIDx); |
2816 | tv2->axis(1)->parallelize(ParallelType::TIDx); |
2817 | |
2818 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2819 | at::Tensor t0 = at::randn({m, n}, options); |
2820 | at::Tensor t1 = at::randn({m, n}, options); |
2821 | |
2822 | FusionExecutor fe; |
2823 | |
2824 | // requires ampere+ GPU |
2825 | if (!deviceMajorMinorCheck(8)) { |
2826 | ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); |
2827 | GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs" ; |
2828 | } |
2829 | fe.compileFusion(&fusion, {t0, t1}); |
2830 | auto cg_outputs = fe.runFusion({t0, t1}); |
2831 | |
2832 | auto ref = t0 + t1; |
2833 | |
2834 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2835 | } |
2836 | |
2837 | // Simple test of async copy primitive: double buffered |
2838 | // Double buffer case 1, both block sync and async wait |
2839 | // are needed. |
2840 | TEST_F(NVFuserTest, FusionDoubleBufferCpAsync1_CUDA) { |
2841 | Fusion fusion; |
2842 | FusionGuard fg(&fusion); |
2843 | |
2844 | // Using vectorization so need to keep n multiple of 4. |
2845 | int m = 33, n = 48; |
2846 | |
2847 | TensorView* tv0 = makeConcreteTensor({m, n}); |
2848 | TensorView* tv1 = makeConcreteTensor({m, n}); |
2849 | |
2850 | fusion.addInput(tv0); |
2851 | fusion.addInput(tv1); |
2852 | |
2853 | TensorView* tv2 = add(tv0, tv1); |
2854 | |
2855 | fusion.addOutput(tv2); |
2856 | |
2857 | auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync); |
2858 | tv0_shared->setMemoryType(MemoryType::Shared); |
2859 | tv0->computeAt(tv2, 1); |
2860 | |
2861 | // Asynchronously load a tile in one schedule |
2862 | tv0_shared->split(1, 4); |
2863 | tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize); |
2864 | tv0_shared->axis(-2)->parallelize(ParallelType::TIDx); |
2865 | |
2866 | // Consume the loaded tile in another schedule, |
2867 | // triggering the need for a sync. |
2868 | tv2->split(1, 12); |
2869 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
2870 | |
2871 | // Double buffer the shared mem tensor. |
2872 | tv0_shared->doubleBuffer(); |
2873 | |
2874 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2875 | at::Tensor t0 = at::randn({m, n}, options); |
2876 | at::Tensor t1 = at::randn({m, n}, options); |
2877 | |
2878 | FusionExecutor fe; |
2879 | // requires ampere+ GPU |
2880 | if (!deviceMajorMinorCheck(8)) { |
2881 | ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); |
2882 | GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs" ; |
2883 | } |
2884 | fe.compileFusion(&fusion, {t0, t1}); |
2885 | auto cg_outputs = fe.runFusion({t0, t1}); |
2886 | |
2887 | auto ref = t0 + t1; |
2888 | |
2889 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2890 | } |
2891 | |
2892 | // Simple test of async copy primitive: double buffered |
2893 | // Double buffer case 2, only async wait is needed |
2894 | TEST_F(NVFuserTest, FusionDoubleBufferCpAsync2_CUDA) { |
2895 | Fusion fusion; |
2896 | FusionGuard fg(&fusion); |
2897 | |
2898 | // Using vectorization so need to keep n multiple of 4. |
2899 | int m = 33, n = 48; |
2900 | |
2901 | TensorView* tv0 = makeConcreteTensor({m, n}); |
2902 | TensorView* tv1 = makeConcreteTensor({m, n}); |
2903 | |
2904 | fusion.addInput(tv0); |
2905 | fusion.addInput(tv1); |
2906 | |
2907 | TensorView* tv2 = add(tv0, tv1); |
2908 | |
2909 | fusion.addOutput(tv2); |
2910 | |
2911 | auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync); |
2912 | tv0_shared->setMemoryType(MemoryType::Shared); |
2913 | tv0->computeAt(tv2, 1); |
2914 | |
2915 | // Asynchronously load a tile in one schedule |
2916 | tv0_shared->split(1, 4); |
2917 | tv0_shared->axis(-2)->parallelize(ParallelType::TIDx); |
2918 | |
2919 | // Consume the loaded tile in another schedule, |
2920 | // triggering the need for a sync. |
2921 | tv2->split(1, 4); |
2922 | tv2->axis(-2)->parallelize(ParallelType::TIDx); |
2923 | |
2924 | // Double buffer the shared mem tensor. |
2925 | tv0_shared->doubleBuffer(); |
2926 | |
2927 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2928 | at::Tensor t0 = at::randn({m, n}, options); |
2929 | at::Tensor t1 = at::randn({m, n}, options); |
2930 | |
2931 | FusionExecutor fe; |
2932 | // requires ampere+ GPU |
2933 | if (!deviceMajorMinorCheck(8)) { |
2934 | ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1})); |
2935 | GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs" ; |
2936 | } |
2937 | fe.compileFusion(&fusion, {t0, t1}); |
2938 | auto cg_outputs = fe.runFusion({t0, t1}); |
2939 | |
2940 | auto ref = t0 + t1; |
2941 | |
2942 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
2943 | } |
2944 | |
2945 | // Simple test for double buffer in shared mem, |
2946 | // where we should not insert redundant syncs when |
2947 | // they are not needed. |
2948 | TEST_F(NVFuserTest, FusionDoubleBufferNoSync_CUDA) { |
2949 | Fusion fusion; |
2950 | FusionGuard fg(&fusion); |
2951 | |
2952 | // Using vectorization so need to keep n multiple of 4. |
2953 | int m = 33, n = 48; |
2954 | |
2955 | TensorView* tv0 = makeConcreteTensor({m, n}); |
2956 | TensorView* tv1 = makeConcreteTensor({m, n}); |
2957 | |
2958 | fusion.addInput(tv0); |
2959 | fusion.addInput(tv1); |
2960 | |
2961 | TensorView* tv2 = add(tv0, tv1); |
2962 | |
2963 | fusion.addOutput(tv2); |
2964 | |
2965 | auto tv0_shared = tv0->cacheAfter(); |
2966 | tv0_shared->setMemoryType(MemoryType::Shared); |
2967 | tv0->computeAt(tv2, 1); |
2968 | |
2969 | // Asynchronously load a tile in one schedule |
2970 | tv0_shared->split(1, 4); |
2971 | tv0_shared->axis(-2)->parallelize(ParallelType::TIDx); |
2972 | |
2973 | // Consume the loaded tile in another schedule, |
2974 | // triggering the need for a sync. |
2975 | tv2->split(1, 4); |
2976 | tv2->axis(-2)->parallelize(ParallelType::TIDx); |
2977 | |
2978 | // Double buffer the shared mem tensor. |
2979 | tv0_shared->doubleBuffer(); |
2980 | |
2981 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2982 | at::Tensor t0 = at::randn({m, n}, options); |
2983 | at::Tensor t1 = at::randn({m, n}, options); |
2984 | |
2985 | GpuLower gpulw(&fusion); |
2986 | auto flattened_exprs = |
2987 | ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs()); |
2988 | bool sync_inserted = std::any_of( |
2989 | flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) { |
2990 | return expr->isA<kir::BlockSync>(); |
2991 | }); |
2992 | TORCH_INTERNAL_ASSERT(!sync_inserted, "Un-expected block sync inserted" ); |
2993 | |
2994 | FusionExecutor fe; |
2995 | fe.compileFusion(&fusion, {t0, t1}); |
2996 | auto cg_outputs = fe.runFusion({t0, t1}); |
2997 | |
2998 | auto ref = t0 + t1; |
2999 | |
3000 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
3001 | } |
3002 | |
3003 | // Test predicate inversion for cp.async |
3004 | TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) { |
3005 | // requires ampere+ GPU |
3006 | |
3007 | Fusion fusion; |
3008 | FusionGuard fg(&fusion); |
3009 | |
3010 | // Using vectorization so need to keep n multiple of 4. |
3011 | int m = 33, n = 48; |
3012 | |
3013 | TensorView* tv0 = makeConcreteTensor({m, n}); |
3014 | |
3015 | fusion.addInput(tv0); |
3016 | auto tv1 = sum(tv0, {1}); |
3017 | fusion.addOutput(tv1); |
3018 | |
3019 | auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync); |
3020 | auto tv0_reg = tv0_shared->cacheAfter(); |
3021 | tv0_shared->setMemoryType(MemoryType::Shared); |
3022 | tv0->computeAt(tv1, 1); |
3023 | |
3024 | tv0_shared->split(-1, 32); |
3025 | tv0_shared->split(-1, 4); |
3026 | tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize); |
3027 | |
3028 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3029 | at::Tensor t0 = at::randn({m, n}, options); |
3030 | |
3031 | FusionExecutor fe; |
3032 | if (!deviceMajorMinorCheck(8)) { |
3033 | ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0})); |
3034 | GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs" ; |
3035 | } |
3036 | |
3037 | fe.compileFusion(&fusion, {t0}); |
3038 | auto cg_outputs = fe.runFusion({t0}); |
3039 | |
3040 | auto ref = t0.sum({1}); |
3041 | |
3042 | testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
3043 | } |
3044 | |
3045 | // Test predicate removal on reg-to-reg expressions |
3046 | TEST_F(NVFuserTest, FusionPredRemovalCheck_CUDA) { |
3047 | Fusion fusion; |
3048 | FusionGuard fg(&fusion); |
3049 | |
3050 | TensorView* tv0 = makeContigTensor(2); |
3051 | fusion.addInput(tv0); |
3052 | |
3053 | TensorView* tv1 = set(tv0); |
3054 | TensorView* tv2 = set(tv1); |
3055 | TensorView* tv3 = set(tv2); |
3056 | TensorView* tv4 = set(tv3); |
3057 | |
3058 | fusion.addOutput(tv4); |
3059 | tv4->split(1, 4); |
3060 | tv0->computeAt(tv4, -2); |
3061 | tv3->axis(-1)->parallelize(ParallelType::Vectorize); |
3062 | |
3063 | class PredicateRemovalChecker : public kir::IrVisitor { |
3064 | public: |
3065 | using kir::IrVisitor::handle; |
3066 | |
3067 | private: |
3068 | void handle(UnaryOp* uop) final { |
3069 | assertOnLocalToLocal(uop); |
3070 | } |
3071 | |
3072 | // Utility to assert any local-to-local expr is only trivially predicated. |
3073 | void assertOnLocalToLocal(Expr* expr) { |
3074 | bool is_local = true; |
3075 | for (auto in : ir_utils::filterByType<kir::TensorIndex>(expr->inputs())) { |
3076 | if (in->view()->getMemoryType() != MemoryType::Local) { |
3077 | is_local = false; |
3078 | } |
3079 | } |
3080 | for (auto in : |
3081 | ir_utils::filterByType<kir::TensorIndex>(expr->outputs())) { |
3082 | if (in->view()->getMemoryType() != MemoryType::Local) { |
3083 | is_local = false; |
3084 | } |
3085 | } |
3086 | |
3087 | if (is_local) { |
3088 | if (auto ite = dynamic_cast<kir::IfThenElse*>(scope_exprs_.back())) { |
3089 | TORCH_INTERNAL_ASSERT( |
3090 | ite->predicate()->value()->isConst(), |
3091 | "redundant predicate on: " , |
3092 | expr); |
3093 | } |
3094 | } |
3095 | } |
3096 | |
3097 | private: |
3098 | bool within_ite_ = false; |
3099 | } pred_checker; |
3100 | |
3101 | GpuLower gpulw(&fusion); |
3102 | pred_checker.handle(gpulw.kernel()->topLevelExprs()); |
3103 | } |
3104 | |
3105 | TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) { |
3106 | Fusion fusion; |
3107 | FusionGuard fg(&fusion); |
3108 | |
3109 | auto tv0 = makeSymbolicTensor(1); |
3110 | fusion.addInput(tv0); |
3111 | auto tvs = Welford(tv0, {0}); |
3112 | auto tv_avg = tvs.avg; |
3113 | fusion.addOutput(tv_avg); |
3114 | |
3115 | tv_avg->split(0, 128); |
3116 | TransformPropagatorWithCheck propagator(tv_avg); |
3117 | MaxRootDomainInfoSpanningTree(tv_avg).traverse(&propagator); |
3118 | |
3119 | tv_avg->axis(0)->parallelize(ParallelType::BIDx); |
3120 | tv_avg->axis(1)->parallelize(ParallelType::TIDx); |
3121 | |
3122 | // Make sure the parallelization of tv_avg is propagated to the var |
3123 | // and count tensors. |
3124 | GpuLower gpulw(&fusion); |
3125 | for (const auto expr : gpulw.kernel()->exprs()) { |
3126 | auto wop = dynamic_cast<WelfordOp*>(expr); |
3127 | if (wop == nullptr) { |
3128 | continue; |
3129 | } |
3130 | auto ref = wop->outAvg()->as<TensorView>(); |
3131 | for (auto sibling : ir_utils::filterByType<TensorView>(wop->outputs())) { |
3132 | if (ref == sibling) { |
3133 | continue; |
3134 | } |
3135 | TORCH_CHECK( |
3136 | ref->nDims() == sibling->nDims(), |
3137 | "Invalid sibling: " , |
3138 | sibling->toString()); |
3139 | for (const auto i : c10::irange(ref->nDims())) { |
3140 | TORCH_CHECK( |
3141 | ref->axis(i)->getParallelType() == |
3142 | sibling->axis(i)->getParallelType(), |
3143 | "Mismatched parallel types between siblings. " , |
3144 | ref->toString(), |
3145 | ", " , |
3146 | sibling->toString()); |
3147 | } |
3148 | } |
3149 | } |
3150 | |
3151 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3152 | auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); |
3153 | at::manual_seed(0); |
3154 | at::Tensor t0 = at::randn({9999}, options); |
3155 | |
3156 | FusionExecutor fe; |
3157 | fe.compileFusion(&fusion, {t0}); |
3158 | auto outputs = fe.runFusion({t0}); |
3159 | |
3160 | testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__); |
3161 | } |
3162 | |
3163 | // Test ExactRootDomainMap |
3164 | TEST_F(NVFuserTest, FusionExactRootDomainMap_CUDA) { |
3165 | Fusion fusion; |
3166 | FusionGuard fg(&fusion); |
3167 | |
3168 | auto tv0 = makeSymbolicTensor(1); |
3169 | fusion.addInput(tv0); |
3170 | auto tv1 = makeSymbolicTensor(2); |
3171 | fusion.addInput(tv1); |
3172 | |
3173 | auto tv2 = broadcast(tv0, {false, true}); |
3174 | auto tv3 = transpose(tv2); |
3175 | auto tv4 = add(tv2, tv1); |
3176 | auto tv5 = add(tv2, tv3); |
3177 | auto tv6 = add(tv3, tv1); |
3178 | fusion.addOutput(tv4); |
3179 | fusion.addOutput(tv5); |
3180 | fusion.addOutput(tv6); |
3181 | |
3182 | const auto exact_map = ExactRootDomainMap(&fusion); |
3183 | |
3184 | // In the exact mapping, the broadcast domain introduced at tv2 is |
3185 | // only mapped with the another one in tv3, which is just transposed |
3186 | // from tv2. Any other domain, including the second domain of tv4, |
3187 | // must not be mapped. |
3188 | |
3189 | auto tv2_bc = tv2->axis(1); |
3190 | auto tv3_bc = tv3->axis(0); |
3191 | |
3192 | TORCH_CHECK( |
3193 | exact_map.areMapped(tv2_bc, tv3_bc), |
3194 | "Invalid exact root domain map: " , |
3195 | exact_map.toString()); |
3196 | |
3197 | // They must not be mapped with anything else. |
3198 | for (auto tv : ir_utils::allTvs(&fusion)) { |
3199 | for (auto root_id : tv->getRootDomain()) { |
3200 | if (root_id == tv2_bc || root_id == tv3_bc) { |
3201 | continue; |
3202 | } |
3203 | TORCH_CHECK( |
3204 | !exact_map.areMapped(root_id, tv2_bc), |
3205 | "Invalid exact root domain map: " , |
3206 | exact_map.toString()); |
3207 | TORCH_CHECK( |
3208 | !exact_map.areMapped(root_id, tv3_bc), |
3209 | "Invalid exact root domain map: " , |
3210 | exact_map.toString()); |
3211 | } |
3212 | } |
3213 | } |
3214 | |
3215 | class NVFuserMultithreadedTest : public ::testing::Test { |
3216 | protected: |
3217 | bool was_enabled = false; |
3218 | |
3219 | void SetUp() override { |
3220 | was_enabled = fuser::cuda::setEnabled(true); |
3221 | } |
3222 | |
3223 | void TearDown() override { |
3224 | fuser::cuda::setEnabled(was_enabled); |
3225 | } |
3226 | }; |
3227 | |
3228 | TEST_F(NVFuserMultithreadedTest, SingleFunction_CUDA) { |
3229 | std::string ir = R"IR( |
3230 | graph(%x.1 : Tensor, |
3231 | %y.1 : Tensor): |
3232 | %12 : NoneType = prim::Constant() |
3233 | %11 : bool = prim::Constant[value=0]() |
3234 | %9 : int = prim::Constant[value=1]() |
3235 | %3 : Tensor = aten::exp(%x.1) |
3236 | %5 : Tensor = aten::relu(%y.1) |
3237 | %6 : Tensor = aten::sin(%5) |
3238 | %8 : Tensor = aten::add(%3, %6, %9) |
3239 | %10 : int[] = prim::ListConstruct(%9) |
3240 | %13 : Tensor = aten::sum(%8, %10, %11, %12) |
3241 | return (%13) |
3242 | )IR" ; |
3243 | auto g = std::make_shared<Graph>(); |
3244 | torch::jit::parseIR(ir, g.get()); |
3245 | GraphFunction fn("nvfuser_test" , g, nullptr); |
3246 | |
3247 | auto run_kernel = [&fn]() { |
3248 | auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA)); |
3249 | auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA)); |
3250 | std::vector<IValue> results; |
3251 | for (const auto& _ : c10::irange(10)) { |
3252 | auto stack = createStack({x.clone(), y.clone()}); |
3253 | fn.run(stack); |
3254 | results.push_back(stack.back()); |
3255 | } |
3256 | for (const auto& i : c10::irange(1, 10)) { |
3257 | auto t0 = results[0].toTensor(); |
3258 | auto ti = results[i].toTensor(); |
3259 | ASSERT_TRUE(at::allclose(t0, ti)); |
3260 | } |
3261 | }; |
3262 | |
3263 | constexpr size_t kNumThreads = 4; |
3264 | std::vector<std::thread> threads; |
3265 | for (size_t id = 0; id < kNumThreads; ++id) { |
3266 | threads.emplace_back(run_kernel); |
3267 | } |
3268 | for (auto& t : threads) { |
3269 | t.join(); |
3270 | } |
3271 | } |
3272 | |
3273 | TEST_F(NVFuserMultithreadedTest, MultipleFunctions_CUDA) { |
3274 | auto run_kernel = []() { |
3275 | const std::string ir = R"IR( |
3276 | graph(%x.1 : Tensor, |
3277 | %y.1 : Tensor): |
3278 | %12 : NoneType = prim::Constant() |
3279 | %11 : bool = prim::Constant[value=0]() |
3280 | %9 : int = prim::Constant[value=1]() |
3281 | %3 : Tensor = aten::exp(%x.1) |
3282 | %5 : Tensor = aten::relu(%y.1) |
3283 | %6 : Tensor = aten::sin(%5) |
3284 | %8 : Tensor = aten::add(%3, %6, %9) |
3285 | %10 : int[] = prim::ListConstruct(%9) |
3286 | %13 : Tensor = aten::sum(%8, %10, %11, %12) |
3287 | return (%13) |
3288 | )IR" ; |
3289 | auto g = std::make_shared<Graph>(); |
3290 | torch::jit::parseIR(ir, g.get()); |
3291 | GraphFunction fn("nvfuser_test" , g, nullptr); |
3292 | |
3293 | auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA)); |
3294 | auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA)); |
3295 | std::vector<IValue> results; |
3296 | constexpr size_t numRuns = 10; |
3297 | for (const auto& _ : c10::irange(numRuns)) { |
3298 | auto stack = createStack({x.clone(), y.clone()}); |
3299 | fn.run(stack); |
3300 | results.push_back(stack.back()); |
3301 | } |
3302 | for (const auto& i : c10::irange(1, numRuns)) { |
3303 | auto t0 = results[0].toTensor(); |
3304 | auto ti = results[i].toTensor(); |
3305 | ASSERT_TRUE(at::allclose(t0, ti)); |
3306 | } |
3307 | }; |
3308 | |
3309 | constexpr size_t kNumThreads = 4; |
3310 | std::vector<std::thread> threads; |
3311 | for (size_t id = 0; id < kNumThreads; ++id) { |
3312 | threads.emplace_back(run_kernel); |
3313 | } |
3314 | for (auto& t : threads) { |
3315 | t.join(); |
3316 | } |
3317 | } |
3318 | |
3319 | // Repro of issue #1655 |
3320 | TEST_F(NVFuserTest, FusionIncompleteConcreteID_CUDA) { |
3321 | Fusion fusion; |
3322 | FusionGuard fg(&fusion); |
3323 | |
3324 | auto tv0 = makeSymbolicTensor(1); |
3325 | fusion.addInput(tv0); |
3326 | auto tv1 = makeSymbolicTensor(2); |
3327 | fusion.addInput(tv1); |
3328 | auto tv2 = makeSymbolicTensor(2); |
3329 | fusion.addInput(tv2); |
3330 | |
3331 | auto tv3 = broadcast(tv0, {true, true, false}); |
3332 | auto tv4 = broadcast(tv1, {false, true, false}); |
3333 | auto tv5 = broadcast(tv2, {true, false, false}); |
3334 | |
3335 | auto tv6 = add(tv3, tv4); |
3336 | auto tv7 = add(tv3, tv5); |
3337 | |
3338 | fusion.addOutput(tv6); |
3339 | fusion.addOutput(tv7); |
3340 | |
3341 | tv6->merge(0); |
3342 | tv6->merge(0); |
3343 | |
3344 | TransformPropagatorWithCheck propagator(tv6); |
3345 | MaxRootDomainInfoSpanningTree(tv6).traverse(&propagator); |
3346 | |
3347 | tv0->computeAt(tv6, -1, ComputeAtMode::MostInlined); |
3348 | tv1->computeAt(tv6, -1, ComputeAtMode::MostInlined); |
3349 | tv2->computeAt(tv7, -1, ComputeAtMode::MostInlined); |
3350 | |
3351 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
3352 | ASSERT_ANY_THROW(fusion.printKernel()); |
3353 | } |
3354 | |
3355 | TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) { |
3356 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
3357 | Fusion& fusion = *fusion_ptr.get(); |
3358 | FusionGuard fg(&fusion); |
3359 | |
3360 | int X = 256, Y = 7, Z = 2048; |
3361 | |
3362 | // setup fusion |
3363 | auto tv0 = makeContigTensor(4, DataType::Half); |
3364 | fusion.addInput(tv0); |
3365 | auto tv1 = castOp(DataType::Float, tv0); |
3366 | |
3367 | auto tvs = Welford(tv1, {0, 1, 2}); |
3368 | auto tv_avg = tvs.avg; |
3369 | auto tv_M2 = tvs.var_sum; |
3370 | auto tv_N = tvs.n; |
3371 | fusion.addOutput(tv_avg); |
3372 | fusion.addOutput(tv_M2); |
3373 | |
3374 | auto cached_input = tv0->cacheAfter(); |
3375 | auto cached_avg = tv_avg->cacheBefore(); |
3376 | auto cached_M2 = tv_M2->cacheBefore(); |
3377 | |
3378 | auto reduction_tv = scheduler_utils::getReductionTvs(&fusion)[0]; |
3379 | |
3380 | reduction_tv->merge(0); |
3381 | reduction_tv->merge(0); |
3382 | |
3383 | int TIDx = 16; |
3384 | int vec = 4; |
3385 | |
3386 | int TIDy = 16; |
3387 | int outer_tidy_fact = 16; |
3388 | |
3389 | reduction_tv->split(-1, TIDx * vec); |
3390 | reduction_tv->split(-1, vec); |
3391 | reduction_tv->axis(-2)->parallelize(ParallelType::TIDx); |
3392 | reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize); |
3393 | reduction_tv->axis(-3)->parallelize(ParallelType::BIDx); |
3394 | |
3395 | reduction_tv->split(0, TIDy); |
3396 | reduction_tv->axis(1)->parallelize(ParallelType::TIDy); |
3397 | reduction_tv->split(0, outer_tidy_fact); |
3398 | reduction_tv->axis(0)->parallelize(ParallelType::BIDy); |
3399 | |
3400 | // T2_g[ rblockIdx.y, rS{16}, rthreadIdx.y, iblockIdx.x, ithreadIdx.x24, |
3401 | // iV25{4} ] |
3402 | reduction_tv->reorder({{3, 0}, {4, 1}, {0, 2}, {2, 3}, {1, 4}, {5, 5}}); |
3403 | // T2_g[iblockIdx.x, ithreadIdx.x24, rblockIdx.y, rthreadIdx.y, rS{16}, |
3404 | // iV25{4}] |
3405 | |
3406 | TransformPropagatorWithCheck propagator(reduction_tv); |
3407 | MaxRootDomainInfoSpanningTree(reduction_tv).traverse(&propagator); |
3408 | auto rfactor_tv = ir_utils::rfactorHelper(reduction_tv, {4}); |
3409 | scheduler_utils::parallelizeAllLike(rfactor_tv); |
3410 | |
3411 | tv0->computeAt(tv_avg, 2); |
3412 | tv0->computeAt(cached_input, -2); |
3413 | |
3414 | cached_input->computeAt(rfactor_tv, 4, ComputeAtMode::BestEffort); |
3415 | |
3416 | for (auto tv : ir_utils::allTvs(&fusion)) { |
3417 | if (tv == cached_input || tv == tv_avg || tv == tv_M2) { |
3418 | continue; |
3419 | } |
3420 | tv->axis(-1)->parallelize(ParallelType::Serial); |
3421 | } |
3422 | |
3423 | FusionExecutor fe; |
3424 | fe.compileFusion(&fusion, {}, LaunchParams()); |
3425 | |
3426 | auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); |
3427 | at::Tensor t0 = at::randn({X, Y, Y, Z}, options); |
3428 | |
3429 | auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1)); |
3430 | |
3431 | // by default Welford outputs sum of square diff so need to divide to get var |
3432 | cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y)); |
3433 | |
3434 | auto at_mu = at::mean(t0.to(at::kDouble), {0, 1, 2}); |
3435 | auto at_var = at::var(t0.to(at::kDouble), {0, 1, 2}, false); |
3436 | |
3437 | testValidate( |
3438 | &fusion, |
3439 | cg_outputs, |
3440 | {t0}, |
3441 | {at_mu, at_var}, |
3442 | __LINE__, |
3443 | __FILE__, |
3444 | "" , |
3445 | LaunchParams(-1, -1, -1, -1, -1, -1)); |
3446 | } |
3447 | |
3448 | // Test sync insertion with redundant predicates |
3449 | TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) { |
3450 | Fusion fusion; |
3451 | FusionGuard fg(&fusion); |
3452 | |
3453 | TensorView* tv0 = makeConcreteTensor({32}); |
3454 | TensorView* tv1 = makeConcreteTensor({32, 32}); |
3455 | fusion.addInput(tv0); |
3456 | fusion.addInput(tv1); |
3457 | |
3458 | auto tv2 = broadcast(tv0, {true, false}); |
3459 | auto tv3 = add(tv2, tv1); |
3460 | |
3461 | fusion.addOutput(tv3); |
3462 | |
3463 | auto tv0c = tv0->cacheAfter(); |
3464 | |
3465 | // Make a redundant write through smem |
3466 | tv0c->setMemoryType(MemoryType::Shared); |
3467 | |
3468 | tv0->computeAt(tv3, 0); |
3469 | tv1->computeAt(tv3, 0); |
3470 | |
3471 | tv0c->axis(0)->parallelize(ParallelType::TIDx); |
3472 | tv2->axis(0)->parallelize(ParallelType::TIDy); |
3473 | tv2->axis(1)->parallelize(ParallelType::TIDx); |
3474 | |
3475 | tv3->axis(0)->parallelize(ParallelType::TIDy); |
3476 | tv3->axis(1)->parallelize(ParallelType::TIDx); |
3477 | |
3478 | GpuLower gpulw(&fusion); |
3479 | auto flattened_exprs = |
3480 | ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs()); |
3481 | bool sync_inserted = std::any_of( |
3482 | flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) { |
3483 | return expr->isA<kir::BlockSync>(); |
3484 | }); |
3485 | TORCH_INTERNAL_ASSERT(sync_inserted, "Expected block sync not inserted" ); |
3486 | |
3487 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3488 | |
3489 | at::Tensor t0 = at::randn({32}, options); |
3490 | at::Tensor t1 = at::randn({32, 32}, options); |
3491 | |
3492 | FusionExecutor fe; |
3493 | fe.compileFusion(&fusion, {t0, t1}); |
3494 | auto cg_outputs = fe.runFusion({t0, t1}); |
3495 | |
3496 | auto ref = t0 + t1; |
3497 | |
3498 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
3499 | } |
3500 | |
3501 | // Test case for removing syncs on chain of redundant uses. |
3502 | TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) { |
3503 | Fusion fusion; |
3504 | FusionGuard fg(&fusion); |
3505 | |
3506 | TensorView* tv0 = makeConcreteTensor({32}); |
3507 | TensorView* tv1 = makeConcreteTensor({32, 32}); |
3508 | fusion.addInput(tv0); |
3509 | fusion.addInput(tv1); |
3510 | |
3511 | auto tv2 = broadcast(tv0, {true, false}); |
3512 | auto tv3 = add(tv2, tv1); |
3513 | |
3514 | fusion.addOutput(tv3); |
3515 | |
3516 | auto tv0c = tv0->cacheAfter(); |
3517 | |
3518 | // Make a redundant write through smem |
3519 | tv0c->setMemoryType(MemoryType::Shared); |
3520 | tv2->setMemoryType(MemoryType::Shared); |
3521 | |
3522 | tv0->computeAt(tv3, 0); |
3523 | tv1->computeAt(tv3, 0); |
3524 | |
3525 | tv0c->axis(0)->parallelize(ParallelType::TIDx); |
3526 | tv2->axis(0)->parallelize(ParallelType::TIDy); |
3527 | tv2->axis(1)->parallelize(ParallelType::TIDx); |
3528 | |
3529 | tv3->axis(0)->parallelize(ParallelType::TIDy); |
3530 | tv3->axis(1)->parallelize(ParallelType::TIDx); |
3531 | |
3532 | // Utility class to make sure one block sync |
3533 | // is inserted by RAW pass. |
3534 | class SyncChecker : public kir::IrVisitor { |
3535 | public: |
3536 | using kir::IrVisitor::handle; |
3537 | int result() { |
3538 | return sync_seen_; |
3539 | } |
3540 | |
3541 | private: |
3542 | void handle(kir::BlockSync*) final { |
3543 | sync_seen_++; |
3544 | } |
3545 | |
3546 | private: |
3547 | int sync_seen_ = 0; |
3548 | } checker; |
3549 | |
3550 | GpuLower gpulw(&fusion); |
3551 | checker.handle(gpulw.kernel()->topLevelExprs()); |
3552 | TORCH_INTERNAL_ASSERT( |
3553 | checker.result() < 2, "More syncs were inserted than expected" ); |
3554 | |
3555 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3556 | |
3557 | at::Tensor t0 = at::randn({32}, options); |
3558 | at::Tensor t1 = at::randn({32, 32}, options); |
3559 | |
3560 | FusionExecutor fe; |
3561 | fe.compileFusion(&fusion, {t0, t1}); |
3562 | auto cg_outputs = fe.runFusion({t0, t1}); |
3563 | |
3564 | auto ref = t0 + t1; |
3565 | |
3566 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
3567 | } |
3568 | |
3569 | // Test case for sync insertion after redundant predicated smem write |
3570 | // Check that syncs are removed only when all paths are redundant. |
3571 | TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) { |
3572 | Fusion fusion; |
3573 | FusionGuard fg(&fusion); |
3574 | |
3575 | TensorView* tv0 = makeConcreteTensor({32}); |
3576 | TensorView* tv1 = makeConcreteTensor({32, 32}); |
3577 | fusion.addInput(tv0); |
3578 | fusion.addInput(tv1); |
3579 | |
3580 | auto tv2 = broadcast(tv0, {true, false}); |
3581 | auto tv3 = set(tv2); |
3582 | auto tv4 = add(tv3, tv1); |
3583 | auto tv5 = add(tv2, tv1); |
3584 | |
3585 | fusion.addOutput(tv4); |
3586 | fusion.addOutput(tv5); |
3587 | |
3588 | auto tv0c = tv0->cacheAfter(); |
3589 | |
3590 | // In this scheduling config, |
3591 | // tv0c -> tv2 -> tv3 is a redundant path for tidy |
3592 | // tv0c -> tv2 -> tv5 is not. |
3593 | // So we need a RAW sync in tv0c->tv2 to make sure |
3594 | // tv2 has the correct value to produce tv5. |
3595 | tv0c->setMemoryType(MemoryType::Shared); |
3596 | tv3->setMemoryType(MemoryType::Shared); |
3597 | |
3598 | tv0c->axis(0)->parallelize(ParallelType::TIDx); |
3599 | tv2->axis(0)->parallelize(ParallelType::TIDy); |
3600 | tv2->axis(1)->parallelize(ParallelType::TIDx); |
3601 | |
3602 | tv3->axis(0)->parallelize(ParallelType::TIDy); |
3603 | tv3->axis(1)->parallelize(ParallelType::TIDx); |
3604 | |
3605 | tv5->axis(0)->parallelize(ParallelType::TIDy); |
3606 | tv5->axis(1)->parallelize(ParallelType::TIDx); |
3607 | |
3608 | // Utility class to make sure one block sync |
3609 | // is inserted by RAW pass. |
3610 | class SyncChecker : public kir::IrVisitor { |
3611 | public: |
3612 | using kir::IrVisitor::handle; |
3613 | int result() { |
3614 | return sync_seen_; |
3615 | } |
3616 | |
3617 | private: |
3618 | void handle(kir::BlockSync* sync) final { |
3619 | if (!sync->isWarHazardSync()) { |
3620 | sync_seen_++; |
3621 | } |
3622 | } |
3623 | |
3624 | private: |
3625 | int sync_seen_ = 0; |
3626 | } checker; |
3627 | |
3628 | GpuLower gpulw(&fusion); |
3629 | checker.handle(gpulw.kernel()->topLevelExprs()); |
3630 | |
3631 | // This is implicit checking. There are exactly 2 places |
3632 | // where RAW hazards happen: one producing tv2 and the other |
3633 | // producing tv3. This test case expect syncs in both of |
3634 | // these places so we check that 2 RAW syncs are inserted. |
3635 | TORCH_INTERNAL_ASSERT( |
3636 | checker.result() == 2, |
3637 | "Exactly 2 RAW sync expected for the two shared memory transfers" ); |
3638 | |
3639 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3640 | |
3641 | at::Tensor t0 = at::randn({32}, options); |
3642 | at::Tensor t1 = at::randn({32, 32}, options); |
3643 | |
3644 | FusionExecutor fe; |
3645 | fe.compileFusion(&fusion, {t0, t1}); |
3646 | auto cg_outputs = fe.runFusion({t0, t1}); |
3647 | |
3648 | auto ref = t0 + t1; |
3649 | |
3650 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref, ref}, __LINE__, __FILE__); |
3651 | } |
3652 | |
3653 | // Unit test case for detecting thread redundant usage of shared tensors. |
3654 | TEST_F(NVFuserTest, FusionRedundantUseCheck_CUDA) { |
3655 | Fusion fusion; |
3656 | FusionGuard fg(&fusion); |
3657 | |
3658 | TensorView* tv0 = makeConcreteTensor({32, 32}); |
3659 | fusion.addInput(tv0); |
3660 | |
3661 | auto tv1 = set(tv0); |
3662 | auto tv2 = set(tv1); |
3663 | auto tv3 = set(tv2); |
3664 | auto tv4 = set(tv3); |
3665 | |
3666 | auto tv5 = set(tv4); |
3667 | |
3668 | auto tv6 = set(tv4); |
3669 | auto tv7 = set(tv6); |
3670 | |
3671 | fusion.addOutput(tv5); |
3672 | fusion.addOutput(tv7); |
3673 | |
3674 | tv2->setMemoryType(MemoryType::Shared); |
3675 | tv4->setMemoryType(MemoryType::Shared); |
3676 | |
3677 | tv7->axis(-1)->parallelize(ParallelType::TIDx); |
3678 | |
3679 | // Thread pred map cannot be built without an active lower |
3680 | // object. So would need to lower the whole fusion for |
3681 | // testing. However, lower also keeps an copy of the fusion |
3682 | // so the original pointers cannot be used to querry the |
3683 | // thread pred map. So have to traverse the new expr list |
3684 | // to find the pointers; |
3685 | GpuLower gpulw(&fusion); |
3686 | |
3687 | TensorView *lowered_tv2 = nullptr, *lowered_tv4 = nullptr; |
3688 | auto used_vals = gpulw.kernel()->usedMathVals(); |
3689 | |
3690 | for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) { |
3691 | if (tv->name() == 2) { |
3692 | lowered_tv2 = tv; |
3693 | } |
3694 | if (tv->name() == 4) { |
3695 | lowered_tv4 = tv; |
3696 | } |
3697 | } |
3698 | |
3699 | TORCH_INTERNAL_ASSERT( |
3700 | lowered_tv2 != nullptr && lowered_tv4 != nullptr, |
3701 | "tv2 or tv4 not lowered or mangled" ); |
3702 | |
3703 | auto tv2_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv2); |
3704 | auto tv4_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv4); |
3705 | |
3706 | // tv2 -> tv3 -> tv4 (shared) is the only use chain for tv2, |
3707 | // and tv4 is redundantly written in tidx so tv2 is redundantly |
3708 | // consumed in tidx. |
3709 | TORCH_INTERNAL_ASSERT( |
3710 | tv2_info.redundant_use_types.get(ParallelType::TIDx), |
3711 | "TV2 is redundantly used but not detected." ); |
3712 | |
3713 | // tv4->tv5 (global) is a redundant use chain, but |
3714 | // tv4->tv6->tv7 is not, so tv4 should not be detected as |
3715 | // a redundant used tensor in tidx. |
3716 | TORCH_INTERNAL_ASSERT( |
3717 | !tv4_info.redundant_use_types.get(ParallelType::TIDx), |
3718 | "TV4 is not redundantly used but not detected." ); |
3719 | } |
3720 | |
3721 | // Test a basic swizzle pattern |
3722 | TEST_F(NVFuserTest, FusionSimpleSwizzle0_CUDA) { |
3723 | Fusion fusion; |
3724 | FusionGuard fg(&fusion); |
3725 | |
3726 | auto tv0 = makeConcreteTensor({2, 32}); |
3727 | fusion.addInput(tv0); |
3728 | |
3729 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3730 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
3731 | |
3732 | fusion.addOutput(tv2); |
3733 | |
3734 | // Make a 2x8 Zshape tile |
3735 | tv1->split(-1, 16); |
3736 | tv1->split(-1, 8); |
3737 | // [O, 2, 8] |
3738 | |
3739 | tv2->split(-1, 16); |
3740 | tv2->split(-1, 4); |
3741 | //[O, 4, 4] |
3742 | |
3743 | tv1->computeAt(tv2, 1); |
3744 | tv1->swizzle(Swizzle2DType::ZShape, -2, -1); |
3745 | |
3746 | FusionExecutor fe; |
3747 | fe.compileFusion(&fusion); |
3748 | |
3749 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3750 | auto t0 = at::randn({2, 32}, options); |
3751 | auto t2 = t0 + 2.0; |
3752 | auto cg_outputs = fe.runFusion({t0}); |
3753 | |
3754 | testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__); |
3755 | } |
3756 | |
3757 | // Test swizzle inlining |
3758 | TEST_F(NVFuserTest, FusionSimpleSwizzle1_CUDA) { |
3759 | Fusion fusion; |
3760 | FusionGuard fg(&fusion); |
3761 | |
3762 | auto tv0 = makeConcreteTensor({2, 32}); |
3763 | fusion.addInput(tv0); |
3764 | |
3765 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3766 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
3767 | auto tv3 = add(tv2, IrBuilder::create<Double>(1)); |
3768 | |
3769 | fusion.addOutput(tv3); |
3770 | |
3771 | // Make a 2x8 Zshape tile |
3772 | tv2->split(-1, 16); |
3773 | tv2->split(-1, 8); |
3774 | // [O, 2, 8] |
3775 | |
3776 | tv3->split(-1, 16); |
3777 | tv3->split(-1, 4); |
3778 | //[O, 4, 4] |
3779 | |
3780 | tv2->computeAt(tv3, 1); |
3781 | tv2->swizzle(Swizzle2DType::ZShape, -2, -1); |
3782 | |
3783 | // Inlining a producer into a swizzled consumer is ok |
3784 | tv1->computeAt(tv2, -1); |
3785 | |
3786 | FusionExecutor fe; |
3787 | fe.compileFusion(&fusion); |
3788 | |
3789 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3790 | auto t0 = at::randn({2, 32}, options); |
3791 | auto t3 = t0 + 3.0; |
3792 | auto cg_outputs = fe.runFusion({t0}); |
3793 | |
3794 | testValidate(&fusion, cg_outputs, {t0}, {t3}, __LINE__, __FILE__); |
3795 | } |
3796 | |
3797 | // Test sync insertion and memory check in parallelized swizzles. |
3798 | // In this test, data is parallel written into smem in zcurve |
3799 | // pattern and then read out and output to global mem unswizzled. |
3800 | TEST_F(NVFuserTest, FusionSimpleSwizzle2_CUDA) { |
3801 | Fusion fusion; |
3802 | FusionGuard fg(&fusion); |
3803 | |
3804 | auto tv0 = makeConcreteTensor({32, 32}); |
3805 | fusion.addInput(tv0); |
3806 | |
3807 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3808 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
3809 | |
3810 | fusion.addOutput(tv2); |
3811 | |
3812 | tv1->swizzle(Swizzle2DType::ZShape, -2, -1); |
3813 | |
3814 | tv1->axis(0)->parallelize(ParallelType::TIDx); |
3815 | tv1->axis(1)->parallelize(ParallelType::TIDy); |
3816 | |
3817 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
3818 | tv2->axis(1)->parallelize(ParallelType::TIDy); |
3819 | |
3820 | // Validation should fail since TV1 is not in shared |
3821 | // memory as required by sync info pass. |
3822 | ASSERT_ANY_THROW(GpuLower gpulw_throw(&fusion)); |
3823 | |
3824 | tv1->setMemoryType(MemoryType::Shared); |
3825 | |
3826 | // Make sure that a sync is inserted: |
3827 | bool sync_found = false; |
3828 | GpuLower gpu_lw(&fusion); |
3829 | auto flattened_exps = |
3830 | ir_utils::flattenScopedExprs(gpu_lw.kernel()->topLevelExprs()); |
3831 | |
3832 | for (auto expr : flattened_exps) { |
3833 | if (expr->isA<kir::BlockSync>()) { |
3834 | sync_found = true; |
3835 | } |
3836 | // Will require a sync thread before any shared memory read. |
3837 | for (auto inp_tv : ir_utils::filterByType<TensorView>(expr->inputs())) { |
3838 | if (inp_tv->getMemoryType() == MemoryType::Shared) { |
3839 | TORCH_INTERNAL_ASSERT( |
3840 | sync_found, "Block sync required but not inserted" ); |
3841 | } |
3842 | } |
3843 | } |
3844 | |
3845 | FusionExecutor fe; |
3846 | fe.compileFusion(&fusion); |
3847 | |
3848 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3849 | auto t0 = at::randn({32, 32}, options); |
3850 | auto t2 = t0 + 2.0; |
3851 | auto cg_outputs = fe.runFusion({t0}); |
3852 | |
3853 | testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__); |
3854 | } |
3855 | |
3856 | // Test BestEffortReplay behavior with swizzle op |
3857 | TEST_F(NVFuserTest, FusionSwizzleMapping_CUDA) { |
3858 | Fusion fusion; |
3859 | FusionGuard fg(&fusion); |
3860 | |
3861 | auto tv0 = makeConcreteTensor({2, 32}); |
3862 | fusion.addInput(tv0); |
3863 | |
3864 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3865 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
3866 | auto tv3 = add(tv2, IrBuilder::create<Double>(1)); |
3867 | |
3868 | fusion.addOutput(tv3); |
3869 | |
3870 | // Make a 2x8 Zshape tile |
3871 | tv2->split(-1, 16); |
3872 | tv2->split(-1, 8); |
3873 | // [O, 2, 8] |
3874 | |
3875 | tv3->split(-1, 16); |
3876 | tv3->split(-1, 4); |
3877 | //[O, 4, 4] |
3878 | |
3879 | tv2->computeAt(tv3, 1); |
3880 | tv2->swizzle(Swizzle2DType::ZShape, -2, -1); |
3881 | |
3882 | // Inlining a producer into a swizzled consumer is ok |
3883 | tv1->computeAt(tv2, -1); |
3884 | |
3885 | // Check BestEffortReplay behavior with skip swizzles option on. |
3886 | PairwiseRootDomainMap root_map(tv1, tv2); |
3887 | |
3888 | // Check producer to consumer map, |
3889 | // i.e. unswizzled tensor to swizzled tensor map |
3890 | //---------------------------------------------------------- |
3891 | auto p2c = BestEffortReplay::replayCasP(tv2, tv1, -1, root_map).getReplay(); |
3892 | auto swizzle_x_it0 = p2c.find(tv1->axis(-2)); |
3893 | auto swizzle_y_it0 = p2c.find(tv1->axis(-1)); |
3894 | // P2C map should exist and both the x and y map should |
3895 | // map to the output of the swizzle op. |
3896 | TORCH_INTERNAL_ASSERT( |
3897 | swizzle_x_it0 != p2c.end() && swizzle_y_it0 != p2c.end()); |
3898 | TORCH_INTERNAL_ASSERT( |
3899 | swizzle_x_it0->second == tv2->axis(-2) && |
3900 | swizzle_y_it0->second == tv2->axis(-1)); |
3901 | |
3902 | // Check consumer to producer map, |
3903 | // i.e. swizzled tensor to unswizzled tensor map |
3904 | //---------------------------------------------------------- |
3905 | auto c2p = BestEffortReplay::replayPasC(tv1, tv2, -1, root_map).getReplay(); |
3906 | |
3907 | auto swizzle_op = tv2->axis(-1)->definition()->as<Swizzle2D>(); |
3908 | |
3909 | // Find mapping for swizzle inputs |
3910 | auto swizzle_x_it1 = c2p.find(swizzle_op->inX()); |
3911 | auto swizzle_y_it1 = c2p.find(swizzle_op->inY()); |
3912 | |
3913 | // Find mapping for swizzle outputs |
3914 | auto swizzle_x_it2 = c2p.find(swizzle_op->outX()); |
3915 | auto swizzle_y_it2 = c2p.find(swizzle_op->outY()); |
3916 | |
3917 | // Input of swizzle ops will not be mapped to any |
3918 | // by BestEffortReplay, as BestEffortReplay has to be |
3919 | // one to one. IdGraph will further map them together. |
3920 | TORCH_INTERNAL_ASSERT( |
3921 | swizzle_x_it1 == c2p.end() && swizzle_y_it1 == c2p.end()); |
3922 | |
3923 | // Mapping for swizzle outputs should be mapped and should |
3924 | // also map to the corresponding axes on the unswizzled tensor. |
3925 | TORCH_INTERNAL_ASSERT( |
3926 | swizzle_x_it2 != c2p.end() && swizzle_y_it2 != c2p.end()); |
3927 | TORCH_INTERNAL_ASSERT( |
3928 | swizzle_x_it2->second == tv1->axis(-2) && |
3929 | swizzle_y_it2->second == tv1->axis(-1)); |
3930 | |
3931 | // Check id graph behavior |
3932 | //---------------------------------------------------------- |
3933 | ComputeAtMap ca_map(&fusion); |
3934 | // Corresponding inputs and outputs of swizzle ops are |
3935 | // map through by exact and permissive map. |
3936 | TORCH_INTERNAL_ASSERT( |
3937 | ca_map.areMapped(tv1->axis(-2), swizzle_op->inX(), IdMappingMode::EXACT)); |
3938 | TORCH_INTERNAL_ASSERT( |
3939 | ca_map.areMapped(tv1->axis(-1), swizzle_op->inY(), IdMappingMode::EXACT)); |
3940 | TORCH_INTERNAL_ASSERT(ca_map.areMapped( |
3941 | tv1->axis(-2), swizzle_op->outX(), IdMappingMode::EXACT)); |
3942 | TORCH_INTERNAL_ASSERT(ca_map.areMapped( |
3943 | tv1->axis(-1), swizzle_op->outY(), IdMappingMode::EXACT)); |
3944 | |
3945 | TORCH_INTERNAL_ASSERT(ca_map.areMapped( |
3946 | tv1->axis(-2), swizzle_op->inX(), IdMappingMode::PERMISSIVE)); |
3947 | TORCH_INTERNAL_ASSERT(ca_map.areMapped( |
3948 | tv1->axis(-1), swizzle_op->inY(), IdMappingMode::PERMISSIVE)); |
3949 | TORCH_INTERNAL_ASSERT(ca_map.areMapped( |
3950 | tv1->axis(-2), swizzle_op->outX(), IdMappingMode::PERMISSIVE)); |
3951 | TORCH_INTERNAL_ASSERT(ca_map.areMapped( |
3952 | tv1->axis(-1), swizzle_op->outY(), IdMappingMode::PERMISSIVE)); |
3953 | } |
3954 | |
3955 | // Test a basic loop swizzle pattern |
3956 | TEST_F(NVFuserTest, FusionLoopSwizzle0_CUDA) { |
3957 | Fusion fusion; |
3958 | FusionGuard fg(&fusion); |
3959 | |
3960 | auto tv0 = makeConcreteTensor({2, 32}); |
3961 | fusion.addInput(tv0); |
3962 | |
3963 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3964 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
3965 | |
3966 | fusion.addOutput(tv2); |
3967 | |
3968 | tv2->split(-1, 16); |
3969 | tv2->split(-1, 4); |
3970 | //[O, 4, 4] |
3971 | |
3972 | tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop); |
3973 | |
3974 | tv0->computeAt(tv2, -1); |
3975 | |
3976 | FusionExecutor fe; |
3977 | fe.compileFusion(&fusion); |
3978 | |
3979 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3980 | auto t0 = at::randn({2, 32}, options); |
3981 | auto t2 = t0 + 2.0; |
3982 | auto cg_outputs = fe.runFusion({t0}); |
3983 | |
3984 | testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__); |
3985 | } |
3986 | |
3987 | // Outer block zshape pattern |
3988 | TEST_F(NVFuserTest, FusionLoopSwizzle1_CUDA) { |
3989 | Fusion fusion; |
3990 | FusionGuard fg(&fusion); |
3991 | |
3992 | auto tv0 = makeContigTensor(2); |
3993 | fusion.addInput(tv0); |
3994 | |
3995 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3996 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
3997 | |
3998 | fusion.addOutput(tv2); |
3999 | |
4000 | tv2->split(-2, 8); |
4001 | tv2->split(-1, 4); |
4002 | //[I0o, I0i, I1o, I1i] |
4003 | tv2->reorder({{1, 2}, {2, 1}}); |
4004 | //[I0o, I1o, I0i, I1i] |
4005 | |
4006 | tv2->swizzle(Swizzle2DType::ZShape, 0, 1, SwizzleMode::Loop); |
4007 | tv0->computeAt(tv2, -1); |
4008 | |
4009 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
4010 | tv2->axis(1)->parallelize(ParallelType::BIDy); |
4011 | |
4012 | FusionExecutor fe; |
4013 | fe.compileFusion(&fusion); |
4014 | |
4015 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4016 | auto t0 = at::randn({45, 77}, options); |
4017 | auto t2 = t0 + 2.0; |
4018 | auto cg_outputs = fe.runFusion({t0}); |
4019 | |
4020 | testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__); |
4021 | } |
4022 | |
4023 | // Test assertion in unsupported pattern: non-leaf loop swizzle. |
4024 | TEST_F(NVFuserTest, FusionLoopSwizzleCheck0_CUDA) { |
4025 | Fusion fusion; |
4026 | FusionGuard fg(&fusion); |
4027 | |
4028 | auto tv0 = makeConcreteTensor({2, 32}); |
4029 | fusion.addInput(tv0); |
4030 | |
4031 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
4032 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
4033 | |
4034 | fusion.addOutput(tv2); |
4035 | |
4036 | tv2->split(-1, 16); |
4037 | tv2->split(-1, 4); |
4038 | //[O, 4, 4] |
4039 | |
4040 | // Swizzle the inner tile. |
4041 | tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop); |
4042 | |
4043 | // Make swizzle output not a leaf domain. |
4044 | tv2->merge(-2); |
4045 | |
4046 | tv0->computeAt(tv2, -1); |
4047 | |
4048 | FusionExecutor fe; |
4049 | ASSERT_ANY_THROW(fe.compileFusion(&fusion)); |
4050 | } |
4051 | |
4052 | // Test assertion in unsupported pattern: half-inlined loop swizzle. |
4053 | TEST_F(NVFuserTest, FusionLoopSwizzleCheck1_CUDA) { |
4054 | Fusion fusion; |
4055 | FusionGuard fg(&fusion); |
4056 | |
4057 | auto tv0 = makeConcreteTensor({2, 32}); |
4058 | fusion.addInput(tv0); |
4059 | |
4060 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
4061 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
4062 | auto tv3 = add(tv2, IrBuilder::create<Double>(1)); |
4063 | |
4064 | fusion.addOutput(tv3); |
4065 | |
4066 | //[O, 4, 4] |
4067 | tv2->split(-1, 16); |
4068 | tv2->split(-1, 4); |
4069 | |
4070 | //[O, 4, 4] |
4071 | tv3->split(-1, 16); |
4072 | tv3->split(-1, 4); |
4073 | |
4074 | // Swizzle inner tile of tv2 |
4075 | tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop); |
4076 | |
4077 | // Make tv2 swizzled and partially-inlined (unsupported). |
4078 | tv0->computeAt(tv3, -2); |
4079 | |
4080 | FusionExecutor fe; |
4081 | ASSERT_ANY_THROW(fe.compileFusion(&fusion)); |
4082 | } |
4083 | |
4084 | TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) { |
4085 | Fusion fusion; |
4086 | FusionGuard fg(&fusion); |
4087 | |
4088 | std::vector<int64_t> shape({10, 11}); |
4089 | |
4090 | auto tv0 = makeConcreteTensor(shape); |
4091 | fusion.addInput(tv0); |
4092 | |
4093 | // [I, R] |
4094 | auto tv1 = sum(tv0, {1}); |
4095 | // [I, B] |
4096 | auto tv2 = unsqueeze(tv1, -1); |
4097 | fusion.addOutput(tv2); |
4098 | |
4099 | TORCH_CHECK( |
4100 | tv2->nDims() == 2, "Unexpected unsqueeze result: " , tv2->toString()); |
4101 | TORCH_CHECK( |
4102 | tv2->axis(1)->isBroadcast(), |
4103 | "Unexpected unsqueeze result: " , |
4104 | tv2->toString()); |
4105 | |
4106 | // tv1 has only one non-reduction axis. An exception should be |
4107 | // thrown. |
4108 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
4109 | ASSERT_ANY_THROW(unsqueeze(tv1, 2)); |
4110 | |
4111 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4112 | at::Tensor t0 = at::randn({10, 11}, options); |
4113 | std::vector<IValue> aten_inputs = {t0}; |
4114 | |
4115 | FusionExecutor fe; |
4116 | fe.compileFusion(&fusion, aten_inputs); |
4117 | auto cg_outputs = fe.runFusion(aten_inputs); |
4118 | |
4119 | auto ref = t0.sum(1).unsqueeze(-1); |
4120 | |
4121 | testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__); |
4122 | } |
4123 | |
4124 | TEST_F(NVFuserTest, FusionSqueeze1_CUDA) { |
4125 | Fusion fusion; |
4126 | FusionGuard fg(&fusion); |
4127 | |
4128 | std::vector<int64_t> shape({10, 11}); |
4129 | |
4130 | auto tv0 = makeConcreteTensor(shape); |
4131 | fusion.addInput(tv0); |
4132 | |
4133 | // [I, B] |
4134 | auto tv1 = sum(tv0, {1}, true); |
4135 | // [I] |
4136 | auto tv2 = squeeze(tv1, {shape[0], 1}); |
4137 | fusion.addOutput(tv2); |
4138 | |
4139 | TORCH_CHECK( |
4140 | tv2->nDims() == 2, "Unexpected squeeze result: " , tv2->toString()); |
4141 | |
4142 | // [I, R] |
4143 | auto tv3 = sum(tv0, {1}); |
4144 | // tv3 has only one non-reduction axis. The extent of the first axis |
4145 | // is not one, so squeeze should fail. |
4146 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
4147 | ASSERT_ANY_THROW(squeeze(tv3, {shape[0], 1})); |
4148 | |
4149 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4150 | at::Tensor t0 = at::randn({10, 11}, options); |
4151 | std::vector<IValue> aten_inputs = {t0}; |
4152 | |
4153 | FusionExecutor fe; |
4154 | fe.compileFusion(&fusion, aten_inputs); |
4155 | auto cg_outputs = fe.runFusion(aten_inputs); |
4156 | |
4157 | auto ref = t0.sum(1, true).squeeze(-1); |
4158 | |
4159 | testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__); |
4160 | } |
4161 | |
4162 | TEST_F(NVFuserTest, FusionContigPredicate_CUDA) { |
4163 | Fusion fusion; |
4164 | FusionGuard fg(&fusion); |
4165 | |
4166 | auto tv0 = makeSymbolicTensor(2); |
4167 | fusion.addInput(tv0); |
4168 | auto tv1 = set(tv0); |
4169 | auto tv2 = broadcast(tv1, {false, true, false}); |
4170 | fusion.addOutput(tv2); |
4171 | |
4172 | tv2->merge(-2, -1); |
4173 | tv2->merge(-2, -1); |
4174 | tv2->split(-1, 100); |
4175 | tv0->computeAt(tv2, -1); |
4176 | |
4177 | GpuLower gpulw(&fusion); |
4178 | TORCH_CHECK(PredicatedChecker::isPredicated(tv1, gpulw)); |
4179 | |
4180 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4181 | at::Tensor t0 = at::randn({3, 4}, options); |
4182 | |
4183 | FusionExecutor fe; |
4184 | fe.compileFusion(&fusion, {t0}); |
4185 | auto cg_outputs = fe.runFusion({t0}); |
4186 | |
4187 | auto ref = t0.unsqueeze(1); |
4188 | |
4189 | testValidate(fe.kernel(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
4190 | } |
4191 | |
4192 | // Repro of https://github.com/csarofeen/pytorch/issues/1777 |
4193 | TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) { |
4194 | // tv1 = 2.0 / tv0 |
4195 | Fusion fusion; |
4196 | FusionGuard fg(&fusion); |
4197 | |
4198 | TensorView* tv0 = makeSymbolicTensor(2); |
4199 | fusion.addInput(tv0); |
4200 | TensorView* tv1 = div(IrBuilder::create<Double>(2.0), tv0); |
4201 | fusion.addOutput(tv1); |
4202 | |
4203 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4204 | auto t0 = at::randn({3, 3}, options); |
4205 | // There's no overload div(Scalar, Tensor) in ATen |
4206 | auto aten_output = at::div( |
4207 | at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0); |
4208 | |
4209 | FusionExecutor fe; |
4210 | fe.compileFusion(&fusion, {t0}); |
4211 | auto cg_outputs = fe.runFusion({t0}); |
4212 | |
4213 | testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__); |
4214 | } |
4215 | |
4216 | // Repro of an issue of the reduction scheduler with a broadcast |
4217 | // domain concretized to multiple domains that are not proven to have |
4218 | // the same extent |
4219 | TEST_F(NVFuserTest, FusionRepro1713_CUDA) { |
4220 | auto fusion = std::make_unique<Fusion>(); |
4221 | FusionGuard fg(fusion.get()); |
4222 | |
4223 | auto tv0 = makeSymbolicTensor(2); |
4224 | auto tv1 = makeSymbolicTensor(2); |
4225 | auto tv2 = makeSymbolicTensor(1); |
4226 | fusion->addInput(tv0); |
4227 | fusion->addInput(tv1); |
4228 | fusion->addInput(tv2); |
4229 | auto tv3 = broadcast(tv2, {false, true}); |
4230 | |
4231 | auto tv4 = add(tv3, tv0); |
4232 | |
4233 | auto tv5 = add(tv3, tv1); |
4234 | auto tv6 = sum(tv5, {0}); |
4235 | fusion->addOutput(tv4); |
4236 | fusion->addOutput(tv6); |
4237 | |
4238 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4239 | at::Tensor t0 = at::randn({1024, 204800}, options); |
4240 | // Original repro had the same shape as t0, but this should work |
4241 | // with a different extent at the second axis |
4242 | at::Tensor t1 = at::randn({1024, 123}, options); |
4243 | at::Tensor t2 = at::randn({1024}, options); |
4244 | std::vector<IValue> aten_inputs({t0, t1, t2}); |
4245 | |
4246 | FusionExecutorCache executor_cache(std::move(fusion)); |
4247 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
4248 | |
4249 | auto t3 = t2.unsqueeze(-1); |
4250 | auto t4 = t3 + t0; |
4251 | auto t5 = t3 + t1; |
4252 | auto t6 = sum(t5, {0}); |
4253 | |
4254 | testValidate( |
4255 | executor_cache.fusion(), |
4256 | cg_outputs, |
4257 | {t0, t1, t2}, |
4258 | {t4, t6}, |
4259 | __LINE__, |
4260 | __FILE__); |
4261 | } |
4262 | |
4263 | TEST_F(NVFuserTest, FusionExpand_CUDA) { |
4264 | auto fusion = std::make_unique<Fusion>(); |
4265 | FusionGuard fg(fusion.get()); |
4266 | |
4267 | auto w = 2, x = 3, y = 4, z = 5; |
4268 | |
4269 | // Test |
4270 | // a simple expand |
4271 | // Expand that's propagated |
4272 | // expand_as |
4273 | // symbolic expand |
4274 | |
4275 | // x |
4276 | auto tv0 = makeSymbolicTensor(1); |
4277 | fusion->addInput(tv0); |
4278 | |
4279 | auto tv1 = broadcast(tv0, {false, true}); |
4280 | auto tv2 = expand(tv1, {tv0->axis(0)->extent(), IrBuilder::create<Int>(y)}); |
4281 | |
4282 | // x |
4283 | auto tv3 = makeSymbolicTensor(1); |
4284 | fusion->addInput(tv3); |
4285 | auto tv4 = broadcast(tv3, {false, true}); |
4286 | auto tv5 = add(tv4, tv2); |
4287 | // [x, e_y] |
4288 | |
4289 | // [x, y, z] |
4290 | auto tv6 = makeSymbolicTensor(3); |
4291 | fusion->addInput(tv6); |
4292 | |
4293 | // Disjoint set op will cause a segmentation for just this op. |
4294 | auto tmp_7 = set(tv6); |
4295 | fusion->addOutput(tmp_7); |
4296 | |
4297 | auto tv7 = broadcast(tv5, {false, false, true}); |
4298 | |
4299 | auto tv8 = expand_as(tv7, tv6); |
4300 | // [x, e_y, e_z] |
4301 | |
4302 | auto w_symbolic = IrBuilder::create<Int>(); |
4303 | fusion->addInput(w_symbolic); |
4304 | |
4305 | auto tv9 = broadcast(tv8, {true, false, false, false}); |
4306 | //[1, x, e_y, e_z] |
4307 | |
4308 | auto tv10 = expand( |
4309 | tv9, |
4310 | {w_symbolic, |
4311 | tv9->axis(1)->extent(), |
4312 | tv9->axis(2)->expandedExtent(), |
4313 | tv9->axis(3)->expandedExtent()}); |
4314 | |
4315 | fusion->addOutput(tv10); |
4316 | |
4317 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4318 | at::Tensor t0 = at::randn({x}, options); |
4319 | at::Tensor t3 = at::randn({x}, options); |
4320 | at::Tensor t6 = at::randn({x, y, z}, options); |
4321 | |
4322 | FusionExecutorCache executor_cache(std::move(fusion)); |
4323 | |
4324 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6, w}); |
4325 | auto cg_out = cg_outputs[1]; |
4326 | |
4327 | TORCH_INTERNAL_ASSERT(cg_out.size(0) == w); |
4328 | TORCH_INTERNAL_ASSERT(cg_out.size(1) == x); |
4329 | TORCH_INTERNAL_ASSERT(cg_out.size(2) == y); |
4330 | TORCH_INTERNAL_ASSERT(cg_out.size(3) == z); |
4331 | TORCH_INTERNAL_ASSERT(cg_out.stride(0) == 0); |
4332 | TORCH_INTERNAL_ASSERT(cg_out.stride(1) == 1); |
4333 | TORCH_INTERNAL_ASSERT(cg_out.stride(2) == 0); |
4334 | TORCH_INTERNAL_ASSERT(cg_out.stride(3) == 0); |
4335 | |
4336 | auto t10 = t0.unsqueeze(-1) |
4337 | .expand({x, y}) |
4338 | .add(t3.unsqueeze(-1)) |
4339 | .unsqueeze(-1) |
4340 | .expand_as(t6) |
4341 | .unsqueeze(0) |
4342 | .expand({w, x, y, z}); |
4343 | |
4344 | testValidate( |
4345 | executor_cache.fusion(), |
4346 | cg_outputs, |
4347 | {t0, t3, t6, w}, |
4348 | {t6, t10}, |
4349 | __LINE__, |
4350 | __FILE__); |
4351 | } |
4352 | |
4353 | TEST_F(NVFuserTest, FusionExpandIssue1751_CUDA) { |
4354 | auto fusion = std::make_unique<Fusion>(); |
4355 | FusionGuard fg(fusion.get()); |
4356 | |
4357 | auto x = 3, y = 4, z = 5; |
4358 | |
4359 | // y, z |
4360 | auto tv0 = makeSymbolicTensor(2); |
4361 | fusion->addInput(tv0); |
4362 | |
4363 | auto tv1 = broadcast(tv0, {true, false, false}); |
4364 | |
4365 | // Two ways to propagate extents as is: use -1 or explicitly pass |
4366 | // the extent vals. |
4367 | |
4368 | auto tv2 = expand( |
4369 | tv1, |
4370 | {IrBuilder::create<Int>(x), |
4371 | IrBuilder::create<Int>(-1), |
4372 | IrBuilder::create<Int>(-1)}); |
4373 | |
4374 | auto tv3 = expand( |
4375 | tv1, |
4376 | {IrBuilder::create<Int>(x), |
4377 | tv0->axis(0)->extent(), |
4378 | tv0->axis(1)->extent()}); |
4379 | |
4380 | fusion->addOutput(tv2); |
4381 | fusion->addOutput(tv3); |
4382 | |
4383 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4384 | at::Tensor t0 = at::randn({y, z}, options); |
4385 | |
4386 | FusionExecutorCache executor_cache(std::move(fusion)); |
4387 | |
4388 | auto cg_outputs = executor_cache.runFusionWithInputs({t0}); |
4389 | |
4390 | for (const auto& cg_out : cg_outputs) { |
4391 | TORCH_INTERNAL_ASSERT(cg_out.size(0) == x); |
4392 | TORCH_INTERNAL_ASSERT(cg_out.size(1) == y); |
4393 | TORCH_INTERNAL_ASSERT(cg_out.size(2) == z); |
4394 | } |
4395 | |
4396 | auto t2 = t0.expand({x, y, z}); |
4397 | |
4398 | testValidate( |
4399 | executor_cache.fusion(), cg_outputs, {t0}, {t2, t2}, __LINE__, __FILE__); |
4400 | } |
4401 | |
4402 | // TODO: Make sure the kernel uses the expanded concrete size instead |
4403 | // of the symbolic size |
4404 | TEST_F(NVFuserTest, FusionExpandToConcrete_CUDA) { |
4405 | auto fusion = std::make_unique<Fusion>(); |
4406 | FusionGuard fg(fusion.get()); |
4407 | |
4408 | auto x = 3, y = 4; |
4409 | |
4410 | auto tv0 = makeSymbolicTensor(1); |
4411 | fusion->addInput(tv0); |
4412 | |
4413 | auto tv1 = broadcast(tv0, {true, false}); |
4414 | |
4415 | auto tv2 = |
4416 | expand(tv1, {IrBuilder::create<Int>(x), IrBuilder::create<Int>(y)}); |
4417 | |
4418 | fusion->addOutput(tv2); |
4419 | |
4420 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4421 | at::Tensor t0 = at::randn({y}, options); |
4422 | |
4423 | FusionExecutorCache executor_cache(std::move(fusion)); |
4424 | |
4425 | auto cg_outputs = executor_cache.runFusionWithInputs({t0}); |
4426 | |
4427 | for (const auto& cg_out : cg_outputs) { |
4428 | TORCH_INTERNAL_ASSERT(cg_out.size(0) == x); |
4429 | TORCH_INTERNAL_ASSERT(cg_out.size(1) == y); |
4430 | } |
4431 | |
4432 | auto t2 = t0.expand({x, y}); |
4433 | |
4434 | testValidate( |
4435 | executor_cache.fusion(), cg_outputs, {t0}, {t2}, __LINE__, __FILE__); |
4436 | } |
4437 | |
4438 | TEST_F(NVFuserTest, FusionReproNoncontigBroadcast_CUDA) { |
4439 | auto fusion = std::make_unique<Fusion>(); |
4440 | FusionGuard fg(fusion.get()); |
4441 | |
4442 | auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); |
4443 | at::Tensor t0 = at::randn({4, 32, 16, 112, 112}, options).transpose(-1, -2); |
4444 | at::Tensor t1 = at::randn({32, 1, 112, 1}, options).transpose(-1, -2); |
4445 | |
4446 | auto tv0 = TensorViewBuilder() |
4447 | .ndims(5) |
4448 | .contiguity({true, true, false, false, false}) // ttfff |
4449 | .shape({-1, -1, -1, -1, -1}) |
4450 | .dtype(DataType::Half) |
4451 | .build(); |
4452 | auto tv1 = TensorViewBuilder() |
4453 | .ndims(4) |
4454 | .contiguity({true, false, false, true}) // tfft |
4455 | .shape({-1, 1, 1, -1}) |
4456 | .dtype(DataType::Half) |
4457 | .build(); |
4458 | |
4459 | fusion->addInput(tv0); |
4460 | fusion->addInput(tv1); |
4461 | |
4462 | auto tv2 = add(tv0, tv1); |
4463 | |
4464 | fusion->addOutput(tv2); |
4465 | |
4466 | std::vector<IValue> aten_inputs({t0, t1}); |
4467 | |
4468 | FusionExecutorCache executor_cache(std::move(fusion)); |
4469 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
4470 | |
4471 | auto t2 = t0 + t1; |
4472 | |
4473 | testValidate( |
4474 | executor_cache.fusion(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__); |
4475 | } |
4476 | |
4477 | namespace { |
4478 | |
4479 | // check that the resulting sibling are identical |
4480 | void checkSiblingConsistency(TensorView* replay, TensorView* target) { |
4481 | auto replay_root = replay->getRootDomain(); |
4482 | auto replay_dom = replay->domain()->domain(); |
4483 | auto target_root = target->getRootDomain(); |
4484 | auto target_dom = target->domain()->domain(); |
4485 | std::unordered_map<IterDomain*, IterDomain*> target2replay_map; |
4486 | TORCH_CHECK(replay_root.size() == target_root.size()); |
4487 | target2replay_map.reserve(replay_root.size()); |
4488 | std::transform( |
4489 | target_root.begin(), |
4490 | target_root.end(), |
4491 | replay_root.begin(), |
4492 | std::inserter(target2replay_map, target2replay_map.begin()), |
4493 | [](auto a, auto b) { return std::make_pair(a, b); }); |
4494 | BestEffortReplay replay_(replay_dom, target_dom, target2replay_map); |
4495 | auto r = replay_.getReplay(); |
4496 | for (int64_t i = 0; i < (int64_t)replay_dom.size(); i++) { |
4497 | auto target_id = target_dom[i]; |
4498 | auto replay_it = r.find(target_id); |
4499 | TORCH_CHECK(replay_it != r.end()); |
4500 | TORCH_CHECK( |
4501 | replay_it->second == replay_dom[i], |
4502 | "IterDomain mismatch when checking " , |
4503 | replay, |
4504 | " and " , |
4505 | target, |
4506 | " at " , |
4507 | i, |
4508 | ", got " , |
4509 | replay_it->second, |
4510 | " and " , |
4511 | replay_dom[i]); |
4512 | } |
4513 | }; |
4514 | |
4515 | } // namespace |
4516 | |
4517 | TEST_F(NVFuserTest, FusionTransformPropagateSibling_CUDA) { |
4518 | // https://github.com/csarofeen/pytorch/issues/1760 |
4519 | Fusion fusion; |
4520 | FusionGuard fg(&fusion); |
4521 | |
4522 | auto tv0 = makeSymbolicTensor(2); |
4523 | fusion.addInput(tv0); |
4524 | |
4525 | auto tvs = Welford(tv0, {1}); |
4526 | fusion.addOutput(tvs.var_sum); |
4527 | |
4528 | tvs.avg->split(1, 1); |
4529 | tvs.avg->split(1, 2); |
4530 | tvs.avg->split(1, 3); |
4531 | tvs.var_sum->split(1, 1); |
4532 | tvs.var_sum->split(1, 2); |
4533 | tvs.var_sum->split(1, 3); |
4534 | tvs.n->split(1, 1); |
4535 | tvs.n->split(1, 2); |
4536 | tvs.n->split(1, 3); |
4537 | |
4538 | auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4}); |
4539 | |
4540 | TransformPropagatorWithCheck propagator(var_sum_rf); |
4541 | MaxRootDomainInfoSpanningTree(var_sum_rf).traverse(&propagator); |
4542 | |
4543 | auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum); |
4544 | |
4545 | std::vector<TensorView*> siblings[] = {{tvs.avg, tvs.var_sum, tvs.n}, rf_tvs}; |
4546 | for (auto tensors : siblings) { |
4547 | for (auto t1 : tensors) { |
4548 | for (auto t2 : tensors) { |
4549 | TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2)); |
4550 | } |
4551 | } |
4552 | } |
4553 | } |
4554 | |
4555 | TEST_F(NVFuserTest, FusionTransformPropagateSelectorSibling_CUDA) { |
4556 | Fusion fusion; |
4557 | FusionGuard fg(&fusion); |
4558 | |
4559 | auto tv0 = makeSymbolicTensor(2); |
4560 | fusion.addInput(tv0); |
4561 | |
4562 | auto tvs = Welford(tv0, {1}); |
4563 | fusion.addOutput(tvs.var_sum); |
4564 | |
4565 | tvs.avg->split(1, 1); |
4566 | tvs.avg->split(1, 2); |
4567 | tvs.avg->split(1, 3); |
4568 | tvs.var_sum->split(1, 1); |
4569 | tvs.var_sum->split(1, 2); |
4570 | tvs.var_sum->split(1, 3); |
4571 | tvs.n->split(1, 1); |
4572 | tvs.n->split(1, 2); |
4573 | tvs.n->split(1, 3); |
4574 | |
4575 | auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4}); |
4576 | |
4577 | struct DisableTv0 : public MaxInfoSpanningTree::Selector { |
4578 | TensorView* tv0; |
4579 | virtual bool allowC2P(TensorView* from, TensorView* to) override { |
4580 | return from != tv0 && to != tv0; |
4581 | }; |
4582 | virtual bool allowP2C(TensorView* from, TensorView* to) override { |
4583 | return from != tv0 && to != tv0; |
4584 | }; |
4585 | virtual bool allowSibling(TensorView* from, TensorView* to) override { |
4586 | return true; |
4587 | } |
4588 | DisableTv0(TensorView* tv0) : tv0(tv0) {} |
4589 | } selector1(tv0); |
4590 | |
4591 | struct DisableTv0AndSibling : public DisableTv0 { |
4592 | virtual bool allowSibling(TensorView* from, TensorView* to) override { |
4593 | return false; |
4594 | } |
4595 | using DisableTv0::DisableTv0; |
4596 | } selector2(tv0); |
4597 | |
4598 | TransformPropagatorWithCheck propagator(var_sum_rf); |
4599 | MaxRootDomainInfoSpanningTree good_path(var_sum_rf, &selector1); |
4600 | MaxRootDomainInfoSpanningTree bad_path(var_sum_rf, &selector2); |
4601 | |
4602 | auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum); |
4603 | |
4604 | auto check = [&]() { |
4605 | std::vector<TensorView*> siblings[] = { |
4606 | {tvs.avg, tvs.var_sum, tvs.n}, rf_tvs}; |
4607 | for (auto tensors : siblings) { |
4608 | for (auto t1 : tensors) { |
4609 | for (auto t2 : tensors) { |
4610 | TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2)); |
4611 | } |
4612 | } |
4613 | } |
4614 | }; |
4615 | |
4616 | bad_path.traverse(&propagator); |
4617 | ASSERT_ANY_THROW(check()); |
4618 | good_path.traverse(&propagator); |
4619 | check(); |
4620 | } |
4621 | |
4622 | TEST_F(NVFuserTest, FusionTransformPropagatePosition_CUDA) { |
4623 | Fusion fusion; |
4624 | FusionGuard fg(&fusion); |
4625 | |
4626 | auto tv0 = makeSymbolicTensor(4); |
4627 | auto tv1 = makeSymbolicTensor(6); |
4628 | fusion.addInput(tv0); |
4629 | |
4630 | auto tv2 = broadcast(tv0, {false, false, true, false, false, true}); |
4631 | auto tv3 = add(tv1, tv2); |
4632 | fusion.addOutput(tv3); |
4633 | |
4634 | tv0->merge(2); |
4635 | tv0->merge(0); |
4636 | TransformPropagatorWithCheck propagator(tv0); |
4637 | MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator); |
4638 | |
4639 | TORCH_CHECK(tv1->nDims() == 4); |
4640 | } |
4641 | |
4642 | TEST_F(NVFuserTest, FusionIgnoreZeroDimReduction_CUDA) { |
4643 | auto fusion = std::make_unique<Fusion>(); |
4644 | FusionGuard fg(fusion.get()); |
4645 | |
4646 | auto tv0 = makeSymbolicTensor(1); |
4647 | fusion->addInput(tv0); |
4648 | auto tv1 = sum(tv0, {0}); |
4649 | // tv1 is effectively a zero-dim tensor as it only has a reduction |
4650 | // axis. |
4651 | // Reducing it further is converted to just a set op. |
4652 | auto tv2 = sum(tv1, {0}); |
4653 | fusion->addOutput(tv2); |
4654 | |
4655 | auto tv2_def = dynamic_cast<UnaryOp*>(tv2->definition()); |
4656 | TORCH_CHECK( |
4657 | tv2_def != nullptr, |
4658 | "Expected UnaryOp but found " , |
4659 | tv2->definition()->toString()); |
4660 | |
4661 | TORCH_CHECK( |
4662 | tv2_def->getUnaryOpType() == UnaryOpType::Set, |
4663 | "Expected UnaryOpType::Set but found " , |
4664 | tv2_def->getUnaryOpType()); |
4665 | |
4666 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4667 | auto t0 = at::randn({12345}, options); |
4668 | std::vector<IValue> aten_inputs({t0}); |
4669 | |
4670 | FusionExecutorCache executor_cache(std::move(fusion)); |
4671 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
4672 | |
4673 | auto ref = sum(t0, {0}); |
4674 | |
4675 | testValidate( |
4676 | executor_cache.fusion(), |
4677 | cg_outputs, |
4678 | aten_inputs, |
4679 | {ref}, |
4680 | __LINE__, |
4681 | __FILE__); |
4682 | } |
4683 | |
4684 | // Repro of issue #1770 |
4685 | TEST_F(NVFuserTest, FusionIssue1770Repro_CUDA) { |
4686 | auto fusion = std::make_unique<Fusion>(); |
4687 | FusionGuard fg(fusion.get()); |
4688 | |
4689 | auto tv0 = makeSymbolicTensor(1); |
4690 | fusion->addInput(tv0); |
4691 | auto tv1 = makeSymbolicTensor(1); |
4692 | fusion->addInput(tv1); |
4693 | |
4694 | auto tv2 = ge(tv0, tv1); |
4695 | auto tv3 = |
4696 | where(tv2, IrBuilder::create<Double>(1), IrBuilder::create<Double>(2)); |
4697 | fusion->addOutput(tv3); |
4698 | |
4699 | std::vector<int64_t> shape({999}); |
4700 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4701 | at::Tensor t0 = at::randn(shape, options); |
4702 | at::Tensor t1 = at::randn(shape, options); |
4703 | std::vector<IValue> aten_inputs({t0, t1}); |
4704 | |
4705 | FusionExecutorCache executor_cache(std::move(fusion)); |
4706 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
4707 | |
4708 | auto ref = where(t0 >= t1, 1.0, 2.0); |
4709 | |
4710 | testValidate( |
4711 | executor_cache.fusion(), |
4712 | cg_outputs, |
4713 | aten_inputs, |
4714 | {ref}, |
4715 | __LINE__, |
4716 | __FILE__); |
4717 | } |
4718 | |
4719 | TEST_F(NVFuserTest, FusionTransformPropagatorSelector_CUDA) { |
4720 | auto fusion = std::make_unique<Fusion>(); |
4721 | FusionGuard fg(fusion.get()); |
4722 | |
4723 | auto tv0 = makeSymbolicTensor(1); |
4724 | fusion->addInput(tv0); |
4725 | auto tv1 = makeSymbolicTensor(1); |
4726 | fusion->addInput(tv1); |
4727 | |
4728 | auto tv2 = add(tv0, tv1); |
4729 | |
4730 | auto tv3 = sin(tv2); |
4731 | auto tv4 = cos(tv2); |
4732 | |
4733 | fusion->addOutput(tv3); |
4734 | fusion->addOutput(tv4); |
4735 | |
4736 | tv2->split(0, 10); |
4737 | |
4738 | struct Selector : public MaxInfoSpanningTree::Selector { |
4739 | TensorView* tv0; |
4740 | TensorView* tv3; |
4741 | virtual bool allowC2P(TensorView* from, TensorView* to) override { |
4742 | return to == tv0; |
4743 | } |
4744 | virtual bool allowP2C(TensorView* from, TensorView* to) override { |
4745 | return to == tv3; |
4746 | } |
4747 | virtual bool allowSibling(TensorView* from, TensorView* to) override { |
4748 | return false; |
4749 | } |
4750 | Selector(TensorView* tv0, TensorView* tv3) : tv0(tv0), tv3(tv3) {} |
4751 | } selector(tv0, tv3); |
4752 | |
4753 | TransformPropagatorWithCheck propagator(tv2); |
4754 | MaxRootDomainInfoSpanningTree(tv2, &selector).traverse(&propagator); |
4755 | |
4756 | TORCH_CHECK(tv0->nDims() == 2); |
4757 | TORCH_CHECK(tv1->nDims() == 1); |
4758 | TORCH_CHECK(tv2->nDims() == 2); |
4759 | TORCH_CHECK(tv3->nDims() == 2); |
4760 | TORCH_CHECK(tv4->nDims() == 1); |
4761 | } |
4762 | |
4763 | TEST_F(NVFuserTest, FusionTransformPropagatorPos_CUDA) { |
4764 | auto fusion = std::make_unique<Fusion>(); |
4765 | FusionGuard fg(fusion.get()); |
4766 | |
4767 | auto tv0 = makeConcreteTensor({22, 105}); |
4768 | fusion->addInput(tv0); |
4769 | |
4770 | auto tv1 = sin(tv0); |
4771 | fusion->addOutput(tv1); |
4772 | |
4773 | tv1->split(0, 2); |
4774 | tv1->split(-1, 3); |
4775 | tv1->split(-1, 5); |
4776 | |
4777 | TransformPropagatorWithCheck propagator(tv1, 2); |
4778 | MaxRootDomainInfoSpanningTree(tv1, 2).traverse(&propagator); |
4779 | |
4780 | auto expect = makeConcreteTensor({22, 105}); |
4781 | expect->split(0, 2); |
4782 | TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv0)); |
4783 | } |
4784 | |
4785 | TEST_F(NVFuserTest, FusionMaxRootDomainInfoSpanningTreePrintTwice_CUDA) { |
4786 | auto fusion = std::make_unique<Fusion>(); |
4787 | FusionGuard fg(fusion.get()); |
4788 | |
4789 | auto tv0 = makeSymbolicTensor(3); |
4790 | fusion->addInput(tv0); |
4791 | |
4792 | auto tv1 = sum(tv0, {0}); |
4793 | auto tv2 = neg(tv1); |
4794 | |
4795 | fusion->addOutput(tv2); |
4796 | |
4797 | tv1->split(0, 10); |
4798 | |
4799 | struct Printer : public MaxInfoSpanningTree::Propagator { |
4800 | std::stringstream ss; |
4801 | virtual void propagateC2P(TensorView* from, TensorView* to) override { |
4802 | ss << "propagateC2P" << std::endl; |
4803 | ss << "from: " << from->name() << std::endl; |
4804 | ss << "to: " << to->name() << std::endl; |
4805 | } |
4806 | virtual void propagateP2C(TensorView* from, TensorView* to) override { |
4807 | ss << "propagateP2C" << std::endl; |
4808 | ss << "from: " << from->name() << std::endl; |
4809 | ss << "to: " << to->name() << std::endl; |
4810 | } |
4811 | virtual void propagateSibling(TensorView* from, TensorView* to) override { |
4812 | ss << "propagateSibling" << std::endl; |
4813 | ss << "from: " << from->name() << std::endl; |
4814 | ss << "to: " << to->name() << std::endl; |
4815 | } |
4816 | } printer1, printer2; |
4817 | printer1.ss << std::endl; |
4818 | printer2.ss << std::endl; |
4819 | |
4820 | MaxRootDomainInfoSpanningTree path(tv1); |
4821 | path.traverse(&printer1); |
4822 | path.traverse(&printer2); |
4823 | |
4824 | auto expect = R"ESCAPE( |
4825 | propagateC2P |
4826 | from: 1 |
4827 | to: 0 |
4828 | propagateP2C |
4829 | from: 1 |
4830 | to: 2 |
4831 | )ESCAPE" ; |
4832 | TORCH_CHECK(printer1.ss.str() == expect); |
4833 | TORCH_CHECK(printer2.ss.str() == expect); |
4834 | } |
4835 | |
4836 | TEST_F(NVFuserTest, FusionTransformPropagatorNoOverwrite_CUDA) { |
4837 | auto fusion = std::make_unique<Fusion>(); |
4838 | FusionGuard fg(fusion.get()); |
4839 | |
4840 | auto tv0 = makeSymbolicTensor(1); |
4841 | fusion->addInput(tv0); |
4842 | auto tv1 = broadcast(tv0, {true, false, true}); |
4843 | auto tv2 = sin(tv1); |
4844 | fusion->addOutput(tv2); |
4845 | |
4846 | tv0->split(0, 2); |
4847 | tv2->split(1, 2); |
4848 | tv2->split(0, 4); |
4849 | |
4850 | MaxRootDomainInfoSpanningTree path1(tv2); |
4851 | TransformPropagatorWithCheck propagator1(tv2); |
4852 | path1.traverse(&propagator1); |
4853 | |
4854 | MaxRootDomainInfoSpanningTree path2(tv0); |
4855 | TransformPropagatorWithCheck propagator2(tv0); |
4856 | path2.traverse(&propagator2); |
4857 | |
4858 | TORCH_CHECK(tv1->axis(0)->isBroadcast()); |
4859 | TORCH_CHECK(tv1->axis(1)->isBroadcast()); |
4860 | TORCH_CHECK(!tv1->axis(2)->isBroadcast()); |
4861 | TORCH_CHECK(!tv1->axis(3)->isBroadcast()); |
4862 | TORCH_CHECK(tv1->axis(4)->isBroadcast()); |
4863 | |
4864 | auto expect = makeSymbolicTensor(3); |
4865 | expect->split(1, 2); |
4866 | expect->split(0, 4); |
4867 | TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv1)); |
4868 | } |
4869 | |
4870 | TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) { |
4871 | Fusion fusion; |
4872 | FusionGuard fg(&fusion); |
4873 | |
4874 | // Set up your input tensor views |
4875 | TensorView* tv0 = makeContigTensor(1); |
4876 | TensorView* tv1 = makeContigTensor(2); |
4877 | |
4878 | // Register your inputs |
4879 | fusion.addInput(tv0); |
4880 | fusion.addInput(tv1); |
4881 | |
4882 | auto tv2 = set(tv0); |
4883 | // [B, I] |
4884 | auto tv3 = broadcast(tv2, {true, false}); |
4885 | auto tv4 = add(tv3, tv1); |
4886 | auto tv5 = set(tv4); |
4887 | |
4888 | // Register your outputs |
4889 | fusion.addOutput(tv5); |
4890 | |
4891 | tv5->split(0, 8); |
4892 | tv5->split(-1, 8); |
4893 | |
4894 | // [Serial, TIDy, TIDX, Serial] |
4895 | |
4896 | tv4->computeAt(tv5, -2); |
4897 | tv3->computeAt(tv4, -1); |
4898 | tv2->computeAt(tv3, 0); |
4899 | tv2->split(0, 8); |
4900 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
4901 | tv1->computeAt(tv5, -2); |
4902 | |
4903 | tv5->axis(1)->parallelize(ParallelType::TIDy); |
4904 | tv5->axis(2)->parallelize(ParallelType::TIDx); |
4905 | |
4906 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4907 | |
4908 | at::Tensor in1 = at::randn({16}, options); |
4909 | at::Tensor in2 = at::randn({12, 16}, options); |
4910 | |
4911 | FusionExecutor fe; |
4912 | fe.compileFusion(&fusion, {in1, in2}); |
4913 | auto cg_outputs = fe.runFusion({in1, in2}); |
4914 | |
4915 | auto tv_ref = in1 + in2; |
4916 | |
4917 | testValidate(&fusion, cg_outputs, {in1, in2}, {tv_ref}, __LINE__, __FILE__); |
4918 | } |
4919 | |
4920 | TEST_F(NVFuserTest, FusionSkipReplay_CUDA) { |
4921 | { |
4922 | Fusion fusion; |
4923 | FusionGuard fg(&fusion); |
4924 | |
4925 | TensorView* tv0 = makeContigTensor(1); |
4926 | TensorView* tv1 = makeContigTensor(2); |
4927 | fusion.addInput(tv0); |
4928 | fusion.addInput(tv1); |
4929 | |
4930 | auto tv2 = broadcast(tv0, {false, true}); |
4931 | auto tv3 = add(tv2, tv1); |
4932 | fusion.addOutput(tv3); |
4933 | |
4934 | tv3->split(1, 2, false); |
4935 | |
4936 | TransformPropagatorWithCheck propagator(tv3); |
4937 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
4938 | } |
4939 | |
4940 | { |
4941 | Fusion fusion; |
4942 | FusionGuard fg(&fusion); |
4943 | |
4944 | TensorView* tv0 = makeContigTensor(3); |
4945 | fusion.addInput(tv0); |
4946 | |
4947 | auto tv1 = sum(tv0, {0, 2}); |
4948 | auto tv2 = sin(tv1); |
4949 | fusion.addOutput(tv2); |
4950 | |
4951 | tv0->split(1, 2, false); |
4952 | |
4953 | TransformPropagatorWithCheck propagator(tv0); |
4954 | MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator); |
4955 | } |
4956 | } |
4957 | |
4958 | TEST_F(NVFuserTest, FusionInlineRepro1803_CUDA) { |
4959 | Fusion fusion; |
4960 | FusionGuard fg(&fusion); |
4961 | |
4962 | TensorView* tv0 = makeContigTensor(2); |
4963 | |
4964 | fusion.addInput(tv0); |
4965 | auto tv1 = set(tv0); |
4966 | auto tvs = Welford(tv1, {1}); |
4967 | auto tvo = set(tvs.var_sum); |
4968 | fusion.addOutput(tvo); |
4969 | |
4970 | tvo->split(0, 16); |
4971 | tvo->axis(1)->parallelize(ParallelType::Unroll); |
4972 | |
4973 | tv0->computeAt(tvo, -1, ComputeAtMode::BestEffort); |
4974 | |
4975 | TORCH_CHECK( |
4976 | tvs.var_sum->getComputeAtPosition() == tvs.avg->getComputeAtPosition()); |
4977 | TORCH_CHECK( |
4978 | tvs.var_sum->getComputeAtPosition() == tvs.n->getComputeAtPosition()); |
4979 | TORCH_CHECK(tvs.var_sum->getComputeAtPosition() == 1); |
4980 | } |
4981 | |
4982 | // Unit test for the transform selection logic |
4983 | TEST_F(NVFuserTest, FusionBoundedDirectionSelection1_CUDA) { |
4984 | Fusion fusion; |
4985 | FusionGuard fg(&fusion); |
4986 | |
4987 | TensorView* tv0 = makeContigTensor(2); |
4988 | |
4989 | fusion.addInput(tv0); |
4990 | auto tv1 = set(tv0); |
4991 | auto tv2 = set(tv1); |
4992 | auto tv3 = add(tv2, tv1); |
4993 | fusion.addOutput(tv3); |
4994 | |
4995 | tv3->split(-1, 5); |
4996 | tv3->split(-1, 8); |
4997 | |
4998 | scheduler_utils::BoundedDirectionalTransformPropagator::backward( |
4999 | tv3, -1, {tv0, tv2}); |
5000 | |
5001 | // Check that the splits are replayed on tv2 |
5002 | TORCH_INTERNAL_ASSERT( |
5003 | tv2->nDims() == tv3->nDims(), |
5004 | "Propagator didn't propagate to tv2: " , |
5005 | tv2->toString()); |
5006 | |
5007 | // Check that the splits are replayed on tv1 as well. Even though |
5008 | // one of its consumers, tv2, is part of the boundary, another |
5009 | // consumer is not a boundary, so tv1 should be transformed as well. |
5010 | TORCH_INTERNAL_ASSERT( |
5011 | tv1->nDims() == tv3->nDims(), |
5012 | "Propagator didn't propagate to tv1: " , |
5013 | tv1->toString()); |
5014 | } |
5015 | |
5016 | TEST_F(NVFuserTest, FusionIssueRepro1844_CUDA) { |
5017 | auto fusion = std::make_unique<Fusion>(); |
5018 | FusionGuard fg(fusion.get()); |
5019 | |
5020 | std::vector<int64_t> shape = {2, 1, 768}; |
5021 | std::vector<int64_t> sum_to_shape = {768}; |
5022 | std::vector<int64_t> sum_to_axes = {0, 1}; |
5023 | double kProb = 0.5; |
5024 | |
5025 | std::vector<Int*> sum_to_symb; |
5026 | std::transform( |
5027 | sum_to_shape.begin(), |
5028 | sum_to_shape.end(), |
5029 | std::back_inserter(sum_to_symb), |
5030 | [](int s) -> Int* { return IrBuilder::create<Int>(s); }); |
5031 | |
5032 | TensorView* tv0 = makeContigConcreteTensor(shape); |
5033 | TensorView* tv1 = makeContigConcreteTensor(shape); |
5034 | TensorView* tv2 = makeContigConcreteTensor(shape, DataType::Bool); |
5035 | |
5036 | fusion->addInput(tv0); |
5037 | fusion->addInput(tv1); |
5038 | fusion->addInput(tv2); |
5039 | |
5040 | Double* prob = IrBuilder::create<Double>(kProb); |
5041 | auto grad_input = dropout_backward(tv1, tv2, prob); |
5042 | auto grad_gelu = gelu_backward(grad_input, tv0); |
5043 | auto grad_bias = sum_to(grad_gelu, sum_to_symb); |
5044 | |
5045 | fusion->addOutput(grad_gelu); |
5046 | fusion->addOutput(grad_bias); |
5047 | |
5048 | const auto options = |
5049 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5050 | const auto mask_options = |
5051 | at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0); |
5052 | at::manual_seed(0); |
5053 | |
5054 | at::Tensor a = at::randn(shape, options); |
5055 | at::Tensor b = at::randn(shape, options); |
5056 | at::Tensor c = at::randn(shape, options); |
5057 | auto mask = at::gt(c, 0.0f); |
5058 | std::vector<IValue> aten_inputs = {a, b, mask}; |
5059 | |
5060 | FusionExecutorCache executor_cache(std::move(fusion)); |
5061 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
5062 | |
5063 | auto dinput = at::native_dropout_backward(b, mask, kProb); |
5064 | auto dgelu = at::gelu_backward(dinput, a, "none" ); |
5065 | auto dbias = dgelu.sum(sum_to_axes); |
5066 | |
5067 | testValidate( |
5068 | executor_cache.fusion(), |
5069 | cg_outputs, |
5070 | aten_inputs, |
5071 | {dgelu, dbias}, |
5072 | __LINE__, |
5073 | __FILE__); |
5074 | } |
5075 | |
5076 | TEST_F(NVFuserTest, FusionInsertMagicZero1_CUDA) { |
5077 | Fusion fusion; |
5078 | FusionGuard fg(&fusion); |
5079 | |
5080 | auto tv0 = makeSymbolicTensor(2); |
5081 | fusion.addInput(tv0); |
5082 | |
5083 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
5084 | auto tv2 = set(tv1); |
5085 | fusion.addOutput(tv2); |
5086 | |
5087 | tv2->split(0, 32); |
5088 | tv2->split(-1, 2); |
5089 | tv2->reorder({{1, 2}, {2, 1}}); |
5090 | tv2->merge(0); |
5091 | |
5092 | TransformPropagatorWithCheck propagator(tv2); |
5093 | MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator); |
5094 | |
5095 | tv0->computeAt(tv2, 1); |
5096 | |
5097 | // The predicate of tv2 should be protected with magic zero |
5098 | GpuLower gpulw(&fusion); |
5099 | TORCH_CHECK( |
5100 | PredicateMagicZeroChecker::isProtected(tv2, gpulw), |
5101 | "Failed to protect the predicates of " , |
5102 | tv2->toString()); |
5103 | } |
5104 | |
5105 | TEST_F(NVFuserTest, FusionRepro1860_CUDA) { |
5106 | auto fusion_ptr = std::make_unique<Fusion>(); |
5107 | Fusion& fusion = *fusion_ptr; |
5108 | FusionGuard fg(&fusion); |
5109 | std::vector<bool> contiguity{true, false, false}; |
5110 | |
5111 | std::vector<int64_t> shape{1, -1, -1}; |
5112 | TensorView* tv0 = makeContigConcreteTensor(shape); |
5113 | fusion.addInput(tv0); |
5114 | TensorView* tv1 = makeContigConcreteTensor(shape); |
5115 | fusion.addInput(tv1); |
5116 | TensorView* tv2 = makeContigConcreteTensor(shape); |
5117 | fusion.addInput(tv2); |
5118 | |
5119 | std::vector<IterDomain*> domain1(3, nullptr); |
5120 | for (const auto i : c10::irange(3)) { |
5121 | if (i == 0) { |
5122 | domain1[i] = |
5123 | IterDomainBuilder( |
5124 | FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1)) |
5125 | .iter_type(IterType::Broadcast) |
5126 | .build(); |
5127 | } else { |
5128 | domain1[i] = |
5129 | IterDomainBuilder( |
5130 | FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1)) |
5131 | .expanded_extent(IrBuilder::create<Int>(1 + i)) |
5132 | .iter_type(IterType::Broadcast) |
5133 | .build(); |
5134 | } |
5135 | } |
5136 | |
5137 | TensorView* tv22 = IrBuilder::create<TensorView>( |
5138 | IrBuilder::create<TensorDomain>(domain1, contiguity), DataType::Float); |
5139 | |
5140 | fusion.addInput(tv22); |
5141 | |
5142 | auto tv3 = add(tv0, tv1); |
5143 | auto tv4 = softmax(tv3, 0); |
5144 | auto tv5 = add(tv4, tv22); |
5145 | fusion.addOutput(tv5); |
5146 | |
5147 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5148 | |
5149 | at::Tensor input1 = at::randn({1, 2, 3}, options); |
5150 | at::Tensor input2 = at::randn({1, 2, 3}, options); |
5151 | at::Tensor input3 = at::randn({1, 2, 3}, options); |
5152 | at::Tensor input4 = at::randn({1, 1, 1}, options).expand({1, 2, 3}); |
5153 | std::vector<IValue> aten_inputs = {input1, input2, input3, input4}; |
5154 | |
5155 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
5156 | auto outputs = executor_cache.runFusionWithInputs(aten_inputs); |
5157 | } |
5158 | |
5159 | TEST_F(NVFuserTest, FusionExpandReduce_CUDA) { |
5160 | auto fusion = std::make_unique<Fusion>(); |
5161 | FusionGuard fg(fusion.get()); |
5162 | |
5163 | auto tv0 = makeConcreteTensor({1, 8}); |
5164 | fusion->addInput(tv0); |
5165 | |
5166 | auto tv1 = |
5167 | expand(tv0, {IrBuilder::create<Int>(12), IrBuilder::create<Int>(8)}); |
5168 | |
5169 | auto tv2 = sum(tv1, {0}); |
5170 | fusion->addOutput(tv2); |
5171 | |
5172 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5173 | at::manual_seed(0); |
5174 | auto t0 = at::randn({1, 8}, options); |
5175 | |
5176 | FusionExecutorCache executor_cache(std::move(fusion)); |
5177 | auto cg_outputs = executor_cache.runFusionWithInputs({t0}); |
5178 | |
5179 | auto ref = t0.expand({12, 8}).sum({0}); |
5180 | |
5181 | testValidate( |
5182 | executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
5183 | } |
5184 | |
5185 | // Predicate elimination issue repro: |
5186 | TEST_F(NVFuserTest, FusionExpandReduce2_CUDA) { |
5187 | auto fusion = std::make_unique<Fusion>(); |
5188 | FusionGuard fg(fusion.get()); |
5189 | |
5190 | auto tv0 = makeConcreteTensor({1, 4}); |
5191 | fusion->addInput(tv0); |
5192 | |
5193 | auto tv1 = |
5194 | expand(tv0, {IrBuilder::create<Int>(3), IrBuilder::create<Int>(4)}); |
5195 | |
5196 | auto tv2 = sum(tv1, {0}); |
5197 | fusion->addOutput(tv2); |
5198 | |
5199 | // tv2[r{3}, i{4}] |
5200 | tv2->split(0, NamedScalar::getParallelDim(ParallelType::TIDy)); |
5201 | tv2->axis(1)->parallelize(ParallelType::TIDy); |
5202 | tv2->split(0, NamedScalar::getParallelDim(ParallelType::BIDy), false); |
5203 | tv2->axis(0)->parallelize(ParallelType::BIDy); |
5204 | tv2->split(-1, NamedScalar::getParallelDim(ParallelType::TIDx)); |
5205 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
5206 | tv2->axis(-2)->parallelize(ParallelType::BIDx); |
5207 | // [rBIDy, rO, rTIDy, iBIDx, iTIDx] |
5208 | tv2->reorder({{-2, 0}, {-1, 1}, {2, 2}}); |
5209 | // [iBIDx, iTIDx, rTIDy, rBIDy, rO] |
5210 | auto tv3 = tv2->rFactor({-1}); |
5211 | |
5212 | TransformPropagatorWithCheck propagator(tv3); |
5213 | MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); |
5214 | scheduler_utils::parallelizeAllLike(tv3); |
5215 | tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); |
5216 | |
5217 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5218 | at::manual_seed(0); |
5219 | auto t0 = at::randn({1, 4}, options); |
5220 | |
5221 | FusionExecutor fe; |
5222 | fe.compileFusion(fusion.get(), {t0}, LaunchParams(-1, 2, -1, 4, 2, 1)); |
5223 | auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, 2, -1, 4, 2, 1)); |
5224 | |
5225 | auto ref = t0.expand({3, 4}).sum({0}); |
5226 | |
5227 | testValidate( |
5228 | fusion.get(), |
5229 | cg_outputs, |
5230 | {t0}, |
5231 | {ref}, |
5232 | __LINE__, |
5233 | __FILE__, |
5234 | "" , |
5235 | LaunchParams(-1, 2, -1, 4, 2, 1)); |
5236 | } |
5237 | |
5238 | TEST_F(NVFuserTest, FusionExpandBadShapeTest_CUDA) { |
5239 | auto fusion_ptr = std::make_unique<Fusion>(); |
5240 | Fusion& fusion = *fusion_ptr; |
5241 | FusionGuard fg(&fusion); |
5242 | std::vector<bool> contiguity{false, false}; |
5243 | |
5244 | auto tv0 = makeSymbolicTensor(2); |
5245 | fusion.addInput(tv0); |
5246 | |
5247 | std::vector<IterDomain*> domains = { |
5248 | IterDomainBuilder( |
5249 | FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>()) |
5250 | .build(), |
5251 | IterDomainBuilder( |
5252 | FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1)) |
5253 | .expanded_extent(IrBuilder::create<Int>(10)) |
5254 | .iter_type(IterType::Broadcast) |
5255 | .build()}; |
5256 | |
5257 | // expand to 10 |
5258 | TensorView* tv22 = IrBuilder::create<TensorView>( |
5259 | IrBuilder::create<TensorDomain>(domains, contiguity), DataType::Float); |
5260 | |
5261 | fusion.addInput(tv22); |
5262 | |
5263 | auto tv3 = add(tv0, tv22); |
5264 | fusion.addOutput(tv3); |
5265 | |
5266 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5267 | |
5268 | // Incompatible shapes |
5269 | at::Tensor input1 = at::randn({2, 3}, options); |
5270 | // Passing expand size of 5, not 10. Should cause an error |
5271 | at::Tensor input4 = at::randn({2, 1}, options).expand({2, 5}); |
5272 | |
5273 | std::vector<IValue> aten_inputs = {input1, input4}; |
5274 | |
5275 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
5276 | ASSERT_ANY_THROW(executor_cache.runFusionWithInputs(aten_inputs)); |
5277 | } |
5278 | |
5279 | TEST_F( |
5280 | NVFuserTest, |
5281 | FusionPointwiseScheduleWithBroadcastAndTrivialReduction_CUDA) { |
5282 | Fusion fusion; |
5283 | FusionGuard fg(&fusion); |
5284 | |
5285 | auto tv0 = makeContigTensor(3); |
5286 | auto tv1 = makeContigTensor(2); |
5287 | fusion.addInput(tv0); |
5288 | fusion.addInput(tv1); |
5289 | auto tv2 = broadcast(tv0, {false, true, false, true, false, true}); |
5290 | auto tv3 = sin(tv2); |
5291 | auto tv4 = add(tv3, tv1); |
5292 | auto tv5 = sum(tv4, {1}); |
5293 | fusion.addOutput(tv5); |
5294 | |
5295 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5296 | at::Tensor t0 = at::randn({100, 100, 10}, options); |
5297 | at::Tensor t1 = at::randn({10, 20}, options); |
5298 | |
5299 | auto aten_output = (t0.view({100, 1, 100, 1, 10, 1}).sin() + t1).squeeze(1); |
5300 | |
5301 | std::vector<IValue> aten_inputs = {t0, t1}; |
5302 | |
5303 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
5304 | |
5305 | FusionExecutor fe; |
5306 | fe.compileFusion(&fusion, aten_inputs, lparams); |
5307 | auto cg_outputs = fe.runFusion(aten_inputs, lparams); |
5308 | |
5309 | testValidate( |
5310 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5311 | } |
5312 | |
5313 | TEST_F(NVFuserTest, FusionInliningMismatchedDims1_CUDA) { |
5314 | Fusion fusion; |
5315 | FusionGuard fg(&fusion); |
5316 | |
5317 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5318 | fusion.addInput(tv0); |
5319 | auto tv1 = sin(tv0); |
5320 | auto tv2 = cos(tv1); |
5321 | auto tv3 = transpose(tv2, 1, 2); |
5322 | auto tv4 = exp(tv3); |
5323 | auto tv5 = tan(tv4); |
5324 | fusion.addOutput(tv5); |
5325 | |
5326 | inlineMost(); |
5327 | |
5328 | TORCH_CHECK(tv5->getComputeAtPosition() == 3); |
5329 | TORCH_CHECK(tv4->getComputeAtPosition() == 3); |
5330 | TORCH_CHECK(tv3->getComputeAtPosition() == 3); |
5331 | TORCH_CHECK(tv2->getComputeAtPosition() == 1); |
5332 | TORCH_CHECK(tv1->getComputeAtPosition() == 3); |
5333 | |
5334 | const auto options = |
5335 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5336 | at::Tensor input = at::randn({2, 3, 4}, options); |
5337 | auto output = input.sin().cos().transpose(1, 2).exp().tan(); |
5338 | |
5339 | FusionExecutor fe; |
5340 | fe.compileFusion(&fusion, {input}); |
5341 | auto cg_outputs = fe.runFusion({input}); |
5342 | |
5343 | testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__); |
5344 | } |
5345 | |
5346 | TEST_F(NVFuserTest, FusionInliningMismatchedDims2_CUDA) { |
5347 | Fusion fusion; |
5348 | FusionGuard fg(&fusion); |
5349 | |
5350 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5351 | fusion.addInput(tv0); |
5352 | auto tv1 = sin(tv0); |
5353 | auto tv2 = cos(tv1); |
5354 | auto tv3 = transpose(tv2, 1, 2); |
5355 | auto tv4 = exp(tv3); |
5356 | auto tv5 = tan(tv4); |
5357 | fusion.addOutput(tv5); |
5358 | |
5359 | inlineAllAt(tv5, -1, true); |
5360 | |
5361 | TORCH_CHECK(tv5->getComputeAtPosition() == 3); |
5362 | TORCH_CHECK(tv4->getComputeAtPosition() == 3); |
5363 | TORCH_CHECK(tv3->getComputeAtPosition() == 3); |
5364 | TORCH_CHECK(tv2->getComputeAtPosition() == 1); |
5365 | TORCH_CHECK(tv1->getComputeAtPosition() == 1); |
5366 | |
5367 | const auto options = |
5368 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5369 | at::Tensor input = at::randn({2, 3, 4}, options); |
5370 | auto output = input.sin().cos().transpose(1, 2).exp().tan(); |
5371 | |
5372 | FusionExecutor fe; |
5373 | fe.compileFusion(&fusion, {input}); |
5374 | auto cg_outputs = fe.runFusion({input}); |
5375 | |
5376 | testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__); |
5377 | } |
5378 | |
5379 | TEST_F(NVFuserTest, FusionInliningMismatchedDims3_CUDA) { |
5380 | Fusion fusion; |
5381 | FusionGuard fg(&fusion); |
5382 | |
5383 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5384 | fusion.addInput(tv0); |
5385 | auto tv1 = sin(tv0); |
5386 | // broadcasting |
5387 | auto tv2 = broadcast(tv1, {false, true, false, true, false, true}); |
5388 | auto tv3 = relu(tv2); |
5389 | // trivial reduction |
5390 | auto tv4 = sum(tv3, {1, 3, 5}); |
5391 | auto tv5 = cos(tv4); |
5392 | auto tv6 = transpose(tv5, 1, 2); |
5393 | auto tv7 = exp(tv6); |
5394 | auto tv8 = tan(tv7); |
5395 | fusion.addOutput(tv8); |
5396 | |
5397 | for (auto tv : {tv2, tv3, tv4}) { |
5398 | tv->merge(0); |
5399 | tv->merge(1); |
5400 | tv->merge(2); |
5401 | } |
5402 | |
5403 | inlineMost(); |
5404 | |
5405 | TORCH_CHECK(tv8->getComputeAtPosition() == 3); |
5406 | TORCH_CHECK(tv7->getComputeAtPosition() == 3); |
5407 | TORCH_CHECK(tv6->getComputeAtPosition() == 3); |
5408 | TORCH_CHECK(tv5->getComputeAtPosition() == 1); |
5409 | TORCH_CHECK(tv4->getComputeAtPosition() == 3); |
5410 | TORCH_CHECK(tv3->getComputeAtPosition() == 3); |
5411 | TORCH_CHECK(tv2->getComputeAtPosition() == 3); |
5412 | TORCH_CHECK(tv1->getComputeAtPosition() == 3); |
5413 | |
5414 | const auto options = |
5415 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5416 | at::Tensor input = at::randn({2, 3, 4}, options); |
5417 | auto output = input.sin().relu().cos().transpose(1, 2).exp().tan(); |
5418 | |
5419 | FusionExecutor fe; |
5420 | fe.compileFusion(&fusion, {input}); |
5421 | auto cg_outputs = fe.runFusion({input}); |
5422 | |
5423 | testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__); |
5424 | } |
5425 | |
5426 | TEST_F(NVFuserTest, FusionInliningMismatchedDims4_CUDA) { |
5427 | Fusion fusion; |
5428 | FusionGuard fg(&fusion); |
5429 | |
5430 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5431 | fusion.addInput(tv0); |
5432 | auto tv1 = sin(tv0); |
5433 | auto tv2 = exp(tv1); |
5434 | auto tv3 = relu(tv2); |
5435 | auto tv4 = cos(tv3); |
5436 | auto tv5 = tan(tv4); |
5437 | fusion.addOutput(tv5); |
5438 | |
5439 | tv3->merge(1); |
5440 | inlineMost(); |
5441 | |
5442 | TORCH_CHECK(tv5->getComputeAtPosition() == 3); |
5443 | TORCH_CHECK(tv4->getComputeAtPosition() == 3); |
5444 | TORCH_CHECK(tv3->getComputeAtPosition() == 1); |
5445 | TORCH_CHECK(tv2->getComputeAtPosition() == 1); |
5446 | TORCH_CHECK(tv1->getComputeAtPosition() == 3); |
5447 | |
5448 | const auto options = |
5449 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5450 | at::Tensor input = at::randn({2, 3, 4}, options); |
5451 | auto output = input.sin().exp().relu().cos().tan(); |
5452 | |
5453 | FusionExecutor fe; |
5454 | fe.compileFusion(&fusion, {input}); |
5455 | auto cg_outputs = fe.runFusion({input}); |
5456 | |
5457 | testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__); |
5458 | } |
5459 | |
5460 | TEST_F(NVFuserTest, FusionInliningBroadcast_CUDA) { |
5461 | Fusion fusion; |
5462 | FusionGuard fg(&fusion); |
5463 | |
5464 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5465 | fusion.addInput(tv0); |
5466 | auto tv1 = sin(tv0); |
5467 | // broadcasting |
5468 | auto tv2 = broadcast(tv1, {false, true, false, true, false, true}); |
5469 | auto tv3 = cos(tv2); |
5470 | auto tv4 = tan(tv3); |
5471 | fusion.addOutput(tv4); |
5472 | |
5473 | for (auto tv : {tv2, tv3, tv4}) { |
5474 | tv->merge(0); |
5475 | tv->merge(1); |
5476 | tv->merge(2); |
5477 | } |
5478 | |
5479 | inlineMost(); |
5480 | |
5481 | TORCH_CHECK(tv4->getComputeAtPosition() == 3); |
5482 | TORCH_CHECK(tv3->getComputeAtPosition() == 3); |
5483 | TORCH_CHECK(tv2->getComputeAtPosition() == 3); |
5484 | TORCH_CHECK(tv1->getComputeAtPosition() == 3); |
5485 | |
5486 | const auto options = |
5487 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5488 | at::Tensor input = at::randn({2, 3, 4}, options); |
5489 | auto output = input.sin().view({2, 1, 3, 1, 4, 1}).cos().tan(); |
5490 | |
5491 | FusionExecutor fe; |
5492 | fe.compileFusion(&fusion, {input}); |
5493 | auto cg_outputs = fe.runFusion({input}); |
5494 | |
5495 | testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__); |
5496 | } |
5497 | |
5498 | TEST_F(NVFuserTest, FusionInliningBroadcastTrivialReduction_CUDA) { |
5499 | Fusion fusion; |
5500 | FusionGuard fg(&fusion); |
5501 | |
5502 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5503 | fusion.addInput(tv0); |
5504 | auto tv1 = sin(tv0); |
5505 | // broadcasting |
5506 | auto tv2 = broadcast(tv1, {false, true, false, true, false, true}); |
5507 | auto tv3 = tan(tv2); |
5508 | // trivial reduction |
5509 | auto tv4 = sum(tv3, {1, 3, 5}); |
5510 | auto tv5 = cos(tv4); |
5511 | auto tv6 = exp(tv5); |
5512 | fusion.addOutput(tv6); |
5513 | |
5514 | for (auto tv : {tv2, tv3, tv4}) { |
5515 | tv->merge(0); |
5516 | tv->merge(1); |
5517 | tv->merge(2); |
5518 | } |
5519 | |
5520 | inlineMost(); |
5521 | |
5522 | TORCH_CHECK(tv6->getComputeAtPosition() == 3); |
5523 | TORCH_CHECK(tv5->getComputeAtPosition() == 3); |
5524 | TORCH_CHECK(tv4->getComputeAtPosition() == 3); |
5525 | TORCH_CHECK(tv3->getComputeAtPosition() == 3); |
5526 | TORCH_CHECK(tv2->getComputeAtPosition() == 3); |
5527 | TORCH_CHECK(tv1->getComputeAtPosition() == 3); |
5528 | |
5529 | const auto options = |
5530 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5531 | at::Tensor input = at::randn({2, 3, 4}, options); |
5532 | auto output = input.sin().tan().cos().exp(); |
5533 | |
5534 | FusionExecutor fe; |
5535 | fe.compileFusion(&fusion, {input}); |
5536 | auto cg_outputs = fe.runFusion({input}); |
5537 | |
5538 | testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__); |
5539 | } |
5540 | |
5541 | TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayTrivialReduction_CUDA) { |
5542 | Fusion fusion; |
5543 | FusionGuard fg(&fusion); |
5544 | |
5545 | auto tv0 = makeConcreteTensor({2, 1, 3, 1, 4, 1}); |
5546 | fusion.addInput(tv0); |
5547 | auto tv1 = sum(tv0, {1, 3, 5}); |
5548 | auto tv2 = sin(tv1); |
5549 | fusion.addOutput(tv1); |
5550 | |
5551 | for (auto tv : {tv0, tv1}) { |
5552 | tv->merge(0); |
5553 | tv->merge(1); |
5554 | tv->merge(2); |
5555 | } |
5556 | |
5557 | TORCH_CHECK( |
5558 | TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3); |
5559 | TORCH_CHECK( |
5560 | TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3); |
5561 | TORCH_CHECK( |
5562 | TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3); |
5563 | TORCH_CHECK( |
5564 | TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3); |
5565 | } |
5566 | |
5567 | TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayBroadcast_CUDA) { |
5568 | Fusion fusion; |
5569 | FusionGuard fg(&fusion); |
5570 | |
5571 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5572 | fusion.addInput(tv0); |
5573 | auto tv1 = broadcast(tv0, {false, true, false, true, false, true}); |
5574 | auto tv2 = sin(tv1); |
5575 | fusion.addOutput(tv2); |
5576 | |
5577 | for (auto tv : {tv1, tv2}) { |
5578 | tv->merge(0); |
5579 | tv->merge(1); |
5580 | tv->merge(2); |
5581 | } |
5582 | |
5583 | TORCH_CHECK( |
5584 | TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3); |
5585 | TORCH_CHECK( |
5586 | TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3); |
5587 | TORCH_CHECK( |
5588 | TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3); |
5589 | TORCH_CHECK( |
5590 | TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3); |
5591 | } |
5592 | |
5593 | TEST_F(NVFuserTest, FusionIdGraphTrivialReduction_CUDA) { |
5594 | Fusion fusion; |
5595 | FusionGuard fg(&fusion); |
5596 | |
5597 | auto tv0 = makeConcreteTensor({2, 3, 4}); |
5598 | fusion.addInput(tv0); |
5599 | auto tv1 = broadcast(tv0, {false, true, false, true, false, true}); |
5600 | auto tv2 = sum(tv1, {1, 3, 5}); |
5601 | auto tv3 = sin(tv2); |
5602 | fusion.addOutput(tv3); |
5603 | |
5604 | for (auto tv : {tv1, tv2}) { |
5605 | tv->merge(0); |
5606 | tv->merge(1); |
5607 | tv->merge(2); |
5608 | } |
5609 | |
5610 | inlineMost(); |
5611 | |
5612 | ComputeAtMap ca_map(&fusion); |
5613 | |
5614 | auto all_tvs = ir_utils::allTvs(&fusion); |
5615 | for (auto tv1 : all_tvs) { |
5616 | for (auto tv2 : all_tvs) { |
5617 | if (tv1->isFusionInput() || tv2->isFusionInput()) { |
5618 | continue; |
5619 | } |
5620 | for (int i : c10::irange(3)) { |
5621 | auto id1 = tv1->axis(i); |
5622 | auto id2 = tv2->axis(i); |
5623 | TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::LOOP)); |
5624 | TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::PERMISSIVE)); |
5625 | } |
5626 | } |
5627 | } |
5628 | } |
5629 | |
5630 | TEST_F(NVFuserTest, FusionPrint_CUDA) { |
5631 | auto dtypes = { |
5632 | at::kFloat, |
5633 | at::kDouble, |
5634 | at::kHalf, |
5635 | at::kBFloat16, |
5636 | at::kInt, |
5637 | at::kLong, |
5638 | at::kBool}; |
5639 | for (auto dtype : dtypes) { |
5640 | auto fusion = std::make_unique<Fusion>(); |
5641 | FusionGuard fg(fusion.get()); |
5642 | |
5643 | auto tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype)); |
5644 | fusion->addInput(tv0); |
5645 | auto tv1 = print(tv0); |
5646 | auto tv2 = sin(tv1); |
5647 | fusion->addOutput(tv2); |
5648 | |
5649 | // There is no way to check if anything is printed to the console, but we |
5650 | // can validate that when print exist, compilation and computation are not |
5651 | // broken. |
5652 | auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); |
5653 | at::Tensor t0 = at::arange(2, options).to(dtype); |
5654 | |
5655 | FusionExecutorCache executor_cache(std::move(fusion)); |
5656 | auto cg_outputs = executor_cache.runFusionWithInputs({t0}); |
5657 | |
5658 | testValidate( |
5659 | executor_cache.fusion(), |
5660 | cg_outputs, |
5661 | {t0}, |
5662 | {t0.sin()}, |
5663 | __LINE__, |
5664 | __FILE__); |
5665 | } |
5666 | } |
5667 | |
5668 | TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) { |
5669 | const auto options = |
5670 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5671 | |
5672 | at::Tensor a = at::randn({123, 456}, options); |
5673 | at::Tensor b = at::randn({123, 456}, options); |
5674 | at::Tensor c = at::randn({321, 654}, options); |
5675 | |
5676 | using return_t = |
5677 | std::pair<std::unique_ptr<FusionExecutorCache>, std::vector<at::Tensor>>; |
5678 | auto matched_add = [](at::Tensor a, at::Tensor b) -> return_t { |
5679 | auto fusion = std::make_unique<Fusion>(); |
5680 | FusionGuard fg(fusion.get()); |
5681 | |
5682 | Val* s1 = IrBuilder::create<Int>(); |
5683 | Val* s2 = IrBuilder::create<Int>(); |
5684 | auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1, s2}); |
5685 | TensorView* tv0 = builder.build(); |
5686 | TensorView* tv1 = builder.build(); |
5687 | |
5688 | fusion->addInput(tv0); |
5689 | fusion->addInput(tv1); |
5690 | |
5691 | auto tv2 = add(tv0, tv1); |
5692 | |
5693 | fusion->addOutput(tv2); |
5694 | |
5695 | auto executor_cache = |
5696 | std::make_unique<FusionExecutorCache>(std::move(fusion)); |
5697 | auto cg_outputs = executor_cache->runFusionWithInputs({a, b}); |
5698 | return {std::move(executor_cache), std::move(cg_outputs)}; |
5699 | }; |
5700 | |
5701 | { |
5702 | auto ret1 = matched_add(a, b); |
5703 | testValidate( |
5704 | ret1.first->fusion(), ret1.second, {a, b}, {a + b}, __LINE__, __FILE__); |
5705 | } |
5706 | |
5707 | { |
5708 | EXPECT_THAT( |
5709 | [&]() { matched_add(a, c); }, |
5710 | ::testing::ThrowsMessage<c10::Error>( |
5711 | ::testing::HasSubstr("Attempting to bind" ))); |
5712 | } |
5713 | } |
5714 | |
5715 | TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) { |
5716 | auto fusion = std::make_unique<Fusion>(); |
5717 | FusionGuard fg(fusion.get()); |
5718 | |
5719 | Val* s1 = IrBuilder::create<Int>(); |
5720 | auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1}); |
5721 | TensorView* tv0 = builder.build(); |
5722 | |
5723 | fusion->addInput(tv0); |
5724 | |
5725 | auto tv1 = add(tv0, s1); |
5726 | |
5727 | fusion->addOutput(tv1); |
5728 | |
5729 | const auto options = |
5730 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5731 | |
5732 | at::Tensor a = at::zeros({123}, options); |
5733 | |
5734 | FusionExecutorCache executor_cache(std::move(fusion)); |
5735 | auto cg_outputs = executor_cache.runFusionWithInputs({a}); |
5736 | |
5737 | testValidate( |
5738 | executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__); |
5739 | } |
5740 | |
5741 | TEST_F(NVFuserTest, FusionDependencyCheck_CUDA) { |
5742 | Fusion fusion; |
5743 | FusionGuard fg(&fusion); |
5744 | |
5745 | TensorView* tv0 = makeSymbolicTensor(1); |
5746 | TensorView* tv1 = makeSymbolicTensor(1); |
5747 | TensorView* tv2 = makeSymbolicTensor(1); |
5748 | TensorView* tv3 = makeSymbolicTensor(1); |
5749 | |
5750 | auto tv4 = add(tv0, tv1); |
5751 | auto tv5 = add(tv0, tv2); |
5752 | auto tv6 = add(tv0, tv3); |
5753 | |
5754 | auto tv7 = add(tv1, tv2); |
5755 | auto tv8 = add(tv1, tv3); |
5756 | |
5757 | auto tv9 = add(tv2, tv3); |
5758 | |
5759 | { |
5760 | auto all_vals = DependencyCheck::getAllValsBetween( |
5761 | {tv0, tv1}, {tv4, tv5, tv6, tv7, tv8, tv9}); |
5762 | std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end()); |
5763 | std::vector<Val*> results({tv0, tv1, tv4, tv5, tv6, tv7, tv8}); |
5764 | for (auto result : results) { |
5765 | TORCH_CHECK(all_vals_set.count(result) > 0); |
5766 | all_vals_set.erase(result); |
5767 | } |
5768 | TORCH_CHECK(all_vals_set.empty()); |
5769 | } |
5770 | |
5771 | auto tv10 = add(tv6, tv7); |
5772 | { |
5773 | auto all_vals = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv10}); |
5774 | std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end()); |
5775 | std::vector<Val*> results({tv0, tv1, tv6, tv7, tv10}); |
5776 | for (auto result : results) { |
5777 | TORCH_CHECK(all_vals_set.count(result) > 0); |
5778 | all_vals_set.erase(result); |
5779 | } |
5780 | TORCH_CHECK(all_vals_set.empty()); |
5781 | } |
5782 | } |
5783 | |
5784 | // Repro for issue #1925 |
5785 | TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { |
5786 | Fusion fusion; |
5787 | FusionGuard fg(&fusion); |
5788 | |
5789 | auto tv0 = makeSymbolicTensor(4); |
5790 | auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); |
5791 | fusion.addInput(tv0); |
5792 | fusion.addInput(tv1); |
5793 | auto tv2 = add(tv0, tv1); |
5794 | fusion.addOutput(tv2); |
5795 | |
5796 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5797 | at::Tensor input0 = at::randn({1, 1, 333, 1}, options); |
5798 | at::Tensor input1 = at::randn({1, 1, 333, 1}, options); |
5799 | |
5800 | auto lparams = scheduleTranspose(&fusion, {input0, input1}); |
5801 | |
5802 | FusionExecutor fe; |
5803 | fe.compileFusion(&fusion, {input0, input1}, lparams); |
5804 | auto outputs = fe.runFusion({input0, input1}, lparams); |
5805 | |
5806 | auto tv_ref = input0 + input1; |
5807 | |
5808 | testValidate( |
5809 | &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); |
5810 | } |
5811 | |
5812 | // Repro for issue #1873 |
5813 | TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { |
5814 | Fusion fusion; |
5815 | FusionGuard fg(&fusion); |
5816 | |
5817 | auto tv0 = makeContigTensor(1); |
5818 | auto tv1 = makeContigTensor(2); |
5819 | fusion.addInput(tv0); |
5820 | fusion.addInput(tv1); |
5821 | auto tv2 = set(tv0); |
5822 | auto tv3 = broadcast(tv2, {true, false}); |
5823 | auto tv4 = add(tv3, tv1); |
5824 | fusion.addOutput(tv4); |
5825 | |
5826 | tv4->merge(0); |
5827 | tv4->split(0, 32); |
5828 | |
5829 | tv0->computeAt(tv4, 1); |
5830 | |
5831 | tv2->split(-1, 8); |
5832 | |
5833 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5834 | at::Tensor t0 = at::randn({123}, options); |
5835 | at::Tensor t1 = at::randn({3, 123}, options); |
5836 | |
5837 | FusionExecutor fe; |
5838 | fe.compileFusion(&fusion, {t0, t1}); |
5839 | |
5840 | auto outputs = fe.runFusion({t0, t1}); |
5841 | |
5842 | auto tv_ref = t0 + t1; |
5843 | |
5844 | testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); |
5845 | } |
5846 | |
5847 | TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) { |
5848 | // https://github.com/csarofeen/pytorch/issues/1926 |
5849 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
5850 | auto fusion = fusion_ptr.get(); |
5851 | FusionGuard fg(fusion); |
5852 | |
5853 | TensorView* tv0 = makeSymbolicTensor(2); |
5854 | fusion->addInput(tv0); |
5855 | auto tv1 = set(tv0); |
5856 | auto tv2 = set(tv1); |
5857 | fusion->addOutput(tv2); |
5858 | |
5859 | tv1->setMemoryType(MemoryType::Shared); |
5860 | for (auto tv : {tv1, tv2}) { |
5861 | tv->split(0, 4); |
5862 | tv->reorder({{1, -1}}); |
5863 | tv->split(1, 8); |
5864 | tv->merge(0); |
5865 | tv->split(0, 1); |
5866 | tv->axis(0)->parallelize(ParallelType::BIDx); |
5867 | tv->axis(1)->parallelize(ParallelType::Unswitch); |
5868 | } |
5869 | tv1->merge(2); |
5870 | tv2->reorder({{2, 3}}); |
5871 | tv2->merge(2); |
5872 | for (auto tv : {tv1, tv2}) { |
5873 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
5874 | } |
5875 | |
5876 | inlineMost(); |
5877 | |
5878 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
5879 | at::Tensor t0 = at::randn({5, 5}, options); |
5880 | |
5881 | FusionExecutor fe; |
5882 | fe.compileFusion(fusion, {t0}); |
5883 | auto cg_outputs = fe.runFusion({t0}); |
5884 | auto out = cg_outputs[0]; |
5885 | |
5886 | testValidate(fusion, {out}, {t0}, {t0}, __LINE__, __FILE__); |
5887 | } |
5888 | |
5889 | TEST_F(NVFuserTest, AsyncCompilation_CUDA) { |
5890 | auto fusion = std::make_unique<Fusion>(); |
5891 | FusionGuard fg(fusion.get()); |
5892 | |
5893 | TensorView* tv0 = makeSymbolicTensor(2); |
5894 | TensorView* tv1 = makeSymbolicTensor(1); |
5895 | TensorView* tv2 = makeSymbolicTensor(2); |
5896 | |
5897 | fusion->addInput(tv0); |
5898 | fusion->addInput(tv1); |
5899 | fusion->addInput(tv2); |
5900 | |
5901 | TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0 |
5902 | TensorView* tv4 = |
5903 | max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) |
5904 | TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, |
5905 | // keeps normalization scheduler away) |
5906 | TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce) |
5907 | |
5908 | fusion->addOutput(tv6); |
5909 | |
5910 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5911 | |
5912 | at::Tensor t0 = at::randn({8, 5}, options); |
5913 | at::Tensor t1 = at::randn({5}, options); |
5914 | at::Tensor t2 = at::randn({8, 5}, options); |
5915 | |
5916 | auto t3 = t0.add(1.0); |
5917 | auto t4 = std::get<0>(at::max(t3, 0)); |
5918 | auto t5 = t4.add(t1); |
5919 | auto t6 = t5.add(t2); |
5920 | |
5921 | FusionExecutorCache executor_cache(std::move(fusion)); |
5922 | |
5923 | std::vector<IValue> aten_inputs = {t0, t1, t2}; |
5924 | |
5925 | executor_cache.compileFusionAsync(aten_inputs); |
5926 | |
5927 | while (!executor_cache.isCompiled(aten_inputs)) { |
5928 | std::this_thread::sleep_for(std::chrono::milliseconds(20)); |
5929 | printf("." ); |
5930 | } |
5931 | |
5932 | auto outputs = executor_cache.runFusionWithInputs(aten_inputs); |
5933 | |
5934 | TORCH_CHECK( |
5935 | executor_cache.getMostRecentKernelRuntime()->isSegmented(), |
5936 | "segmentation didn't happen" ); |
5937 | TORCH_CHECK( |
5938 | executor_cache.getMostRecentKernelRuntime() |
5939 | ->fusionSegments() |
5940 | ->groups() |
5941 | .size() == 2, |
5942 | "segmentation didn't happen as expected" ); |
5943 | |
5944 | testValidate( |
5945 | executor_cache.fusion(), outputs, aten_inputs, {t6}, __LINE__, __FILE__); |
5946 | } |
5947 | |
5948 | TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) { |
5949 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
5950 | auto fusion = fusion_ptr.get(); |
5951 | FusionGuard fg(fusion); |
5952 | |
5953 | TensorView* tv0 = makeConcreteTensor({1, 1}); |
5954 | TensorView* tv1 = makeConcreteTensor({-1}); |
5955 | fusion->addInput(tv0); |
5956 | fusion->addInput(tv1); |
5957 | auto tv2 = sum(tv0, {1}); |
5958 | auto tv3 = add(tv2, tv1); |
5959 | fusion->addOutput(tv3); |
5960 | |
5961 | tv0->merge(0); |
5962 | |
5963 | MaxRootDomainInfoSpanningTree tree(tv0); |
5964 | TransformPropagatorWithCheck tp(tv0); |
5965 | tree.traverse(&tp); |
5966 | |
5967 | inlineMost(); |
5968 | |
5969 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
5970 | at::Tensor t0 = at::randn({1, 1}, options); |
5971 | at::Tensor t1 = at::randn({10}, options); |
5972 | |
5973 | FusionExecutor fe; |
5974 | fe.compileFusion(fusion, {t0, t1}); |
5975 | auto cg_outputs = fe.runFusion({t0, t1}); |
5976 | auto out = cg_outputs[0]; |
5977 | |
5978 | testValidate( |
5979 | fusion, {out}, {t0, t1}, {t1 + t0.flatten()}, __LINE__, __FILE__); |
5980 | } |
5981 | |
5982 | TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) { |
5983 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
5984 | auto fusion = fusion_ptr.get(); |
5985 | FusionGuard fg(fusion); |
5986 | |
5987 | TensorView* tv0 = makeConcreteTensor({-1, 1, 1}); |
5988 | TensorView* tv1 = makeConcreteTensor({-1, -1}); |
5989 | fusion->addInput(tv0); |
5990 | fusion->addInput(tv1); |
5991 | auto tv2 = sum(tv0, {1}); |
5992 | auto tv3 = add(tv2, tv1); |
5993 | fusion->addOutput(tv3); |
5994 | |
5995 | tv2->merge(1); |
5996 | tv2->merge(0); |
5997 | |
5998 | MaxRootDomainInfoSpanningTree tree(tv0); |
5999 | TransformPropagatorWithCheck tp(tv0); |
6000 | tree.traverse(&tp); |
6001 | |
6002 | inlineMost(); |
6003 | |
6004 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6005 | at::Tensor t0 = at::randn({10, 1, 1}, options); |
6006 | at::Tensor t1 = at::randn({10, 10}, options); |
6007 | |
6008 | FusionExecutor fe; |
6009 | fe.compileFusion(fusion, {t0, t1}); |
6010 | auto cg_outputs = fe.runFusion({t0, t1}); |
6011 | auto out = cg_outputs[0]; |
6012 | |
6013 | testValidate( |
6014 | fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__); |
6015 | } |
6016 | |
6017 | // Simple test case exercising the null scheduler path. |
6018 | TEST_F(NVFuserTest, FusionNullScheduler_CUDA) { |
6019 | auto fusion = std::make_unique<Fusion>(); |
6020 | FusionGuard fg(fusion.get()); |
6021 | |
6022 | auto tv0 = makeConcreteTensor({1, 1, 1}); |
6023 | fusion->addInput(tv0); |
6024 | |
6025 | auto tv1 = sum(tv0, {0, 1, 2}); |
6026 | |
6027 | fusion->addOutput(tv1); |
6028 | |
6029 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6030 | at::Tensor t0 = at::randn({1, 1, 1}, options); |
6031 | |
6032 | std::vector<IValue> aten_inputs({t0}); |
6033 | |
6034 | FusionExecutorCache executor_cache(std::move(fusion)); |
6035 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
6036 | |
6037 | auto t1 = t0.sum({0, 1, 2}); |
6038 | |
6039 | testValidate( |
6040 | executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); |
6041 | |
6042 | auto groups = |
6043 | executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); |
6044 | |
6045 | // Check that all groups on the resulting runtime are null. |
6046 | for (auto group : groups) { |
6047 | TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp); |
6048 | } |
6049 | } |
6050 | |
6051 | // Simple test case exercising the null scheduler path. |
6052 | TEST_F(NVFuserTest, FusionNullScheduler2_CUDA) { |
6053 | auto fusion = std::make_unique<Fusion>(); |
6054 | FusionGuard fg(fusion.get()); |
6055 | |
6056 | auto tv0 = makeConcreteTensor({0, 1, 9223372036854775807L}); |
6057 | fusion->addInput(tv0); |
6058 | |
6059 | auto tv1 = sum(tv0, {0, 1, 2}); |
6060 | |
6061 | fusion->addOutput(tv1); |
6062 | |
6063 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6064 | at::Tensor t0 = at::randn({0, 1, 9223372036854775807L}, options); |
6065 | |
6066 | std::vector<IValue> aten_inputs({t0}); |
6067 | |
6068 | FusionExecutorCache executor_cache(std::move(fusion)); |
6069 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
6070 | |
6071 | auto t1 = t0.sum({0, 1, 2}); |
6072 | |
6073 | testValidate( |
6074 | executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__); |
6075 | |
6076 | auto groups = |
6077 | executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); |
6078 | |
6079 | // Check that all groups on the resulting runtime are null. |
6080 | for (auto group : groups) { |
6081 | TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp); |
6082 | } |
6083 | } |
6084 | |
6085 | // Simple test case exercising the null scheduler path. |
6086 | TEST_F(NVFuserTest, FusionNullScheduler3_CUDA) { |
6087 | auto fusion = std::make_unique<Fusion>(); |
6088 | FusionGuard fg(fusion.get()); |
6089 | |
6090 | auto tv0 = TensorViewBuilder().ndims(0).build(); |
6091 | auto tv1 = TensorViewBuilder().ndims(0).build(); |
6092 | fusion->addInput(tv0); |
6093 | fusion->addInput(tv1); |
6094 | auto tv2 = add(tv0, tv1); |
6095 | fusion->addOutput(tv2); |
6096 | |
6097 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6098 | at::Tensor t0 = at::randn({}, options); |
6099 | at::Tensor t1 = at::randn({}, options); |
6100 | |
6101 | std::vector<IValue> aten_inputs({t0, t1}); |
6102 | |
6103 | FusionExecutorCache executor_cache(std::move(fusion)); |
6104 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
6105 | |
6106 | testValidate( |
6107 | executor_cache.fusion(), |
6108 | cg_outputs, |
6109 | {t0, t1}, |
6110 | {t0 + t1}, |
6111 | __LINE__, |
6112 | __FILE__); |
6113 | |
6114 | auto groups = |
6115 | executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); |
6116 | |
6117 | // Check that all groups on the resulting runtime are null. |
6118 | for (auto group : groups) { |
6119 | TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp); |
6120 | } |
6121 | } |
6122 | |
6123 | TEST_F(NVFuserTest, FusionEmpty_CUDA) { |
6124 | auto fusion = std::make_unique<Fusion>(); |
6125 | FusionGuard fg(fusion.get()); |
6126 | |
6127 | auto tv0 = makeConcreteTensor({10, 10, 10}); |
6128 | auto tv1 = makeConcreteTensor({10, 10, 10}); |
6129 | fusion->addInput(tv0); |
6130 | fusion->addInput(tv1); |
6131 | fusion->addOutput(tv0); |
6132 | fusion->addOutput(tv1); |
6133 | |
6134 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6135 | at::Tensor t0 = at::randn({10, 10, 10}, options); |
6136 | at::Tensor t1 = at::randn({10, 10, 10}, options); |
6137 | |
6138 | std::vector<IValue> aten_inputs({t0, t1}); |
6139 | |
6140 | FusionExecutorCache executor_cache(std::move(fusion)); |
6141 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
6142 | |
6143 | testValidate( |
6144 | executor_cache.fusion(), |
6145 | cg_outputs, |
6146 | {t0, t1}, |
6147 | {t0, t1}, |
6148 | __LINE__, |
6149 | __FILE__); |
6150 | |
6151 | auto groups = |
6152 | executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(); |
6153 | |
6154 | // Check that all groups on the resulting runtime are null. |
6155 | for (auto group : groups) { |
6156 | TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp); |
6157 | } |
6158 | } |
6159 | |
6160 | TEST_F(NVFuserTest, FusionMappingRelation_CUDA) { |
6161 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6162 | auto fusion = fusion_ptr.get(); |
6163 | FusionGuard fg(fusion); |
6164 | |
6165 | TensorView* tv0 = makeConcreteTensor({1, 1}); |
6166 | TensorView* tv1 = makeConcreteTensor({-1, 1, 1}); |
6167 | fusion->addInput(tv0); |
6168 | fusion->addInput(tv1); |
6169 | auto tv2 = set(tv0); |
6170 | auto tv3 = broadcast(tv2, {true, false, false}); |
6171 | auto tv4 = add(tv3, tv1); |
6172 | |
6173 | fusion->addOutput(tv4); |
6174 | |
6175 | tv4->merge(-2); |
6176 | tv4->merge(-1); |
6177 | |
6178 | tv0->computeAt(tv4, -1); |
6179 | tv1->computeAt(tv4, -1); |
6180 | |
6181 | ComputeAtMap ca_map(fusion); |
6182 | |
6183 | // FIXME: This is the concerning part that would motivate some |
6184 | // more formalization on concrete/permissive mapping: |
6185 | // exact mapping should ideally imply permissive mapping. |
6186 | auto tv4_inner_node = tv4->axis(0)->definition()->input(1)->as<IterDomain>(); |
6187 | TORCH_CHECK( |
6188 | ca_map.areMapped(tv2->axis(0), tv4_inner_node, IdMappingMode::EXACT)); |
6189 | TORCH_CHECK(!ca_map.areMapped( |
6190 | tv2->axis(0), tv4_inner_node, IdMappingMode::PERMISSIVE)); |
6191 | |
6192 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6193 | at::Tensor t0 = at::randn({1, 1}, options); |
6194 | at::Tensor t1 = at::randn({2, 1, 1}, options); |
6195 | |
6196 | FusionExecutor fe; |
6197 | fe.compileFusion(fusion, {t0, t1}); |
6198 | auto cg_outputs = fe.runFusion({t0, t1}); |
6199 | auto out = cg_outputs[0]; |
6200 | |
6201 | testValidate( |
6202 | fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__); |
6203 | } |
6204 | |
6205 | TEST_F(NVFuserTest, FusionInlineAt_CUDA) { |
6206 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6207 | auto fusion = fusion_ptr.get(); |
6208 | FusionGuard fg(fusion); |
6209 | |
6210 | TensorView* tv0 = makeSymbolicTensor(2); |
6211 | fusion->addInput(tv0); |
6212 | auto tv1 = sin(tv0); |
6213 | auto tv2 = cos(tv1); |
6214 | fusion->addOutput(tv2); |
6215 | |
6216 | tv1->inlineAt(-1); |
6217 | |
6218 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6219 | at::Tensor t0 = at::randn({100, 2}, options); |
6220 | |
6221 | FusionExecutor fe; |
6222 | fe.compileFusion(fusion, {t0}); |
6223 | auto cg_outputs = fe.runFusion({t0}); |
6224 | auto out = cg_outputs[0]; |
6225 | |
6226 | testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__); |
6227 | } |
6228 | |
6229 | TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) { |
6230 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6231 | auto fusion = fusion_ptr.get(); |
6232 | FusionGuard fg(fusion); |
6233 | |
6234 | TensorView* tv0 = makeConcreteTensor({-1, -1}); |
6235 | TensorView* tv1 = makeConcreteTensor({-1, -1}); |
6236 | fusion->addInput(tv0); |
6237 | fusion->addInput(tv1); |
6238 | // Note: tv2 is not needed. Kept it here since previously there was an |
6239 | // assertion from sorting in codegen. |
6240 | auto tv2 = add(tv1, IrBuilder::create<Double>(3.141)); |
6241 | fusion->addOutput(tv0); |
6242 | |
6243 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6244 | at::Tensor t0 = at::randn({10, 4}, options); |
6245 | at::Tensor t1 = at::randn({10, 4}, options); |
6246 | |
6247 | FusionExecutorCache fec(std::move(fusion_ptr)); |
6248 | auto cg_outputs = fec.runFusionWithInputs({t0, t1}); |
6249 | |
6250 | testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__); |
6251 | |
6252 | // Second run to ensure cache hit handles trivial forwarding properly |
6253 | TORCH_CHECK(fec.isCompiled({t0, t1})); |
6254 | auto cg_outputs2 = fec.runFusionWithInputs({t0, t1}); |
6255 | testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__); |
6256 | } |
6257 | |
6258 | TEST_F(NVFuserTest, FusionTrivialInputForwarding2_CUDA) { |
6259 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6260 | auto fusion = fusion_ptr.get(); |
6261 | FusionGuard fg(fusion); |
6262 | |
6263 | TensorView* tv0 = makeSymbolicTensor(0); |
6264 | fusion->addInput(tv0); |
6265 | fusion->addOutput(tv0); |
6266 | |
6267 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6268 | at::Tensor t0 = at::randn({}, options); |
6269 | |
6270 | FusionExecutorCache fec(std::move(fusion_ptr)); |
6271 | auto cg_outputs = fec.runFusionWithInputs({t0}); |
6272 | |
6273 | testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); |
6274 | |
6275 | // Second run to ensure cache hit handles trivial forwarding properly |
6276 | TORCH_CHECK(fec.isCompiled({t0})); |
6277 | auto cg_outputs2 = fec.runFusionWithInputs({t0}); |
6278 | testValidate(fusion, cg_outputs2, {t0}, {t0}, __LINE__, __FILE__); |
6279 | } |
6280 | |
6281 | // Simplified repro of issue #2008 |
6282 | TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) { |
6283 | auto fusion_ptr = std::make_unique<Fusion>(); |
6284 | Fusion& fusion = *fusion_ptr; |
6285 | FusionGuard fg(fusion_ptr.get()); |
6286 | |
6287 | std::vector<int64_t> shape({10, 1, 1}); |
6288 | |
6289 | auto tv0 = makeConcreteTensor(shape); |
6290 | fusion.addInput(tv0); |
6291 | |
6292 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
6293 | auto tv2 = sum(tv1, {1, 2}); |
6294 | auto tv3 = broadcast(tv2, {false, true, true}); |
6295 | fusion.addOutput(tv3); |
6296 | |
6297 | tv0->merge(-2, -1)->merge(-2, -1)->split(0, 4); |
6298 | |
6299 | MaxRootDomainInfoSpanningTree tree(tv0); |
6300 | TransformPropagator tp(tv0); |
6301 | tree.traverse(&tp); |
6302 | |
6303 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6304 | at::Tensor t0 = at::randn(shape, options); |
6305 | std::vector<IValue> aten_inputs({t0}); |
6306 | |
6307 | FusionExecutor fe; |
6308 | fe.compileFusion(fusion_ptr.get(), aten_inputs); |
6309 | auto outputs = fe.runFusion(aten_inputs); |
6310 | |
6311 | testValidate(&fusion, outputs, aten_inputs, {t0 + 1}, __LINE__, __FILE__); |
6312 | } |
6313 | |
6314 | namespace { |
6315 | |
6316 | size_t getVecSizeForPointwise(FusionExecutorCache& fec) { |
6317 | auto most_recent_params = |
6318 | fec.getMostRecentKernelRuntime()->getMostRecentExecutorLog().params; |
6319 | auto params = dynamic_cast<PointwiseParams*>(most_recent_params.get()); |
6320 | if (params->vectorize) { |
6321 | return params->unroll_factor; |
6322 | } |
6323 | return 1; |
6324 | } |
6325 | |
6326 | } // namespace |
6327 | |
6328 | TEST_F(NVFuserTest, FusionVectorizeStrideContiguity2D_CUDA) { |
6329 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6330 | auto fusion = fusion_ptr.get(); |
6331 | FusionGuard fg(fusion); |
6332 | |
6333 | TensorView* tv0 = |
6334 | TensorViewBuilder().ndims(2).contiguity({false, true}).build(); |
6335 | fusion->addInput(tv0); |
6336 | auto tv1 = set(tv0); |
6337 | fusion->addOutput(tv1); |
6338 | |
6339 | FusionExecutorCache fec(std::move(fusion_ptr)); |
6340 | fec.profile(true); |
6341 | |
6342 | std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {18, 2}, {32, 4}}; |
6343 | |
6344 | for (auto pair : size_and_vec) { |
6345 | auto size = pair.first; |
6346 | auto vec = pair.second; |
6347 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6348 | at::Tensor t0 = at::randn({1000000, size}, options).narrow(1, 0, 16); |
6349 | auto cg_outputs = fec.runFusionWithInputs({t0}); |
6350 | |
6351 | TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec); |
6352 | |
6353 | testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); |
6354 | } |
6355 | } |
6356 | |
6357 | TEST_F(NVFuserTest, FusionVectorizeStrideContiguity3D_CUDA) { |
6358 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6359 | auto fusion = fusion_ptr.get(); |
6360 | FusionGuard fg(fusion); |
6361 | |
6362 | TensorView* tv0 = |
6363 | TensorViewBuilder().ndims(3).contiguity({false, true, true}).build(); |
6364 | fusion->addInput(tv0); |
6365 | auto tv1 = set(tv0); |
6366 | fusion->addOutput(tv1); |
6367 | |
6368 | FusionExecutorCache fec(std::move(fusion_ptr)); |
6369 | fec.profile(true); |
6370 | |
6371 | std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {10, 2}, {16, 4}}; |
6372 | |
6373 | for (auto pair : size_and_vec) { |
6374 | auto size = pair.first; |
6375 | auto vec = pair.second; |
6376 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6377 | at::Tensor t0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8); |
6378 | auto cg_outputs = fec.runFusionWithInputs({t0}); |
6379 | |
6380 | TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec); |
6381 | |
6382 | testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); |
6383 | } |
6384 | } |
6385 | |
6386 | TEST_F(NVFuserTest, FusionVectorizeStrideContiguity5D_CUDA) { |
6387 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6388 | auto fusion = fusion_ptr.get(); |
6389 | FusionGuard fg(fusion); |
6390 | |
6391 | TensorView* tv0 = TensorViewBuilder() |
6392 | .ndims(5) |
6393 | .contiguity({false, true, false, true, true}) |
6394 | .build(); |
6395 | fusion->addInput(tv0); |
6396 | auto tv1 = set(tv0); |
6397 | fusion->addOutput(tv1); |
6398 | |
6399 | FusionExecutorCache fec(std::move(fusion_ptr)); |
6400 | fec.profile(true); |
6401 | |
6402 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6403 | |
6404 | std::vector<std::tuple<int, int, int>> sizes_and_vec{ |
6405 | {9, 17, 1}, {9, 10, 2}, {9, 16, 4}}; |
6406 | |
6407 | for (auto tup : sizes_and_vec) { |
6408 | auto size1 = std::get<0>(tup); |
6409 | auto size2 = std::get<1>(tup); |
6410 | auto vec = std::get<2>(tup); |
6411 | at::Tensor t0 = at::randn({4, size1, 12345, size2, 3}, options) |
6412 | .narrow(1, 0, 8) |
6413 | .narrow(3, 0, 4); |
6414 | auto cg_outputs = fec.runFusionWithInputs({t0}); |
6415 | |
6416 | TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec); |
6417 | |
6418 | testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); |
6419 | } |
6420 | } |
6421 | |
6422 | TEST_F(NVFuserTest, FusionVectorizeStrideContiguitySelfOverlapping_CUDA) { |
6423 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
6424 | auto fusion = fusion_ptr.get(); |
6425 | FusionGuard fg(fusion); |
6426 | |
6427 | TensorView* tv0 = TensorViewBuilder() |
6428 | .ndims(5) |
6429 | .contiguity({false, true, false, true, true}) |
6430 | .build(); |
6431 | fusion->addInput(tv0); |
6432 | auto tv1 = set(tv0); |
6433 | fusion->addOutput(tv1); |
6434 | |
6435 | FusionExecutorCache fec(std::move(fusion_ptr)); |
6436 | fec.profile(true); |
6437 | |
6438 | auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); |
6439 | |
6440 | std::vector<std::tuple<int, int, int, int>> sizes_strides_and_vec{ |
6441 | {4, 4, 4, 4}, |
6442 | {4, 4, 2, 2}, |
6443 | {4, 2, 4, 2}, |
6444 | {2, 4, 4, 2}, |
6445 | {4, 4, 1, 1}, |
6446 | {4, 1, 4, 1}, |
6447 | {1, 4, 4, 1}, |
6448 | {2, 2, 2, 2}, |
6449 | {2, 2, 1, 1}, |
6450 | {2, 1, 2, 1}, |
6451 | {1, 2, 2, 1}}; |
6452 | |
6453 | for (auto tup : sizes_strides_and_vec) { |
6454 | auto size = std::get<0>(tup); |
6455 | auto stride1 = std::get<1>(tup); |
6456 | auto stride2 = std::get<2>(tup); |
6457 | auto vec = std::get<3>(tup); |
6458 | std::vector<int64_t> shape = {4, 4, 12345, size, 3}; |
6459 | std::vector<int64_t> stride = {stride1, stride2 * 12345, stride2, 3, 1}; |
6460 | at::Tensor t0 = at::empty_strided(shape, stride, options); |
6461 | t0.random_(); |
6462 | auto cg_outputs = fec.runFusionWithInputs({t0}); |
6463 | TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec); |
6464 | testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); |
6465 | } |
6466 | } |
6467 | |
6468 | TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) { |
6469 | Fusion fusion; |
6470 | FusionGuard fg(&fusion); |
6471 | |
6472 | // requires ampere+ GPU |
6473 | if (!deviceMajorMinorCheck(8)) { |
6474 | GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs" ; |
6475 | return; |
6476 | } |
6477 | |
6478 | auto tv0 = makeContigTensor(1); |
6479 | |
6480 | fusion.addInput(tv0); |
6481 | |
6482 | auto tv1 = set(tv0); |
6483 | |
6484 | fusion.addOutput(tv1); |
6485 | |
6486 | auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync); |
6487 | tv_cache->setMemoryType(MemoryType::Shared); |
6488 | |
6489 | tv1->split(0, 16); |
6490 | tv0->computeAt(tv1, 1); |
6491 | |
6492 | tv_cache->circularBuffer(10); |
6493 | |
6494 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6495 | at::Tensor input1 = at::randn({255}, options); |
6496 | |
6497 | // Add check that the cp async op has an inlined predicate. |
6498 | class InlinedCpAsyncPredChecker : public kir::IrVisitor { |
6499 | public: |
6500 | using kir::IrVisitor::handle; |
6501 | |
6502 | private: |
6503 | void handle(kir::IfThenElse* ite) final { |
6504 | auto prev_within_ite = within_ite_; |
6505 | within_ite_ = true; |
6506 | kir::IrVisitor::handle(ite); |
6507 | within_ite_ = prev_within_ite; |
6508 | } |
6509 | |
6510 | void handle(LoadStoreOp* ldst) final { |
6511 | if (ldst->opType() == LoadStoreOpType::CpAsync) { |
6512 | TORCH_INTERNAL_ASSERT(!within_ite_, "CPASYNC predicate not inlined" ); |
6513 | TORCH_INTERNAL_ASSERT( |
6514 | ldst->predicate()->hasValue() && |
6515 | !ldst->predicate()->value()->isConst(), |
6516 | "CPASYNC predicate is not generated" ); |
6517 | } |
6518 | } |
6519 | |
6520 | private: |
6521 | bool within_ite_ = false; |
6522 | } pred_checker; |
6523 | |
6524 | // Check that cp async is inlined: |
6525 | GpuLower gpulw(&fusion); |
6526 | pred_checker.handle(gpulw.kernel()->topLevelExprs()); |
6527 | |
6528 | FusionExecutor fe; |
6529 | fe.compileFusion(&fusion, {input1}); |
6530 | auto cg_outputs = fe.runFusion({input1}); |
6531 | |
6532 | testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__); |
6533 | } |
6534 | |
6535 | // Test file size should be up to 10K LoC. Create a new file for more tests. |
6536 | |
6537 | } // namespace jit |
6538 | } // namespace torch |
6539 | #endif // #if defined(USE_CUDA) |
6540 | |