1 | #if defined(USE_CUDA) |
2 | #include <gmock/gmock-matchers.h> |
3 | #include <gtest/gtest.h> |
4 | |
5 | #include <arith.h> |
6 | #include <codegen.h> |
7 | #include <disjoint_set.h> |
8 | #include <executor.h> |
9 | #include <executor_launch_params.h> |
10 | #include <expr_evaluator.h> |
11 | #include <fusion.h> |
12 | #include <fusion_segmenter.h> |
13 | #include <grouped_reduction.h> |
14 | #include <inlining.h> |
15 | #include <ir_all_nodes.h> |
16 | #include <ir_builder.h> |
17 | #include <ir_graphviz.h> |
18 | #include <ir_iostream.h> |
19 | #include <ir_utils.h> |
20 | #include <iter_visitor.h> |
21 | #include <kernel_cache.h> |
22 | #include <kernel_expr_evaluator.h> |
23 | #include <kernel_ir.h> |
24 | #include <kernel_ir_dispatch.h> |
25 | #include <lower2device.h> |
26 | #include <lower_magic_zero.h> |
27 | #include <mutator.h> |
28 | #include <ops/all_ops.h> |
29 | #include <register_interface.h> |
30 | #include <root_domain_map.h> |
31 | #include <scheduler/all_schedulers.h> |
32 | #include <scheduler/reduction_utils.h> |
33 | #include <scheduler/utils.h> |
34 | #include <test/test_gpu_validator.h> |
35 | #include <test/test_utils.h> |
36 | #include <transform_replay.h> |
37 | #include <transform_rfactor.h> |
38 | |
39 | #include <test/cpp/jit/test_utils.h> |
40 | #include <torch/csrc/jit/api/function_impl.h> |
41 | #include <parser.h> |
42 | #include <torch/csrc/jit/ir/irparser.h> |
43 | #include <torch/torch.h> |
44 | |
45 | #include <ATen/cuda/CUDAContext.h> |
46 | #include <ATen/cuda/Exceptions.h> |
47 | #include <c10/cuda/CUDAStream.h> |
48 | |
49 | #include <algorithm> |
50 | #include <iostream> |
51 | #include <sstream> |
52 | #include <thread> |
53 | |
54 | // Tests go in torch::jit |
55 | namespace torch { |
56 | namespace jit { |
57 | |
58 | using namespace torch::jit::fuser::cuda; |
59 | using namespace at::indexing; |
60 | |
61 | // A few smoke tests for IrGraphGenerator |
62 | // (These tests exercise IrGraphGenerator through a non-trivial IR, |
63 | // to make sure that it runs w/o crashing. The actual output is not |
64 | // validated) |
65 | TEST_F(NVFuserTest, FusionIrGraphGenerator_CUDA) { |
66 | Fusion fusion; |
67 | FusionGuard fg(&fusion); |
68 | |
69 | // Make sure we can handle empty IRs |
70 | TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
71 | &fusion, IrGraphGenerator::DetailLevel::Basic) |
72 | .empty()); |
73 | |
74 | // Construct an interesting IR |
75 | TensorView* tv0 = makeSymbolicTensor(2); |
76 | fusion.addInput(tv0); |
77 | |
78 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.141)); |
79 | TensorView* tv3 = broadcast(tv0, {false, true, false, true}); |
80 | TensorView* tv4 = |
81 | reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv3); |
82 | TensorView* tv5 = clamp( |
83 | tv4, IrBuilder::create<Double>(0.f), IrBuilder::create<Double>(1.f)); |
84 | TensorView* tv6 = add(tv2, tv2); |
85 | |
86 | // Another checkpoint before adding outputs |
87 | TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
88 | &fusion, IrGraphGenerator::DetailLevel::Explicit) |
89 | .empty()); |
90 | |
91 | fusion.addOutput(tv6); |
92 | |
93 | tv4->axis(2)->parallelize(ParallelType::BIDy); |
94 | tv6->merge(0); |
95 | tv6->split(0, 4); |
96 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
97 | tv5->reorder({{-1, 0}}); |
98 | tv2->computeAt(tv6, 1); |
99 | |
100 | // Another checkpoint with more node types |
101 | TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
102 | &fusion, IrGraphGenerator::DetailLevel::ComputeOnly) |
103 | .empty()); |
104 | |
105 | for (Val* val : fusion.vals()) { |
106 | if (!val->isFusionInput() && |
107 | val->getValType().value() == ValType::TensorView) { |
108 | TensorView* tv = static_cast<TensorView*>(val); |
109 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
110 | } |
111 | } |
112 | |
113 | // Final IR graph |
114 | TORCH_CHECK(!IrGraphGenerator::toGraphviz( |
115 | &fusion, IrGraphGenerator::DetailLevel::Verbose) |
116 | .empty()); |
117 | } |
118 | |
119 | TEST_F(NVFuserTest, FusionDispatch_CUDA) { |
120 | Fusion fusion; |
121 | FusionGuard fg(&fusion); |
122 | |
123 | Double* f = IrBuilder::create<Double>(2.f); |
124 | std::stringstream ss1, ss2, ss3; |
125 | ss1 << f; |
126 | ss2 << static_cast<Val*>(f); |
127 | ss3 << static_cast<Statement*>(f); |
128 | TORCH_CHECK( |
129 | ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0, |
130 | "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*." ); |
131 | } |
132 | |
133 | // Evaluate basic scalar operations with constant values |
134 | TEST_F(NVFuserTest, FusionExprEvalConstants_CUDA) { |
135 | Fusion fusion; |
136 | FusionGuard fg(&fusion); |
137 | |
138 | ExpressionEvaluator evaluator(&fusion); |
139 | |
140 | auto* a = IrBuilder::create<Int>(7); |
141 | auto* b = IrBuilder::create<Int>(3); |
142 | |
143 | // Avoid div operation because it casts int operands to float |
144 | checkIntValue(evaluator, neg(a), -7); |
145 | checkIntValue(evaluator, add(a, b), 10); |
146 | checkIntValue(evaluator, neg(mul(sub(a, b), add(a, b))), -40); |
147 | checkIntValue(evaluator, mod(a, b), 1); |
148 | checkIntValue(evaluator, ceilDiv(a, b), 3); |
149 | } |
150 | |
151 | TEST_F(NVFuserTest, FusionExprEvalDouble_CUDA) { |
152 | auto fusion = std::make_unique<Fusion>(); |
153 | FusionGuard fg(fusion.get()); |
154 | auto ten = IrBuilder::create<Double>(10); |
155 | auto two = IrBuilder::create<Double>(2); |
156 | auto three = IrBuilder::create<Double>(3); |
157 | auto val = castOp(DataType::Int, ceilDiv(sub(ten, two), three)); |
158 | auto reference = static_cast<int64_t>(std::ceil((10.0 - 2.0) / 3.0)); |
159 | TORCH_CHECK(reference == val->evaluateInt()); |
160 | } |
161 | |
162 | // Evaluate basic scalar operations with bound values |
163 | TEST_F(NVFuserTest, FusionExprEvalBindings_CUDA) { |
164 | Fusion fusion; |
165 | FusionGuard fg(&fusion); |
166 | |
167 | ExpressionEvaluator evaluator(&fusion); |
168 | |
169 | auto* a = IrBuilder::create<Int>(); |
170 | auto* b = IrBuilder::create<Int>(); |
171 | auto* c = add(a, b); |
172 | auto* d = neg(ceilDiv(c, b)); |
173 | auto* e = IrBuilder::create<Int>(0); |
174 | |
175 | // trying to evaluate before binding should give empty results |
176 | TORCH_CHECK(!evaluator.evaluate(a).has_value()); |
177 | TORCH_CHECK(!evaluator.evaluate(d).has_value()); |
178 | |
179 | evaluator.bind(a, 7); |
180 | evaluator.bind(b, 3); |
181 | |
182 | // can't bind to the results of expressions |
183 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
184 | ASSERT_ANY_THROW(evaluator.bind(c, 100)); |
185 | |
186 | // can't bind to concrete values |
187 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
188 | ASSERT_ANY_THROW(evaluator.bind(e, 100)); |
189 | |
190 | checkIntValue(evaluator, c, 10); |
191 | checkIntValue(evaluator, sub(a, b), 4); |
192 | checkIntValue(evaluator, mod(a, b), 1); |
193 | checkIntValue(evaluator, ceilDiv(a, b), 3); |
194 | checkIntValue(evaluator, d, -4); |
195 | |
196 | // Reset evaluation context |
197 | evaluator = ExpressionEvaluator(&fusion); |
198 | |
199 | evaluator.bind(a, 2); |
200 | evaluator.bind(b, 5); |
201 | |
202 | checkIntValue(evaluator, c, 7); |
203 | checkIntValue(evaluator, sub(a, b), -3); |
204 | checkIntValue(evaluator, mod(a, b), 2); |
205 | checkIntValue(evaluator, ceilDiv(a, b), 1); |
206 | checkIntValue(evaluator, d, -2); |
207 | } |
208 | |
209 | // Evaluate expressions in a simple IR |
210 | TEST_F(NVFuserTest, FusionExprEvalBasic_CUDA) { |
211 | Fusion fusion; |
212 | FusionGuard fg(&fusion); |
213 | |
214 | // Create a non-trivial IR |
215 | TensorView* tv0 = makeSymbolicTensor(2); |
216 | TensorView* tv1 = makeSymbolicTensor(2); |
217 | |
218 | fusion.addInput(tv0); |
219 | fusion.addInput(tv1); |
220 | |
221 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
222 | TensorView* tv3 = add(tv0, tv2); |
223 | |
224 | fusion.addOutput(tv3); |
225 | |
226 | tv3->split(0, 4); |
227 | |
228 | tv0->computeAt(tv3, 1); |
229 | tv1->computeAt(tv3, 1); |
230 | |
231 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
232 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
233 | tv3->axis(1)->parallelize(ParallelType::Unroll); |
234 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
235 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
236 | |
237 | // 1. Create an evaluator |
238 | ExpressionEvaluator evaluator(&fusion); |
239 | |
240 | // 2. Bind values |
241 | // |
242 | // IMPORTANT: |
243 | // a. The bindings are only as stable as the Vals are in the fusion graph |
244 | // b. You must use the original (rootDomain) extents |
245 | // (ex. `tv0->getRootDomain()[0]->extent()` |
246 | // instead of `tv0->axis(0)->extent()`) |
247 | // |
248 | evaluator.bind(tv0->getRootDomain()[0]->extent(), 6); |
249 | evaluator.bind(tv0->getRootDomain()[1]->extent(), 128); |
250 | evaluator.bind(tv1->getRootDomain()[0]->extent(), 6); |
251 | evaluator.bind(tv1->getRootDomain()[1]->extent(), 128); |
252 | |
253 | // 3. Evaluate and check result values |
254 | TORCH_CHECK(tv2->domain()->nDims() == 3); |
255 | checkIntValue(evaluator, tv2->axis(0)->extent(), 2); |
256 | checkIntValue(evaluator, tv2->axis(1)->extent(), 4); |
257 | checkIntValue(evaluator, tv2->axis(2)->extent(), 128); |
258 | |
259 | TORCH_CHECK(tv3->domain()->nDims() == 3); |
260 | checkIntValue(evaluator, tv3->axis(0)->extent(), 2); |
261 | checkIntValue(evaluator, tv3->axis(1)->extent(), 4); |
262 | checkIntValue(evaluator, tv3->axis(2)->extent(), 128); |
263 | } |
264 | |
265 | // Evaluate expressions in a more complex IR |
266 | TEST_F(NVFuserTest, FusionExprEvalComplex_CUDA) { |
267 | Fusion fusion; |
268 | FusionGuard fg(&fusion); |
269 | |
270 | TensorView* tv0 = makeSymbolicTensor(2); |
271 | fusion.addInput(tv0); |
272 | |
273 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0)); |
274 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0)); |
275 | TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0)); |
276 | TensorView* tv4 = add(tv2, tv1); |
277 | TensorView* tv5 = add(tv4, tv3); |
278 | TensorView* tv6 = add(tv0, tv3); |
279 | |
280 | fusion.addOutput(tv5); |
281 | fusion.addOutput(tv6); |
282 | |
283 | tv5->reorder({{-1, 0}}); |
284 | |
285 | tv6->split(0, 5); |
286 | tv5->merge(0); |
287 | |
288 | // 1. Create an evaluator |
289 | ExpressionEvaluator evaluator(&fusion); |
290 | |
291 | // 2. Bind values |
292 | evaluator.bind(tv0->getRootDomain()[0]->extent(), 129); |
293 | evaluator.bind(tv0->getRootDomain()[1]->extent(), 127); |
294 | |
295 | // Evaluate and check extent values |
296 | TORCH_CHECK(tv0->domain()->nDims() == 2); |
297 | checkIntValue(evaluator, tv0->axis(0)->extent(), 129); |
298 | checkIntValue(evaluator, tv0->axis(1)->extent(), 127); |
299 | |
300 | TORCH_CHECK(tv3->domain()->nDims() == 2); |
301 | checkIntValue(evaluator, tv3->axis(0)->extent(), 129); |
302 | checkIntValue(evaluator, tv3->axis(1)->extent(), 127); |
303 | |
304 | TORCH_CHECK(tv4->domain()->nDims() == 2); |
305 | checkIntValue(evaluator, tv4->axis(0)->extent(), 129); |
306 | checkIntValue(evaluator, tv4->axis(1)->extent(), 127); |
307 | |
308 | TORCH_CHECK(tv5->domain()->nDims() == 1); |
309 | checkIntValue(evaluator, tv5->axis(0)->extent(), 16383); |
310 | |
311 | TORCH_CHECK(tv6->domain()->nDims() == 3); |
312 | checkIntValue(evaluator, tv6->axis(0)->extent(), 26); |
313 | checkIntValue(evaluator, tv6->axis(1)->extent(), 5); |
314 | checkIntValue(evaluator, tv6->axis(2)->extent(), 127); |
315 | } |
316 | |
317 | // Evaluate expressions post lowering |
318 | TEST_F(NVFuserTest, FusionExprEvalPostLower_CUDA) { |
319 | Fusion fusion; |
320 | FusionGuard fg(&fusion); |
321 | |
322 | // Create a non-trivial IR |
323 | TensorView* tv0 = makeSymbolicTensor(2); |
324 | TensorView* tv1 = makeSymbolicTensor(2); |
325 | |
326 | fusion.addInput(tv0); |
327 | fusion.addInput(tv1); |
328 | |
329 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
330 | TensorView* tv3 = add(tv0, tv2); |
331 | |
332 | fusion.addOutput(tv3); |
333 | |
334 | tv3->split(0, 4); |
335 | |
336 | tv0->computeAt(tv3, 1); |
337 | tv1->computeAt(tv3, 1); |
338 | |
339 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
340 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
341 | tv3->axis(1)->parallelize(ParallelType::Unroll); |
342 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
343 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
344 | |
345 | auto* bid_x = add(tv3->axis(0)->extent(), IrBuilder::create<Int>(0)); |
346 | auto* tid_x = add(tv3->axis(-1)->extent(), IrBuilder::create<Int>(0)); |
347 | |
348 | // Lower |
349 | GpuLower gpulw(&fusion); |
350 | |
351 | // 1. Create an evaluation context |
352 | ExpressionEvaluator evaluator(&fusion); |
353 | |
354 | // 2. Bind values |
355 | evaluator.bind(tv0->getRootDomain()[0]->extent(), 6); |
356 | evaluator.bind(tv0->getRootDomain()[1]->extent(), 128); |
357 | evaluator.bind(tv1->getRootDomain()[0]->extent(), 6); |
358 | evaluator.bind(tv1->getRootDomain()[1]->extent(), 128); |
359 | |
360 | // 3. Evaluate and check result values |
361 | TORCH_CHECK(tv2->domain()->nDims() == 3); |
362 | checkIntValue(evaluator, tv2->axis(0)->extent(), 2); |
363 | checkIntValue(evaluator, tv2->axis(1)->extent(), 4); |
364 | checkIntValue(evaluator, tv2->axis(2)->extent(), 128); |
365 | |
366 | TORCH_CHECK(tv3->domain()->nDims() == 3); |
367 | checkIntValue(evaluator, tv3->axis(0)->extent(), 2); |
368 | checkIntValue(evaluator, tv3->axis(1)->extent(), 4); |
369 | checkIntValue(evaluator, tv3->axis(2)->extent(), 128); |
370 | |
371 | checkIntValue(evaluator, bid_x, 2); |
372 | checkIntValue(evaluator, tid_x, 128); |
373 | } |
374 | |
375 | // Kernel IR: Evaluate basic scalar operations with constant values |
376 | TEST_F(NVFuserTest, FusionKernelExprEvalConstants_CUDA) { |
377 | Fusion fusion; |
378 | kir::Kernel kernel(&fusion); |
379 | FusionGuard fg((&kernel)->as<Fusion>()); |
380 | |
381 | auto a = IrBuilder::create<Int>(7); |
382 | auto b = IrBuilder::create<Int>(3); |
383 | auto c = IrBuilder::subExpr(a, b); |
384 | auto d = IrBuilder::divExpr(a, b); |
385 | auto e = IrBuilder::mulExpr(c, d); |
386 | |
387 | kir::ExpressionEvaluator evaluator; |
388 | |
389 | checkIntValue(evaluator, IrBuilder::negExpr(a), -7); |
390 | checkIntValue(evaluator, IrBuilder::addExpr(a, b), 10); |
391 | checkIntValue(evaluator, IrBuilder::negExpr(e), -8); |
392 | checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1); |
393 | checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3); |
394 | } |
395 | |
396 | // Kernel IR: Evaluate basic scalar operations with bound values |
397 | TEST_F(NVFuserTest, FusionKernelExprEvalBindings_CUDA) { |
398 | Fusion fusion; |
399 | kir::Kernel kernel(&fusion); |
400 | FusionGuard fg((&kernel)->as<Fusion>()); |
401 | |
402 | kir::ExpressionEvaluator evaluator; |
403 | |
404 | auto a = IrBuilder::create<Int>(c10::nullopt); |
405 | auto b = IrBuilder::create<Int>(c10::nullopt); |
406 | auto c = IrBuilder::addExpr(a, b); |
407 | auto d = IrBuilder::negExpr(IrBuilder::ceilDivExpr(c, b)); |
408 | auto e = IrBuilder::create<Int>(0); |
409 | |
410 | // trying to evaluate before binding should give empty results |
411 | TORCH_CHECK(!evaluator.evaluate(a).has_value()); |
412 | TORCH_CHECK(!evaluator.evaluate(d).has_value()); |
413 | |
414 | evaluator.bind(a, 7); |
415 | evaluator.bind(b, 3); |
416 | |
417 | // can't bind to the results of expressions |
418 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
419 | ASSERT_ANY_THROW(evaluator.bind(c, 100)); |
420 | |
421 | // can't bind to concrete values |
422 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
423 | ASSERT_ANY_THROW(evaluator.bind(e, 100)); |
424 | |
425 | checkIntValue(evaluator, c, 10); |
426 | checkIntValue(evaluator, IrBuilder::subExpr(a, b), 4); |
427 | checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1); |
428 | checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3); |
429 | checkIntValue(evaluator, d, -4); |
430 | |
431 | // Reset the evaluation context |
432 | evaluator = kir::ExpressionEvaluator(); |
433 | |
434 | evaluator.bind(a, 2); |
435 | evaluator.bind(b, 5); |
436 | |
437 | checkIntValue(evaluator, c, 7); |
438 | checkIntValue(evaluator, IrBuilder::subExpr(a, b), -3); |
439 | checkIntValue(evaluator, IrBuilder::modExpr(a, b), 2); |
440 | checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 1); |
441 | checkIntValue(evaluator, d, -2); |
442 | } |
443 | |
444 | TEST_F(NVFuserTest, FusionClear_CUDA) { |
445 | Fusion fusion; |
446 | FusionGuard fg(&fusion); |
447 | |
448 | // 1. Create a dummy IR |
449 | |
450 | { |
451 | TensorView* tv0 = makeSymbolicTensor(2); |
452 | TensorView* tv1 = makeSymbolicTensor(2); |
453 | |
454 | fusion.addInput(tv0); |
455 | fusion.addInput(tv1); |
456 | |
457 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
458 | TensorView* tv3 = add(tv0, tv2); |
459 | |
460 | fusion.addOutput(tv3); |
461 | |
462 | tv3->split(0, 4); |
463 | tv0->computeAt(tv3, 1); |
464 | tv1->computeAt(tv3, 1); |
465 | |
466 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
467 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
468 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
469 | } |
470 | |
471 | // 2. Clear the IR |
472 | |
473 | fusion.clear(); |
474 | |
475 | TORCH_CHECK(fusion.unordered_exprs().empty()); |
476 | TORCH_CHECK(fusion.vals().empty()); |
477 | |
478 | TORCH_CHECK(fusion.inputs().empty()); |
479 | TORCH_CHECK(fusion.outputs().empty()); |
480 | |
481 | TORCH_CHECK(ir_utils::getReductionOps(&fusion).empty()); |
482 | |
483 | // 3. Rebuild the IR |
484 | |
485 | { |
486 | TensorView* tv0 = makeSymbolicTensor(3); |
487 | TensorView* tv1 = makeSymbolicTensor(3); |
488 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
489 | TensorView* tv3 = add(tv0, tv2); |
490 | |
491 | fusion.addInput(tv0); |
492 | fusion.addInput(tv1); |
493 | fusion.addOutput(tv3); |
494 | |
495 | // tv3 [i0, i1, i2] |
496 | tv3->reorder({{0, 2}, {2, 0}}); |
497 | // tv3 [i2, i1, i0] |
498 | tv3->split(-1, 4); |
499 | // tv3 [i2, i1, i0outer, i0inner{4}] |
500 | tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); |
501 | // tv3 [i0outer, i0inner{4}, i1, i2] |
502 | tv0->computeAt(tv3, -1); |
503 | tv1->computeAt(tv3, -1); |
504 | tv3->axis(1)->parallelize(ParallelType::BIDx); |
505 | } |
506 | |
507 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
508 | |
509 | at::Tensor input1 = at::randn({16, 8, 8}, options); |
510 | at::Tensor input2 = at::randn_like(input1); |
511 | |
512 | FusionExecutor fe; |
513 | fe.compileFusion(&fusion, {input1, input2}); |
514 | auto outputs = fe.runFusion({input1, input2}); |
515 | |
516 | at::Tensor tv2_ref = input2 + 2.0; |
517 | at::Tensor output_ref = input1 + tv2_ref; |
518 | |
519 | TORCH_CHECK(output_ref.equal(outputs[0])); |
520 | } |
521 | |
522 | TEST_F(NVFuserTest, FusionCopy_CUDA) { |
523 | Fusion original_fusion; |
524 | |
525 | // Create the test IR |
526 | { |
527 | FusionGuard fg(&original_fusion); |
528 | |
529 | auto tv0 = makeSymbolicTensor(3); |
530 | auto tv1 = makeSymbolicTensor(3); |
531 | auto tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
532 | auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); |
533 | |
534 | original_fusion.addInput(tv0); |
535 | original_fusion.addInput(tv1); |
536 | original_fusion.addOutput(tv3); |
537 | |
538 | tv3->reorder({{0, 2}, {2, 0}}); |
539 | tv3->split(-1, 4); |
540 | tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); |
541 | |
542 | tv0->computeAt(tv3, -1); |
543 | tv1->computeAt(tv3, -1); |
544 | |
545 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
546 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
547 | } |
548 | |
549 | // Test copy before lowering |
550 | Fusion clone = original_fusion; |
551 | |
552 | // Compare IR dumps |
553 | std::stringstream original_ir; |
554 | std::stringstream clone_ir; |
555 | original_ir << original_fusion; |
556 | clone_ir << clone; |
557 | ASSERT_EQ(original_ir.str(), clone_ir.str()); |
558 | |
559 | // Lower original fusion |
560 | std::string original_kernel; |
561 | { |
562 | // TODO(kir): remove this guard once we implement the cuda codegen visitor |
563 | FusionGuard fg(&original_fusion); |
564 | original_kernel = |
565 | codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); |
566 | } |
567 | |
568 | // Make sure the "before lowering" clone was not mutated |
569 | // while lowering the original fusion IR |
570 | std::stringstream before_lowering_ir; |
571 | before_lowering_ir << clone; |
572 | ASSERT_EQ(original_ir.str(), before_lowering_ir.str()); |
573 | |
574 | // Test copy after lowering (including assignment operator) |
575 | Fusion before_lowering = clone; |
576 | clone = original_fusion; |
577 | |
578 | // Compare IR dumps |
579 | std::stringstream original_lowered_ir; |
580 | std::stringstream clone_lowered_ir; |
581 | original_lowered_ir << original_fusion; |
582 | clone_lowered_ir << clone; |
583 | ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); |
584 | |
585 | // Lower the "before lowering" and compare kernels |
586 | std::string clone_kernel; |
587 | { |
588 | // TODO(kir): remove this guard once we implement the cuda codegen visitor |
589 | FusionGuard fg(&before_lowering); |
590 | clone_kernel = |
591 | codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); |
592 | } |
593 | ASSERT_EQ(original_kernel, clone_kernel); |
594 | } |
595 | |
596 | TEST_F(NVFuserTest, FusionMove_CUDA) { |
597 | Fusion fusion; |
598 | |
599 | // Create the test IR |
600 | { |
601 | FusionGuard fg(&fusion); |
602 | |
603 | auto tv0 = makeSymbolicTensor(3); |
604 | auto tv1 = makeSymbolicTensor(3); |
605 | auto tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
606 | auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); |
607 | |
608 | fusion.addInput(tv0); |
609 | fusion.addInput(tv1); |
610 | fusion.addOutput(tv3); |
611 | |
612 | tv3->reorder({{0, 2}, {2, 0}}); |
613 | tv3->split(-1, 4); |
614 | tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); |
615 | |
616 | tv0->computeAt(tv3, -1); |
617 | tv1->computeAt(tv3, -1); |
618 | |
619 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
620 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
621 | } |
622 | |
623 | std::stringstream original_ir; |
624 | original_ir << fusion; |
625 | |
626 | // Test move before lowering |
627 | Fusion another_fusion = std::move(fusion); |
628 | |
629 | // Check that the original fusion is "empty" |
630 | // |
631 | // IMPORTANT: these checks assume knowledge of the internal |
632 | // implementation of the move operations. General uses |
633 | // should only assume that the moved-from object is in |
634 | // a valid, but unspecified state. This is similar to the |
635 | // standard library containers: |
636 | // https://en.cppreference.com/w/cpp/utility/move |
637 | // |
638 | TORCH_CHECK(fusion.unordered_exprs().empty()); |
639 | TORCH_CHECK(fusion.vals().empty()); |
640 | TORCH_CHECK(fusion.inputs().empty()); |
641 | TORCH_CHECK(fusion.outputs().empty()); |
642 | |
643 | // clear() has no pre-conditions so it's valid to call on a moved-from object |
644 | fusion.clear(); |
645 | |
646 | // Compare IR dumps |
647 | std::stringstream another_ir; |
648 | another_ir << another_fusion; |
649 | ASSERT_EQ(original_ir.str(), another_ir.str()); |
650 | |
651 | // Lower the fusion IR |
652 | GpuLower lower(&another_fusion); |
653 | |
654 | std::stringstream lowered_ir; |
655 | lowered_ir << another_fusion; |
656 | |
657 | // Test move assignment after lowering |
658 | fusion = std::move(another_fusion); |
659 | |
660 | // Compare IR dumps |
661 | std::stringstream moved_lowered_ir; |
662 | moved_lowered_ir << fusion; |
663 | ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str()); |
664 | } |
665 | |
666 | TEST_F(NVFuserTest, FusionSimpleArith_CUDA) { |
667 | std::stringstream ss1, ss2; |
668 | |
669 | Fusion fusion; |
670 | FusionGuard fg(&fusion); |
671 | |
672 | Double* d1 = IrBuilder::create<Double>(1.f); |
673 | Double* d2 = IrBuilder::create<Double>(2.f); |
674 | Double* d3 = IrBuilder::create<Double>(); |
675 | |
676 | // Disrupt the fusion to make sure guard works well |
677 | { |
678 | Fusion fusion2; |
679 | FusionGuard fg(&fusion2); |
680 | |
681 | Double* d1 = IrBuilder::create<Double>(1.f); |
682 | Double* d2 = IrBuilder::create<Double>(2.f); |
683 | add(d1, d2); |
684 | ss2 << fusion2; |
685 | } |
686 | |
687 | IrBuilder::create<BinaryOp>(BinaryOpType::Add, d3, d1, d2); |
688 | ss1 << fusion; |
689 | |
690 | TORCH_CHECK( |
691 | ss1.str().compare(ss2.str()) == 0, |
692 | "Error where explicit add nodes don't match implicit add nodes." ); |
693 | } |
694 | |
695 | TEST_F(NVFuserTest, FusionScalarTypePromote_CUDA) { |
696 | Fusion fusion; |
697 | FusionGuard fg(&fusion); |
698 | |
699 | Bool* b = IrBuilder::create<Bool>(true); |
700 | Double* d = IrBuilder::create<Double>(4.f); |
701 | Int* i = IrBuilder::create<Int>(3); |
702 | ComplexDouble* c = |
703 | IrBuilder::create<ComplexDouble>(c10::complex<double>(1, 2)); |
704 | |
705 | TORCH_CHECK(add(b, b)->getDataType() == DataType::Bool); |
706 | TORCH_CHECK(add(b, d)->getDataType() == DataType::Double); |
707 | TORCH_CHECK(add(b, i)->getDataType() == DataType::Int); |
708 | TORCH_CHECK(add(b, c)->getDataType() == DataType::ComplexDouble); |
709 | |
710 | TORCH_CHECK(add(d, b)->getDataType() == DataType::Double); |
711 | TORCH_CHECK(add(d, d)->getDataType() == DataType::Double); |
712 | TORCH_CHECK(add(d, i)->getDataType() == DataType::Double); |
713 | TORCH_CHECK(add(d, c)->getDataType() == DataType::ComplexDouble); |
714 | |
715 | TORCH_CHECK(add(i, b)->getDataType() == DataType::Int); |
716 | TORCH_CHECK(add(i, d)->getDataType() == DataType::Double); |
717 | TORCH_CHECK(add(i, i)->getDataType() == DataType::Int); |
718 | TORCH_CHECK(add(i, c)->getDataType() == DataType::ComplexDouble); |
719 | |
720 | TORCH_CHECK(add(c, b)->getDataType() == DataType::ComplexDouble); |
721 | TORCH_CHECK(add(c, d)->getDataType() == DataType::ComplexDouble); |
722 | TORCH_CHECK(add(c, i)->getDataType() == DataType::ComplexDouble); |
723 | TORCH_CHECK(add(c, c)->getDataType() == DataType::ComplexDouble); |
724 | } |
725 | |
726 | TEST_F(NVFuserTest, FusionComplexAbsTypes_CUDA) { |
727 | Fusion fusion; |
728 | FusionGuard fg(&fusion); |
729 | |
730 | auto options = at::TensorOptions().device(at::kCUDA, 0); |
731 | auto tensor_cf = at::randn({4, 4, 4}, options.dtype(at::kComplexFloat)); |
732 | auto tensor_cd = at::randn({4, 4, 4}, options.dtype(at::kComplexDouble)); |
733 | |
734 | auto type_cf = TensorType::create(tensor_cf); |
735 | auto tv_cf = IrBuilder::create<TensorView>(type_cf); |
736 | auto type_cd = TensorType::create(tensor_cd); |
737 | auto tv_cd = IrBuilder::create<TensorView>(type_cd); |
738 | |
739 | TORCH_CHECK( |
740 | tensor_cf.abs().scalar_type() == |
741 | data_type_to_aten(abs(tv_cf)->getDataType().value())); |
742 | TORCH_CHECK( |
743 | tensor_cd.abs().scalar_type() == |
744 | data_type_to_aten(abs(tv_cd)->getDataType().value())); |
745 | } |
746 | |
747 | TEST_F(NVFuserTest, FusionRegister_CUDA) { |
748 | Fusion fusion; |
749 | FusionGuard fg(&fusion); |
750 | Double* v1 = IrBuilder::create<Double>(1.f); |
751 | Double* v2 = IrBuilder::create<Double>(2.f); |
752 | Val* v3 = binaryOp(BinaryOpType::Add, v1, v2); |
753 | Val* v4 = binaryOp(BinaryOpType::Add, v1, v2); |
754 | TORCH_CHECK(v1->name() + 1 == v2->name()); |
755 | TORCH_CHECK(v2->name() + 1 == v3->name()); |
756 | TORCH_CHECK(v3->name() + 1 == v4->name()); |
757 | TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name()); |
758 | } |
759 | |
760 | // dummy expr with 2 outputs only for toposort test. |
761 | struct DummyExpr : public Expr { |
762 | ~DummyExpr() = default; |
763 | DummyExpr( |
764 | IrBuilderPasskey passkey, |
765 | Val* _outlhs, |
766 | Val* _outrhs, |
767 | Val* _lhs, |
768 | Val* _rhs) |
769 | : Expr(passkey, ExprType::UnaryOp) // Not terribly safe... |
770 | { |
771 | addOutput(_outlhs); |
772 | addOutput(_outrhs); |
773 | addInput(_lhs); |
774 | addInput(_rhs); |
775 | } |
776 | DummyExpr(const DummyExpr& other) = delete; |
777 | DummyExpr& operator=(const DummyExpr& other) = delete; |
778 | DummyExpr(DummyExpr&& other) = delete; |
779 | DummyExpr& operator=(DummyExpr&& other) = delete; |
780 | Expr* shallowCopy() const override { |
781 | return nullptr; |
782 | } |
783 | }; |
784 | |
785 | TEST_F(NVFuserTest, FusionTopoSort_CUDA) { |
786 | Fusion fusion; |
787 | FusionGuard fg(&fusion); |
788 | |
789 | // e0: v3, v2 = dummy(v1, v0) |
790 | // e1: v4 = add(v3, v2) |
791 | // e2: v5 = add(v2, v4) |
792 | // e3: v6 = add(v5, v5) |
793 | Double* v0 = IrBuilder::create<Double>(); |
794 | Double* v1 = IrBuilder::create<Double>(); |
795 | Double* v2 = IrBuilder::create<Double>(); |
796 | Double* v3 = IrBuilder::create<Double>(); |
797 | Double* v4 = IrBuilder::create<Double>(); |
798 | Double* v5 = IrBuilder::create<Double>(); |
799 | Double* v6 = IrBuilder::create<Double>(); |
800 | |
801 | std::vector<Val*> inputs = {v0, v1}; |
802 | for (auto val : inputs) { |
803 | fusion.addInput(val); |
804 | } |
805 | |
806 | Expr* e0 = IrBuilder::create<DummyExpr>(v3, v2, v1, v0); |
807 | Expr* e1 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v4, v3, v2); |
808 | Expr* e2 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v5, v2, v4); |
809 | Expr* e3 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v6, v5, v5); |
810 | |
811 | fusion.addOutput(v2); |
812 | fusion.addOutput(v3); |
813 | auto exprs = fusion.exprs(); |
814 | TORCH_CHECK(exprs.size() == 1, "Found " , exprs.size(), " but expecting 1" ); |
815 | TORCH_CHECK(exprs[0] == e0); |
816 | |
817 | fusion.addOutput(v5); |
818 | exprs = fusion.exprs(); |
819 | TORCH_CHECK(exprs.size() == 3, "Found " , exprs.size(), " but expecting 3" ); |
820 | TORCH_CHECK(exprs[0] == e0); |
821 | TORCH_CHECK(exprs[1] == e1); |
822 | TORCH_CHECK(exprs[2] == e2); |
823 | |
824 | fusion.addOutput(v4); |
825 | exprs = fusion.exprs(); |
826 | TORCH_CHECK(exprs.size() == 3, "Found " , exprs.size(), " but expecting 3" ); |
827 | TORCH_CHECK(exprs[0] == e0); |
828 | TORCH_CHECK(exprs[1] == e1); |
829 | TORCH_CHECK(exprs[2] == e2); |
830 | |
831 | fusion.addOutput(v6); |
832 | exprs = fusion.exprs(); |
833 | TORCH_CHECK(exprs.size() == 4, "Found " , exprs.size(), " but expecting 4" ); |
834 | TORCH_CHECK(exprs[0] == e0); |
835 | TORCH_CHECK(exprs[1] == e1); |
836 | TORCH_CHECK(exprs[2] == e2); |
837 | TORCH_CHECK(exprs[3] == e3); |
838 | |
839 | TORCH_CHECK(v2->definition()->name() == 0); |
840 | TORCH_CHECK(v3->definition()->name() == 0); |
841 | TORCH_CHECK(v4->definition()->name() == 1); |
842 | TORCH_CHECK(v5->definition()->name() == 2); |
843 | TORCH_CHECK(v6->definition()->name() == 3); |
844 | } |
845 | |
846 | TEST_F(NVFuserTest, FusionTensor_CUDA) { |
847 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
848 | |
849 | Fusion fusion; |
850 | FusionGuard fg(&fusion); |
851 | |
852 | { |
853 | auto tensor = at::randn({2, 3, 4, 5}, options); |
854 | auto tensor_type = TensorType::create(tensor); |
855 | auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type); |
856 | TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); |
857 | TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); |
858 | TORCH_CHECK(fuser_tensor->domain() != nullptr); |
859 | for (const auto i : c10::irange(fuser_tensor->nDims())) { |
860 | // size 1 dimension are makred as broadcast |
861 | TORCH_CHECK( |
862 | fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); |
863 | // check contiguity information; |
864 | TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]); |
865 | } |
866 | } |
867 | |
868 | // TensorType::create fills stride_properties, which helps us to mark |
869 | // IterDomain properly |
870 | // Note: implementation could change, depending on how much we want to invest |
871 | // in our home-brew contiguity coalescing. For now let's make sure that we |
872 | // properly test what we are using. |
873 | { |
874 | auto tensor = at::randn({4, 4, 4}, options); |
875 | auto sliced_tensor = tensor.slice(1, 0, -1, 2); |
876 | |
877 | auto tensor_type = TensorType::create(sliced_tensor); |
878 | auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type); |
879 | TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); |
880 | TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); |
881 | TORCH_CHECK(fuser_tensor->domain() != nullptr); |
882 | for (const auto i : c10::irange(fuser_tensor->nDims())) { |
883 | // size 1 dimension are makred as broadcast |
884 | TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false); |
885 | } |
886 | TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); |
887 | TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); |
888 | TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); |
889 | } |
890 | |
891 | { |
892 | auto tensor = at::randn({2, 3, 4, 5}, options); |
893 | auto permuted_tensor = tensor.permute({0, 3, 1, 2}); |
894 | auto tensor_type = TensorType::create(permuted_tensor); |
895 | auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type); |
896 | TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); |
897 | TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); |
898 | TORCH_CHECK(fuser_tensor->domain() != nullptr); |
899 | for (const auto i : c10::irange(fuser_tensor->nDims())) { |
900 | // size 1 dimension are makred as broadcast |
901 | TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false); |
902 | } |
903 | TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); |
904 | TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); |
905 | TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); |
906 | TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]); |
907 | } |
908 | } |
909 | |
910 | TEST_F(NVFuserTest, FusionFilterVals_CUDA) { |
911 | Fusion fusion; |
912 | FusionGuard fg(&fusion); |
913 | |
914 | auto tv0 = makeSymbolicTensor(1); |
915 | auto tv1 = makeSymbolicTensor(1); |
916 | auto scalar0 = IrBuilder::create<Double>(0); |
917 | auto scalar1 = IrBuilder::create<Int>(0); |
918 | auto scalar2 = IrBuilder::create<Int>(1); |
919 | |
920 | const std::vector<Val*> vals = {tv0, scalar0, tv1, scalar1, scalar2}; |
921 | |
922 | std::vector<TensorView*> tvs( |
923 | ir_utils::filterByType<TensorView>(vals).begin(), |
924 | ir_utils::filterByType<TensorView>(vals).end()); |
925 | TORCH_CHECK(tvs.size() == 2); |
926 | TORCH_CHECK(tvs[0] == tv0); |
927 | TORCH_CHECK(tvs[1] == tv1); |
928 | |
929 | std::vector<Double*> floats( |
930 | ir_utils::filterByType<Double>(vals).begin(), |
931 | ir_utils::filterByType<Double>(vals).end()); |
932 | TORCH_CHECK(floats.size() == 1); |
933 | TORCH_CHECK(floats[0] == scalar0); |
934 | |
935 | std::vector<Int*> ints( |
936 | ir_utils::filterByType<Int>(vals).begin(), |
937 | ir_utils::filterByType<Int>(vals).end()); |
938 | TORCH_CHECK(ints.size() == 2); |
939 | TORCH_CHECK(ints[0] == scalar1); |
940 | TORCH_CHECK(ints[1] == scalar2); |
941 | |
942 | TORCH_CHECK( |
943 | ir_utils::filterByType<Expr>(vals).begin() == |
944 | ir_utils::filterByType<Expr>(vals).end(), |
945 | "Not expecting any results" ); |
946 | } |
947 | |
948 | TEST_F(NVFuserTest, FusionTVSplit_CUDA) { |
949 | Fusion fusion; |
950 | FusionGuard fg(&fusion); |
951 | |
952 | TensorView* tv = makeSymbolicTensor(3); |
953 | |
954 | tv = tv->split(2, 2); |
955 | TORCH_CHECK(tv->nDims() == 4); |
956 | Expr* outer = tv->axis(2)->extent()->definition(); |
957 | |
958 | TORCH_CHECK( |
959 | outer->getExprType().value() == ExprType::BinaryOp && |
960 | static_cast<BinaryOp*>(outer)->getBinaryOpType() == |
961 | BinaryOpType::CeilDiv && |
962 | static_cast<BinaryOp*>(outer)->lhs()->sameAs( |
963 | tv->getRootDomain()[2]->extent()) && |
964 | static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs()) |
965 | ->sameAs(IrBuilder::create<Int>(2))); |
966 | |
967 | IterDomain* inner = static_cast<IterDomain*>(tv->axis(3)); |
968 | TORCH_CHECK( |
969 | inner->extent()->isScalar() && |
970 | static_cast<Int*>(inner->extent())->isConst() && |
971 | static_cast<Int*>(inner->extent())->value().value() == 2); |
972 | } |
973 | |
974 | TEST_F(NVFuserTest, FusionTVMerge_CUDA) { |
975 | Fusion fusion; |
976 | FusionGuard fg(&fusion); |
977 | |
978 | TensorView* tv = makeSymbolicTensor(3); |
979 | |
980 | tv = tv->merge(1); |
981 | Expr* axisOp = tv->axis(1)->extent()->definition(); |
982 | |
983 | TORCH_CHECK( |
984 | tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp && |
985 | static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul && |
986 | static_cast<BinaryOp*>(axisOp)->lhs() == |
987 | tv->getRootDomain()[1]->extent() && |
988 | static_cast<BinaryOp*>(axisOp)->rhs() == |
989 | tv->getRootDomain()[2]->extent()); |
990 | } |
991 | |
992 | TEST_F(NVFuserTest, FusionTVReorder_CUDA) { |
993 | Fusion fusion; |
994 | FusionGuard fg(&fusion); |
995 | |
996 | std::unordered_map<int, int> shift_right{{-1, 0}}; |
997 | |
998 | std::unordered_map<int, int> shift_left{{0, -1}}; |
999 | |
1000 | std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}}; |
1001 | |
1002 | std::unordered_map<int, int> swap{{0, 2}, {2, 0}}; |
1003 | |
1004 | auto tv = makeSymbolicTensor(3); |
1005 | std::vector<IterDomain*> ref; |
1006 | ref = std::vector<IterDomain*>( |
1007 | tv->domain()->domain().begin(), tv->domain()->domain().end()); |
1008 | |
1009 | tv->reorder(shift_left); |
1010 | for (const auto i : c10::irange(tv->nDims())) { |
1011 | TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); |
1012 | } |
1013 | |
1014 | tv = makeSymbolicTensor(3); |
1015 | ref = std::vector<IterDomain*>( |
1016 | tv->domain()->domain().begin(), tv->domain()->domain().end()); |
1017 | |
1018 | tv->reorder(shift_left); |
1019 | for (const auto i : c10::irange(tv->nDims())) { |
1020 | TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); |
1021 | } |
1022 | |
1023 | tv = makeSymbolicTensor(3); |
1024 | ref = std::vector<IterDomain*>( |
1025 | tv->domain()->domain().begin(), tv->domain()->domain().end()); |
1026 | |
1027 | tv->reorder(shift_right); |
1028 | TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0))); |
1029 | for (const auto i : c10::irange(1, tv->nDims())) { |
1030 | TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i))); |
1031 | } |
1032 | |
1033 | tv = makeSymbolicTensor(3); |
1034 | ref = std::vector<IterDomain*>( |
1035 | tv->domain()->domain().begin(), tv->domain()->domain().end()); |
1036 | tv->reorder(swap); |
1037 | TORCH_CHECK(ref[0]->sameAs(tv->axis(2))); |
1038 | TORCH_CHECK(ref[2]->sameAs(tv->axis(0))); |
1039 | TORCH_CHECK(ref[1]->sameAs(tv->axis(1))); |
1040 | } |
1041 | |
1042 | TEST_F(NVFuserTest, FusionEquality_CUDA) { |
1043 | Fusion fusion; |
1044 | FusionGuard fg(&fusion); |
1045 | |
1046 | Double* fval1 = IrBuilder::create<Double>(); |
1047 | Double* fval1_copy = fval1; |
1048 | Double* fval2 = IrBuilder::create<Double>(); |
1049 | Double* fone = IrBuilder::create<Double>(1.0); |
1050 | |
1051 | TORCH_CHECK(fval1->sameAs(fval1_copy)); |
1052 | TORCH_CHECK(!fval1->sameAs(fval2)); |
1053 | TORCH_CHECK(!fone->sameAs(fval1)); |
1054 | TORCH_CHECK(fone->sameAs(IrBuilder::create<Double>(1.0))); |
1055 | |
1056 | Int* ival1 = IrBuilder::create<Int>(); |
1057 | Int* ival1_copy = ival1; |
1058 | Int* ival2 = IrBuilder::create<Int>(); |
1059 | Int* ione = IrBuilder::create<Int>(1); |
1060 | |
1061 | TORCH_CHECK(ival1->sameAs(ival1_copy)); |
1062 | TORCH_CHECK(!ival1->sameAs(ival2)); |
1063 | TORCH_CHECK(!ione->sameAs(ival1)); |
1064 | TORCH_CHECK(ione->sameAs(IrBuilder::create<Int>(1))); |
1065 | |
1066 | BinaryOp* add1 = IrBuilder::create<BinaryOp>( |
1067 | BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1); |
1068 | BinaryOp* add1_copy = IrBuilder::create<BinaryOp>( |
1069 | BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1); |
1070 | BinaryOp* sub1 = IrBuilder::create<BinaryOp>( |
1071 | BinaryOpType::Sub, IrBuilder::create<Double>(), fval1, ival1); |
1072 | |
1073 | UnaryOp* neg1 = IrBuilder::create<UnaryOp>( |
1074 | UnaryOpType::Neg, IrBuilder::create<Double>(), fval1); |
1075 | UnaryOp* neg2 = IrBuilder::create<UnaryOp>( |
1076 | UnaryOpType::Neg, IrBuilder::create<Double>(), fval2); |
1077 | UnaryOp* neg1_copy = IrBuilder::create<UnaryOp>( |
1078 | UnaryOpType::Neg, IrBuilder::create<Double>(), fval1); |
1079 | |
1080 | TORCH_CHECK(add1->sameAs(add1_copy)); |
1081 | TORCH_CHECK(!add1->sameAs(sub1)); |
1082 | |
1083 | TORCH_CHECK(neg1->sameAs(neg1_copy)); |
1084 | TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1)); |
1085 | TORCH_CHECK(!neg1->sameAs(neg2)); |
1086 | } |
1087 | |
1088 | TEST_F(NVFuserTest, FusionDependency_CUDA) { |
1089 | Fusion fusion; |
1090 | FusionGuard fg(&fusion); |
1091 | |
1092 | Double* d0 = IrBuilder::create<Double>(0.f); |
1093 | Double* d1 = IrBuilder::create<Double>(1.f); |
1094 | auto d2 = add(d0, d1); |
1095 | |
1096 | auto d3 = add(d2, d2); |
1097 | |
1098 | Double* d4 = IrBuilder::create<Double>(4.f); |
1099 | Double* d5 = IrBuilder::create<Double>(5.f); |
1100 | auto d6 = add(d4, d5); |
1101 | |
1102 | Double* d7 = IrBuilder::create<Double>(7.f); |
1103 | Double* d8 = IrBuilder::create<Double>(8.f); |
1104 | auto d9 = add(d7, d8); |
1105 | |
1106 | auto d10 = add(d6, d9); |
1107 | |
1108 | auto d11 = add(d3, d10); |
1109 | |
1110 | TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11)); |
1111 | TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11)); |
1112 | TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11)); |
1113 | TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11)); |
1114 | TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11)); |
1115 | TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11)); |
1116 | TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2)); |
1117 | TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3)); |
1118 | TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6)); |
1119 | TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10)); |
1120 | |
1121 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0)); |
1122 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1)); |
1123 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2)); |
1124 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3)); |
1125 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4)); |
1126 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5)); |
1127 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0)); |
1128 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2)); |
1129 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4)); |
1130 | TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8)); |
1131 | |
1132 | auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11); |
1133 | TORCH_CHECK(dep_chain.back() == d11); |
1134 | dep_chain.pop_back(); |
1135 | TORCH_CHECK(dep_chain.back() == d3); |
1136 | dep_chain.pop_back(); |
1137 | TORCH_CHECK(dep_chain.back() == d2); |
1138 | dep_chain.pop_back(); |
1139 | |
1140 | dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11); |
1141 | TORCH_CHECK(dep_chain.back() == d11); |
1142 | dep_chain.pop_back(); |
1143 | TORCH_CHECK(dep_chain.back() == d10); |
1144 | dep_chain.pop_back(); |
1145 | |
1146 | dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11); |
1147 | TORCH_CHECK(dep_chain.back() == d11); |
1148 | dep_chain.pop_back(); |
1149 | TORCH_CHECK(dep_chain.back() == d10); |
1150 | dep_chain.pop_back(); |
1151 | TORCH_CHECK(dep_chain.back() == d6); |
1152 | dep_chain.pop_back(); |
1153 | |
1154 | dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2); |
1155 | TORCH_CHECK(dep_chain.empty()); |
1156 | } |
1157 | |
1158 | TEST_F(NVFuserTest, FusionParser_CUDA) { |
1159 | // This test may not pass if using a custom block sync as there may |
1160 | // be additional calls. Skip the test as it's not specifically |
1161 | // relevant with block synchronizatin. |
1162 | if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC" )) { |
1163 | return; |
1164 | } |
1165 | auto g = std::make_shared<Graph>(); |
1166 | const auto graph0_string = R"IR( |
1167 | graph(%0 : Float(2, strides=[1]), |
1168 | %1 : Float(2, strides=[1])): |
1169 | %c0 : Float(2, strides=[1]) = aten::mul(%0, %1) |
1170 | %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0) |
1171 | return (%d0))IR" ; |
1172 | parseIR(graph0_string, g.get()); |
1173 | |
1174 | // strides are not yet supported in the irparser. |
1175 | for (auto val : g->block()->inputs()) { |
1176 | if (val->isCompleteTensor()) |
1177 | val->setType(val->type()->castRaw<TensorType>()->contiguous()); |
1178 | } |
1179 | for (auto node : g->block()->nodes()) { |
1180 | for (auto val : node->outputs()) { |
1181 | if (val->isCompleteTensor()) |
1182 | val->setType(val->type()->castRaw<TensorType>()->contiguous()); |
1183 | } |
1184 | } |
1185 | |
1186 | auto fusion = parseJitIR(g); |
1187 | FusionGuard fg(fusion.get()); |
1188 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1189 | // Avoid vectorization here as those kernels can't be lowered twice at the |
1190 | // moment |
1191 | at::Tensor input1 = at::randn({16}, options); |
1192 | at::Tensor input2 = at::randn({16}, options); |
1193 | auto lparams = schedulePointwise(fusion.get(), {input1, input2}); |
1194 | |
1195 | // CONSIDER: |
1196 | // 1. this can be moved to a dedicated "golden" file |
1197 | // 2. use a fuzzy compare (ignore non-significant whitespaces for example) |
1198 | const std::string expected_kernel = R"( |
1199 | __global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) { |
1200 | int64_t i50; |
1201 | i50 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x); |
1202 | if ((i50 < T0.size[0])) { |
1203 | float T5[1]; |
1204 | T5[0] = 0; |
1205 | T5[0] |
1206 | = T1[i50]; |
1207 | float T4[1]; |
1208 | T4[0] = 0; |
1209 | T4[0] |
1210 | = T0[i50]; |
1211 | float T2[1]; |
1212 | T2[0] |
1213 | = T4[0] |
1214 | * T5[0]; |
1215 | float T6[1]; |
1216 | T6[0] |
1217 | = T2[0] |
1218 | * T4[0]; |
1219 | T3[i50] |
1220 | = T6[0]; |
1221 | } |
1222 | } |
1223 | )" ; |
1224 | |
1225 | const std::string actual_kernel = |
1226 | "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); |
1227 | if (expected_kernel.size() != actual_kernel.size() || |
1228 | expected_kernel.compare(actual_kernel) != 0) { |
1229 | std::cerr |
1230 | << " Codegen mismatch, codegen possibly changed, or is incorrect. " |
1231 | << " \n ========= EXPECTED ========= \n" |
1232 | << expected_kernel << "\n========= ACTUAL ========== \n" |
1233 | << actual_kernel << "\n=================" << std::endl; |
1234 | auto it = std::mismatch( |
1235 | expected_kernel.begin(), |
1236 | expected_kernel.end(), |
1237 | actual_kernel.begin(), |
1238 | actual_kernel.end()); |
1239 | std::string actual_mismatched_snippet(it.second, actual_kernel.end()); |
1240 | actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10); |
1241 | std::string expected_mismatched_snippet(it.first, expected_kernel.end()); |
1242 | expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10); |
1243 | std::cerr << "First mismatch found at: " << actual_mismatched_snippet |
1244 | << ", expected: " << expected_mismatched_snippet << std::endl; |
1245 | TORCH_CHECK(false); |
1246 | } |
1247 | |
1248 | FusionExecutor fe; |
1249 | fe.compileFusion(fusion.get(), {input1, input2}, lparams); |
1250 | auto outputs = fe.runFusion({input1, input2}, lparams); |
1251 | at::Tensor output_ref = input1 * input2 * input1; |
1252 | TORCH_CHECK(output_ref.equal(outputs[0])); |
1253 | } |
1254 | |
1255 | TEST_F(NVFuserTest, FusionOuterSplit_CUDA) { |
1256 | Fusion fusion; |
1257 | FusionGuard fg(&fusion); |
1258 | |
1259 | TensorView* tv0 = makeSymbolicTensor(3); |
1260 | |
1261 | IrBuilder::create<BinaryOp>( |
1262 | BinaryOpType::Add, |
1263 | tv0, |
1264 | IrBuilder::create<Double>(0.0), |
1265 | IrBuilder::create<Double>(1.0)); |
1266 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0)); |
1267 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0)); |
1268 | fusion.addOutput(tv2); |
1269 | |
1270 | //[I0, I1, I2] |
1271 | tv2->split(-1, 4, false); |
1272 | //[I0, I1, I2o{4}, I2i] |
1273 | tv2->merge(0); |
1274 | tv2->merge(0); |
1275 | //[I0*I1*I2o{4}, I2i] |
1276 | tv2->split(0, 2); |
1277 | //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i] |
1278 | tv2->reorder({{0, 1}, {1, 0}}); |
1279 | // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i] |
1280 | |
1281 | tv0->computeAt(tv2, -1); |
1282 | |
1283 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1284 | |
1285 | at::Tensor output = at::empty({2, 6, 32}, options); |
1286 | |
1287 | FusionExecutor fe; |
1288 | fe.compileFusion(&fusion); |
1289 | fe.runFusion({}, {output}); |
1290 | |
1291 | at::Tensor output_ref = at::zeros_like(output, options); |
1292 | output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; |
1293 | |
1294 | TORCH_CHECK(output_ref.equal(output)); |
1295 | } |
1296 | |
1297 | TEST_F(NVFuserTest, FusionCodeGen_CUDA) { |
1298 | Fusion fusion; |
1299 | FusionGuard fg(&fusion); |
1300 | |
1301 | TensorView* tv0 = makeSymbolicTensor(3); |
1302 | |
1303 | IrBuilder::create<BinaryOp>( |
1304 | BinaryOpType::Add, |
1305 | tv0, |
1306 | IrBuilder::create<Double>(0.0), |
1307 | IrBuilder::create<Double>(1.0)); |
1308 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0)); |
1309 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0)); |
1310 | fusion.addOutput(tv2); |
1311 | |
1312 | //[I0, I1, I2] |
1313 | tv2 = tv2->split(0, 4); |
1314 | //[I0o, I0i{4}, I1, I2] |
1315 | tv2 = tv2->merge(1); |
1316 | //[I0o, I0i{4}*I1, I2] |
1317 | tv2 = tv2->split(-1, 2); |
1318 | //[I0o, I0i{4}*I1, I2o, I2i{2}] |
1319 | tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}}); |
1320 | //[I0i{4}*I1, I0o, I2i{2}, I2o] |
1321 | |
1322 | tv0->computeAt(tv2, -1); |
1323 | |
1324 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1325 | |
1326 | at::Tensor output = at::empty({16, 8, 8}, options); |
1327 | |
1328 | FusionExecutor fe; |
1329 | fe.compileFusion(&fusion); |
1330 | fe.runFusion({}, {output}); |
1331 | |
1332 | at::Tensor output_ref = at::zeros_like(output, options); |
1333 | output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; |
1334 | |
1335 | TORCH_CHECK(output_ref.equal(output)); |
1336 | } |
1337 | |
1338 | TEST_F(NVFuserTest, FusionCodeGen2_CUDA) { |
1339 | Fusion fusion; |
1340 | FusionGuard fg(&fusion); |
1341 | |
1342 | TensorView* tv0 = makeSymbolicTensor(3); |
1343 | TensorView* tv1 = makeSymbolicTensor(3); |
1344 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
1345 | TensorView* tv3 = add(tv0, tv2); |
1346 | |
1347 | fusion.addInput(tv0); |
1348 | fusion.addInput(tv1); |
1349 | fusion.addOutput(tv3); |
1350 | |
1351 | //[I0, I1, I2] |
1352 | tv3->reorder({{0, 2}, {2, 0}}); |
1353 | //[I2, I1, I0] |
1354 | tv3->split(-1, 4); |
1355 | //[I2, I1, I0o, I0i{4}] |
1356 | tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); |
1357 | // I0o, I0i{4}, I1, I2] |
1358 | |
1359 | tv0->computeAt(tv3, -1); |
1360 | tv1->computeAt(tv3, -1); |
1361 | |
1362 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1363 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
1364 | |
1365 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1366 | |
1367 | at::Tensor input1 = at::randn({16, 8, 8}, options); |
1368 | at::Tensor input2 = at::randn_like(input1); |
1369 | |
1370 | FusionExecutor fe; |
1371 | fe.compileFusion(&fusion, {input1, input2}); |
1372 | auto outputs = fe.runFusion({input1, input2}); |
1373 | |
1374 | at::Tensor tv2_ref = input2 + 2.0; |
1375 | at::Tensor output_ref = input1 + tv2_ref; |
1376 | |
1377 | TORCH_CHECK(output_ref.equal(outputs[0])); |
1378 | } |
1379 | |
1380 | TEST_F(NVFuserTest, FusionSimplePWise_CUDA) { |
1381 | Fusion fusion; |
1382 | FusionGuard fg(&fusion); |
1383 | // dimensionality of the problem |
1384 | int nDims = 3; |
1385 | |
1386 | // Set up your input tensor views |
1387 | TensorView* tv0 = makeContigTensor(nDims); |
1388 | TensorView* tv1 = makeContigTensor(nDims); |
1389 | |
1390 | // Register your inputs |
1391 | fusion.addInput(tv0); |
1392 | fusion.addInput(tv1); |
1393 | |
1394 | // Do math with it, it returns a `Val*` but can be static_casted back to |
1395 | // TensorView |
1396 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
1397 | TensorView* tv3 = add(tv0, tv2); |
1398 | |
1399 | // Register your outputs |
1400 | fusion.addOutput(tv3); |
1401 | |
1402 | // Do transformations, remember, transformations are outputs to inputs |
1403 | // This doesn't have to be in this order |
1404 | tv3->merge(1); |
1405 | tv3->merge(0); |
1406 | |
1407 | // Split by n_threads |
1408 | tv3->split(0, 128); |
1409 | tv3->split(0, 4); |
1410 | |
1411 | // For all inputs, computeAt the output inline, temporaries should be squeezed |
1412 | // between them |
1413 | tv0->computeAt(tv3, -1); |
1414 | tv1->computeAt(tv3, -1); |
1415 | |
1416 | // Parallelize TV3 |
1417 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1418 | tv3->axis(-2)->parallelize(ParallelType::Unroll); |
1419 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
1420 | |
1421 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1422 | |
1423 | at::Tensor input1 = at::randn({64, 2, 128}, options); |
1424 | at::Tensor input2 = at::rand_like(input1); |
1425 | at::Tensor output = at::empty_like(input1); |
1426 | |
1427 | FusionExecutor fe; |
1428 | fe.compileFusion(&fusion, {input1, input2}); |
1429 | fe.runFusion({input1, input2}, {output}); |
1430 | |
1431 | at::Tensor tv2_ref = input2 + 2.0; |
1432 | at::Tensor output_ref = input1 + tv2_ref; |
1433 | |
1434 | TORCH_CHECK(output_ref.equal(output)); |
1435 | } |
1436 | |
1437 | TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) { |
1438 | Fusion fusion; |
1439 | FusionGuard fg(&fusion); |
1440 | // dimensionality of the problem |
1441 | int nDims = 3; |
1442 | |
1443 | // Set up your input tensor views |
1444 | TensorView* tv0 = makeContigTensor(nDims, DataType::ComplexFloat); |
1445 | TensorView* tv1 = makeContigTensor(nDims, DataType::ComplexFloat); |
1446 | |
1447 | // Register your inputs |
1448 | fusion.addInput(tv0); |
1449 | fusion.addInput(tv1); |
1450 | |
1451 | // Do math with it, it returns a `Val*` but can be static_casted back to |
1452 | // TensorView |
1453 | c10::complex<double> scalar1(2.0, 3.0); |
1454 | TensorView* tv2 = add(tv1, IrBuilder::create<ComplexDouble>(scalar1)); |
1455 | TensorView* tv3 = add(tv0, tv2); |
1456 | |
1457 | // Register your outputs |
1458 | fusion.addOutput(tv3); |
1459 | |
1460 | // Do transformations, remember, transformations are outputs to inputs |
1461 | // This doesn't have to be in this order |
1462 | tv3->merge(1); |
1463 | tv3->merge(0); |
1464 | |
1465 | // Split by n_threads |
1466 | tv3->split(0, 128); |
1467 | tv3->split(0, 4); |
1468 | |
1469 | // For all inputs, computeAt the output inline, temporaries should be squeezed |
1470 | // between them |
1471 | tv0->computeAt(tv3, -1); |
1472 | tv1->computeAt(tv3, -1); |
1473 | |
1474 | // Parallelize TV3 |
1475 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1476 | tv3->axis(-2)->parallelize(ParallelType::Unroll); |
1477 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
1478 | |
1479 | auto options = |
1480 | at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0); |
1481 | |
1482 | at::Tensor input1 = at::randn({64, 2, 128}, options); |
1483 | at::Tensor input2 = at::rand_like(input1); |
1484 | at::Tensor output = at::empty_like(input1); |
1485 | |
1486 | FusionExecutor fe; |
1487 | fe.compileFusion(&fusion, {input1, input2}); |
1488 | fe.runFusion({input1, input2}, {output}); |
1489 | |
1490 | at::Tensor tv2_ref = input2 + scalar1; |
1491 | at::Tensor output_ref = input1 + tv2_ref; |
1492 | |
1493 | TORCH_CHECK(output_ref.equal(output)); |
1494 | } |
1495 | |
1496 | TEST_F(NVFuserTest, FusionExecKernel_CUDA) { |
1497 | Fusion fusion; |
1498 | FusionGuard fg(&fusion); |
1499 | |
1500 | // Set up your input tensor views |
1501 | TensorView* tv0 = makeSymbolicTensor(2); |
1502 | TensorView* tv1 = makeSymbolicTensor(2); |
1503 | |
1504 | // Register your inputs |
1505 | fusion.addInput(tv0); |
1506 | fusion.addInput(tv1); |
1507 | |
1508 | // Do math with it, it returns a `Val*` but can be static_casted back to |
1509 | // TensorView |
1510 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
1511 | TensorView* tv3 = add(tv0, tv2); |
1512 | |
1513 | // Register your outputs |
1514 | fusion.addOutput(tv3); |
1515 | |
1516 | tv3->merge(0); |
1517 | tv3->split(0, 128); |
1518 | tv3->split(0, 4); |
1519 | |
1520 | // For all inputs, computeAt the output inline, temporaries should be squeezed |
1521 | // between them |
1522 | tv0->computeAt(tv3, 1); |
1523 | tv1->computeAt(tv3, 1); |
1524 | |
1525 | // Parallelize TV3 |
1526 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1527 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
1528 | tv3->axis(1)->parallelize(ParallelType::Unroll); |
1529 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
1530 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
1531 | |
1532 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1533 | |
1534 | at::Tensor input1 = at::ones({1, 128}, options); |
1535 | at::Tensor input2 = at::ones_like(input1); |
1536 | |
1537 | FusionExecutor fe; |
1538 | fe.compileFusion(&fusion, {input1, input2}); |
1539 | auto outputs = fe.runFusion({input1, input2}); |
1540 | |
1541 | at::Tensor check = at::full({1, 128}, 4, options); |
1542 | ; |
1543 | TORCH_CHECK(outputs[0].equal(check)); |
1544 | } |
1545 | |
1546 | int ceilDiv_(int a, int b) { |
1547 | return (a + b - 1) / b; |
1548 | } |
1549 | |
1550 | TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { |
1551 | // Case 1 |
1552 | // tv1 = tv0 * 0.5 |
1553 | // tv2 = tv1 * -1 |
1554 | // tv3 = tv1 + 3 |
1555 | // tv4 = tv1 * 2 |
1556 | // tv5 = tv3 + tv2 |
1557 | // tv6 = tv5 + tv4 |
1558 | // tv7 = tv1 + tv4 |
1559 | Fusion fusion; |
1560 | FusionGuard fg(&fusion); |
1561 | |
1562 | TensorView* tv0 = makeSymbolicTensor(2); |
1563 | fusion.addInput(tv0); |
1564 | |
1565 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
1566 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
1567 | TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0)); |
1568 | TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0)); |
1569 | TensorView* tv5 = add(tv3, tv2); |
1570 | |
1571 | TensorView* tv6 = add(tv5, tv4); |
1572 | TensorView* tv7 = add(tv1, tv4); |
1573 | |
1574 | fusion.addOutput(tv6); |
1575 | fusion.addOutput(tv7); |
1576 | |
1577 | // Lets setup to actually run |
1578 | tv7->merge(0); |
1579 | tv7->split(0, 128); |
1580 | tv7->split(0, 4); |
1581 | |
1582 | tv7->axis(0)->parallelize(ParallelType::BIDx); |
1583 | |
1584 | tv0->computeAt(tv7, 1); |
1585 | |
1586 | ComputeAtMap ca_map(&fusion); |
1587 | |
1588 | // The this-position of the last tensor should be zero. |
1589 | TORCH_CHECK( |
1590 | tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && |
1591 | tv7->getMaxProducerPosition() == 1); |
1592 | TORCH_CHECK( |
1593 | tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && |
1594 | tv6->getMaxProducerPosition() == 1); |
1595 | // The position of every other tensor should be 1. |
1596 | for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { |
1597 | TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); |
1598 | |
1599 | TORCH_CHECK( |
1600 | ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE)); |
1601 | } |
1602 | |
1603 | for (Val* val : fusion.vals()) { |
1604 | if (!val->isFusionInput() && |
1605 | val->getValType().value() == ValType::TensorView) { |
1606 | TensorView* tv = static_cast<TensorView*>(val); |
1607 | tv->axis(1)->parallelize(ParallelType::Unroll); |
1608 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
1609 | } |
1610 | } |
1611 | |
1612 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1613 | |
1614 | at::Tensor aten_input = at::randn({129, 127}, options); |
1615 | |
1616 | auto t1 = aten_input.mul({0.5}); |
1617 | auto t2 = t1.mul({-1.0}); |
1618 | auto t3 = t1.add({3.0}); |
1619 | auto t4 = t1.mul({2.0}); |
1620 | auto t5 = t3.add(t2); |
1621 | auto t6 = t5.add(t4); |
1622 | auto t7 = t1.add(t4); |
1623 | |
1624 | std::vector<at::Tensor> aten_outputs = {t6, t7}; |
1625 | std::vector<at::Tensor> cg_outputs = { |
1626 | at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; |
1627 | |
1628 | FusionExecutor fe; |
1629 | fe.compileFusion(&fusion, {aten_input}); |
1630 | fe.runFusion({aten_input}, cg_outputs); |
1631 | |
1632 | testValidate( |
1633 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
1634 | } |
1635 | |
1636 | TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { |
1637 | // Case 2 |
1638 | // tv1 = tv0 * -1 |
1639 | // tv2 = tv0 + 3 |
1640 | // tv3 = tv0 * 2 |
1641 | // tv4 = tv2 + tv1 |
1642 | // tv5 = tv4 + tv3 |
1643 | // tv6 = tv5 + tv3 |
1644 | Fusion fusion; |
1645 | FusionGuard fg(&fusion); |
1646 | |
1647 | TensorView* tv0 = makeSymbolicTensor(2); |
1648 | fusion.addInput(tv0); |
1649 | |
1650 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0)); |
1651 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0)); |
1652 | TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0)); |
1653 | TensorView* tv4 = add(tv2, tv1); |
1654 | |
1655 | TensorView* tv5 = add(tv4, tv3); |
1656 | TensorView* tv6 = add(tv5, tv3); |
1657 | |
1658 | fusion.addOutput(tv5); |
1659 | fusion.addOutput(tv6); |
1660 | |
1661 | // Lets setup to actually run |
1662 | tv6->merge(0); |
1663 | tv6->split(0, 128); |
1664 | tv6->split(0, 4); |
1665 | |
1666 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
1667 | |
1668 | tv0->computeAt(tv6, 1); |
1669 | |
1670 | for (Val* val : fusion.vals()) { |
1671 | if (!val->isFusionInput() && |
1672 | val->getValType().value() == ValType::TensorView) { |
1673 | TensorView* tv = static_cast<TensorView*>(val); |
1674 | |
1675 | tv->axis(1)->parallelize(ParallelType::Unroll); |
1676 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
1677 | } |
1678 | } |
1679 | |
1680 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1681 | at::Tensor input = at::randn({129, 127}, options); |
1682 | |
1683 | auto t1 = input.mul({-1.0}); |
1684 | auto t2 = input.add({3.0}); |
1685 | auto t3 = input.mul({2.0}); |
1686 | auto t4 = t2.add(t1); |
1687 | auto t5 = t4.add(t3); |
1688 | auto t6 = t5.add(t3); |
1689 | |
1690 | std::vector<at::Tensor> aten_outputs = {t5, t6}; |
1691 | |
1692 | FusionExecutor fe; |
1693 | fe.compileFusion(&fusion, {input}); |
1694 | auto cg_outputs = fe.runFusion({input}); |
1695 | |
1696 | testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); |
1697 | } |
1698 | |
1699 | TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { |
1700 | // Case 3 |
1701 | // T2 = T1 * 0.979361 |
1702 | // T3 = T2 * T0 |
1703 | Fusion fusion; |
1704 | FusionGuard fg(&fusion); |
1705 | |
1706 | TensorView* tv0 = makeSymbolicTensor(4); |
1707 | fusion.addInput(tv0); |
1708 | |
1709 | TensorView* tv1 = makeSymbolicTensor(4); |
1710 | fusion.addInput(tv1); |
1711 | |
1712 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361)); |
1713 | TensorView* tv3 = mul(tv2, tv0); |
1714 | |
1715 | fusion.addOutput(tv3); |
1716 | |
1717 | // Lets setup to actually run |
1718 | while (tv3->nDims() > 1) |
1719 | tv3->merge(0); |
1720 | tv3->split(0, 128); |
1721 | tv3->split(0, 4); |
1722 | |
1723 | tv0->computeAt(tv3, 1); |
1724 | tv1->computeAt(tv3, 1); |
1725 | |
1726 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1727 | |
1728 | for (Val* val : fusion.vals()) { |
1729 | if (!val->isFusionInput() && |
1730 | val->getValType().value() == ValType::TensorView) { |
1731 | TensorView* tv = static_cast<TensorView*>(val); |
1732 | |
1733 | tv->axis(1)->parallelize(ParallelType::Unroll); |
1734 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
1735 | } |
1736 | } |
1737 | |
1738 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1739 | at::Tensor t0 = at::randn({129, 127, 63, 65}, options); |
1740 | at::Tensor t1 = at::rand_like(t0, options); |
1741 | |
1742 | auto t2 = t1.mul({0.979361}); |
1743 | auto aten_output = t2.mul(t0); |
1744 | |
1745 | std::vector<IValue> aten_inputs = {t0, t1}; |
1746 | |
1747 | at::Tensor cg_output = at::empty_like(t0, options); |
1748 | |
1749 | FusionExecutor fe; |
1750 | fe.compileFusion(&fusion, aten_inputs); |
1751 | fe.runFusion(aten_inputs, {cg_output}); |
1752 | |
1753 | testValidate( |
1754 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
1755 | } |
1756 | |
1757 | TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { |
1758 | // Case 4 |
1759 | // T4 = T2 - T3 |
1760 | // T5 = T1 + T4 |
1761 | // T6 = T5 - T0 |
1762 | Fusion fusion; |
1763 | FusionGuard fg(&fusion); |
1764 | |
1765 | TensorView* tv0 = makeSymbolicTensor(4); |
1766 | fusion.addInput(tv0); |
1767 | |
1768 | TensorView* tv1 = makeSymbolicTensor(4); |
1769 | fusion.addInput(tv1); |
1770 | |
1771 | TensorView* tv2 = makeSymbolicTensor(4); |
1772 | fusion.addInput(tv2); |
1773 | |
1774 | TensorView* tv3 = makeSymbolicTensor(4); |
1775 | fusion.addInput(tv3); |
1776 | |
1777 | TensorView* tv4 = sub(tv2, tv3); |
1778 | TensorView* tv5 = add(tv1, tv4); |
1779 | TensorView* tv6 = sub(tv5, tv0); |
1780 | |
1781 | fusion.addOutput(tv6); |
1782 | |
1783 | // Lets setup to actually run |
1784 | while (tv6->nDims() > 1) |
1785 | tv6->merge(0); |
1786 | tv6->split(0, 128); |
1787 | tv6->split(0, 4); |
1788 | |
1789 | tv0->computeAt(tv6, 1); |
1790 | tv1->computeAt(tv6, 1); |
1791 | tv2->computeAt(tv6, 1); |
1792 | tv3->computeAt(tv6, 1); |
1793 | |
1794 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
1795 | |
1796 | for (Val* val : fusion.vals()) { |
1797 | if (!val->isFusionInput() && |
1798 | val->getValType().value() == ValType::TensorView) { |
1799 | TensorView* tv = static_cast<TensorView*>(val); |
1800 | |
1801 | tv->axis(1)->parallelize(ParallelType::Unroll); |
1802 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
1803 | } |
1804 | } |
1805 | |
1806 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1807 | at::Tensor t0 = at::randn({129, 127, 63, 65}, options); |
1808 | at::Tensor t1 = at::rand_like(t0, options); |
1809 | at::Tensor t2 = at::rand_like(t0, options); |
1810 | at::Tensor t3 = at::rand_like(t0, options); |
1811 | |
1812 | auto t4 = t2.sub(t3); |
1813 | auto t5 = t1.add(t4); |
1814 | auto aten_output = t5.sub(t0); |
1815 | |
1816 | std::vector<IValue> aten_inputs = {t0, t1, t2, t3}; |
1817 | |
1818 | FusionExecutor fe; |
1819 | fe.compileFusion(&fusion, aten_inputs); |
1820 | auto cg_outputs = fe.runFusion(aten_inputs); |
1821 | |
1822 | testValidate( |
1823 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
1824 | } |
1825 | |
1826 | TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { |
1827 | // Case 5 |
1828 | // tv2 = tv0 + 2.0 |
1829 | // tv3 = tv1 * tv2 |
1830 | Fusion fusion; |
1831 | FusionGuard fg(&fusion); |
1832 | |
1833 | // Set up your input tensor views |
1834 | TensorView* tv0 = makeSymbolicTensor(2); |
1835 | fusion.addInput(tv0); |
1836 | TensorView* tv1 = makeSymbolicTensor(2); |
1837 | fusion.addInput(tv1); |
1838 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0)); |
1839 | TensorView* tv3 = mul(tv1, tv2); |
1840 | fusion.addOutput(tv3); |
1841 | |
1842 | tv3->merge(0); |
1843 | tv3->split(-1, 8); |
1844 | tv3->split(-1, 4); |
1845 | |
1846 | tv2->computeAt(tv3, 1); |
1847 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1848 | |
1849 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1850 | at::Tensor t0 = at::randn({63, 65}, options); |
1851 | at::Tensor t1 = at::rand_like(t0, options); |
1852 | |
1853 | auto t2 = t0.add(2.0); |
1854 | auto aten_output = t1.mul(t2); |
1855 | |
1856 | std::vector<IValue> aten_inputs = {t0, t1}; |
1857 | |
1858 | FusionExecutor fe; |
1859 | fe.compileFusion(&fusion, aten_inputs); |
1860 | auto cg_outputs = fe.runFusion(aten_inputs); |
1861 | |
1862 | testValidate( |
1863 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
1864 | } |
1865 | |
1866 | TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { |
1867 | Fusion fusion; |
1868 | FusionGuard fg(&fusion); |
1869 | |
1870 | TensorView* tv0 = makeSymbolicTensor(2); |
1871 | fusion.addInput(tv0); |
1872 | TensorView* tv1 = makeSymbolicTensor(2); |
1873 | fusion.addInput(tv1); |
1874 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0)); |
1875 | TensorView* tv3 = mul(tv1, tv2); |
1876 | fusion.addOutput(tv3); |
1877 | |
1878 | tv2->merge(0); |
1879 | tv2->split(-1, 8); |
1880 | tv2->split(-1, 4); |
1881 | tv3->merge(0); |
1882 | tv3->split(-1, 8); |
1883 | |
1884 | tv2->computeAt(tv3, 1); |
1885 | |
1886 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
1887 | |
1888 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1889 | at::Tensor t0 = at::randn({63, 65}, options); |
1890 | at::Tensor t1 = at::rand_like(t0, options); |
1891 | |
1892 | auto t2 = t0.add(2.0); |
1893 | auto aten_output = t1.mul(t2); |
1894 | |
1895 | std::vector<IValue> aten_inputs = {t0, t1}; |
1896 | |
1897 | FusionExecutor fe; |
1898 | fe.compileFusion(&fusion, aten_inputs); |
1899 | auto cg_outputs = fe.runFusion(aten_inputs); |
1900 | |
1901 | testValidate( |
1902 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
1903 | } |
1904 | |
1905 | TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { |
1906 | Fusion fusion; |
1907 | FusionGuard fg(&fusion); |
1908 | |
1909 | auto tv0 = makeSymbolicTensor(1); |
1910 | fusion.addInput(tv0); |
1911 | |
1912 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
1913 | |
1914 | auto tv2 = makeSymbolicTensor(1); |
1915 | fusion.addInput(tv2); |
1916 | |
1917 | auto tv3 = add(tv2, IrBuilder::create<Double>(3.0)); |
1918 | |
1919 | auto tv4 = add(tv1, tv3); |
1920 | fusion.addOutput(tv4); |
1921 | |
1922 | auto tv5 = broadcast(tv1, {false, true}); |
1923 | |
1924 | auto tv6 = makeSymbolicTensor(2); |
1925 | fusion.addInput(tv6); |
1926 | |
1927 | auto tv7 = mul(tv5, tv6); |
1928 | |
1929 | fusion.addOutput(tv7); |
1930 | |
1931 | tv7->split(1, 2); |
1932 | tv7->merge(0); |
1933 | tv7->split(0, 4); |
1934 | tv7->split(0, 128); |
1935 | |
1936 | tv7->axis(0)->parallelize(ParallelType::BIDx); |
1937 | tv7->axis(1)->parallelize(ParallelType::TIDx); |
1938 | |
1939 | tv0->computeAt(tv7, 1); |
1940 | auto tv5_domain = tv5->domain()->domain(); |
1941 | |
1942 | // These computeAt transformations should not affect the TV5 domain |
1943 | tv0->computeAt(tv4, -1); |
1944 | tv2->computeAt(tv4, -1); |
1945 | |
1946 | auto tv5_domain_current = tv5->domain()->domain(); |
1947 | TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain" ); |
1948 | |
1949 | const int numel_x = 100; |
1950 | const int numel_y = 200; |
1951 | |
1952 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1953 | auto t0 = at::randn({numel_x}, options); |
1954 | auto t2 = at::randn({numel_x}, options); |
1955 | auto t6 = at::randn({numel_x, numel_y}, options); |
1956 | |
1957 | auto t1 = t0.add(1.0); |
1958 | auto t3 = t2.add(3.0); |
1959 | auto t4 = t1.add(t3); |
1960 | auto t5 = t1.unsqueeze(1); |
1961 | auto t7 = t5.mul(t6); |
1962 | |
1963 | std::vector<IValue> aten_inputs = {t0, t2, t6}; |
1964 | std::vector<at::Tensor> aten_outputs = {t4, t7}; |
1965 | |
1966 | FusionExecutor fe; |
1967 | fe.compileFusion(&fusion, aten_inputs); |
1968 | auto cg_outputs = fe.runFusion(aten_inputs); |
1969 | |
1970 | testValidate( |
1971 | &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); |
1972 | } |
1973 | |
1974 | TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { |
1975 | Fusion fusion; |
1976 | FusionGuard fg(&fusion); |
1977 | |
1978 | auto tv0 = makeSymbolicTensor(1); |
1979 | fusion.addInput(tv0); |
1980 | |
1981 | auto tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
1982 | |
1983 | auto tv2 = makeSymbolicTensor(1); |
1984 | fusion.addInput(tv2); |
1985 | |
1986 | auto tv3 = add(tv2, IrBuilder::create<Double>(3.0)); |
1987 | |
1988 | auto tv4 = add(tv1, tv3); |
1989 | fusion.addOutput(tv4); |
1990 | |
1991 | auto tv5 = broadcast(tv1, {false, true}); |
1992 | |
1993 | auto tv6 = makeSymbolicTensor(2); |
1994 | fusion.addInput(tv6); |
1995 | |
1996 | auto tv7 = mul(tv5, tv6); |
1997 | |
1998 | fusion.addOutput(tv7); |
1999 | |
2000 | tv7->split(1, 2); |
2001 | tv7->merge(0); |
2002 | tv7->split(0, 128, false); |
2003 | tv7->split(0, 4, false); |
2004 | |
2005 | tv7->axis(0)->parallelize(ParallelType::BIDx); |
2006 | tv7->axis(1)->parallelize(ParallelType::TIDx); |
2007 | |
2008 | // Reverse computeAt structure from previous test |
2009 | tv0->computeAt(tv4, -1); |
2010 | tv2->computeAt(tv4, -1); |
2011 | tv0->computeAt(tv7, -1); |
2012 | |
2013 | const int numel_x = 100; |
2014 | const int numel_y = 200; |
2015 | |
2016 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2017 | auto t0 = at::randn({numel_x}, options); |
2018 | auto t2 = at::randn({numel_x}, options); |
2019 | auto t6 = at::randn({numel_x, numel_y}, options); |
2020 | |
2021 | auto t1 = t0.add(1.0); |
2022 | auto t3 = t2.add(3.0); |
2023 | auto t4 = t1.add(t3); |
2024 | auto t5 = t1.unsqueeze(1); |
2025 | auto t7 = t5.mul(t6); |
2026 | |
2027 | std::vector<IValue> aten_inputs = {t0, t2, t6}; |
2028 | std::vector<at::Tensor> aten_outputs = {t4, t7}; |
2029 | |
2030 | FusionExecutor fe; |
2031 | fe.compileFusion(&fusion, aten_inputs); |
2032 | auto cg_outputs = fe.runFusion(aten_inputs); |
2033 | |
2034 | testValidate( |
2035 | &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); |
2036 | } |
2037 | |
2038 | TEST_F(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { |
2039 | // Case 1 |
2040 | // tv1 = tv0 * 0.5 |
2041 | // tv2 = tv1 * -1 |
2042 | // tv3 = tv1 + 3 |
2043 | // tv4 = tv1 * 2 |
2044 | // tv5 = tv3 + tv2 |
2045 | // tv6 = tv5 + tv4 |
2046 | // tv7 = tv1 + tv4 |
2047 | Fusion fusion; |
2048 | FusionGuard fg(&fusion); |
2049 | |
2050 | TensorView* tv0 = makeSymbolicTensor(2); |
2051 | fusion.addInput(tv0); |
2052 | |
2053 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
2054 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
2055 | TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0)); |
2056 | TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0)); |
2057 | TensorView* tv5 = add(tv3, tv2); |
2058 | |
2059 | TensorView* tv6 = add(tv5, tv4); |
2060 | TensorView* tv7 = add(tv1, tv4); |
2061 | |
2062 | fusion.addOutput(tv6); |
2063 | fusion.addOutput(tv7); |
2064 | |
2065 | // Lets setup to actually run |
2066 | tv0->merge(0); |
2067 | tv0->split(0, 128); |
2068 | tv0->split(0, 4); |
2069 | |
2070 | tv0->axis(0)->parallelize(ParallelType::BIDx); |
2071 | |
2072 | tv0->computeWith(tv7, 1); |
2073 | |
2074 | GpuLower gpulw(&fusion); |
2075 | |
2076 | // The this-position of the last tensor should be zero. |
2077 | TORCH_CHECK( |
2078 | tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && |
2079 | tv7->getMaxProducerPosition() == 1); |
2080 | TORCH_CHECK( |
2081 | tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && |
2082 | tv6->getMaxProducerPosition() == 1); |
2083 | |
2084 | ComputeAtMap ca_map(&fusion); |
2085 | |
2086 | // The position of every other tensor should be 1. |
2087 | for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { |
2088 | TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); |
2089 | TORCH_CHECK( |
2090 | ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE)); |
2091 | } |
2092 | |
2093 | for (Val* val : fusion.vals()) { |
2094 | if (!val->isFusionInput() && |
2095 | val->getValType().value() == ValType::TensorView) { |
2096 | TensorView* tv = static_cast<TensorView*>(val); |
2097 | tv->axis(1)->parallelize(ParallelType::Unroll); |
2098 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2099 | } |
2100 | } |
2101 | |
2102 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2103 | |
2104 | at::Tensor aten_input = at::randn({129, 127}, options); |
2105 | |
2106 | auto t1 = aten_input.mul({0.5}); |
2107 | auto t2 = t1.mul({-1.0}); |
2108 | auto t3 = t1.add({3.0}); |
2109 | auto t4 = t1.mul({2.0}); |
2110 | auto t5 = t3.add(t2); |
2111 | auto t6 = t5.add(t4); |
2112 | auto t7 = t1.add(t4); |
2113 | |
2114 | std::vector<at::Tensor> aten_outputs = {t6, t7}; |
2115 | std::vector<at::Tensor> cg_outputs = { |
2116 | at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; |
2117 | |
2118 | FusionExecutor fe; |
2119 | fe.compileFusion(&fusion, {aten_input}); |
2120 | fe.runFusion({aten_input}, cg_outputs); |
2121 | |
2122 | testValidate( |
2123 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
2124 | } |
2125 | |
2126 | TEST_F(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { |
2127 | // Case 2 |
2128 | // tv1 = tv0 * -1 |
2129 | // tv2 = tv0 + 3 |
2130 | // tv3 = tv0 * 2 |
2131 | // tv4 = tv2 + tv1 |
2132 | // tv5 = tv4 + tv3 |
2133 | // tv6 = tv5 + tv3 |
2134 | Fusion fusion; |
2135 | FusionGuard fg(&fusion); |
2136 | |
2137 | TensorView* tv0 = makeSymbolicTensor(2); |
2138 | fusion.addInput(tv0); |
2139 | |
2140 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0)); |
2141 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0)); |
2142 | TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0)); |
2143 | TensorView* tv4 = add(tv2, tv1); |
2144 | |
2145 | TensorView* tv5 = add(tv4, tv3); |
2146 | TensorView* tv6 = add(tv5, tv3); |
2147 | |
2148 | fusion.addOutput(tv5); |
2149 | fusion.addOutput(tv6); |
2150 | |
2151 | // Lets setup to actually run |
2152 | tv0->merge(0); |
2153 | tv0->split(0, 128); |
2154 | tv0->split(0, 4); |
2155 | |
2156 | tv0->axis(0)->parallelize(ParallelType::BIDx); |
2157 | |
2158 | tv0->computeWith(tv6, 1); |
2159 | |
2160 | for (Val* val : fusion.vals()) { |
2161 | if (!val->isFusionInput() && |
2162 | val->getValType().value() == ValType::TensorView) { |
2163 | TensorView* tv = static_cast<TensorView*>(val); |
2164 | |
2165 | tv->axis(1)->parallelize(ParallelType::Unroll); |
2166 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2167 | } |
2168 | } |
2169 | |
2170 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2171 | at::Tensor input = at::randn({129, 127}, options); |
2172 | |
2173 | auto t1 = input.mul({-1.0}); |
2174 | auto t2 = input.add({3.0}); |
2175 | auto t3 = input.mul({2.0}); |
2176 | auto t4 = t2.add(t1); |
2177 | auto t5 = t4.add(t3); |
2178 | auto t6 = t5.add(t3); |
2179 | |
2180 | std::vector<at::Tensor> aten_outputs = {t5, t6}; |
2181 | |
2182 | FusionExecutor fe; |
2183 | fe.compileFusion(&fusion, {input}); |
2184 | auto cg_outputs = fe.runFusion({input}); |
2185 | |
2186 | testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); |
2187 | } |
2188 | |
2189 | TEST_F(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { |
2190 | // Case 3 |
2191 | // T2 = T1 * 0.979361 |
2192 | // T3 = T2 * T0 |
2193 | Fusion fusion; |
2194 | FusionGuard fg(&fusion); |
2195 | |
2196 | TensorView* tv0 = makeSymbolicTensor(4); |
2197 | fusion.addInput(tv0); |
2198 | |
2199 | TensorView* tv1 = makeSymbolicTensor(4); |
2200 | fusion.addInput(tv1); |
2201 | |
2202 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361)); |
2203 | TensorView* tv3 = mul(tv2, tv0); |
2204 | |
2205 | fusion.addOutput(tv3); |
2206 | |
2207 | // Lets setup to actually run |
2208 | while (tv0->nDims() > 1) |
2209 | tv0->merge(0); |
2210 | tv0->split(0, 128); |
2211 | tv0->split(0, 4); |
2212 | |
2213 | while (tv1->nDims() > 1) |
2214 | tv1->merge(0); |
2215 | tv1->split(0, 128); |
2216 | tv1->split(0, 4); |
2217 | |
2218 | tv0->computeWith(tv3, 1); |
2219 | tv1->computeWith(tv3, 1); |
2220 | |
2221 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
2222 | |
2223 | for (Val* val : fusion.vals()) { |
2224 | if (!val->isFusionInput() && |
2225 | val->getValType().value() == ValType::TensorView) { |
2226 | TensorView* tv = static_cast<TensorView*>(val); |
2227 | |
2228 | tv->axis(1)->parallelize(ParallelType::Unroll); |
2229 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2230 | } |
2231 | } |
2232 | |
2233 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2234 | at::Tensor t0 = at::randn({129, 127, 63, 65}, options); |
2235 | at::Tensor t1 = at::rand_like(t0, options); |
2236 | |
2237 | auto t2 = t1.mul({0.979361}); |
2238 | auto aten_output = t2.mul(t0); |
2239 | |
2240 | std::vector<IValue> aten_inputs = {t0, t1}; |
2241 | |
2242 | at::Tensor cg_output = at::empty_like(t0, options); |
2243 | |
2244 | FusionExecutor fe; |
2245 | fe.compileFusion(&fusion, aten_inputs); |
2246 | fe.runFusion(aten_inputs, {cg_output}); |
2247 | |
2248 | testValidate( |
2249 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
2250 | } |
2251 | |
2252 | TEST_F(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { |
2253 | // Case 4 |
2254 | // T4 = T2 - T3 |
2255 | // T5 = T1 + T4 |
2256 | // T6 = T5 - T0 |
2257 | Fusion fusion; |
2258 | FusionGuard fg(&fusion); |
2259 | |
2260 | TensorView* tv0 = makeSymbolicTensor(4); |
2261 | fusion.addInput(tv0); |
2262 | |
2263 | TensorView* tv1 = makeSymbolicTensor(4); |
2264 | fusion.addInput(tv1); |
2265 | |
2266 | TensorView* tv2 = makeSymbolicTensor(4); |
2267 | fusion.addInput(tv2); |
2268 | |
2269 | TensorView* tv3 = makeSymbolicTensor(4); |
2270 | fusion.addInput(tv3); |
2271 | |
2272 | TensorView* tv4 = sub(tv2, tv3); |
2273 | TensorView* tv5 = add(tv1, tv4); |
2274 | TensorView* tv6 = sub(tv5, tv0); |
2275 | |
2276 | fusion.addOutput(tv6); |
2277 | std::vector<TensorView*> tvs = {tv0, tv1, tv2}; |
2278 | for (auto tv : tvs) { |
2279 | // Lets setup to actually run |
2280 | while (tv->nDims() > 1) { |
2281 | tv->merge(0); |
2282 | } |
2283 | tv->split(0, 128); |
2284 | tv->split(0, 4); |
2285 | tv->computeWith(tv6, 1); |
2286 | } |
2287 | |
2288 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
2289 | |
2290 | for (Val* val : fusion.vals()) { |
2291 | if (!val->isFusionInput() && |
2292 | val->getValType().value() == ValType::TensorView) { |
2293 | TensorView* tv = static_cast<TensorView*>(val); |
2294 | |
2295 | tv->axis(1)->parallelize(ParallelType::Unroll); |
2296 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2297 | } |
2298 | } |
2299 | |
2300 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2301 | at::Tensor t0 = at::randn({129, 127, 63, 65}, options); |
2302 | at::Tensor t1 = at::rand_like(t0, options); |
2303 | at::Tensor t2 = at::rand_like(t0, options); |
2304 | at::Tensor t3 = at::rand_like(t0, options); |
2305 | |
2306 | auto t4 = t2.sub(t3); |
2307 | auto t5 = t1.add(t4); |
2308 | auto aten_output = t5.sub(t0); |
2309 | |
2310 | std::vector<IValue> aten_inputs = {t0, t1, t2, t3}; |
2311 | |
2312 | FusionExecutor fe; |
2313 | fe.compileFusion(&fusion, aten_inputs); |
2314 | auto cg_outputs = fe.runFusion(aten_inputs); |
2315 | |
2316 | testValidate( |
2317 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
2318 | } |
2319 | |
2320 | TEST_F(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { |
2321 | // Case 5 |
2322 | // tv2 = tv0 + 2.0 |
2323 | // tv3 = tv1 * tv2 |
2324 | Fusion fusion; |
2325 | FusionGuard fg(&fusion); |
2326 | |
2327 | // Set up your input tensor views |
2328 | TensorView* tv0 = makeSymbolicTensor(2); |
2329 | fusion.addInput(tv0); |
2330 | TensorView* tv1 = makeSymbolicTensor(2); |
2331 | fusion.addInput(tv1); |
2332 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0)); |
2333 | TensorView* tv3 = mul(tv1, tv2); |
2334 | fusion.addOutput(tv3); |
2335 | |
2336 | tv2->merge(0); |
2337 | tv2->split(-1, 8); |
2338 | tv2->split(-1, 4); |
2339 | |
2340 | tv2->computeWith(tv3, 1); |
2341 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
2342 | |
2343 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2344 | at::Tensor t0 = at::randn({63, 65}, options); |
2345 | at::Tensor t1 = at::rand_like(t0, options); |
2346 | |
2347 | auto t2 = t0.add(2.0); |
2348 | auto aten_output = t1.mul(t2); |
2349 | |
2350 | std::vector<IValue> aten_inputs = {t0, t1}; |
2351 | |
2352 | FusionExecutor fe; |
2353 | fe.compileFusion(&fusion, aten_inputs); |
2354 | auto cg_outputs = fe.runFusion(aten_inputs); |
2355 | |
2356 | testValidate( |
2357 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
2358 | } |
2359 | |
2360 | TEST_F(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { |
2361 | Fusion fusion; |
2362 | FusionGuard fg(&fusion); |
2363 | |
2364 | TensorView* tv0 = makeSymbolicTensor(2); |
2365 | fusion.addInput(tv0); |
2366 | TensorView* tv1 = makeSymbolicTensor(2); |
2367 | fusion.addInput(tv1); |
2368 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0)); |
2369 | TensorView* tv3 = mul(tv1, tv2); |
2370 | fusion.addOutput(tv3); |
2371 | |
2372 | tv2->merge(0); |
2373 | tv2->split(-1, 8); |
2374 | tv2->split(-1, 4); |
2375 | tv3->merge(0); |
2376 | tv3->split(-1, 8); |
2377 | |
2378 | tv2->computeWith(tv3, 1); |
2379 | |
2380 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
2381 | |
2382 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2383 | at::Tensor t0 = at::randn({63, 65}, options); |
2384 | at::Tensor t1 = at::rand_like(t0, options); |
2385 | |
2386 | auto t2 = t0.add(2.0); |
2387 | auto aten_output = t1.mul(t2); |
2388 | |
2389 | std::vector<IValue> aten_inputs = {t0, t1}; |
2390 | |
2391 | FusionExecutor fe; |
2392 | fe.compileFusion(&fusion, aten_inputs); |
2393 | auto cg_outputs = fe.runFusion(aten_inputs); |
2394 | |
2395 | testValidate( |
2396 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
2397 | } |
2398 | |
2399 | TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { |
2400 | // tv1 = tv0 * 0.5 |
2401 | // tv2 = tv1 * -1 |
2402 | // tv3 = tv2 * -2 |
2403 | Fusion fusion; |
2404 | FusionGuard fg(&fusion); |
2405 | |
2406 | TensorView* tv0 = makeSymbolicTensor(1); |
2407 | fusion.addInput(tv0); |
2408 | |
2409 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
2410 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
2411 | TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0)); |
2412 | fusion.addOutput(tv2); |
2413 | fusion.addOutput(tv3); |
2414 | |
2415 | // This computeAt will affect tv2 as well, even though tv2 is not in |
2416 | // the data-flow path between tv1 and tv3. The reason is that tv1 is |
2417 | // now computed at tv3, so tv2 must also be computed at the same |
2418 | // location. Overall, what will happen is basically we merge |
2419 | // expressions of all tensors and compute them in a single loop |
2420 | // nest. |
2421 | TensorView* computeAtTarget = tv3; |
2422 | computeAtTarget->split(0, 128); |
2423 | tv1->computeAt(computeAtTarget, 1); |
2424 | |
2425 | TensorView* affected_tensors[] = {tv1, tv2, tv3}; |
2426 | for (auto tv : affected_tensors) { |
2427 | TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); |
2428 | } |
2429 | |
2430 | GpuLower gpulw(&fusion); |
2431 | |
2432 | TORCH_CHECK(tv1->getComputeAtPosition() == 1); |
2433 | TORCH_CHECK( |
2434 | tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1); |
2435 | TORCH_CHECK( |
2436 | tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1); |
2437 | |
2438 | ComputeAtMap ca_map(&fusion); |
2439 | |
2440 | // Note that tv2 is also computed at tv3. |
2441 | for (auto tv : {tv1, tv2}) { |
2442 | TORCH_CHECK(ca_map.areMapped( |
2443 | tv->axis(0), computeAtTarget->axis(0), IdMappingMode::PERMISSIVE)); |
2444 | } |
2445 | |
2446 | TORCH_CHECK(tv3->getComputeAtPosition() == 0); |
2447 | |
2448 | computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); |
2449 | for (auto tv : affected_tensors) { |
2450 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2451 | } |
2452 | |
2453 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2454 | |
2455 | at::Tensor aten_input = at::randn({1000}, options); |
2456 | |
2457 | auto t1 = aten_input * 0.5; |
2458 | auto t2 = t1 * -1.0; |
2459 | auto t3 = t1 * -2.0; |
2460 | |
2461 | std::vector<at::Tensor> aten_outputs = {t2, t3}; |
2462 | |
2463 | std::vector<at::Tensor> cg_outputs = { |
2464 | at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; |
2465 | |
2466 | FusionExecutor fe; |
2467 | fe.compileFusion(&fusion, {aten_input}); |
2468 | fe.runFusion({aten_input}, cg_outputs); |
2469 | |
2470 | testValidate( |
2471 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
2472 | } |
2473 | |
2474 | // Similar to ComputeAtMultiConsumers, but with a common consumer. |
2475 | TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { |
2476 | // tv1 = tv0 * 0.5 |
2477 | // tv2 = tv1 * -1 |
2478 | // tv3 = tv2 * -2 |
2479 | // tv4 = tv2 + tv3 |
2480 | // tv5 = tv4 * 5 |
2481 | Fusion fusion; |
2482 | FusionGuard fg(&fusion); |
2483 | |
2484 | TensorView* tv0 = makeSymbolicTensor(1); |
2485 | fusion.addInput(tv0); |
2486 | |
2487 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
2488 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
2489 | TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0)); |
2490 | TensorView* tv4 = add(tv2, tv3); |
2491 | TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0)); |
2492 | fusion.addOutput(tv3); |
2493 | fusion.addOutput(tv4); |
2494 | fusion.addOutput(tv5); |
2495 | |
2496 | // Computing tv1 at tv3. This will affect tv2 as discussed in |
2497 | // ComplexComputeAt1. Additionally, in this case, notice that tv4 is |
2498 | // the common consumer of tv2 and tv3, so they are computed at |
2499 | // tv4. The indirect propagation of the computeAt should stop at the |
2500 | // common consumer, and no further change should occur. More |
2501 | // specifically, the computeAT position of tv4 and tv5 should be zero. |
2502 | TensorView* computeAtTarget = tv3; |
2503 | computeAtTarget->split(0, 128); |
2504 | tv1->computeAt(computeAtTarget, 1); |
2505 | |
2506 | TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; |
2507 | for (auto tv : affected_tensors) { |
2508 | TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); |
2509 | } |
2510 | |
2511 | TORCH_CHECK(tv1->getComputeAtPosition() == 1); |
2512 | TORCH_CHECK(tv2->getComputeAtPosition() == 1); |
2513 | TORCH_CHECK(tv3->getComputeAtPosition() == 1); |
2514 | TORCH_CHECK(tv4->getComputeAtPosition() == 0); |
2515 | TORCH_CHECK(tv5->getComputeAtPosition() == 0); |
2516 | |
2517 | computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); |
2518 | |
2519 | for (auto tv : affected_tensors) { |
2520 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2521 | } |
2522 | |
2523 | // Transform tv5 to make it look like the rest |
2524 | tv5->split(0, 128); |
2525 | tv5->axis(1)->parallelize(ParallelType::TIDx); |
2526 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
2527 | |
2528 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2529 | |
2530 | at::Tensor aten_input = at::randn({1000}, options); |
2531 | |
2532 | auto t1 = aten_input * 0.5; |
2533 | auto t2 = t1 * -1.0; |
2534 | auto t3 = t1 * -2.0; |
2535 | auto t4 = t2 + t3; |
2536 | auto t5 = t4 * 5.0; |
2537 | |
2538 | std::vector<at::Tensor> aten_outputs = {t3, t4, t5}; |
2539 | std::vector<at::Tensor> cg_outputs = { |
2540 | at::empty_like(aten_input, options), |
2541 | at::empty_like(aten_input, options), |
2542 | at::empty_like(aten_input, options)}; |
2543 | |
2544 | FusionExecutor fe; |
2545 | fe.compileFusion(&fusion, {aten_input}); |
2546 | fe.runFusion({aten_input}, cg_outputs); |
2547 | |
2548 | testValidate( |
2549 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
2550 | } |
2551 | |
2552 | TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { |
2553 | // tv1 = tv0 * 0.5 |
2554 | // tv2 = tv1 * -1 |
2555 | // tv3 = tv2 * -1 |
2556 | // tv4 = tv1 + 4 |
2557 | // tv5 = tv3 + tv4 |
2558 | Fusion fusion; |
2559 | FusionGuard fg(&fusion); |
2560 | |
2561 | TensorView* tv0 = makeSymbolicTensor(2); |
2562 | fusion.addInput(tv0); |
2563 | |
2564 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
2565 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
2566 | TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0)); |
2567 | TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0)); |
2568 | TensorView* tv5 = add(tv3, tv4); |
2569 | |
2570 | fusion.addOutput(tv5); |
2571 | |
2572 | TensorView* computeAtTarget = tv3; |
2573 | |
2574 | computeAtTarget->merge(0); |
2575 | computeAtTarget->split(0, 128); |
2576 | computeAtTarget->split(0, 4); |
2577 | |
2578 | computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); |
2579 | |
2580 | // This computeAt will affect all tensors including tv3, tv4 and |
2581 | // tv5, even though it appears to impact only tv1 and tv2. The |
2582 | // reason is that tv1 is now computed at tv3, so tv4 must also be |
2583 | // computed at the same location. Similarly, the consumer of tv4, |
2584 | // tv5, must also be computed at the same location. Overall, what |
2585 | // will happen is basically we merge expressions of all tensors and |
2586 | // compute them in a single loop nest. Internally, this will be |
2587 | // realized by making all tensors, except for those in the path |
2588 | // between tv1 and tv3, computed at tv5, which we call the common |
2589 | // consumer. |
2590 | tv1->computeAt(computeAtTarget, 1); |
2591 | |
2592 | // All tensors should have the same dimenionality as the target |
2593 | for (Val* val : fusion.vals()) { |
2594 | if (val->isFusionInput() || |
2595 | val->getValType().value() != ValType::TensorView) { |
2596 | continue; |
2597 | } |
2598 | TensorView* tv = val->as<TensorView>(); |
2599 | TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); |
2600 | if (tv == tv5) { |
2601 | TORCH_CHECK(tv->getComputeAtPosition() == 0); |
2602 | } else { |
2603 | TORCH_CHECK(tv->getComputeAtPosition() == 1); |
2604 | } |
2605 | } |
2606 | |
2607 | for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) { |
2608 | if (!tv->isFusionInput()) { |
2609 | tv->axis(1)->parallelize(ParallelType::Unroll); |
2610 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2611 | } |
2612 | } |
2613 | |
2614 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2615 | |
2616 | at::Tensor aten_input = at::randn({129, 127}, options); |
2617 | |
2618 | auto t1 = aten_input.mul({0.5}); |
2619 | auto t2 = t1.mul({-1.0}); |
2620 | auto t3 = t2.mul({-1.0}); |
2621 | auto t4 = t1.add({4.0}); |
2622 | auto aten_output = t3 + t4; |
2623 | |
2624 | at::Tensor cg_output = at::empty_like(aten_input, options); |
2625 | |
2626 | FusionExecutor fe; |
2627 | fe.compileFusion(&fusion, {aten_input}); |
2628 | fe.runFusion({aten_input}, {cg_output}); |
2629 | |
2630 | testValidate( |
2631 | &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); |
2632 | } |
2633 | |
2634 | // Similar to the above common consumer test but adds an additional |
2635 | // tensor that has no common consumer with the other tensors. |
2636 | TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { |
2637 | // tv1 = tv0 * 0.5 |
2638 | // tv2 = tv1 * -1 |
2639 | // tv3 = tv2 * -1 |
2640 | // tv4 = tv1 + 4 |
2641 | // tv5 = tv2 + tv3 |
2642 | // tv6 = tv1 + 6 |
2643 | Fusion fusion; |
2644 | FusionGuard fg(&fusion); |
2645 | |
2646 | TensorView* tv0 = makeSymbolicTensor(2); |
2647 | fusion.addInput(tv0); |
2648 | |
2649 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
2650 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
2651 | TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0)); |
2652 | TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0)); |
2653 | TensorView* tv5 = add(tv3, tv4); |
2654 | TensorView* tv6 = add(tv1, IrBuilder::create<Double>(6.0)); |
2655 | |
2656 | fusion.addOutput(tv5); |
2657 | fusion.addOutput(tv6); |
2658 | |
2659 | TensorView* computeAtTarget = tv3; |
2660 | |
2661 | computeAtTarget->merge(0); |
2662 | computeAtTarget->split(0, 128); |
2663 | computeAtTarget->split(0, 4); |
2664 | |
2665 | computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); |
2666 | |
2667 | // This will have the same impact on the tensors except for tv5 and |
2668 | // tv6. tv6 does not have any common consumer with the computeAt |
2669 | // target, but since it uses tv1, it must be also computed at the |
2670 | // same location as the other impacted tensors. We can either make |
2671 | // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 |
2672 | // should be computed at tv6 just because the current implementation |
2673 | // orders the computeAt relationship based on the order in which |
2674 | // tensors are specified as outputs. |
2675 | |
2676 | tv1->computeAt(computeAtTarget, 1); |
2677 | |
2678 | // All tensors should have the same dimenionality as the target |
2679 | for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) { |
2680 | if (tv->isFusionInput()) { |
2681 | continue; |
2682 | } |
2683 | TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); |
2684 | if (tv == tv5 || tv == tv6) { |
2685 | TORCH_CHECK(tv->getComputeAtPosition() == 0); |
2686 | TORCH_CHECK(tv->getMaxProducerPosition() == 1); |
2687 | } else { |
2688 | TORCH_CHECK(tv->getComputeAtPosition() == 1); |
2689 | } |
2690 | } |
2691 | |
2692 | for (Val* val : fusion.vals()) { |
2693 | if (!val->isFusionInput() && |
2694 | val->getValType().value() == ValType::TensorView) { |
2695 | TensorView* tv = val->as<TensorView>(); |
2696 | tv->axis(1)->parallelize(ParallelType::Unroll); |
2697 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2698 | } |
2699 | } |
2700 | |
2701 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2702 | |
2703 | at::Tensor aten_input = at::randn({129, 127}, options); |
2704 | |
2705 | auto t1 = aten_input.mul({0.5}); |
2706 | auto t2 = t1.mul({-1.0}); |
2707 | auto t3 = t2.mul({-1.0}); |
2708 | auto t4 = t1.add({4.0}); |
2709 | auto t5 = t3 + t4; |
2710 | auto t6 = t1.add({6.0}); |
2711 | |
2712 | std::vector<at::Tensor> aten_outputs = {t5, t6}; |
2713 | std::vector<at::Tensor> cg_outputs = { |
2714 | at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; |
2715 | |
2716 | FusionExecutor fe; |
2717 | fe.compileFusion(&fusion, {aten_input}); |
2718 | fe.runFusion({aten_input}, cg_outputs); |
2719 | |
2720 | testValidate( |
2721 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
2722 | } |
2723 | |
2724 | // Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor |
2725 | // that does not have data dependency with the consumer. |
2726 | TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { |
2727 | // tv1 = tv0 * 0.5 |
2728 | // tv2 = tv1 * -1 |
2729 | // tv3 = tv1 * -2 |
2730 | // tv4 = tv2 + tv3 |
2731 | // tv5 = tv4 * 5 |
2732 | // tv6 = tv1 * 6 |
2733 | Fusion fusion; |
2734 | FusionGuard fg(&fusion); |
2735 | |
2736 | TensorView* tv0 = makeSymbolicTensor(1); |
2737 | fusion.addInput(tv0); |
2738 | |
2739 | TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5)); |
2740 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0)); |
2741 | TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0)); |
2742 | TensorView* tv4 = add(tv2, tv3); |
2743 | TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0)); |
2744 | // Notice that tv6 is not a consumer of tv4. |
2745 | TensorView* tv6 = mul(tv1, IrBuilder::create<Double>(6.0)); |
2746 | fusion.addOutput(tv3); |
2747 | fusion.addOutput(tv4); |
2748 | fusion.addOutput(tv5); |
2749 | fusion.addOutput(tv6); |
2750 | |
2751 | TensorView* computeAtTarget = tv3; |
2752 | computeAtTarget->split(0, 128); |
2753 | tv1->computeAt(computeAtTarget, 1); |
2754 | |
2755 | TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6}; |
2756 | for (auto tv : affected_tensors) { |
2757 | TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); |
2758 | if (tv == tv6 || tv == tv5) { |
2759 | TORCH_CHECK(tv->getComputeAtPosition() == 0); |
2760 | } else { |
2761 | TORCH_CHECK(tv->getComputeAtPosition() == 1); |
2762 | } |
2763 | } |
2764 | |
2765 | computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); |
2766 | |
2767 | for (auto tv : affected_tensors) { |
2768 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
2769 | } |
2770 | |
2771 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
2772 | |
2773 | at::Tensor aten_input = at::randn({1000}, options); |
2774 | |
2775 | auto t1 = aten_input * 0.5; |
2776 | auto t2 = t1 * -1.0; |
2777 | auto t3 = t1 * -2.0; |
2778 | auto t4 = t2 + t3; |
2779 | auto t5 = t4 * 5.0; |
2780 | auto t6 = t1 * 6.0; |
2781 | |
2782 | std::vector<at::Tensor> aten_outputs = {t3, t4, t5, t6}; |
2783 | std::vector<at::Tensor> cg_outputs = { |
2784 | at::empty_like(aten_input, options), |
2785 | at::empty_like(aten_input, options), |
2786 | at::empty_like(aten_input, options), |
2787 | at::empty_like(aten_input, options)}; |
2788 | |
2789 | FusionExecutor fe; |
2790 | fe.compileFusion(&fusion, {aten_input}); |
2791 | fe.runFusion({aten_input}, cg_outputs); |
2792 | |
2793 | testValidate( |
2794 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
2795 | } |
2796 | |
2797 | namespace { |
2798 | |
2799 | void checkIdMapped( |
2800 | ComputeAtRootDomainMap& root_map, |
2801 | TensorView* v0, |
2802 | IterDomain* id0, |
2803 | TensorView* v1, |
2804 | IterDomain* id1, |
2805 | bool should_map) { |
2806 | if (should_map) { |
2807 | TORCH_CHECK( |
2808 | root_map.canMap(v0->domain(), id0, v1->domain(), id1), |
2809 | "Should be mappable: " , |
2810 | id0, |
2811 | " of " , |
2812 | v0, |
2813 | " and " , |
2814 | id1, |
2815 | " of " , |
2816 | v1); |
2817 | } else { |
2818 | TORCH_CHECK( |
2819 | !root_map.canMap(v0->domain(), id0, v1->domain(), id1), |
2820 | "Should not be mappable: " , |
2821 | id0, |
2822 | " of " , |
2823 | v0, |
2824 | " and " , |
2825 | id1, |
2826 | " of " , |
2827 | v1); |
2828 | } |
2829 | } |
2830 | |
2831 | void checkIdMapped( |
2832 | TensorView* v0, |
2833 | const std::vector<IterDomain*>& root0, |
2834 | const std::vector<bool> should_map0, |
2835 | TensorView* v1, |
2836 | const std::vector<IterDomain*>& root1, |
2837 | const std::vector<bool> should_map1) { |
2838 | ComputeAtRootDomainMap map; |
2839 | map.build(); |
2840 | TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size()); |
2841 | TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size()); |
2842 | size_t idx0 = 0; |
2843 | for (const auto i : c10::irange(root0.size())) { |
2844 | size_t idx1 = 0; |
2845 | for (const auto j : c10::irange(root1.size())) { |
2846 | if (should_map0[i] && should_map1[j] && idx0 == idx1) { |
2847 | checkIdMapped(map, v0, root0[i], v1, root1[j], true); |
2848 | } else { |
2849 | checkIdMapped(map, v0, root0[i], v1, root1[j], false); |
2850 | } |
2851 | if (should_map1[j]) |
2852 | ++idx1; |
2853 | } |
2854 | if (should_map0[i]) |
2855 | ++idx0; |
2856 | } |
2857 | } |
2858 | |
2859 | void checkIdMapped( |
2860 | TensorView* v0, |
2861 | const std::vector<IterDomain*>& root0, |
2862 | TensorView* v1, |
2863 | const std::vector<IterDomain*>& root1) { |
2864 | checkIdMapped( |
2865 | v0, |
2866 | root0, |
2867 | std::vector<bool>(root0.size(), true), |
2868 | v1, |
2869 | root1, |
2870 | std::vector<bool>(root1.size(), true)); |
2871 | } |
2872 | |
2873 | } // namespace |
2874 | |
2875 | TEST_F(NVFuserTest, FusionRootMappingBasic_CUDA) { |
2876 | Fusion fusion; |
2877 | FusionGuard fg(&fusion); |
2878 | |
2879 | TensorView* tv0 = makeSymbolicTensor(2); |
2880 | TensorView* tv1 = makeSymbolicTensor(2); |
2881 | |
2882 | fusion.addInput(tv0); |
2883 | fusion.addInput(tv1); |
2884 | auto tv3 = broadcast(tv0, {true, false, false}); |
2885 | auto tv4 = broadcast(tv1, {false, true, false}); |
2886 | auto tv5 = add(tv3, tv4); |
2887 | fusion.addOutput(tv5); |
2888 | |
2889 | checkIdMapped( |
2890 | tv0, |
2891 | tv0->getRootDomain(), |
2892 | {true, true}, |
2893 | tv4, |
2894 | tv4->getRootDomain(), |
2895 | {false, true, true}); |
2896 | checkIdMapped( |
2897 | tv1, |
2898 | tv1->getRootDomain(), |
2899 | {true, true}, |
2900 | tv4, |
2901 | tv4->getRootDomain(), |
2902 | {true, false, true}); |
2903 | checkIdMapped( |
2904 | tv0, |
2905 | tv0->getRootDomain(), |
2906 | {false, true}, |
2907 | tv1, |
2908 | tv1->getRootDomain(), |
2909 | {false, true}); |
2910 | checkIdMapped( |
2911 | tv0, |
2912 | tv0->getRootDomain(), |
2913 | {true, true}, |
2914 | tv5, |
2915 | tv5->getRootDomain(), |
2916 | {false, true, true}); |
2917 | checkIdMapped( |
2918 | tv1, |
2919 | tv1->getRootDomain(), |
2920 | {true, true}, |
2921 | tv5, |
2922 | tv5->getRootDomain(), |
2923 | {true, false, true}); |
2924 | checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain()); |
2925 | checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain()); |
2926 | checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain()); |
2927 | } |
2928 | |
2929 | TEST_F(NVFuserTest, FusionRootMappingRfactor_CUDA) { |
2930 | Fusion fusion; |
2931 | FusionGuard fg(&fusion); |
2932 | |
2933 | // [I,I] |
2934 | TensorView* tv0 = makeSymbolicTensor(2); |
2935 | // [I,I,I] |
2936 | TensorView* tv1 = makeSymbolicTensor(3); |
2937 | |
2938 | //[I,I,R] |
2939 | auto tv2 = sum(tv1, {2}); |
2940 | auto tv3 = add(tv2, tv0); |
2941 | |
2942 | fusion.addInput(tv0); |
2943 | fusion.addInput(tv1); |
2944 | fusion.addOutput(tv3); |
2945 | |
2946 | // scheduling: |
2947 | //[B,I,R0,R1=128], root = [B,I,R] |
2948 | tv2->split(2, 128); |
2949 | |
2950 | // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] |
2951 | auto tv4 = tv2->rFactor({3}); |
2952 | |
2953 | checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain()); |
2954 | checkIdMapped( |
2955 | tv4, |
2956 | tv4->getRFactorDomain(), |
2957 | {true, true, true, false}, |
2958 | tv2, |
2959 | tv2->getRootDomain(), |
2960 | {true, true, true}); |
2961 | checkIdMapped( |
2962 | tv1, |
2963 | tv1->getRootDomain(), |
2964 | {true, true, false}, |
2965 | tv2, |
2966 | tv2->getRootDomain(), |
2967 | {true, true, false}); |
2968 | checkIdMapped( |
2969 | tv1, |
2970 | tv1->getRootDomain(), |
2971 | {true, true, false}, |
2972 | tv3, |
2973 | tv3->getRootDomain(), |
2974 | {true, true}); |
2975 | checkIdMapped( |
2976 | tv2, |
2977 | tv2->getRootDomain(), |
2978 | {true, true, false}, |
2979 | tv3, |
2980 | tv3->getRootDomain(), |
2981 | {true, true}); |
2982 | checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain()); |
2983 | checkIdMapped( |
2984 | tv0, |
2985 | tv0->getRootDomain(), |
2986 | {true, true}, |
2987 | tv1, |
2988 | tv1->getRootDomain(), |
2989 | {true, true, false}); |
2990 | checkIdMapped( |
2991 | tv0, |
2992 | tv0->getRootDomain(), |
2993 | {true, true}, |
2994 | tv2, |
2995 | tv2->getRootDomain(), |
2996 | {true, true, false}); |
2997 | checkIdMapped( |
2998 | tv0, |
2999 | tv0->getRootDomain(), |
3000 | {true, true}, |
3001 | tv4, |
3002 | tv4->getRFactorDomain(), |
3003 | {true, true, false, false}); |
3004 | checkIdMapped( |
3005 | tv0, |
3006 | tv0->getRootDomain(), |
3007 | {true, true}, |
3008 | tv4, |
3009 | tv4->getRootDomain(), |
3010 | {true, true, false}); |
3011 | } |
3012 | |
3013 | TEST_F(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) { |
3014 | Fusion fusion; |
3015 | FusionGuard fg(&fusion); |
3016 | |
3017 | TensorView* tv0 = makeSymbolicTensor(2); |
3018 | auto tv1 = sum(tv0, {1}); |
3019 | auto tv2 = broadcast(tv1, {false, true}); |
3020 | fusion.addOutput(tv2); |
3021 | |
3022 | // The second dimension cannot be mapped as it would require recomputation. |
3023 | checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain()); |
3024 | checkIdMapped( |
3025 | tv1, |
3026 | tv1->getRootDomain(), |
3027 | {true, false}, |
3028 | tv2, |
3029 | tv2->getRootDomain(), |
3030 | {true, false}); |
3031 | checkIdMapped( |
3032 | tv0, |
3033 | tv0->getRootDomain(), |
3034 | {true, false}, |
3035 | tv2, |
3036 | tv2->getRootDomain(), |
3037 | {true, false}); |
3038 | } |
3039 | |
3040 | TEST_F(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) { |
3041 | Fusion fusion; |
3042 | FusionGuard fg(&fusion); |
3043 | |
3044 | TensorView* tv0 = makeSymbolicTensor(2); |
3045 | auto tv1 = sum(tv0, {1}); |
3046 | auto tv2 = broadcast(tv1, {false, true}); |
3047 | auto tv3 = add(tv0, tv2); |
3048 | fusion.addOutput(tv3); |
3049 | |
3050 | checkIdMapped( |
3051 | tv0, |
3052 | tv0->getRootDomain(), |
3053 | {true, false}, |
3054 | tv1, |
3055 | tv1->getRootDomain(), |
3056 | {true, false}); |
3057 | checkIdMapped( |
3058 | tv1, |
3059 | tv1->getRootDomain(), |
3060 | {true, false}, |
3061 | tv2, |
3062 | tv2->getRootDomain(), |
3063 | {true, false}); |
3064 | checkIdMapped( |
3065 | tv0, |
3066 | tv0->getRootDomain(), |
3067 | {true, false}, |
3068 | tv3, |
3069 | tv3->getRootDomain(), |
3070 | {true, false}); |
3071 | checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); |
3072 | } |
3073 | |
3074 | TEST_F(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) { |
3075 | Fusion fusion; |
3076 | FusionGuard fg(&fusion); |
3077 | |
3078 | TensorView* tv0 = makeSymbolicTensor(2); |
3079 | auto tv1 = sum(tv0, {1}); |
3080 | auto tv2 = broadcast(tv1, {false, true}); |
3081 | fusion.addOutput(tv2); |
3082 | |
3083 | tv1->split(-1, 4); |
3084 | auto tv3 = tv1->rFactor({-2}); |
3085 | |
3086 | checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain()); |
3087 | checkIdMapped( |
3088 | tv3, |
3089 | tv3->getMaybeRFactorDomain(), |
3090 | {true, false, true}, |
3091 | tv1, |
3092 | tv1->getRootDomain(), |
3093 | {true, true}); |
3094 | checkIdMapped( |
3095 | tv1, |
3096 | tv1->getRootDomain(), |
3097 | {true, false}, |
3098 | tv2, |
3099 | tv2->getRootDomain(), |
3100 | {true, false}); |
3101 | } |
3102 | |
3103 | TEST_F(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) { |
3104 | Fusion fusion; |
3105 | FusionGuard fg(&fusion); |
3106 | |
3107 | TensorView* tv0 = makeSymbolicTensor(2); |
3108 | auto tv1 = sum(tv0, {1}); |
3109 | auto tv2 = broadcast(tv1, {false, true}); |
3110 | auto tv3 = add(tv0, tv2); |
3111 | fusion.addOutput(tv3); |
3112 | |
3113 | tv1->split(-1, 4); |
3114 | auto tv4 = tv1->rFactor({-2}); |
3115 | |
3116 | checkIdMapped( |
3117 | tv0, |
3118 | tv0->getRootDomain(), |
3119 | {true, false}, |
3120 | tv4, |
3121 | tv4->getRootDomain(), |
3122 | {true, false}); |
3123 | checkIdMapped( |
3124 | tv4, |
3125 | tv4->getMaybeRFactorDomain(), |
3126 | {true, false, true}, |
3127 | tv1, |
3128 | tv1->getRootDomain(), |
3129 | {true, true}); |
3130 | checkIdMapped( |
3131 | tv1, |
3132 | tv1->getRootDomain(), |
3133 | {true, false}, |
3134 | tv2, |
3135 | tv2->getRootDomain(), |
3136 | {true, false}); |
3137 | checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); |
3138 | checkIdMapped( |
3139 | tv0, |
3140 | tv0->getRootDomain(), |
3141 | {true, false}, |
3142 | tv2, |
3143 | tv2->getRootDomain(), |
3144 | {true, false}); |
3145 | } |
3146 | |
3147 | // Reproducer of issue #749 |
3148 | TEST_F(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) { |
3149 | Fusion fusion; |
3150 | FusionGuard fg(&fusion); |
3151 | |
3152 | auto tv0 = makeSymbolicTensor(2); |
3153 | fusion.addInput(tv0); |
3154 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3155 | auto tv2 = sum(tv1, {1}); |
3156 | auto tv3 = broadcast(tv2, {false, true}); |
3157 | auto tv4 = add(tv0, tv3); |
3158 | auto tv5 = add(tv4, tv1); |
3159 | fusion.addOutput(tv5); |
3160 | |
3161 | checkIdMapped( |
3162 | tv0, |
3163 | tv0->getRootDomain(), |
3164 | {true, false}, |
3165 | tv1, |
3166 | tv1->getRootDomain(), |
3167 | {true, false}); |
3168 | checkIdMapped( |
3169 | tv1, |
3170 | tv1->getRootDomain(), |
3171 | {true, false}, |
3172 | tv2, |
3173 | tv2->getRootDomain(), |
3174 | {true, false}); |
3175 | checkIdMapped( |
3176 | tv2, |
3177 | tv2->getRootDomain(), |
3178 | {true, false}, |
3179 | tv3, |
3180 | tv3->getRootDomain(), |
3181 | {true, false}); |
3182 | checkIdMapped( |
3183 | tv3, |
3184 | tv3->getRootDomain(), |
3185 | {true, true}, |
3186 | tv4, |
3187 | tv4->getRootDomain(), |
3188 | {true, true}); |
3189 | checkIdMapped( |
3190 | tv0, |
3191 | tv0->getRootDomain(), |
3192 | {true, false}, |
3193 | tv4, |
3194 | tv4->getRootDomain(), |
3195 | {true, false}); |
3196 | checkIdMapped( |
3197 | tv4, |
3198 | tv4->getRootDomain(), |
3199 | {true, true}, |
3200 | tv5, |
3201 | tv5->getRootDomain(), |
3202 | {true, true}); |
3203 | } |
3204 | |
3205 | // Similar to RootMappingReductionDependency5 but with rFactor |
3206 | TEST_F(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) { |
3207 | Fusion fusion; |
3208 | FusionGuard fg(&fusion); |
3209 | |
3210 | auto tv0 = makeSymbolicTensor(2); |
3211 | fusion.addInput(tv0); |
3212 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3213 | auto tv2 = sum(tv1, {1}); |
3214 | auto tv3 = broadcast(tv2, {false, true}); |
3215 | auto tv4 = add(tv0, tv3); |
3216 | auto tv5 = add(tv4, tv1); |
3217 | fusion.addOutput(tv5); |
3218 | |
3219 | tv2->split(1, 4); |
3220 | auto tv6 = tv2->rFactor({-1}); |
3221 | |
3222 | checkIdMapped( |
3223 | tv0, |
3224 | tv0->getRootDomain(), |
3225 | {true, false}, |
3226 | tv1, |
3227 | tv1->getRootDomain(), |
3228 | {true, false}); |
3229 | checkIdMapped( |
3230 | tv1, |
3231 | tv1->getRootDomain(), |
3232 | {true, false}, |
3233 | tv6, |
3234 | tv6->getRootDomain(), |
3235 | {true, false}); |
3236 | checkIdMapped( |
3237 | tv6, |
3238 | tv6->getMaybeRFactorDomain(), |
3239 | {true, true, false}, |
3240 | tv2, |
3241 | tv2->getRootDomain(), |
3242 | {true, true}); |
3243 | checkIdMapped( |
3244 | tv1, |
3245 | tv1->getRootDomain(), |
3246 | {true, false}, |
3247 | tv2, |
3248 | tv2->getRootDomain(), |
3249 | {true, false}); |
3250 | checkIdMapped( |
3251 | tv2, |
3252 | tv2->getRootDomain(), |
3253 | {true, false}, |
3254 | tv3, |
3255 | tv3->getRootDomain(), |
3256 | {true, false}); |
3257 | checkIdMapped( |
3258 | tv3, |
3259 | tv3->getRootDomain(), |
3260 | {true, true}, |
3261 | tv4, |
3262 | tv4->getRootDomain(), |
3263 | {true, true}); |
3264 | checkIdMapped( |
3265 | tv0, |
3266 | tv0->getRootDomain(), |
3267 | {true, false}, |
3268 | tv4, |
3269 | tv4->getRootDomain(), |
3270 | {true, false}); |
3271 | checkIdMapped( |
3272 | tv4, |
3273 | tv4->getRootDomain(), |
3274 | {true, true}, |
3275 | tv5, |
3276 | tv5->getRootDomain(), |
3277 | {true, true}); |
3278 | } |
3279 | |
3280 | TEST_F( |
3281 | NVFuserTest, |
3282 | FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) { |
3283 | Fusion fusion; |
3284 | FusionGuard fg(&fusion); |
3285 | |
3286 | TensorView* tv0 = makeSymbolicTensor(1); |
3287 | auto tv1 = broadcast(tv0, {false, true}); |
3288 | auto tv2 = broadcast(tv0, {true, false}); |
3289 | fusion.addOutput(tv1); |
3290 | fusion.addOutput(tv2); |
3291 | |
3292 | // If there is no common consumer, there is no recomputation constraint. |
3293 | checkIdMapped( |
3294 | tv0, |
3295 | tv0->getRootDomain(), |
3296 | {true}, |
3297 | tv1, |
3298 | tv1->getRootDomain(), |
3299 | {true, false}); |
3300 | checkIdMapped( |
3301 | tv0, |
3302 | tv0->getRootDomain(), |
3303 | {true}, |
3304 | tv2, |
3305 | tv2->getRootDomain(), |
3306 | {false, true}); |
3307 | checkIdMapped( |
3308 | tv1, |
3309 | tv1->getRootDomain(), |
3310 | {true, false}, |
3311 | tv2, |
3312 | tv2->getRootDomain(), |
3313 | {false, true}); |
3314 | } |
3315 | |
3316 | TEST_F(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) { |
3317 | Fusion fusion; |
3318 | FusionGuard fg(&fusion); |
3319 | |
3320 | auto tv0 = makeSymbolicTensor(1); |
3321 | fusion.addInput(tv0); |
3322 | auto tv1 = makeSymbolicTensor(2); |
3323 | fusion.addInput(tv1); |
3324 | auto tv2 = makeSymbolicTensor(2); |
3325 | fusion.addInput(tv2); |
3326 | auto tv3 = broadcast(tv0, {false, true}); |
3327 | auto tv4 = add(tv1, tv3); |
3328 | fusion.addOutput(tv4); |
3329 | auto tv5 = add(tv2, tv3); |
3330 | fusion.addOutput(tv5); |
3331 | |
3332 | // Broadcast domains can be used with multiple domains with |
3333 | // different sizes. In this test, the broadcast domain of tv3 has |
3334 | // two consumers, tv4 and tv5, which may have different sizes. Each |
3335 | // of the consumers is used with the broadcast domain of tv3, but |
3336 | // the two consumers may not have the same size, it is not possible |
3337 | // to map those domains. |
3338 | checkIdMapped( |
3339 | tv0, |
3340 | tv0->getRootDomain(), |
3341 | {true}, |
3342 | tv3, |
3343 | tv3->getRootDomain(), |
3344 | {true, false}); |
3345 | checkIdMapped( |
3346 | tv0, |
3347 | tv0->getRootDomain(), |
3348 | {true}, |
3349 | tv1, |
3350 | tv1->getRootDomain(), |
3351 | {true, false}); |
3352 | checkIdMapped( |
3353 | tv0, |
3354 | tv0->getRootDomain(), |
3355 | {true}, |
3356 | tv2, |
3357 | tv2->getRootDomain(), |
3358 | {true, false}); |
3359 | checkIdMapped( |
3360 | tv1, |
3361 | tv1->getRootDomain(), |
3362 | {true, false}, |
3363 | tv2, |
3364 | tv2->getRootDomain(), |
3365 | {true, false}); |
3366 | checkIdMapped( |
3367 | tv1, |
3368 | tv1->getRootDomain(), |
3369 | {true, false}, |
3370 | tv3, |
3371 | tv3->getRootDomain(), |
3372 | {true, false}); |
3373 | checkIdMapped( |
3374 | tv2, |
3375 | tv2->getRootDomain(), |
3376 | {true, false}, |
3377 | tv3, |
3378 | tv3->getRootDomain(), |
3379 | {true, false}); |
3380 | checkIdMapped( |
3381 | tv3, |
3382 | tv3->getRootDomain(), |
3383 | {true, false}, |
3384 | tv4, |
3385 | tv4->getRootDomain(), |
3386 | {true, false}); |
3387 | checkIdMapped( |
3388 | tv3, |
3389 | tv3->getRootDomain(), |
3390 | {true, false}, |
3391 | tv5, |
3392 | tv5->getRootDomain(), |
3393 | {true, false}); |
3394 | checkIdMapped( |
3395 | tv4, |
3396 | tv4->getRootDomain(), |
3397 | {true, false}, |
3398 | tv5, |
3399 | tv5->getRootDomain(), |
3400 | {true, false}); |
3401 | } |
3402 | |
3403 | TEST_F(NVFuserTest, FusionRootMappingBroadcast_CUDA) { |
3404 | Fusion fusion; |
3405 | FusionGuard fg(&fusion); |
3406 | |
3407 | auto tv0 = makeSymbolicTensor(1); |
3408 | // tv0[I0] |
3409 | fusion.addInput(tv0); |
3410 | auto tv1 = broadcast(tv0, {true, false}); |
3411 | // tv1[B1, I0] |
3412 | auto tv2 = broadcast(tv1, {true, false, false}); |
3413 | // tv2[B2, B1, I0] |
3414 | fusion.addOutput(tv2); |
3415 | |
3416 | // In this case, tv1 and tv2 has one and two broadcast domains, |
3417 | // respectively. It is the second broadcast domain that is mapped to |
3418 | // the broadcast of tv1. |
3419 | checkIdMapped( |
3420 | tv0, |
3421 | tv0->getRootDomain(), |
3422 | {true}, |
3423 | tv1, |
3424 | tv1->getRootDomain(), |
3425 | {false, true}); |
3426 | checkIdMapped( |
3427 | tv1, |
3428 | tv1->getRootDomain(), |
3429 | {true, true}, |
3430 | tv2, |
3431 | tv2->getRootDomain(), |
3432 | {false, true, true}); // Not {true, false, true} |
3433 | checkIdMapped( |
3434 | tv0, |
3435 | tv0->getRootDomain(), |
3436 | {true}, |
3437 | tv2, |
3438 | tv2->getRootDomain(), |
3439 | {false, false, true}); |
3440 | } |
3441 | |
3442 | // Reproducer of issue #723 |
3443 | TEST_F(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { |
3444 | Fusion fusion; |
3445 | FusionGuard fg(&fusion); |
3446 | |
3447 | auto tv0 = makeSymbolicTensor(1); |
3448 | auto tv1 = makeSymbolicTensor(2); |
3449 | |
3450 | fusion.addInput(tv0); |
3451 | fusion.addInput(tv1); |
3452 | |
3453 | auto tv2 = broadcast(tv0, {true, false}); |
3454 | auto tv3 = sum(tv2, {0}); |
3455 | auto tv4 = add(tv2, tv1); |
3456 | |
3457 | fusion.addOutput(tv3); |
3458 | fusion.addOutput(tv4); |
3459 | |
3460 | ComputeAtRootDomainMap map; |
3461 | map.build(); |
3462 | |
3463 | checkIdMapped( |
3464 | map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true); |
3465 | checkIdMapped( |
3466 | map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true); |
3467 | |
3468 | tv2->computeAt(tv4, -1); |
3469 | |
3470 | const int x = 11; |
3471 | const int y = 12; |
3472 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3473 | at::Tensor t0 = at::randn({x}, options); |
3474 | at::Tensor t1 = at::randn({y, x}, options); |
3475 | std::vector<IValue> aten_inputs = {t0, t1}; |
3476 | |
3477 | FusionExecutor fe; |
3478 | fe.compileFusion(&fusion, aten_inputs); |
3479 | auto outputs = fe.runFusion(aten_inputs); |
3480 | |
3481 | auto t3 = t0; |
3482 | auto t4 = t0.unsqueeze(0).expand({y, x}) + t1; |
3483 | |
3484 | testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__); |
3485 | } |
3486 | |
3487 | // Repro of issue #1950 |
3488 | TEST_F(NVFuserTest, FusionRootMappingRepro1950_CUDA) { |
3489 | Fusion fusion; |
3490 | FusionGuard fg(&fusion); |
3491 | auto tv0 = makeSymbolicTensor(3); |
3492 | auto tv1 = makeSymbolicTensor(3); |
3493 | auto tv2 = makeSymbolicTensor(3); |
3494 | |
3495 | fusion.addInput(tv0); |
3496 | fusion.addInput(tv1); |
3497 | fusion.addInput(tv2); |
3498 | |
3499 | auto tv3 = set(tv0); |
3500 | auto tv4 = mul(tv1, tv3); |
3501 | auto tv5 = mul(tv1, tv2); |
3502 | auto tv6 = mul(tv5, tv3); |
3503 | auto tv7 = sum(tv6, {2}); |
3504 | auto tv8 = broadcast(tv7, {false, false, true}); |
3505 | auto tv9 = mul(tv3, tv8); |
3506 | |
3507 | // Issue #1950 was caused by a particular traversal ordering based |
3508 | // on the output tensor ordering as below |
3509 | fusion.addOutput(tv9); |
3510 | fusion.addOutput(tv5); |
3511 | fusion.addOutput(tv4); |
3512 | |
3513 | ComputeAtRootDomainMap root_map; |
3514 | root_map.build(); |
3515 | |
3516 | checkIdMapped(root_map, tv4, tv4->axis(-1), tv9, tv9->axis(-1), false); |
3517 | } |
3518 | |
3519 | TEST_F(NVFuserTest, FusionDetectSelfMappedDomains_CUDA) { |
3520 | Fusion fusion; |
3521 | FusionGuard fg(&fusion); |
3522 | |
3523 | auto tv0 = makeSymbolicTensor(1); |
3524 | fusion.addInput(tv0); |
3525 | // [I1] |
3526 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
3527 | // [B2, I2] |
3528 | auto tv2 = broadcast(tv1, {true, false}); |
3529 | // [I3, B3] |
3530 | auto tv3 = broadcast(tv1, {false, true}); |
3531 | // [I4, I5] |
3532 | auto tv4 = add(tv2, tv3); |
3533 | fusion.addOutput(tv4); |
3534 | |
3535 | // IterDomainGraph maps B2, I3 and I4 together, and similarly I2, |
3536 | // B3 and I5. The problem is I1 is mapped with both of the ID |
3537 | // groups, so eventually all of the IDs are mapped |
3538 | // together. IterDomainGraph should throw an exception as this |
3539 | // pattern of domain mappings is not supported. |
3540 | |
3541 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
3542 | ASSERT_ANY_THROW({ IterDomainGraph id_graph(&fusion); }); |
3543 | } |
3544 | |
3545 | TEST_F(NVFuserTest, FusionScalarInputs_CUDA) { |
3546 | Fusion fusion; |
3547 | FusionGuard fg(&fusion); |
3548 | |
3549 | TensorView* tv0 = makeSymbolicTensor(2); |
3550 | fusion.addInput(tv0); |
3551 | TensorView* tv1 = makeSymbolicTensor(2); |
3552 | fusion.addInput(tv1); |
3553 | |
3554 | Double* d0 = IrBuilder::create<Double>(); |
3555 | fusion.addInput(d0); |
3556 | Double* d1 = IrBuilder::create<Double>(); |
3557 | fusion.addInput(d1); |
3558 | Double* d2 = IrBuilder::create<Double>(); |
3559 | fusion.addInput(d2); |
3560 | Double* d3 = IrBuilder::create<Double>(); |
3561 | fusion.addInput(d3); |
3562 | Val* d4 = mul(d0, d1); |
3563 | Val* d5 = sub(d2, d3); |
3564 | |
3565 | TensorView* tv2 = sub(tv1, d4); |
3566 | TensorView* tv3 = add(tv0, d5); |
3567 | TensorView* tv4 = mul(tv3, tv2); |
3568 | |
3569 | fusion.addOutput(tv4); |
3570 | |
3571 | // Lets setup to actually run |
3572 | while (tv4->nDims() > 1) |
3573 | tv4->merge(0); |
3574 | tv4->split(0, 128); |
3575 | tv4->split(0, 4); |
3576 | |
3577 | tv0->computeAt(tv4, 1); |
3578 | tv1->computeAt(tv4, 1); |
3579 | |
3580 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
3581 | |
3582 | for (Val* val : fusion.vals()) { |
3583 | if (!val->isFusionInput() && |
3584 | val->getValType().value() == ValType::TensorView) { |
3585 | TensorView* tv = static_cast<TensorView*>(val); |
3586 | |
3587 | tv->axis(1)->parallelize(ParallelType::Unroll); |
3588 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
3589 | } |
3590 | } |
3591 | |
3592 | // d4 = d0 * d1 |
3593 | // d5 = d2 - d3 |
3594 | // t2 = t1 - d4 |
3595 | // t3 = t0 + d5 |
3596 | // t4 = t3 * t2 |
3597 | |
3598 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3599 | |
3600 | float fl0 = 0.1; |
3601 | float fl1 = -0.2; |
3602 | float fl2 = 0.3; |
3603 | float fl3 = -0.4; |
3604 | float fl4 = fl0 * fl1; |
3605 | float fl5 = fl2 - fl3; |
3606 | |
3607 | at::Tensor t0 = at::randn({129, 127}, options); |
3608 | at::Tensor t1 = at::rand_like(t0, options); |
3609 | |
3610 | auto t2 = t1.sub(fl4); |
3611 | auto t3 = t0.add(fl5); |
3612 | auto aten_output = t3.mul(t2); |
3613 | |
3614 | at::Tensor cg_output = at::empty_like(t0, options); |
3615 | |
3616 | at::Scalar test(fl0); |
3617 | |
3618 | std::vector<IValue> aten_inputs = { |
3619 | t0, |
3620 | t1, |
3621 | at::Scalar(fl0), |
3622 | at::Scalar(fl1), |
3623 | at::Scalar(fl2), |
3624 | at::Scalar(fl3)}; |
3625 | |
3626 | FusionExecutor fe; |
3627 | fe.compileFusion(&fusion, aten_inputs); |
3628 | fe.runFusion(aten_inputs, {cg_output}); |
3629 | |
3630 | testValidate( |
3631 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
3632 | } |
3633 | |
3634 | TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) { |
3635 | Fusion fusion; |
3636 | FusionGuard fg(&fusion); |
3637 | |
3638 | // Set up your input tensor views |
3639 | TensorView* tv0 = makeSymbolicTensor(3); |
3640 | TensorView* tv1 = makeSymbolicTensor(3); |
3641 | |
3642 | // Register your inputs |
3643 | fusion.addInput(tv0); |
3644 | fusion.addInput(tv1); |
3645 | |
3646 | // Do math with it, it returns a `Val*` but can be static_casted back to |
3647 | // TensorView |
3648 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
3649 | TensorView* tv3 = add(tv0, tv2); |
3650 | |
3651 | // Register your outputs |
3652 | fusion.addOutput(tv3); |
3653 | |
3654 | int block_size = 16; |
3655 | |
3656 | tv3->merge(0, 1); |
3657 | tv3->merge(0, 1); |
3658 | |
3659 | tv3->split(0, block_size); |
3660 | tv3->split(0, 4); |
3661 | |
3662 | // For all inputs, computeAt the output inline, temporaries should be squeezed |
3663 | // between them |
3664 | tv0->computeAt(tv3, 1); |
3665 | tv1->computeAt(tv3, 1); |
3666 | |
3667 | // Parallelize |
3668 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
3669 | tv3->axis(1)->parallelize(ParallelType::Unroll); |
3670 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
3671 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
3672 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
3673 | |
3674 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3675 | |
3676 | at::Tensor input0 = at::randn({129, 13, 3}, options); |
3677 | at::Tensor input1 = at::randn({129, 13, 3}, options); |
3678 | |
3679 | FusionExecutor fe; |
3680 | fe.compileFusion(&fusion, {input0, input1}); |
3681 | auto outputs = fe.runFusion({input0, input1}); |
3682 | |
3683 | TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); |
3684 | } |
3685 | |
3686 | /* |
3687 | * Helper function for single op testing that generates a codegen operand |
3688 | */ |
3689 | |
3690 | Val* gen_jit_operand(std::pair<ValType, DataType> desc) { |
3691 | if (desc.first == ValType::TensorView) { |
3692 | return makeSymbolicTensor(2, desc.second); |
3693 | } else if (desc.first == ValType::Scalar) { |
3694 | if (desc.second == DataType::Float) { |
3695 | return IrBuilder::create<Double>(); |
3696 | } else if (desc.second == DataType::Double) { |
3697 | return IrBuilder::create<Double>(); |
3698 | } else if (desc.second == DataType::ComplexFloat) { |
3699 | return IrBuilder::create<ComplexDouble>(); |
3700 | } else if (desc.second == DataType::ComplexDouble) { |
3701 | return IrBuilder::create<ComplexDouble>(); |
3702 | } else if (desc.second == DataType::Int) { |
3703 | return IrBuilder::create<Int>(); |
3704 | } else { |
3705 | TORCH_CHECK(false, "Not currently supported type: " , desc.first); |
3706 | } |
3707 | } else { |
3708 | TORCH_CHECK(false, "Not currently supported type: " , desc.first); |
3709 | } |
3710 | return nullptr; |
3711 | } |
3712 | |
3713 | /* |
3714 | * Helper function for single op testing that generates an ATen operand |
3715 | */ |
3716 | |
3717 | IValue gen_aten_operand( |
3718 | std::pair<ValType, DataType> desc, |
3719 | int blocks, |
3720 | int threads, |
3721 | bool rand) { |
3722 | if (desc.first == ValType::TensorView) { |
3723 | if (desc.second == DataType::Double || desc.second == DataType::Float || |
3724 | desc.second == DataType::ComplexDouble || |
3725 | desc.second == DataType::ComplexFloat || |
3726 | desc.second == DataType::Half || desc.second == DataType::BFloat16) { |
3727 | auto options = at::TensorOptions() |
3728 | .dtype(data_type_to_aten(desc.second)) |
3729 | .device(at::kCUDA, 0); |
3730 | if (rand) { |
3731 | return IValue(at::rand({blocks, threads}, options)); |
3732 | } else { |
3733 | return IValue(at::empty({blocks, threads}, options)); |
3734 | } |
3735 | } else if (desc.second == DataType::Int || desc.second == DataType::Int32) { |
3736 | auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong; |
3737 | if (rand) { |
3738 | auto options = |
3739 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3740 | return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype)); |
3741 | } else { |
3742 | auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); |
3743 | return IValue(at::empty({blocks, threads}, options)); |
3744 | } |
3745 | } else if (desc.second == DataType::Bool) { |
3746 | if (rand) { |
3747 | auto options = |
3748 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
3749 | return IValue( |
3750 | at::rand({blocks, threads}, options).round().to(at::kBool)); |
3751 | } else { |
3752 | auto options = |
3753 | at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0); |
3754 | return IValue(at::empty({blocks, threads}, options)); |
3755 | } |
3756 | } else { |
3757 | TORCH_CHECK(false, "Not currently supported type: " , desc.second) |
3758 | } |
3759 | } else if (desc.first == ValType::Scalar) { |
3760 | // IValue scalars can only be double int64 or bool |
3761 | if (desc.second == DataType::ComplexDouble || |
3762 | desc.second == DataType::ComplexFloat) { |
3763 | return IValue(at::Scalar(c10::complex<double>(1.0, 0.0))); |
3764 | } else if ( |
3765 | desc.second == DataType::Double || desc.second == DataType::Float || |
3766 | desc.second == DataType::Half || desc.second == DataType::BFloat16) { |
3767 | return IValue(at::Scalar(1.0)); |
3768 | } else if (desc.second == DataType::Int) { |
3769 | return IValue(at::Scalar(1)); |
3770 | } else { |
3771 | TORCH_CHECK(false, "Not currently supported type: " , desc.first); |
3772 | } |
3773 | } else { |
3774 | TORCH_CHECK(false, "Not currently supported type: " , desc.first); |
3775 | } |
3776 | return nullptr; |
3777 | } |
3778 | |
3779 | /* |
3780 | * Templatized Helper Function To generate single Op comparison between the |
3781 | * JIT codegen for Cuda and the ATen Library. |
3782 | */ |
3783 | |
3784 | using OutputPair = std::pair<ValType, DataType>; |
3785 | template < |
3786 | typename AtenFunc, |
3787 | typename JitFunc, |
3788 | typename InputTuple, |
3789 | size_t... NumInputs> |
3790 | void test_op( |
3791 | int blocks, |
3792 | int threads, |
3793 | std::string op_str, |
3794 | AtenFunc af, |
3795 | JitFunc jf, |
3796 | OutputPair op, |
3797 | InputTuple it, |
3798 | std::index_sequence<NumInputs...>) { |
3799 | Fusion fusion; |
3800 | FusionGuard fg(&fusion); |
3801 | |
3802 | // Generate Input JIT function Inputs and add them as Inputs to the Fusion |
3803 | // Graph |
3804 | std::array<Val*, sizeof...(NumInputs)> jit_inputs = { |
3805 | gen_jit_operand(std::get<NumInputs>(it))...}; |
3806 | std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) { |
3807 | fusion.addInput(v); |
3808 | }); |
3809 | TensorView* out = |
3810 | static_cast<TensorView*>(jf(std::get<NumInputs>(jit_inputs)...)); |
3811 | fusion.addOutput(out); |
3812 | |
3813 | std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) { |
3814 | if (v->getValType() == ValType::TensorView) |
3815 | static_cast<TensorView*>(v)->computeAt(out, -1); |
3816 | }); |
3817 | out->axis(0)->parallelize(ParallelType::BIDx); |
3818 | out->axis(-1)->parallelize(ParallelType::TIDx); |
3819 | |
3820 | std::array<IValue, sizeof...(NumInputs)> aten_inputs = {gen_aten_operand( |
3821 | std::get<NumInputs>(it), blocks, threads, /*rand*/ true)...}; |
3822 | const at::ArrayRef<IValue> aten_inputs_ivalues(aten_inputs); |
3823 | |
3824 | at::Tensor cg_output = |
3825 | gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); |
3826 | std::vector<at::Tensor> output_vect = {cg_output}; |
3827 | cudaDeviceSynchronize(); |
3828 | if (fusion.isStochastic()) |
3829 | at::manual_seed(0); |
3830 | |
3831 | FusionExecutor fe; |
3832 | fe.compileFusion(&fusion, aten_inputs_ivalues); |
3833 | fe.runFusion(aten_inputs_ivalues, output_vect); |
3834 | cudaDeviceSynchronize(); |
3835 | |
3836 | if (fusion.isStochastic()) |
3837 | at::manual_seed(0); |
3838 | at::Tensor aten_output = af(aten_inputs); |
3839 | cudaDeviceSynchronize(); // This sync shouldn't be necessary; |
3840 | |
3841 | std::string op_msg = "Operation " + op_str; |
3842 | |
3843 | testValidate( |
3844 | &fusion, |
3845 | {cg_output}, |
3846 | aten_inputs, |
3847 | {aten_output}, |
3848 | __LINE__, |
3849 | __FILE__, |
3850 | op_msg); |
3851 | } |
3852 | |
3853 | /* |
3854 | * Templatized Helper Function that uses variadic templates to |
3855 | * process a variable length Input Tuple of different Operand Type. |
3856 | */ |
3857 | template <typename AtenFunc, typename JitFunc, typename InputTuple> |
3858 | void test_op( |
3859 | int blocks, |
3860 | int threads, |
3861 | std::string op_str, |
3862 | AtenFunc af, |
3863 | JitFunc jf, |
3864 | OutputPair op, |
3865 | InputTuple it) { |
3866 | static constexpr auto size = std::tuple_size<InputTuple>::value; |
3867 | test_op( |
3868 | blocks, |
3869 | threads, |
3870 | op_str, |
3871 | af, |
3872 | jf, |
3873 | op, |
3874 | it, |
3875 | std::make_index_sequence<size>{}); |
3876 | } |
3877 | |
3878 | TEST_F(NVFuserTest, FusionUnaryOps_CUDA) { |
3879 | using OpTuple = |
3880 | std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>; |
3881 | |
3882 | // [Note: explicit tuple type for uniform initialization list] |
3883 | // Tuple type must be explicitly specified for each uniform initialization |
3884 | // list within the vector to make this code compatible with some old env |
3885 | // which we still need to support. eg. gcc 5.4 + cuda 9.2. |
3886 | std::vector<OpTuple> ops{ |
3887 | OpTuple{at::acos, UnaryOpType::Acos, "acos" }, |
3888 | OpTuple{at::asin, UnaryOpType::Asin, "asin" }, |
3889 | OpTuple{at::atan, UnaryOpType::Atan, "atan" }, |
3890 | // There does not appear to be an appropriate ATen function for atanh |
3891 | // OpTuple{at::atanh, UnaryOpType::Atanh, "atanh" }, |
3892 | OpTuple{at::cos, UnaryOpType::Cos, "cos" }, |
3893 | OpTuple{at::cosh, UnaryOpType::Cosh, "cosh" }, |
3894 | OpTuple{at::exp, UnaryOpType::Exp, "exp" }, |
3895 | // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"}, |
3896 | OpTuple{at::log, UnaryOpType::Log, "log" }, |
3897 | OpTuple{at::log10, UnaryOpType::Log10, "log10" }, |
3898 | OpTuple{at::neg, UnaryOpType::Neg, "neg" }, |
3899 | OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal" }, |
3900 | OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid" }, |
3901 | OpTuple{at::sin, UnaryOpType::Sin, "sin" }, |
3902 | OpTuple{at::sinh, UnaryOpType::Sinh, "sinh" }, |
3903 | OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt" }, |
3904 | OpTuple{at::tan, UnaryOpType::Tan, "tan" }, |
3905 | OpTuple{at::tanh, UnaryOpType::Tanh, "tanh" }, |
3906 | OpTuple{at::isfinite, UnaryOpType::IsFinite, "isfinite" }, |
3907 | OpTuple{at::isinf, UnaryOpType::IsInf, "isinf" }, |
3908 | OpTuple{at::isnan, UnaryOpType::IsNan, "isnan" }, |
3909 | OpTuple{at::isreal, UnaryOpType::IsReal, "isreal" }, |
3910 | }; |
3911 | |
3912 | // The following ops has no complex support in eager mode |
3913 | std::vector<OpTuple> ops_without_complex{ |
3914 | OpTuple{at::ceil, UnaryOpType::Ceil, "ceil" }, |
3915 | OpTuple{at::floor, UnaryOpType::Floor, "floor" }, |
3916 | OpTuple{at::frac, UnaryOpType::Frac, "frac" }, |
3917 | OpTuple{at::trunc, UnaryOpType::Trunc, "trunc" }, |
3918 | OpTuple{at::round, UnaryOpType::Round, "round" }, |
3919 | OpTuple{at::relu, UnaryOpType::Relu, "relu" }, |
3920 | OpTuple{at::expm1, UnaryOpType::Expm1, "expm1" }, |
3921 | OpTuple{at::log1p, UnaryOpType::Log1p, "log1p" }, |
3922 | OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma" }, |
3923 | OpTuple{at::erf, UnaryOpType::Erf, "erf" }, |
3924 | OpTuple{at::erfc, UnaryOpType::Erfc, "erfc" }, |
3925 | OpTuple{at::isneginf, UnaryOpType::IsNegInf, "isneginf" }, |
3926 | OpTuple{at::isposinf, UnaryOpType::IsPosInf, "isposinf" }, |
3927 | }; |
3928 | |
3929 | // The following ops only supports complex |
3930 | std::vector<OpTuple> ops_complex_only{ |
3931 | // real is supported via UnaryOpType::Set for non-complex types, and |
3932 | // UnaryOpType::Real requires input to be complex |
3933 | OpTuple{at::real, UnaryOpType::Real, "real" }, |
3934 | OpTuple{at::imag, UnaryOpType::Imag, "imag" }, |
3935 | }; |
3936 | |
3937 | // Complex support for the following op is not working in nvFuser yet |
3938 | std::vector<OpTuple> ops_skip_complex{ |
3939 | // TODO: abs is actually supported in nvFuser, but it has bug!!! |
3940 | // In eager mode, abs(complex_tensor) returns floating point tensor |
3941 | // but in nvFuser, it wrongly returns complex tensor! |
3942 | // We need to: |
3943 | // 1. change our type promotion logic to make a special case for abs |
3944 | // 2. why this bug is not detected here? we should bump up test coverage |
3945 | OpTuple{at::abs, UnaryOpType::Abs, "abs" }, |
3946 | // TODO: the following two ops fails with compilation error like |
3947 | // "undefined function rsqrt(complex)", we could implement them in |
3948 | // helpers.cu, but I think it is better to check with Jiterator first, |
3949 | // because Jiterator uses the same string for complex support. |
3950 | OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt" }, |
3951 | OpTuple{at::log2, UnaryOpType::Log2, "log2" }}; |
3952 | |
3953 | std::vector<DataType> dtypes = { |
3954 | DataType::Float, |
3955 | DataType::Double, |
3956 | DataType::ComplexFloat, |
3957 | DataType::ComplexDouble}; |
3958 | |
3959 | for (auto dtype : dtypes) { |
3960 | auto ops_to_test = ops; |
3961 | if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) { |
3962 | ops_to_test.insert( |
3963 | ops_to_test.end(), |
3964 | ops_without_complex.begin(), |
3965 | ops_without_complex.end()); |
3966 | ops_to_test.insert( |
3967 | ops_to_test.end(), ops_skip_complex.begin(), ops_skip_complex.end()); |
3968 | } else { |
3969 | ops_to_test.insert( |
3970 | ops_to_test.end(), ops_complex_only.begin(), ops_complex_only.end()); |
3971 | } |
3972 | std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) { |
3973 | test_op( |
3974 | /*blocks*/ 640, |
3975 | /*threads*/ 64, |
3976 | /*name*/ std::get<2>(op), |
3977 | /*Aten Func */ |
3978 | [&op](std::array<IValue, 1>& vals) { |
3979 | return std::get<0>(op)(vals[0].toTensor()); |
3980 | }, |
3981 | /*JIT Func */ |
3982 | [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); }, |
3983 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
3984 | /*Inputs Tuple*/ |
3985 | std::make_tuple(std::make_pair(ValType::TensorView, dtype))); |
3986 | }); |
3987 | } |
3988 | |
3989 | dtypes = {DataType::Int, DataType::Int32, DataType::Bool}; |
3990 | for (auto dtype : dtypes) { |
3991 | test_op( |
3992 | /*blocks*/ 128, |
3993 | /*threads*/ 64, |
3994 | /*name*/ "bitwise_not" , |
3995 | /*Aten Func */ |
3996 | [](std::array<IValue, 1>& vals) { |
3997 | return at::bitwise_not(vals[0].toTensor()); |
3998 | }, |
3999 | /*JIT Func */ |
4000 | [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); }, |
4001 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4002 | /*Inputs Tuple*/ |
4003 | std::make_tuple(std::make_pair(ValType::TensorView, dtype))); |
4004 | } |
4005 | } |
4006 | |
4007 | TEST_F(NVFuserTest, FusionBinaryOps_CUDA) { |
4008 | using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&); |
4009 | using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>; |
4010 | |
4011 | std::vector<DataType> dtypes = { |
4012 | DataType::Double, |
4013 | DataType::Float, |
4014 | DataType::ComplexFloat, |
4015 | DataType::ComplexDouble}; |
4016 | |
4017 | // see [Note: explicit tuple type for uniform initialization list] |
4018 | std::vector<OpTuple> equal_ops{ |
4019 | OpTuple{at::eq, BinaryOpType::Eq, "eq" }, |
4020 | OpTuple{at::ne, BinaryOpType::NE, "ne" }}; |
4021 | |
4022 | // Complex numbers are not ordered |
4023 | std::vector<OpTuple> order_ops{ |
4024 | OpTuple{at::ge, BinaryOpType::GE, "ge" }, |
4025 | OpTuple{at::gt, BinaryOpType::GT, "gt" }, |
4026 | OpTuple{at::le, BinaryOpType::LE, "le" }, |
4027 | OpTuple{at::lt, BinaryOpType::LT, "lt" }}; |
4028 | |
4029 | // see [Note: explicit tuple type for uniform initialization list] |
4030 | std::vector<OpTuple> math_ops{ |
4031 | OpTuple{at::div, BinaryOpType::Div, "div" }, |
4032 | OpTuple{at::mul, BinaryOpType::Mul, "mul" }, |
4033 | OpTuple{at::pow, BinaryOpType::Pow, "pow" }}; |
4034 | |
4035 | // The following ops has no complex support in eager mode |
4036 | std::vector<OpTuple> math_ops_without_complex{ |
4037 | OpTuple{at::atan2, BinaryOpType::Atan2, "atan2" }, |
4038 | OpTuple{at::max, BinaryOpType::Max, "max" }, |
4039 | OpTuple{at::min, BinaryOpType::Min, "min" }, |
4040 | OpTuple{at::fmod, BinaryOpType::Fmod, "fmod" }, |
4041 | // NOTE: Remainder does not match the Aten impl exactly |
4042 | // despite using an identical function. |
4043 | OpTuple{at::remainder, BinaryOpType::Remainder, "remainder" }}; |
4044 | |
4045 | for (auto dtype : dtypes) { |
4046 | auto logic_ops = equal_ops; |
4047 | if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) { |
4048 | logic_ops.insert(logic_ops.end(), order_ops.begin(), order_ops.end()); |
4049 | } |
4050 | std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) { |
4051 | test_op( |
4052 | /*blocks*/ 640, |
4053 | /*threads*/ 64, |
4054 | /*name*/ std::get<2>(op), |
4055 | /*Aten Func */ |
4056 | [&op](std::array<IValue, 2>& vals) { |
4057 | return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); |
4058 | }, |
4059 | /*JIT Func */ |
4060 | [&op](Val* in1, Val* in2) -> Val* { |
4061 | return binaryOp(std::get<1>(op), in1, in2); |
4062 | }, |
4063 | /*Output */ std::make_pair(ValType::TensorView, DataType::Bool), |
4064 | /*Inputs Tuple*/ |
4065 | std::make_tuple( |
4066 | std::make_pair(ValType::TensorView, dtype), |
4067 | std::make_pair(ValType::TensorView, dtype))); |
4068 | }); |
4069 | |
4070 | auto enabled_math_ops = math_ops; |
4071 | if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) { |
4072 | enabled_math_ops.insert( |
4073 | enabled_math_ops.end(), |
4074 | math_ops_without_complex.begin(), |
4075 | math_ops_without_complex.end()); |
4076 | } |
4077 | std::for_each( |
4078 | enabled_math_ops.begin(), enabled_math_ops.end(), [&](OpTuple& op) { |
4079 | test_op( |
4080 | /*blocks*/ 640, |
4081 | /*threads*/ 64, |
4082 | /*name*/ std::get<2>(op), |
4083 | /*Aten Func */ |
4084 | [&op](std::array<IValue, 2>& vals) { |
4085 | return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); |
4086 | }, |
4087 | /*JIT Func */ |
4088 | [&op](Val* in1, Val* in2) -> Val* { |
4089 | return binaryOp(std::get<1>(op), in1, in2); |
4090 | }, |
4091 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4092 | /*Inputs Tuple*/ |
4093 | std::make_tuple( |
4094 | std::make_pair(ValType::TensorView, dtype), |
4095 | std::make_pair(ValType::TensorView, dtype))); |
4096 | }); |
4097 | |
4098 | test_op( |
4099 | /*blocks*/ 640, |
4100 | /*threads*/ 64, |
4101 | /*name*/ "add_alpha" , |
4102 | /*Aten Func */ |
4103 | [](std::array<IValue, 3>& vals) { |
4104 | return at::add( |
4105 | vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); |
4106 | }, |
4107 | /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&add_alpha), |
4108 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4109 | /*Inputs Tuple*/ |
4110 | std::make_tuple( |
4111 | std::make_pair(ValType::TensorView, dtype), |
4112 | std::make_pair(ValType::TensorView, dtype), |
4113 | std::make_pair(ValType::Scalar, dtype))); |
4114 | |
4115 | test_op( |
4116 | /*blocks*/ 640, |
4117 | /*threads*/ 64, |
4118 | /*name*/ "sub_alpha" , |
4119 | /*Aten Func */ |
4120 | [](std::array<IValue, 3>& vals) { |
4121 | return at::sub( |
4122 | vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); |
4123 | }, |
4124 | /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&sub_alpha), |
4125 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4126 | /*Inputs Tuple*/ |
4127 | std::make_tuple( |
4128 | std::make_pair(ValType::TensorView, dtype), |
4129 | std::make_pair(ValType::TensorView, dtype), |
4130 | std::make_pair(ValType::Scalar, dtype))); |
4131 | } |
4132 | } |
4133 | |
4134 | TEST_F(NVFuserTest, FusionTernaryOps_CUDA) { |
4135 | std::vector<DataType> dtypes = { |
4136 | DataType::Double, |
4137 | DataType::Float, |
4138 | DataType::ComplexFloat, |
4139 | DataType::ComplexDouble}; |
4140 | |
4141 | for (auto dtype : dtypes) { |
4142 | // clamp and threshold are not supported for complex on eager mode |
4143 | if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) { |
4144 | test_op( |
4145 | /*blocks*/ 640, |
4146 | /*threads*/ 64, |
4147 | /*name*/ "clamp" , |
4148 | /*Aten Func */ |
4149 | [](std::array<IValue, 1>& vals) { |
4150 | return at::clamp(vals[0].toTensor(), 0.f, 1.f); |
4151 | }, |
4152 | /*JIT Func */ |
4153 | [&](Val* in1) -> Val* { |
4154 | if (dtype == DataType::Float) { |
4155 | return clamp( |
4156 | in1, |
4157 | IrBuilder::create<Double>(0.f), |
4158 | IrBuilder::create<Double>(1.f)); |
4159 | } else { |
4160 | return clamp( |
4161 | in1, |
4162 | IrBuilder::create<Double>(0.f), |
4163 | IrBuilder::create<Double>(1.f)); |
4164 | } |
4165 | }, |
4166 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4167 | /*Inputs Tuple*/ |
4168 | std::make_tuple(std::make_pair(ValType::TensorView, dtype))); |
4169 | test_op( |
4170 | /*blocks*/ 640, |
4171 | /*threads*/ 64, |
4172 | /*name*/ "threshold" , |
4173 | /*Aten Func */ |
4174 | [](std::array<IValue, 1>& vals) { |
4175 | return at::threshold(vals[0].toTensor(), 0.f, 1.f); |
4176 | }, |
4177 | /*JIT Func */ |
4178 | [&](Val* in1) -> Val* { |
4179 | if (dtype == DataType::Float) { |
4180 | return threshold( |
4181 | in1, |
4182 | IrBuilder::create<Double>(0.f), |
4183 | IrBuilder::create<Double>(1.f)); |
4184 | } else { |
4185 | return threshold( |
4186 | in1, |
4187 | IrBuilder::create<Double>(0.f), |
4188 | IrBuilder::create<Double>(1.f)); |
4189 | } |
4190 | }, |
4191 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4192 | /*Inputs Tuple*/ |
4193 | std::make_tuple(std::make_pair(ValType::TensorView, dtype))); |
4194 | } |
4195 | test_op( |
4196 | /*blocks*/ 640, |
4197 | /*threads*/ 64, |
4198 | /*name*/ "where" , |
4199 | /*Aten Func */ |
4200 | [](std::array<IValue, 3>& vals) { |
4201 | return at::where( |
4202 | vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); |
4203 | }, |
4204 | /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&where), |
4205 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4206 | /*Inputs Tuple*/ |
4207 | std::make_tuple( |
4208 | std::make_pair(ValType::TensorView, DataType::Bool), |
4209 | std::make_pair(ValType::TensorView, dtype), |
4210 | std::make_pair(ValType::TensorView, dtype))); |
4211 | } |
4212 | } |
4213 | |
4214 | TEST_F(NVFuserTest, FusionCompoundOps_CUDA) { |
4215 | std::vector<DataType> dtypes = { |
4216 | DataType::Double, |
4217 | DataType::Float, |
4218 | DataType::ComplexFloat, |
4219 | DataType::ComplexDouble}; |
4220 | |
4221 | for (auto dtype : dtypes) { |
4222 | test_op( |
4223 | /*blocks*/ 640, |
4224 | /*threads*/ 64, |
4225 | /*name*/ "lerp" , |
4226 | /*Aten Func */ |
4227 | [](std::array<IValue, 3>& vals) { |
4228 | return at::lerp( |
4229 | vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); |
4230 | }, |
4231 | /*JIT Func */ static_cast<Val* (*)(Val*, Val*, Val*)>(&lerp), |
4232 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4233 | /*Inputs Tuple*/ |
4234 | std::make_tuple( |
4235 | std::make_pair(ValType::TensorView, dtype), |
4236 | std::make_pair(ValType::TensorView, dtype), |
4237 | std::make_pair(ValType::TensorView, dtype))); |
4238 | test_op( |
4239 | /*blocks*/ 640, |
4240 | /*threads*/ 64, |
4241 | /*name*/ "addcmul" , |
4242 | /*Aten Func */ |
4243 | [](std::array<IValue, 4>& vals) { |
4244 | return at::addcmul( |
4245 | vals[0].toTensor(), |
4246 | vals[1].toTensor(), |
4247 | vals[2].toTensor(), |
4248 | vals[3].toScalar()); |
4249 | }, |
4250 | /*JIT Func */ |
4251 | static_cast<Val* (*)(Val*, Val*, Val*, Val*)>(&addcmul), |
4252 | /*Output */ std::make_pair(ValType::TensorView, dtype), |
4253 | /*Inputs Tuple*/ |
4254 | std::make_tuple( |
4255 | std::make_pair(ValType::TensorView, dtype), |
4256 | std::make_pair(ValType::TensorView, dtype), |
4257 | std::make_pair(ValType::TensorView, dtype), |
4258 | std::make_pair(ValType::Scalar, dtype))); |
4259 | } |
4260 | } |
4261 | |
4262 | TEST_F(NVFuserTest, FusionCastOps_CUDA) { |
4263 | Fusion fusion; |
4264 | FusionGuard fg(&fusion); |
4265 | |
4266 | TensorView* tv0 = makeSymbolicTensor(2, DataType::Half); |
4267 | |
4268 | TensorView* intrm1 = castOp(DataType::Float, tv0); |
4269 | TensorView* out = castOp(DataType::Half, intrm1); |
4270 | |
4271 | fusion.addInput(tv0); |
4272 | fusion.addOutput(out); |
4273 | tv0->computeAt(out, -1); |
4274 | |
4275 | out->axis(0)->parallelize(ParallelType::BIDx); |
4276 | out->axis(-1)->parallelize(ParallelType::TIDx); |
4277 | |
4278 | auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); |
4279 | |
4280 | at::Tensor input1 = at::randn({1, 4}, options); |
4281 | at::Tensor ref_output = at::empty_like(input1); |
4282 | |
4283 | std::array<IValue, 1> inputs = {input1}; |
4284 | const at::ArrayRef<IValue> input_ivalues(inputs); |
4285 | |
4286 | FusionExecutor fe; |
4287 | fe.compileFusion(&fusion, input_ivalues); |
4288 | auto outputs = fe.runFusion(input_ivalues); |
4289 | |
4290 | ref_output = at::_cast_Half(at::_cast_Double(input1)); |
4291 | |
4292 | TORCH_CHECK( |
4293 | outputs[0].equal(ref_output), |
4294 | "\nOp Type: -- " , |
4295 | "cast FP16->FP32->FP16" , |
4296 | " -- had a mismatch.\n" , |
4297 | "\nABS MAX DIFF: " , |
4298 | outputs[0].sub(ref_output).abs().max(), |
4299 | "\n" ); |
4300 | } |
4301 | |
4302 | // Start off simple, block on the outer dim |
4303 | // block stride + thread all reduce + unrolling on inner dim |
4304 | TEST_F(NVFuserTest, FusionReduction1_CUDA) { |
4305 | Fusion fusion; |
4306 | FusionGuard fg(&fusion); |
4307 | |
4308 | // Set up your input tensor views |
4309 | TensorView* tv0 = makeSymbolicTensor(2); |
4310 | fusion.addInput(tv0); |
4311 | |
4312 | // tv1[I0, R1] = tv0[I0, I1] |
4313 | TensorView* tv1 = |
4314 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
4315 | fusion.addOutput(tv1); |
4316 | |
4317 | TORCH_CHECK( |
4318 | ir_utils::getReductionOps(&fusion).size(), |
4319 | "Could not detect reduction in fusion." ); |
4320 | |
4321 | tv1->split(1, 128); |
4322 | // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] |
4323 | tv1->split(1, 4); |
4324 | // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] |
4325 | |
4326 | TensorView* tv2 = tv1->rFactor({1}); |
4327 | // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] |
4328 | // tv1[I0, R1oi{4}, R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] |
4329 | |
4330 | TensorView* tv3 = tv1->rFactor({1}); |
4331 | // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] |
4332 | // tv3[I0, R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] |
4333 | // tv1[I0, R1i{128}] = tv3[I0, R1oi{4}, Ir1i{128}] |
4334 | |
4335 | // Incrementally, can print in between for debugging |
4336 | tv0->computeAt(tv2, 1); |
4337 | tv2->computeAt(tv3, 1); |
4338 | tv3->computeAt(tv1, 1); |
4339 | |
4340 | // Re do it all at once, because why not. |
4341 | tv0->computeAt(tv1, 1); |
4342 | |
4343 | tv2->axis(2)->parallelize(ParallelType::Unroll); |
4344 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
4345 | |
4346 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
4347 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
4348 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
4349 | |
4350 | int numel_x = 65000; |
4351 | int numel_y = 1025; |
4352 | |
4353 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4354 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
4355 | at::Tensor cg_output = at::empty({numel_x}, options); |
4356 | |
4357 | FusionExecutor fe; |
4358 | fe.compileFusion(&fusion, {input}); |
4359 | fe.runFusion({input}, {cg_output}); |
4360 | |
4361 | auto aten_output = input.to(at::kDouble).sum({1}); |
4362 | |
4363 | testValidate( |
4364 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
4365 | } |
4366 | |
4367 | TEST_F(NVFuserTest, FusionReduction2_CUDA) { |
4368 | Fusion fusion; |
4369 | FusionGuard fg(&fusion); |
4370 | |
4371 | // Set up your input tensor views |
4372 | TensorView* tv0 = makeSymbolicTensor(2); |
4373 | fusion.addInput(tv0); |
4374 | |
4375 | // tv1[I0, R1] = tv0[I0, I1] |
4376 | TensorView* tv1 = |
4377 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
4378 | |
4379 | fusion.addOutput(tv1); |
4380 | |
4381 | // switches to try some different scenarios. maybe we should iterate on all |
4382 | // permutations. |
4383 | bool bind_bidx = true; |
4384 | bool bind_tidx = true; |
4385 | bool bind_tidy = true; |
4386 | bool bind_unroll = true; |
4387 | |
4388 | int numel_x = 1025; // Cannot exceed block dim max size / tidy |
4389 | int numel_y = 129; |
4390 | int tidx = 16; |
4391 | int tidy = 8; |
4392 | int unroll_factor = 4; |
4393 | |
4394 | tv1->split(1, tidx); |
4395 | // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1] |
4396 | |
4397 | tv1->split(1, unroll_factor); |
4398 | // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1] |
4399 | |
4400 | tv1->split(0, tidy); |
4401 | |
4402 | TensorView* tv2 = tv1->rFactor({-3}); |
4403 | // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] |
4404 | // tv1[I0o, I0i{tidy}, R1oi{unroll}, R1i{tidx}] |
4405 | |
4406 | TensorView* tv3 = tv1->rFactor({-2}); |
4407 | // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] |
4408 | // tv3[I0, R1oi{unroll}, Ir1i{tidx}] |
4409 | // tv1[I0o, I0i{tidy}, R1i{tidx}] |
4410 | |
4411 | tv0->computeAt(tv1, -2); |
4412 | |
4413 | if (bind_unroll) |
4414 | tv2->axis(-2)->parallelize(ParallelType::Unroll); |
4415 | if (bind_bidx) |
4416 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
4417 | if (bind_tidy) |
4418 | tv1->axis(1)->parallelize(ParallelType::TIDy); |
4419 | |
4420 | if (bind_tidx) { |
4421 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
4422 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
4423 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
4424 | } |
4425 | |
4426 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4427 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
4428 | |
4429 | FusionExecutor fe; |
4430 | fe.compileFusion(&fusion, {input}); |
4431 | auto cg_outputs = fe.runFusion({input}); |
4432 | |
4433 | auto aten_output = input.to(at::kDouble).sum({1}); |
4434 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
4435 | } |
4436 | |
4437 | TEST_F(NVFuserTest, FusionReduction3_CUDA) { |
4438 | // What if Z participates in the reduction with X? |
4439 | Fusion fusion; |
4440 | FusionGuard fg(&fusion); |
4441 | |
4442 | // Set up your input tensor views |
4443 | TensorView* tv0 = makeSymbolicTensor(2); |
4444 | fusion.addInput(tv0); |
4445 | |
4446 | // tv1[I0, R1] = tv0[I0, I1] |
4447 | TensorView* tv1 = |
4448 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
4449 | |
4450 | fusion.addOutput(tv1); |
4451 | |
4452 | int numel_x = 1025; // Cannot exceed block dim max size / tidy |
4453 | int numel_y = 129; |
4454 | int tidx = 16; |
4455 | int tidz = 8; |
4456 | |
4457 | tv1->split(1, tidz); |
4458 | // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1] |
4459 | |
4460 | tv1->split(1, tidx); |
4461 | // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1] |
4462 | |
4463 | TensorView* tv2 = tv1->rFactor({-3}); |
4464 | // tv2[I0, >R1oo<, Ir1oi{tidx}, Ir1i{tidz}] |
4465 | // tv1[I0o, R1oi{tidx}, R1i{tidz}] |
4466 | |
4467 | tv0->computeAt(tv1, -3); |
4468 | |
4469 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
4470 | tv1->axis(-2)->parallelize(ParallelType::TIDx); |
4471 | tv1->axis(-1)->parallelize(ParallelType::TIDz); |
4472 | |
4473 | tv2->axis(-2)->parallelize(ParallelType::TIDx); |
4474 | tv2->axis(-1)->parallelize(ParallelType::TIDz); |
4475 | |
4476 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4477 | at::Tensor aten_input = at::randn({numel_x, numel_y}, options); |
4478 | at::Tensor cg_output = at::empty({numel_x}, options); |
4479 | |
4480 | FusionExecutor fe; |
4481 | fe.compileFusion(&fusion, {aten_input}); |
4482 | fe.runFusion({aten_input}, {cg_output}); |
4483 | |
4484 | auto aten_output = aten_input.to(at::kDouble).sum({1}); |
4485 | |
4486 | testValidate( |
4487 | &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); |
4488 | } |
4489 | |
4490 | TEST_F(NVFuserTest, FusionReduction4_CUDA) { |
4491 | Fusion fusion; |
4492 | FusionGuard fg(&fusion); |
4493 | |
4494 | // Set up your input tensor views |
4495 | TensorView* tv0 = makeSymbolicTensor(2); |
4496 | TensorView* tv1 = makeSymbolicTensor(2); |
4497 | |
4498 | TensorView* tv2 = add(tv0, tv1); |
4499 | // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] |
4500 | |
4501 | fusion.addInput(tv0); |
4502 | fusion.addInput(tv1); |
4503 | |
4504 | TensorView* tv3 = |
4505 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2); |
4506 | // tv3[I0, R1] = tv2[I0, I1] |
4507 | |
4508 | TensorView* tv4 = makeSymbolicTensor(1); |
4509 | fusion.addInput(tv4); |
4510 | |
4511 | // tv5[I0] = tv3[I0, R1] * tv4[I0] |
4512 | TensorView* tv5 = mul(tv3, tv4); |
4513 | fusion.addOutput(tv5); |
4514 | |
4515 | int tidx = 16; |
4516 | |
4517 | // RFactor the reduction |
4518 | tv3->split(1, tidx); |
4519 | // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1] |
4520 | |
4521 | TensorView* tv6 = tv3->rFactor({-2}); |
4522 | // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1] |
4523 | // tv3[I0, R1i{tidx}] = tv3[I0, I1] |
4524 | tv2->computeAt(tv6, 2); |
4525 | |
4526 | // Compute at inline with tv5 (only 1D) |
4527 | tv6->computeAt(tv3, 1); |
4528 | tv3->computeAt(tv5, 1); |
4529 | |
4530 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
4531 | |
4532 | // Intermediate tensors only need this, but doesn't hurt to do on inputs |
4533 | // tv0, 1, 4 |
4534 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
4535 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
4536 | tv6->axis(-1)->parallelize(ParallelType::TIDx); |
4537 | |
4538 | int numel_x = 1025; |
4539 | int numel_y = 129; |
4540 | |
4541 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4542 | at::Tensor t0 = at::randn({numel_x, numel_y}, options); |
4543 | at::Tensor t1 = at::randn({numel_x, numel_y}, options); |
4544 | at::Tensor t4 = at::randn({numel_x}, options); |
4545 | |
4546 | FusionExecutor fe; |
4547 | fe.compileFusion(&fusion, {t0, t1, t4}); |
4548 | auto cg_outputs = fe.runFusion({t0, t1, t4}); |
4549 | |
4550 | auto t2 = t0.add(t1); |
4551 | auto t3 = t2.to(at::kDouble).sum({1}); |
4552 | auto aten_output = t3.mul(t4); |
4553 | |
4554 | testValidate( |
4555 | &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); |
4556 | } |
4557 | |
4558 | TEST_F(NVFuserTest, FusionReduction5_CUDA) { |
4559 | Fusion fusion; |
4560 | FusionGuard fg(&fusion); |
4561 | |
4562 | // Set up your input tensor views |
4563 | TensorView* tv0 = makeSymbolicTensor(3); |
4564 | |
4565 | fusion.addInput(tv0); |
4566 | |
4567 | TensorView* tv1 = |
4568 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
4569 | |
4570 | fusion.addOutput(tv1); |
4571 | |
4572 | int bidy = 2; |
4573 | int tidy = 4; |
4574 | int tidx = 5; |
4575 | |
4576 | int dim1 = 11; |
4577 | |
4578 | tv1->split(-2, tidy); |
4579 | |
4580 | TensorView* tv2 = tv1->rFactor({-3}); |
4581 | |
4582 | tv0->computeAt(tv1, 1); |
4583 | tv1->axis(0)->parallelize(ParallelType::BIDy); |
4584 | |
4585 | for (auto* val : fusion.vals()) { |
4586 | if (!val->isFusionInput() && |
4587 | val->getValType().value() == ValType::TensorView) { |
4588 | val->as<TensorView>()->axis(-1)->parallelize(ParallelType::TIDx); |
4589 | } |
4590 | } |
4591 | |
4592 | tv2->axis(-2)->parallelize(ParallelType::TIDy); |
4593 | tv1->axis(-2)->parallelize(ParallelType::TIDy); |
4594 | |
4595 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4596 | at::Tensor input = at::randn({bidy, dim1, tidx}, options); |
4597 | |
4598 | at::Tensor cg_output = at::empty({bidy, tidx}, options); |
4599 | |
4600 | FusionExecutor fe; |
4601 | fe.compileFusion(&fusion, {input}); |
4602 | fe.runFusion({input}, {cg_output}); |
4603 | |
4604 | auto aten_output = input.to(at::kDouble).sum({1}); |
4605 | testValidate( |
4606 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
4607 | } |
4608 | |
4609 | TEST_F(NVFuserTest, FusionReduction6_CUDA) { |
4610 | Fusion fusion; |
4611 | FusionGuard fg(&fusion); |
4612 | |
4613 | const int bdimx = 64; |
4614 | const int bdimy = 8; |
4615 | |
4616 | // Set up your input tensor views |
4617 | TensorView* tv0 = makeSymbolicTensor(3); |
4618 | fusion.addInput(tv0); |
4619 | |
4620 | // tv1[I0, R1, R2] = tv0[I0, I1, I2] |
4621 | TensorView* tv1 = |
4622 | reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0); |
4623 | fusion.addOutput(tv1); |
4624 | |
4625 | TORCH_CHECK( |
4626 | ir_utils::getReductionOps(&fusion).size(), |
4627 | "Could not detect reduction in fusion." ); |
4628 | |
4629 | tv1->split(2, bdimx); |
4630 | // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] |
4631 | tv1->split(1, bdimy); |
4632 | // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2] |
4633 | |
4634 | TensorView* tv2 = tv1->rFactor({3}); |
4635 | // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2] |
4636 | // tv1[I0, R1o, R1i{8}, R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}] |
4637 | |
4638 | TensorView* tv3 = tv1->rFactor({1}); |
4639 | // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2] |
4640 | // tv3[I0, R1o, I1i{8}, I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}] |
4641 | // tv1[I0, R1i{8}, R2i{128}] = tv3[I0, R1o, I1i{8}, I2i{128}] |
4642 | |
4643 | tv3->computeAt(tv1, 1); |
4644 | tv2->computeAt(tv3, 2); |
4645 | |
4646 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
4647 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
4648 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
4649 | |
4650 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
4651 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
4652 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
4653 | |
4654 | tv1->axis(-2)->parallelize(ParallelType::TIDy); |
4655 | tv3->axis(-2)->parallelize(ParallelType::TIDy); |
4656 | tv2->axis(-3)->parallelize(ParallelType::TIDy); |
4657 | |
4658 | int numel_x = 650; |
4659 | int numel_y = 1000; |
4660 | int numel_z = 4; |
4661 | |
4662 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4663 | at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); |
4664 | |
4665 | FusionExecutor fe; |
4666 | fe.compileFusion(&fusion, {input}); |
4667 | auto cg_outputs = fe.runFusion({input}); |
4668 | |
4669 | auto aten_output = input.to(at::kDouble).sum({1, 2}); |
4670 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
4671 | } |
4672 | |
4673 | TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) { |
4674 | Fusion fusion; |
4675 | FusionGuard fg(&fusion); |
4676 | |
4677 | TensorView* tv0 = makeSymbolicTensor(2); |
4678 | fusion.addInput(tv0); |
4679 | TensorView* tv1 = max(tv0, {0}); |
4680 | TensorView* tv2 = sum(tv0, {0}); |
4681 | |
4682 | fusion.addOutput(tv1); |
4683 | fusion.addOutput(tv2); |
4684 | |
4685 | int numel_x = 4; |
4686 | int numel_y = 2; |
4687 | |
4688 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
4689 | tv1->axis(1)->parallelize(ParallelType::TIDx); |
4690 | |
4691 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
4692 | tv2->axis(1)->parallelize(ParallelType::TIDx); |
4693 | |
4694 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4695 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
4696 | |
4697 | FusionExecutor fe; |
4698 | fe.compileFusion(&fusion, {input}); |
4699 | auto cg_outputs = fe.runFusion({input}); |
4700 | |
4701 | std::vector<at::Tensor> aten_outputs = { |
4702 | std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)}; |
4703 | testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); |
4704 | } |
4705 | |
4706 | TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) { |
4707 | Fusion fusion; |
4708 | FusionGuard fg(&fusion); |
4709 | |
4710 | auto tv0 = makeSymbolicTensor(2); |
4711 | fusion.addInput(tv0); |
4712 | auto tv1 = sum(tv0, {0}); |
4713 | auto tv2 = sum(tv1, {0}); |
4714 | fusion.addOutput(tv2); |
4715 | |
4716 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
4717 | tv1->axis(1)->parallelize(ParallelType::BIDy); |
4718 | tv2->axis(0)->parallelize(ParallelType::BIDy); |
4719 | |
4720 | FusionExecutor fe; |
4721 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
4722 | ASSERT_ANY_THROW(fe.compileFusion(&fusion)); |
4723 | } |
4724 | |
4725 | TEST_F(NVFuserTest, FusionReductionTFT_CUDA) { |
4726 | Fusion fusion; |
4727 | FusionGuard fg(&fusion); |
4728 | |
4729 | // Set up your input tensor views |
4730 | TensorView* tv0 = makeSymbolicTensor(2); |
4731 | fusion.addInput(tv0); |
4732 | |
4733 | // tv1[I0, R1] = tv0[I0, I1] |
4734 | TensorView* tv1 = |
4735 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
4736 | |
4737 | fusion.addOutput(tv1); |
4738 | |
4739 | int numel_x = 1025; |
4740 | int numel_y = 129; |
4741 | int tidx = 16; |
4742 | int tidy = 8; |
4743 | int tidz = 8; |
4744 | |
4745 | tv1->split(1, tidx); |
4746 | // tv1[I0, R1o, R1i{tidx}] |
4747 | |
4748 | tv1->split(1, tidz); |
4749 | // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}] |
4750 | |
4751 | tv1->split(0, tidy); |
4752 | // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}] |
4753 | |
4754 | TensorView* tv2 = tv1->rFactor({2}); |
4755 | // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}] |
4756 | // tv1[I0o, I0i, R1Oi{tidz}, R1R1i{tidx}] |
4757 | |
4758 | tv2->computeAt(tv1, 2); |
4759 | |
4760 | tv1->axis(1)->parallelize(ParallelType::TIDy); |
4761 | |
4762 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
4763 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
4764 | |
4765 | tv1->axis(-2)->parallelize(ParallelType::TIDz); |
4766 | tv2->axis(-2)->parallelize(ParallelType::TIDz); |
4767 | |
4768 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4769 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
4770 | at::Tensor cg_output = at::empty({numel_x}, options); |
4771 | |
4772 | FusionExecutor fe; |
4773 | fe.compileFusion(&fusion, {input}); |
4774 | fe.runFusion({input}, {cg_output}); |
4775 | |
4776 | auto aten_output = input.to(at::kDouble).sum({1}); |
4777 | testValidate( |
4778 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
4779 | } |
4780 | |
4781 | TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) { |
4782 | // based off FusionReduction4 |
4783 | Fusion fusion; |
4784 | FusionGuard fg(&fusion); |
4785 | |
4786 | // Set up your input tensor views |
4787 | TensorView* tv0 = makeSymbolicTensor(2); |
4788 | TensorView* tv1 = makeSymbolicTensor(2); |
4789 | |
4790 | TensorView* tv2 = add(tv0, tv1); |
4791 | // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] |
4792 | |
4793 | fusion.addInput(tv0); |
4794 | fusion.addInput(tv1); |
4795 | |
4796 | TensorView* tv3 = |
4797 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2); |
4798 | // tv3[I0, R1] = tv2[I0, I1] |
4799 | |
4800 | TensorView* tv4 = makeSymbolicTensor(1); |
4801 | fusion.addInput(tv4); |
4802 | |
4803 | // tv5[I0] = tv3[I0, R1] * tv4[I0] |
4804 | TensorView* tv5 = mul(tv3, tv4); |
4805 | fusion.addOutput(tv5); |
4806 | |
4807 | // RFactor the reduction |
4808 | tv3->split(1, 16, false); |
4809 | // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1] |
4810 | |
4811 | TensorView* tv6 = tv3->rFactor({-2}); |
4812 | // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1] |
4813 | // tv3[I0, R1i{tidx}] = tv3[I0, I1] |
4814 | tv2->computeAt(tv6, 2); |
4815 | |
4816 | // Compute at inline with tv5 (only 1D) |
4817 | tv6->computeAt(tv3, 1); |
4818 | tv3->computeAt(tv5, 1); |
4819 | |
4820 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
4821 | |
4822 | // Intermediate tensors only need this, but doesn't hurt to do on inputs |
4823 | // tv0, 1, 4 |
4824 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
4825 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
4826 | tv6->axis(-1)->parallelize(ParallelType::TIDx); |
4827 | |
4828 | int numel_x = 1025; |
4829 | int numel_y = 129; |
4830 | |
4831 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4832 | at::Tensor t0 = at::randn({numel_x, numel_y}, options); |
4833 | at::Tensor t1 = at::randn({numel_x, numel_y}, options); |
4834 | at::Tensor t4 = at::randn({numel_x}, options); |
4835 | |
4836 | FusionExecutor fe; |
4837 | fe.compileFusion(&fusion, {t0, t1, t4}); |
4838 | auto cg_outputs = fe.runFusion({t0, t1, t4}); |
4839 | |
4840 | auto t2 = t0.add(t1); |
4841 | auto t3 = t2.to(at::kDouble).sum({1}); |
4842 | auto aten_output = t3.mul(t4); |
4843 | |
4844 | testValidate( |
4845 | &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); |
4846 | } |
4847 | |
4848 | TEST_F(NVFuserTest, FusionBranches_CUDA) { |
4849 | Fusion fusion; |
4850 | FusionGuard fg(&fusion); |
4851 | |
4852 | // Set up your input tensor views |
4853 | TensorView* tv0 = makeSymbolicTensor(2); |
4854 | TensorView* tv1 = makeSymbolicTensor(2); |
4855 | TensorView* tv2 = makeSymbolicTensor(2); |
4856 | fusion.addInput(tv0); |
4857 | fusion.addInput(tv1); |
4858 | fusion.addInput(tv2); |
4859 | |
4860 | auto tv3 = add(tv0, IrBuilder::create<Double>(1.0)); |
4861 | auto tv4 = add(tv3, tv1); |
4862 | auto tv5 = add(tv3, tv2); |
4863 | auto tv6 = add(tv4, tv5); |
4864 | |
4865 | fusion.addOutput(tv6); |
4866 | |
4867 | constexpr int x = 63, y = 33; |
4868 | |
4869 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4870 | |
4871 | at::Tensor t0 = at::randn({x, y}, options); |
4872 | at::Tensor t1 = at::randn({x, y}, options); |
4873 | at::Tensor t2 = at::randn({x, y}, options); |
4874 | |
4875 | FusionExecutor fe; |
4876 | tv6->merge(0); |
4877 | tv6->split(0, 128); |
4878 | tv6->split(0, 4); |
4879 | |
4880 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
4881 | |
4882 | tv0->computeAt(tv6, 1); |
4883 | tv1->computeAt(tv6, 1); |
4884 | tv2->computeAt(tv6, 1); |
4885 | |
4886 | tv3->axis(-2)->parallelize(ParallelType::Unroll); |
4887 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
4888 | tv4->axis(-2)->parallelize(ParallelType::Unroll); |
4889 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
4890 | tv5->axis(-2)->parallelize(ParallelType::Unroll); |
4891 | tv5->axis(-1)->parallelize(ParallelType::TIDx); |
4892 | tv6->axis(-1)->parallelize(ParallelType::TIDx); |
4893 | |
4894 | std::vector<IValue> aten_inputs = {t0, t1, t2}; |
4895 | |
4896 | fe.compileFusion(&fusion, aten_inputs); |
4897 | auto cg_outputs = fe.runFusion(aten_inputs); |
4898 | |
4899 | auto t3 = t0.add(1.0); |
4900 | auto t4 = t3.add(t1); |
4901 | auto t5 = t3.add(t2); |
4902 | auto aten_output = t4.add(t5); |
4903 | |
4904 | testValidate( |
4905 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
4906 | } |
4907 | |
4908 | TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) { |
4909 | Fusion fusion; |
4910 | FusionGuard fg(&fusion); |
4911 | |
4912 | // Set up your input tensor views |
4913 | TensorView* tv0 = makeSymbolicTensor(2); |
4914 | fusion.addInput(tv0); |
4915 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.5)); |
4916 | |
4917 | TensorView* tv2 = makeSymbolicTensor(2); |
4918 | fusion.addInput(tv2); |
4919 | TensorView* tv3 = makeSymbolicTensor(2); |
4920 | fusion.addInput(tv3); |
4921 | TensorView* tv4 = sub(tv2, tv3); |
4922 | |
4923 | TensorView* tv5 = broadcast(tv1, {false, false, true}); |
4924 | TensorView* tv6 = broadcast(tv4, {true, false, false}); |
4925 | |
4926 | TensorView* tv7 = add(tv5, tv6); |
4927 | fusion.addOutput(tv7); |
4928 | |
4929 | tv7->split(-1, 4); |
4930 | tv7->split(0, 8); |
4931 | |
4932 | tv0->computeAt(tv7, -1); |
4933 | tv2->computeAt(tv7, -1); |
4934 | |
4935 | tv7->axis(0)->parallelize(ParallelType::BIDx); |
4936 | tv7->axis(-1)->parallelize(ParallelType::TIDx); |
4937 | |
4938 | constexpr int x = 63, y = 33, z = 15; |
4939 | |
4940 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
4941 | |
4942 | at::Tensor t0 = at::randn({x, y}, options); |
4943 | at::Tensor t1 = t0.add(1.5); |
4944 | |
4945 | at::Tensor t2 = at::randn({y, z}, options); |
4946 | at::Tensor t3 = at::randn({y, z}, options); |
4947 | |
4948 | at::Tensor t4 = t2.sub(t3); |
4949 | at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z}); |
4950 | |
4951 | at::Tensor t6 = t4.expand({x, y, z}); |
4952 | |
4953 | at::Tensor aten_output = t5.add(t6); |
4954 | |
4955 | std::vector<IValue> aten_inputs = {t0, t2, t3}; |
4956 | |
4957 | FusionExecutor fe; |
4958 | fe.compileFusion(&fusion, aten_inputs); |
4959 | auto cg_outputs = fe.runFusion(aten_inputs); |
4960 | |
4961 | testValidate( |
4962 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
4963 | } |
4964 | |
4965 | TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) { |
4966 | Fusion fusion; |
4967 | FusionGuard fg(&fusion); |
4968 | |
4969 | // Set up your input tensor views |
4970 | TensorView* tv0 = makeSymbolicTensor(2); |
4971 | fusion.addInput(tv0); |
4972 | TensorView* tv1 = makeSymbolicTensor(2); |
4973 | fusion.addInput(tv1); |
4974 | |
4975 | TensorView* tv2 = add(tv0, tv1); |
4976 | |
4977 | TensorView* tv3 = broadcast(tv2, {false, false, true}); |
4978 | |
4979 | TensorView* tv4 = makeSymbolicTensor(2); |
4980 | fusion.addInput(tv4); |
4981 | |
4982 | TensorView* tv5 = sub(tv4, IrBuilder::create<Double>(0.1)); |
4983 | |
4984 | TensorView* tv6 = broadcast(tv5, {true, false, false}); |
4985 | |
4986 | TensorView* tv7 = add(tv3, tv6); |
4987 | |
4988 | fusion.addOutput(tv7); |
4989 | |
4990 | tv7->merge(0, 1); |
4991 | |
4992 | tv0->computeAt(tv7, -1); |
4993 | tv4->computeAt(tv7, -1); |
4994 | |
4995 | tv7->axis(0)->parallelize(ParallelType::BIDx); |
4996 | tv7->axis(-1)->parallelize(ParallelType::TIDx); |
4997 | |
4998 | constexpr int x = 63, y = 33, z = 15; |
4999 | |
5000 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5001 | |
5002 | at::Tensor t0 = at::randn({x, y}, options); |
5003 | at::Tensor t1 = at::randn({x, y}, options); |
5004 | at::Tensor t2 = t0.add(t1); |
5005 | at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z}); |
5006 | |
5007 | at::Tensor t4 = at::randn({y, z}, options); |
5008 | at::Tensor t5 = t4.sub(0.1); |
5009 | at::Tensor t6 = t5.expand({x, y, z}); |
5010 | at::Tensor aten_output = t3.add(t6); |
5011 | |
5012 | at::Tensor cg_output = at::empty({x, y, z}, options); |
5013 | |
5014 | std::vector<IValue> aten_inputs = {t0, t1, t4}; |
5015 | |
5016 | FusionExecutor fe; |
5017 | fe.compileFusion(&fusion, aten_inputs); |
5018 | fe.runFusion(aten_inputs, {cg_output}); |
5019 | |
5020 | testValidate( |
5021 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5022 | } |
5023 | |
5024 | TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) { |
5025 | Fusion fusion; |
5026 | FusionGuard fg(&fusion); |
5027 | |
5028 | // Set up input tensor views |
5029 | // tv0[I1, B{1}] |
5030 | TensorView* tv0 = makeConcreteTensor({-1, 1}); |
5031 | fusion.addInput(tv0); |
5032 | |
5033 | // tv1[I0, I1, I2] |
5034 | TensorView* tv2 = makeSymbolicTensor(3); |
5035 | fusion.addInput(tv2); |
5036 | |
5037 | TensorView* tv3 = add(tv0, tv2); |
5038 | |
5039 | fusion.addOutput(tv3); |
5040 | |
5041 | tv3->merge(0); |
5042 | tv3->merge(0); |
5043 | |
5044 | tv0->computeAt(tv3, -1); |
5045 | tv2->computeAt(tv3, -1); |
5046 | |
5047 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
5048 | |
5049 | constexpr int x = 2, y = 3, z = 4; |
5050 | |
5051 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5052 | |
5053 | at::Tensor t0 = at::randn({y, 1}, options); |
5054 | at::Tensor t2 = at::randn({x, y, z}, options); |
5055 | auto aten_output = t0.add(t2); |
5056 | |
5057 | std::vector<IValue> aten_inputs = {t0, t2}; |
5058 | at::Tensor cg_output = at::empty({x, y, z}, options); |
5059 | |
5060 | FusionExecutor fe; |
5061 | fe.compileFusion(&fusion, aten_inputs); |
5062 | fe.runFusion(aten_inputs, {cg_output}); |
5063 | |
5064 | testValidate( |
5065 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5066 | } |
5067 | |
5068 | TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) { |
5069 | Fusion fusion; |
5070 | FusionGuard fg(&fusion); |
5071 | |
5072 | // Set up your input tensor views |
5073 | TensorView* tv0 = makeConcreteTensor({1, -1}); |
5074 | |
5075 | TensorView* tv1 = makeSymbolicTensor(3); |
5076 | fusion.addInput(tv0); |
5077 | fusion.addInput(tv1); |
5078 | |
5079 | TensorView* tv3 = add(tv0, tv1); |
5080 | |
5081 | tv3->merge(0); |
5082 | tv3->merge(0); |
5083 | tv3->split(0, 128); |
5084 | tv3->split(0, 4); |
5085 | |
5086 | fusion.addOutput(tv3); |
5087 | |
5088 | tv0->computeAt(tv3, -1); |
5089 | tv1->computeAt(tv3, -1); |
5090 | |
5091 | tv3->axis(0)->parallelize(ParallelType::BIDx); |
5092 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
5093 | tv3->axis(-2)->parallelize(ParallelType::Unroll); |
5094 | |
5095 | constexpr int x = 63, y = 33, z = 15; |
5096 | |
5097 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5098 | |
5099 | at::Tensor t0 = at::randn({1, z}, options); |
5100 | at::Tensor t1 = at::randn({x, y, z}, options); |
5101 | |
5102 | auto aten_output = t0.add(t1); |
5103 | |
5104 | at::Tensor cg_output = at::empty({x, y, z}, options); |
5105 | |
5106 | std::vector<IValue> aten_inputs = {t0, t1}; |
5107 | |
5108 | FusionExecutor fe; |
5109 | fe.compileFusion(&fusion, aten_inputs); |
5110 | fe.runFusion(aten_inputs, {cg_output}); |
5111 | |
5112 | testValidate( |
5113 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5114 | } |
5115 | |
5116 | TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) { |
5117 | Fusion fusion; |
5118 | FusionGuard fg(&fusion); |
5119 | |
5120 | constexpr int m = 2, k = 3, n = 4; |
5121 | auto tv0 = makeConcreteTensor({m, k}); |
5122 | auto tv1 = makeConcreteTensor({k, n}); |
5123 | |
5124 | fusion.addInput(tv0); |
5125 | fusion.addInput(tv1); |
5126 | |
5127 | TensorView* tv2 = broadcast(tv0, {false, false, true}); |
5128 | TensorView* tv3 = broadcast(tv1, {true, false, false}); |
5129 | |
5130 | TensorView* tv4 = add(tv2, tv3); |
5131 | |
5132 | fusion.addOutput(tv4); |
5133 | |
5134 | tv4->merge(0); |
5135 | tv4->merge(0); |
5136 | |
5137 | tv0->computeAt(tv4, -1); |
5138 | tv1->computeAt(tv4, -1); |
5139 | |
5140 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5141 | |
5142 | at::Tensor t0 = at::randn({m, k}, options); |
5143 | at::Tensor t1 = at::randn({k, n}, options); |
5144 | |
5145 | auto t2 = t0.unsqueeze(-1).expand({m, k, n}); |
5146 | auto t3 = t1.expand({m, k, n}); |
5147 | auto aten_output = t2.add(t3); |
5148 | |
5149 | at::Tensor cg_output = at::empty({m, k, n}, options); |
5150 | |
5151 | std::vector<IValue> aten_inputs = {t0, t1}; |
5152 | |
5153 | FusionExecutor fe; |
5154 | fe.compileFusion(&fusion, aten_inputs); |
5155 | fe.runFusion(aten_inputs, {cg_output}); |
5156 | |
5157 | testValidate( |
5158 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5159 | } |
5160 | |
5161 | TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) { |
5162 | Fusion fusion; |
5163 | FusionGuard fg(&fusion); |
5164 | |
5165 | int x = 2, y = 3, z = 4; |
5166 | |
5167 | auto tv0 = makeConcreteTensor({y}); |
5168 | auto tv1 = div(tv0, IrBuilder::create<Double>(2.0)); |
5169 | auto tv2 = broadcast(tv1, {false, true}); |
5170 | auto tv3 = makeConcreteTensor({y, z}); |
5171 | auto tv4 = mul(tv2, tv3); |
5172 | auto tv5 = broadcast(tv4, {true, false, false}); |
5173 | auto tv6 = makeConcreteTensor({x, y, z}); |
5174 | auto tv7 = add(tv5, tv6); |
5175 | |
5176 | // tv0[ i1 ] = input |
5177 | // tv1[ i1 ] = tv0/2.0 |
5178 | // tv2[ i1, b2] = bcast(tv1) |
5179 | // tv3[ i1, i2] = input |
5180 | // tv4[ i1, i2] = tv2 * tv3 |
5181 | // tv5[b0, i1, i2] = bcast(tv4) |
5182 | // tv6[i0, i1, i2] = input |
5183 | // tv7[i0, i1, i2] = tv5 + tv6 |
5184 | |
5185 | // tv4 = bcast(tv1) * tv3 |
5186 | // tv7 = bcast(tv4) + tv6 |
5187 | |
5188 | fusion.addInput(tv0); |
5189 | fusion.addInput(tv3); |
5190 | fusion.addInput(tv6); |
5191 | |
5192 | fusion.addOutput(tv7); |
5193 | |
5194 | tv7->merge(0); |
5195 | tv7->merge(0); |
5196 | tv0->computeAt(tv7, -1); |
5197 | |
5198 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5199 | |
5200 | at::Tensor t0 = at::randn({y}, options); |
5201 | at::Tensor t3 = at::randn({y, z}, options); |
5202 | at::Tensor t6 = at::randn({x, y, z}, options); |
5203 | |
5204 | auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3; |
5205 | auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6; |
5206 | |
5207 | std::vector<IValue> aten_inputs = {t0, t3, t6}; |
5208 | |
5209 | FusionExecutor fe; |
5210 | fe.compileFusion(&fusion, aten_inputs); |
5211 | auto cg_outputs = fe.runFusion(aten_inputs); |
5212 | |
5213 | testValidate( |
5214 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5215 | } |
5216 | |
5217 | TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) { |
5218 | Fusion fusion; |
5219 | FusionGuard fg(&fusion); |
5220 | |
5221 | int x = 2, y = 3, z = 4; |
5222 | |
5223 | auto tv0 = makeConcreteTensor({y, z}); |
5224 | auto tv1 = div(tv0, IrBuilder::create<Double>(2.0)); |
5225 | auto tv2 = sum(tv1, {1}); |
5226 | auto tv3 = broadcast(tv2, {true, false}); |
5227 | auto tv4 = makeConcreteTensor({x, y}); |
5228 | auto tv5 = add(tv3, tv4); |
5229 | |
5230 | // tv0[ i1, i2] = input |
5231 | // tv1[ i1, i2] = tv0/2.0 |
5232 | // tv2[ i1 ] = sum(tv1, 1) |
5233 | // tv3[b0, i1 ] = bcast(tv2) |
5234 | // tv4[i0, i1 ] = input |
5235 | // tv5[i0, i1 ] = tv3 + tv4 |
5236 | |
5237 | // tv2 = sum(tv0/2.0, 1) |
5238 | // tv5 = bcast(tv2) + tv4 |
5239 | |
5240 | fusion.addInput(tv0); |
5241 | fusion.addInput(tv4); |
5242 | |
5243 | fusion.addOutput(tv5); |
5244 | |
5245 | tv5->merge(0); |
5246 | tv0->computeAt(tv5, -1); |
5247 | tv1->computeAt(tv2, -1); |
5248 | |
5249 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5250 | |
5251 | at::Tensor t0 = at::randn({y, z}, options); |
5252 | at::Tensor t4 = at::randn({x, y}, options); |
5253 | |
5254 | FusionExecutor fe; |
5255 | fe.compileFusion(&fusion, {t0, t4}); |
5256 | auto cg_outputs = fe.runFusion({t0, t4}); |
5257 | |
5258 | auto t1 = t0.div(2.0); |
5259 | auto t2 = t1.to(at::kDouble).sum(1); |
5260 | auto t3 = t2.unsqueeze(0).expand({x, y}); |
5261 | auto aten_output = t3.add(t4); |
5262 | |
5263 | testValidate( |
5264 | &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__); |
5265 | } |
5266 | |
5267 | TEST_F(NVFuserTest, FusionAdvancedIndexing1_CUDA) { |
5268 | Fusion fusion; |
5269 | FusionGuard fg(&fusion); |
5270 | |
5271 | int w = 3, x = 4, y = 7, z = 8; |
5272 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5273 | |
5274 | auto tv0 = makeSymbolicTensor(3); |
5275 | auto tv1 = makeSymbolicTensor(4); |
5276 | fusion.addInput(tv0); |
5277 | fusion.addInput(tv1); |
5278 | |
5279 | auto tv2 = add(tv0, IrBuilder::create<Double>(1.0)); |
5280 | auto tv3 = broadcast(tv2, {true, false, false, false}); |
5281 | auto tv4 = add(tv3, tv1); |
5282 | |
5283 | fusion.addOutput(tv4); |
5284 | |
5285 | tv4->merge(0); |
5286 | tv4->merge(0); |
5287 | tv4->merge(0); |
5288 | |
5289 | tv4->split(0, 128); |
5290 | tv4->split(0, 4); |
5291 | |
5292 | tv2->computeAt(tv4, 1); |
5293 | |
5294 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
5295 | tv4->axis(1)->parallelize(ParallelType::Unroll); |
5296 | tv4->axis(2)->parallelize(ParallelType::TIDx); |
5297 | |
5298 | tv3->axis(1)->parallelize(ParallelType::Unroll); |
5299 | tv3->axis(2)->parallelize(ParallelType::TIDx); |
5300 | |
5301 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
5302 | tv2->axis(2)->parallelize(ParallelType::TIDx); |
5303 | |
5304 | FusionExecutor fe; |
5305 | |
5306 | at::Tensor t0 = at::randn({x, y, z}, options); |
5307 | at::Tensor t1 = at::randn({w, x, y, z}, options); |
5308 | |
5309 | auto t3 = t0.add(1.0); |
5310 | auto aten_output = t3.add(t1); |
5311 | |
5312 | std::vector<IValue> aten_inputs = {t0, t1}; |
5313 | |
5314 | fe.compileFusion(&fusion, aten_inputs); |
5315 | auto cg_outputs = fe.runFusion(aten_inputs); |
5316 | |
5317 | testValidate( |
5318 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5319 | } |
5320 | |
5321 | TEST_F(NVFuserTest, FusionAdvancedIndexing2_CUDA) { |
5322 | Fusion fusion; |
5323 | FusionGuard fg(&fusion); |
5324 | |
5325 | int w = 3, x = 4, y = 7, z = 8; |
5326 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5327 | |
5328 | auto tv0 = makeSymbolicTensor(3); |
5329 | auto tv1 = makeSymbolicTensor(4); |
5330 | fusion.addInput(tv0); |
5331 | fusion.addInput(tv1); |
5332 | |
5333 | auto tv2 = add(tv0, IrBuilder::create<Double>(1.0)); |
5334 | auto tv3 = broadcast(tv2, {true, false, false, false}); |
5335 | auto tv4 = add(tv3, tv1); |
5336 | |
5337 | fusion.addOutput(tv4); |
5338 | |
5339 | tv4->merge(-2); |
5340 | tv4->merge(-2); |
5341 | tv4->merge(-2); |
5342 | |
5343 | tv4->split(0, 128); |
5344 | tv4->split(0, 4); |
5345 | |
5346 | tv2->computeAt(tv4, 1); |
5347 | |
5348 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
5349 | tv4->axis(1)->parallelize(ParallelType::Unroll); |
5350 | tv4->axis(2)->parallelize(ParallelType::TIDx); |
5351 | |
5352 | tv3->axis(1)->parallelize(ParallelType::Unroll); |
5353 | tv3->axis(2)->parallelize(ParallelType::TIDx); |
5354 | |
5355 | tv2->axis(1)->parallelize(ParallelType::Unroll); |
5356 | tv2->axis(2)->parallelize(ParallelType::TIDx); |
5357 | |
5358 | FusionExecutor fe; |
5359 | |
5360 | at::Tensor t0 = at::randn({x, y, z}, options); |
5361 | at::Tensor t1 = at::randn({w, x, y, z}, options); |
5362 | |
5363 | auto t3 = t0.add(1.0); |
5364 | auto aten_output = t3.add(t1); |
5365 | |
5366 | std::vector<IValue> aten_inputs = {t0, t1}; |
5367 | |
5368 | fe.compileFusion(&fusion, aten_inputs); |
5369 | auto cg_outputs = fe.runFusion(aten_inputs); |
5370 | |
5371 | testValidate( |
5372 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5373 | } |
5374 | |
5375 | TEST_F(NVFuserTest, FusionAdvancedIndexing3_CUDA) { |
5376 | Fusion fusion; |
5377 | FusionGuard fg(&fusion); |
5378 | |
5379 | int w = 3, x = 4, y = 7, z = 8; |
5380 | |
5381 | auto tv0 = makeSymbolicTensor(3); |
5382 | auto tv1 = makeSymbolicTensor(4); |
5383 | fusion.addInput(tv0); |
5384 | fusion.addInput(tv1); |
5385 | |
5386 | auto tv2 = add(tv0, IrBuilder::create<Double>(1.0)); |
5387 | auto tv3 = add(tv2, tv1); |
5388 | fusion.addOutput(tv3); |
5389 | |
5390 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5391 | at::Tensor t0 = at::randn({x, y, z}, options); |
5392 | at::Tensor t1 = at::randn({w, x, y, z}, options); |
5393 | |
5394 | auto t2 = t0.add(1.0); |
5395 | auto aten_output = t2.add(t1); |
5396 | |
5397 | std::vector<IValue> aten_inputs = {t0, t1}; |
5398 | |
5399 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
5400 | |
5401 | FusionExecutor fe; |
5402 | fe.compileFusion(&fusion, aten_inputs, lparams); |
5403 | auto cg_outputs = fe.runFusion(aten_inputs, lparams); |
5404 | |
5405 | testValidate( |
5406 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5407 | } |
5408 | |
5409 | TEST_F(NVFuserTest, FusionAdvancedIndexing4_CUDA) { |
5410 | Fusion fusion; |
5411 | FusionGuard fg(&fusion); |
5412 | |
5413 | // Set up your input tensor views |
5414 | TensorView* tv0 = makeConcreteTensor({4, 8}); |
5415 | fusion.addInput(tv0); |
5416 | TensorView* tv1 = makeConcreteTensor({4, 4, 8}); |
5417 | fusion.addInput(tv1); |
5418 | |
5419 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1)); |
5420 | TensorView* tv3 = broadcast(tv2, {true, false, false}); |
5421 | TensorView* tv4 = add(tv3, tv1); |
5422 | fusion.addOutput(tv4); |
5423 | |
5424 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5425 | at::Tensor t0 = at::randn({4, 8}, options); |
5426 | at::Tensor t1 = at::randn({4, 4, 8}, options); |
5427 | |
5428 | auto t2 = t0.add(1.0); |
5429 | auto aten_output = t2.add(t1); |
5430 | |
5431 | std::vector<IValue> aten_inputs = {t0, t1}; |
5432 | |
5433 | FusionExecutor fe; |
5434 | fe.compileFusion(&fusion, aten_inputs); |
5435 | auto cg_outputs = fe.runFusion(aten_inputs); |
5436 | |
5437 | testValidate( |
5438 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5439 | } |
5440 | |
5441 | TEST_F(NVFuserTest, FusionAdvancedIndexing5_CUDA) { |
5442 | Fusion fusion; |
5443 | FusionGuard fg(&fusion); |
5444 | |
5445 | // Set up your input tensor views |
5446 | TensorView* tv0 = makeSymbolicTensor(1); |
5447 | fusion.addInput(tv0); |
5448 | TensorView* tv1 = makeSymbolicTensor(3); |
5449 | fusion.addInput(tv1); |
5450 | |
5451 | TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1)); |
5452 | TensorView* tv3 = broadcast(tv2, {true, false, true}); |
5453 | TensorView* tv4 = add(tv3, tv1); |
5454 | fusion.addOutput(tv4); |
5455 | |
5456 | tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3); |
5457 | tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3); |
5458 | |
5459 | tv0->computeAt(tv4, 1); |
5460 | tv1->computeAt(tv4, 1); |
5461 | |
5462 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5463 | at::Tensor t0 = at::randn({7}, options); |
5464 | at::Tensor t1 = at::randn({5, 7, 11}, options); |
5465 | |
5466 | auto t2 = t0.add(1.0); |
5467 | auto aten_output = t2.unsqueeze(-1).add(t1); |
5468 | |
5469 | std::vector<IValue> aten_inputs = {t0, t1}; |
5470 | |
5471 | FusionExecutor fe; |
5472 | fe.compileFusion(&fusion, aten_inputs); |
5473 | auto cg_outputs = fe.runFusion(aten_inputs); |
5474 | |
5475 | testValidate( |
5476 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5477 | } |
5478 | |
5479 | TEST_F(NVFuserTest, FusionAdvancedIndexing6_CUDA) { |
5480 | Fusion fusion; |
5481 | FusionGuard fg(&fusion); |
5482 | |
5483 | std::vector<int64_t> tensor0_shape{7, 4, 7}; |
5484 | std::vector<int64_t> tensor1_shape{4, 7}; |
5485 | |
5486 | TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size()); |
5487 | fusion.addInput(tv0); |
5488 | TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size()); |
5489 | fusion.addInput(tv1); |
5490 | |
5491 | TensorView* tv2 = add(tv0, tv1); |
5492 | TensorView* tv3 = sum(tv2, {0, 1}); |
5493 | fusion.addOutput(tv3); |
5494 | |
5495 | const auto options = |
5496 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5497 | |
5498 | at::Tensor input0 = at::randn(tensor0_shape, options); |
5499 | at::Tensor input1 = at::randn(tensor1_shape, options); |
5500 | |
5501 | std::vector<int64_t> reduction_axes{0, 1}; |
5502 | auto reduction_params = getReductionHeuristics(&fusion, {input0, input1}); |
5503 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
5504 | scheduleReduction(&fusion, *reduction_params); |
5505 | |
5506 | FusionExecutor fe; |
5507 | fe.compileFusion(&fusion, {input0, input1}, reduction_params->lparams); |
5508 | auto cg_outputs = fe.runFusion({input0, input1}, reduction_params->lparams); |
5509 | |
5510 | auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes); |
5511 | |
5512 | testValidate( |
5513 | &fusion, |
5514 | cg_outputs, |
5515 | {input0, input1}, |
5516 | {aten_output}, |
5517 | __LINE__, |
5518 | __FILE__, |
5519 | "" , |
5520 | reduction_params->lparams); |
5521 | } |
5522 | |
5523 | TEST_F(NVFuserTest, FusionAdvancedIndexing7_CUDA) { |
5524 | // Might be able to use this one without 6 as the heuristics in 6 may change |
5525 | // and this test is to cover the same issue. |
5526 | Fusion fusion; |
5527 | FusionGuard fg(&fusion); |
5528 | |
5529 | auto tv0 = makeSymbolicTensor(1); |
5530 | fusion.addInput(tv0); |
5531 | |
5532 | auto tv1 = broadcast(tv0, {false, true}); |
5533 | |
5534 | auto tv2 = makeSymbolicTensor(2); |
5535 | fusion.addInput(tv2); |
5536 | |
5537 | auto tv3 = add(tv1, tv2); |
5538 | auto tv4 = sum(tv3, {0, 1}); |
5539 | fusion.addOutput(tv4); |
5540 | |
5541 | tv4->merge(0, 1); |
5542 | tv4->split(0, 128); |
5543 | tv4->split(0, 4); |
5544 | |
5545 | auto tv5 = tv4->rFactor({0, 1}); |
5546 | |
5547 | tv5->computeAt(tv4, -1); |
5548 | tv0->computeAt(tv5, -1); |
5549 | |
5550 | tv4->axis(0)->parallelize(ParallelType::TIDx); |
5551 | |
5552 | const int numel_x = 100; |
5553 | const int numel_y = 200; |
5554 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5555 | auto at_t0 = at::randn({numel_x}, options); |
5556 | auto at_t1 = at::randn({numel_x, numel_y}, options); |
5557 | |
5558 | FusionExecutor fe; |
5559 | fe.compileFusion(&fusion, {at_t0, at_t1}); |
5560 | auto cg_outputs = fe.runFusion({at_t0, at_t1}); |
5561 | |
5562 | auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) |
5563 | .to(at::kDouble) |
5564 | .sum(); |
5565 | |
5566 | testValidate( |
5567 | &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); |
5568 | } |
5569 | |
5570 | TEST_F(NVFuserTest, FusionAdvancedIndexing8_CUDA) { |
5571 | // Same as 7 but with outer splits instead of inner |
5572 | Fusion fusion; |
5573 | FusionGuard fg(&fusion); |
5574 | |
5575 | auto tv0 = makeSymbolicTensor(1); |
5576 | fusion.addInput(tv0); |
5577 | |
5578 | auto tv1 = broadcast(tv0, {false, true}); |
5579 | |
5580 | auto tv2 = makeSymbolicTensor(2); |
5581 | fusion.addInput(tv2); |
5582 | |
5583 | auto tv3 = add(tv1, tv2); |
5584 | auto tv4 = sum(tv3, {0, 1}); |
5585 | fusion.addOutput(tv4); |
5586 | |
5587 | tv4->merge(0, 1); |
5588 | tv4->split(0, 128, false); |
5589 | tv4->split(0, 4, false); |
5590 | |
5591 | auto tv5 = tv4->rFactor({0, 1}); |
5592 | |
5593 | tv5->computeAt(tv4, -1); |
5594 | tv0->computeAt(tv5, -1); |
5595 | |
5596 | tv4->axis(0)->parallelize(ParallelType::TIDx); |
5597 | |
5598 | const int numel_x = 100; |
5599 | const int numel_y = 200; |
5600 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5601 | auto at_t0 = at::randn({numel_x}, options); |
5602 | auto at_t1 = at::randn({numel_x, numel_y}, options); |
5603 | |
5604 | FusionExecutor fe; |
5605 | fe.compileFusion(&fusion, {at_t0, at_t1}); |
5606 | auto cg_outputs = fe.runFusion({at_t0, at_t1}); |
5607 | |
5608 | auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) |
5609 | .to(at::kDouble) |
5610 | .sum(); |
5611 | |
5612 | testValidate( |
5613 | &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); |
5614 | } |
5615 | |
5616 | TEST_F(NVFuserTest, FusionAdvancedIndexing9_CUDA) { |
5617 | // Same as 7 but with outer splits instead of inner |
5618 | Fusion fusion; |
5619 | FusionGuard fg(&fusion); |
5620 | |
5621 | auto tv0 = makeSymbolicTensor(1); |
5622 | fusion.addInput(tv0); |
5623 | |
5624 | auto tv1 = broadcast(tv0, {false, true}); |
5625 | |
5626 | auto tv2 = mul(tv1, IrBuilder::create<Double>(2)); |
5627 | fusion.addOutput(tv2); |
5628 | |
5629 | auto tv3 = makeSymbolicTensor(3); |
5630 | fusion.addInput(tv3); |
5631 | |
5632 | auto tv4 = add(tv3, tv2); |
5633 | fusion.addOutput(tv4); |
5634 | |
5635 | const int numel_x = 200; |
5636 | const int numel_y = 300; |
5637 | const int numel_z = 400; |
5638 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5639 | auto at_t0 = at::randn({numel_y}, options); |
5640 | auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options); |
5641 | std::vector<IValue> aten_inputs = {at_t0, at_t3}; |
5642 | |
5643 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
5644 | |
5645 | FusionExecutor fe; |
5646 | fe.compileFusion(&fusion, aten_inputs, lparams); |
5647 | auto cg_outputs = fe.runFusion(aten_inputs, lparams); |
5648 | |
5649 | auto at_t1 = at_t0.unsqueeze(-1); |
5650 | auto at_t2 = at_t1.mul(2.0); |
5651 | |
5652 | auto at_t4 = at_t3.add(at_t2); |
5653 | |
5654 | testValidate( |
5655 | &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__); |
5656 | } |
5657 | |
5658 | TEST_F(NVFuserTest, FusionAdvancedIndexing10_CUDA) { |
5659 | Fusion fusion; |
5660 | FusionGuard fg(&fusion); |
5661 | |
5662 | // Set up your input tensor views |
5663 | TensorView* tv0 = makeContigTensor(2); |
5664 | TensorView* tv1 = makeContigTensor(2); |
5665 | |
5666 | // Register your inputs |
5667 | fusion.addInput(tv0); |
5668 | fusion.addInput(tv1); |
5669 | |
5670 | // Do math with it, it returns a `Val*` but can be static_casted back to |
5671 | // TensorView |
5672 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0)); |
5673 | TensorView* tv3 = add(tv0, tv2); |
5674 | |
5675 | // Register your outputs |
5676 | fusion.addOutput(tv3); |
5677 | |
5678 | auto tv0_cache = tv0->cacheAfter(); |
5679 | auto tv1_cache = tv1->cacheAfter(); |
5680 | |
5681 | std::vector<TensorView*> tvs = {tv0_cache, tv1_cache, tv2, tv3}; |
5682 | |
5683 | for (auto tv : tvs) { |
5684 | tv->split(1, 2, false); |
5685 | tv->split(1, 1); |
5686 | tv->split(-1, 4); |
5687 | // [I0, 2, 1, I1/2/4, 4] |
5688 | tv->reorder({{1, 2}, {2, 3}, {3, 1}}); |
5689 | tv->axis(0)->parallelize(ParallelType::BIDx); |
5690 | tv->axis(1)->parallelize(ParallelType::TIDx); |
5691 | } |
5692 | |
5693 | // For all inputs, computeAt the output inline, temporaries should be squeezed |
5694 | // between them |
5695 | tv0->computeAt(tv3, 1); |
5696 | tv1->computeAt(tv3, 1); |
5697 | |
5698 | tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize); |
5699 | tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize); |
5700 | |
5701 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5702 | |
5703 | at::Tensor input1 = at::randn({64, 128}, options); |
5704 | at::Tensor input2 = at::rand_like(input1); |
5705 | at::Tensor output = at::empty_like(input1); |
5706 | |
5707 | FusionExecutor fe; |
5708 | fe.compileFusion(&fusion, {input1, input2}); |
5709 | fe.runFusion({input1, input2}, {output}); |
5710 | |
5711 | at::Tensor tv2_ref = input2 + 2.0; |
5712 | at::Tensor output_ref = input1 + tv2_ref; |
5713 | |
5714 | TORCH_CHECK(output_ref.equal(output)); |
5715 | } |
5716 | |
5717 | TEST_F(NVFuserTest, FusionAdvancedIndexing11_CUDA) { |
5718 | Fusion fusion; |
5719 | FusionGuard fg(&fusion); |
5720 | |
5721 | int w = 3, x = 4, y = 7, z = 8; |
5722 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5723 | |
5724 | auto tv0 = makeSymbolicTensor(4); |
5725 | auto tv1 = makeSymbolicTensor(1); |
5726 | fusion.addInput(tv0); |
5727 | fusion.addInput(tv1); |
5728 | |
5729 | auto tv2 = add(tv1, IrBuilder::create<Double>(1.0)); |
5730 | auto tv3 = broadcast(tv2, {true, false, true, true}); |
5731 | auto tv4 = add(tv3, tv0); |
5732 | |
5733 | fusion.addOutput(tv4); |
5734 | |
5735 | tv4->merge(0); |
5736 | tv4->merge(1); |
5737 | |
5738 | tv4->split(1, 32); |
5739 | tv4->split(0, 1); |
5740 | |
5741 | tv4->reorder({{2, 1}}); |
5742 | |
5743 | tv2->computeAt(tv4, 3); |
5744 | |
5745 | tv2->setMemoryType(MemoryType::Global); |
5746 | |
5747 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
5748 | tv4->axis(1)->parallelize(ParallelType::BIDy); |
5749 | tv4->axis(2)->parallelize(ParallelType::Unswitch); |
5750 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
5751 | |
5752 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
5753 | |
5754 | FusionExecutor fe; |
5755 | |
5756 | at::Tensor t0 = at::randn({w, x, y, z}, options); |
5757 | at::Tensor t1 = at::randn({x}, options); |
5758 | |
5759 | auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1); |
5760 | auto aten_output = t3.add(t0); |
5761 | |
5762 | std::vector<IValue> aten_inputs = {t0, t1}; |
5763 | |
5764 | fe.compileFusion(&fusion, aten_inputs); |
5765 | auto cg_outputs = fe.runFusion(aten_inputs); |
5766 | |
5767 | testValidate( |
5768 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5769 | } |
5770 | |
5771 | // Intended to stress the lowering of our code generator |
5772 | TEST_F(NVFuserTest, FusionAdvancedLowering1_CUDA) { |
5773 | Fusion fusion; |
5774 | FusionGuard fg(&fusion); |
5775 | |
5776 | TensorView* tv0 = makeConcreteTensor({9, 5}); |
5777 | fusion.addInput(tv0); |
5778 | |
5779 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1)); |
5780 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2)); |
5781 | TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3)); |
5782 | TensorView* tv4 = sum(tv3, {1}); |
5783 | |
5784 | fusion.addOutput(tv2); |
5785 | fusion.addOutput(tv4); |
5786 | |
5787 | tv4->split(1, 4); |
5788 | auto tv5 = tv4->rFactor({2}); |
5789 | |
5790 | tv1->computeAt(tv5, 2); |
5791 | |
5792 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5793 | at::manual_seed(1); |
5794 | at::Tensor aten_input = at::randn({9, 5}, options); |
5795 | |
5796 | auto t1 = aten_input.add(1.0); |
5797 | auto t2 = t1.add(2.0); |
5798 | auto t3 = t1.add(3.0); |
5799 | auto t4 = t3.sum(1); |
5800 | |
5801 | std::vector<at::Tensor> aten_outputs = {t2, t4}; |
5802 | |
5803 | FusionExecutor fe; |
5804 | fe.compileFusion(&fusion, {aten_input}); |
5805 | auto cg_outputs = fe.runFusion({aten_input}); |
5806 | |
5807 | testValidate( |
5808 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
5809 | } |
5810 | |
5811 | TEST_F(NVFuserTest, FusionAdvancedLowering2_CUDA) { |
5812 | Fusion fusion; |
5813 | FusionGuard fg(&fusion); |
5814 | |
5815 | // Progressively broadcast tensors |
5816 | TensorView* tv0 = makeSymbolicTensor(1); |
5817 | fusion.addInput(tv0); |
5818 | TensorView* tv1 = makeSymbolicTensor(2); |
5819 | fusion.addInput(tv1); |
5820 | TensorView* tv2 = makeSymbolicTensor(3); |
5821 | fusion.addInput(tv2); |
5822 | |
5823 | TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); |
5824 | TensorView* tv4 = broadcast(tv3, {false, true}); |
5825 | TensorView* tv5 = add(tv4, tv1); |
5826 | TensorView* tv6 = add(tv5, tv2); |
5827 | |
5828 | fusion.addOutput(tv6); |
5829 | |
5830 | // Split inner dimension |
5831 | tv6->split(1, 4); |
5832 | // Merge middle dims with outer dimensions |
5833 | tv6->merge(2); |
5834 | tv6->merge(0); |
5835 | |
5836 | // tv6[I0*I1o, I1i*I2] |
5837 | |
5838 | // Compute everything inline |
5839 | tv0->computeAt(tv6, -1); |
5840 | |
5841 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
5842 | tv6->axis(1)->parallelize(ParallelType::TIDx); |
5843 | |
5844 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5845 | int x = 13, y = 9, z = 5; |
5846 | at::Tensor t0 = at::randn({y}, options); |
5847 | at::Tensor t1 = at::randn({y, z}, options); |
5848 | at::Tensor t2 = at::randn({x, y, z}, options); |
5849 | |
5850 | auto t3 = t0.add(1.0); |
5851 | auto t4 = t3.unsqueeze(-1); |
5852 | auto t5 = t4.add(t1); |
5853 | auto t6 = t5.add(t2); |
5854 | |
5855 | std::vector<IValue> aten_inputs = {t0, t1, t2}; |
5856 | std::vector<at::Tensor> aten_outputs = {t6}; |
5857 | |
5858 | FusionExecutor fe; |
5859 | fe.compileFusion(&fusion, aten_inputs); |
5860 | auto cg_outputs = fe.runFusion(aten_inputs); |
5861 | |
5862 | testValidate( |
5863 | &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); |
5864 | } |
5865 | |
5866 | // TODO: Complete test |
5867 | TEST_F(NVFuserTest, FusionAdvancedLowering3_CUDA) { |
5868 | Fusion fusion; |
5869 | FusionGuard fg(&fusion); |
5870 | |
5871 | auto tv0 = makeConcreteTensor({1, -1}); |
5872 | auto tv1 = makeSymbolicTensor(2); |
5873 | fusion.addInput(tv0); |
5874 | fusion.addInput(tv1); |
5875 | |
5876 | // [b0, i1] |
5877 | auto tv2 = add(tv0, IrBuilder::create<Double>(2.0)); |
5878 | |
5879 | // [i0, i1] |
5880 | auto tv3 = add(tv1, IrBuilder::create<Double>(3.0)); |
5881 | |
5882 | // [b0, i1] |
5883 | auto tv4 = add(tv2, IrBuilder::create<Double>(4.0)); |
5884 | |
5885 | // [io, i1] |
5886 | auto tv5 = add(tv2, tv3); |
5887 | |
5888 | fusion.addOutput(tv4); |
5889 | fusion.addOutput(tv5); |
5890 | |
5891 | tv0->computeAt(tv4, -1); |
5892 | |
5893 | tv3->setMemoryType(MemoryType::Global); |
5894 | |
5895 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5896 | int x = 13, y = 9; |
5897 | at::Tensor t0 = at::randn({1, y}, options); |
5898 | at::Tensor t1 = at::randn({x, y}, options); |
5899 | |
5900 | auto t4 = t0 + 2 + 4; |
5901 | auto t5 = t0 + 2 + t1 + 3; |
5902 | |
5903 | std::vector<IValue> aten_inputs = {t0, t1}; |
5904 | std::vector<at::Tensor> aten_outputs = {t4, t5}; |
5905 | |
5906 | FusionExecutor fe; |
5907 | fe.compileFusion(&fusion, aten_inputs); |
5908 | auto cg_outputs = fe.runFusion(aten_inputs); |
5909 | |
5910 | testValidate( |
5911 | &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); |
5912 | } |
5913 | |
5914 | // This excercises indexing with broadcast root axes. Non-broadcast |
5915 | // axes need to be preferred when propagating index exprs to root |
5916 | // axes. See, e.g., Index::getConsumerIndex_impl. |
5917 | TEST_F(NVFuserTest, FusionAdvancedLowering4_CUDA) { |
5918 | Fusion fusion; |
5919 | FusionGuard fg(&fusion); |
5920 | |
5921 | auto tv0 = makeSymbolicTensor(1); |
5922 | fusion.addInput(tv0); |
5923 | auto tv1 = broadcast(tv0, {false, true}); |
5924 | auto tv2 = broadcast(tv1, {false, false, true}); |
5925 | auto tv3 = makeSymbolicTensor(3); |
5926 | fusion.addInput(tv3); |
5927 | auto tv4 = add(tv2, tv3); |
5928 | fusion.addOutput(tv4); |
5929 | |
5930 | tv4->merge(1)->merge(0); |
5931 | tv4->split(0, 8); |
5932 | tv0->computeAt(tv4, 1); |
5933 | |
5934 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5935 | const int bx = 10; |
5936 | const int by = 20; |
5937 | const int bz = 30; |
5938 | at::Tensor t0 = at::randn({bx}, options); |
5939 | at::Tensor t3 = at::randn({bx, by, bz}, options); |
5940 | std::vector<IValue> aten_inputs = {t0, t3}; |
5941 | |
5942 | FusionExecutor fe; |
5943 | fe.compileFusion(&fusion, aten_inputs); |
5944 | auto cg_outputs = fe.runFusion(aten_inputs); |
5945 | |
5946 | auto aten_output = |
5947 | t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3; |
5948 | |
5949 | testValidate( |
5950 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
5951 | } |
5952 | |
5953 | TEST_F(NVFuserTest, FusionAdvancedLowering5_CUDA) { |
5954 | Fusion fusion; |
5955 | FusionGuard fg(&fusion); |
5956 | |
5957 | TensorView* tv0 = makeConcreteTensor({5, 4, 3}); |
5958 | fusion.addInput(tv0); |
5959 | |
5960 | TensorView* tv1 = makeConcreteTensor({5, 3}); |
5961 | fusion.addInput(tv1); |
5962 | |
5963 | auto tv2 = broadcast(tv1, {false, true, false}); |
5964 | |
5965 | auto tv3 = add(tv0, tv2); |
5966 | |
5967 | fusion.addOutput(tv3); |
5968 | |
5969 | tv2->merge(0); |
5970 | tv1->computeAt(tv2, 1); |
5971 | |
5972 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
5973 | at::manual_seed(1); |
5974 | at::Tensor t0 = at::randn({5, 4, 3}, options); |
5975 | at::Tensor t1 = at::randn({5, 3}, options); |
5976 | auto t2 = t1.unsqueeze(1); |
5977 | auto t3 = t0 + t2; |
5978 | |
5979 | std::vector<IValue> aten_inputs = {t0, t1}; |
5980 | std::vector<at::Tensor> aten_outputs = {t3}; |
5981 | |
5982 | FusionExecutor fe; |
5983 | fe.compileFusion(&fusion, aten_inputs); |
5984 | auto cg_outputs = fe.runFusion(aten_inputs); |
5985 | |
5986 | testValidate( |
5987 | &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); |
5988 | } |
5989 | |
5990 | TEST_F(NVFuserTest, FusionAdvancedLowering6_CUDA) { |
5991 | Fusion fusion; |
5992 | FusionGuard fg(&fusion); |
5993 | |
5994 | TensorView* tv0 = makeConcreteTensor({5, 4, 3}); |
5995 | fusion.addInput(tv0); |
5996 | auto tv1 = makeConcreteTensor({4}); |
5997 | fusion.addInput(tv1); |
5998 | auto tv2 = unaryOp(UnaryOpType::Set, tv0); |
5999 | auto tv3 = unaryOp(UnaryOpType::Set, tv1); |
6000 | |
6001 | auto tv4 = sum(tv2, {0, 2}); |
6002 | auto tv5 = add(tv4, tv3); |
6003 | fusion.addOutput(tv5); |
6004 | |
6005 | auto tv6 = broadcast(tv3, {true, false, true}); |
6006 | auto tv7 = add(tv2, tv6); |
6007 | fusion.addOutput(tv7); |
6008 | |
6009 | tv2->computeAt(tv4, -1, ComputeAtMode::BestEffort); |
6010 | tv3->computeAt(tv7, -1, ComputeAtMode::BestEffort); |
6011 | |
6012 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6013 | at::manual_seed(1); |
6014 | at::Tensor t0 = at::randn({5, 4, 3}, options); |
6015 | at::Tensor t1 = at::randn({4}, options); |
6016 | |
6017 | auto t2 = t0; |
6018 | auto t3 = t1; |
6019 | |
6020 | std::vector<int64_t> reduction_axes{0, 2}; |
6021 | auto t4 = t2.sum(reduction_axes); |
6022 | auto t5 = add(t4, t3); |
6023 | auto t6 = t3.unsqueeze(0).unsqueeze(-1); |
6024 | auto t7 = t2.add(t6); |
6025 | |
6026 | std::vector<IValue> aten_inputs = {t0, t1}; |
6027 | std::vector<at::Tensor> aten_outputs = {t5, t7}; |
6028 | |
6029 | FusionExecutor fe; |
6030 | fe.compileFusion(&fusion, aten_inputs); |
6031 | auto cg_outputs = fe.runFusion(aten_inputs); |
6032 | |
6033 | testValidate( |
6034 | &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); |
6035 | } |
6036 | |
6037 | // Test a simple Gemm but also play around with fusion executor features |
6038 | TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) { |
6039 | Fusion fusion; |
6040 | FusionGuard fg(&fusion); |
6041 | |
6042 | // Set up your input tensor views |
6043 | TensorView* tv0 = makeSymbolicTensor(2); // M, K |
6044 | TensorView* tv1 = makeSymbolicTensor(2); // K, N |
6045 | fusion.addInput(tv0); |
6046 | fusion.addInput(tv1); |
6047 | |
6048 | TensorView* tv2 = broadcast(tv0, {false, false, true}); |
6049 | // tv2[I0, I1, B] = tv0[I0, I1] |
6050 | |
6051 | TensorView* tv3 = broadcast(tv1, {true, false, false}); |
6052 | // tv3[B, I1, I2] = tv1[I1, I2] |
6053 | |
6054 | // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2] |
6055 | TensorView* tv4 = mul(tv2, tv3); |
6056 | // tv5[I0, R1, I2] = tv4[I0, I1, I2] |
6057 | TensorView* tv5 = sum(tv4, {1}); |
6058 | fusion.addOutput(tv5); |
6059 | |
6060 | tv5->split(1, 32); |
6061 | // tv5[I0, R1o, R1i{32}, I2] |
6062 | |
6063 | auto tv6 = tv5->rFactor({1}); |
6064 | // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2] |
6065 | // tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2] |
6066 | |
6067 | tv5->split(0, 4); |
6068 | tv5->split(-1, 4); |
6069 | // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] |
6070 | // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] |
6071 | |
6072 | tv0->computeAt(tv5, -1); |
6073 | tv1->computeAt(tv5, -1); |
6074 | |
6075 | // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] |
6076 | // tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] |
6077 | //--> (line symbolizes compute at location) |
6078 | // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] |
6079 | // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] |
6080 | // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] |
6081 | |
6082 | tv0->computeAt(tv6, -1); |
6083 | tv1->computeAt(tv6, -1); |
6084 | // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] |
6085 | // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] |
6086 | // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] |
6087 | |
6088 | tv5->axis(0)->parallelize(ParallelType::BIDz); |
6089 | tv5->axis(1)->parallelize(ParallelType::TIDz); |
6090 | |
6091 | tv5->axis(-2)->parallelize(ParallelType::BIDy); |
6092 | tv5->axis(-1)->parallelize(ParallelType::TIDy); |
6093 | |
6094 | tv5->axis(2)->parallelize(ParallelType::TIDx); |
6095 | tv6->axis(2)->parallelize(ParallelType::TIDx); |
6096 | |
6097 | constexpr int M = 65, K = 33, N = 17; |
6098 | |
6099 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6100 | |
6101 | at::Tensor t0 = at::randn({M, K}, options); |
6102 | at::Tensor t1 = at::randn({K, N}, options); |
6103 | |
6104 | FusionExecutor fe; |
6105 | fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); |
6106 | // Lets specify a few bounds in launch params to make sure it works |
6107 | fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); |
6108 | |
6109 | // Make sure bad launch params throws |
6110 | // TODO: Re-enable once we have parallelization validation in. |
6111 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
6112 | // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); |
6113 | |
6114 | // Don't specify any launch params |
6115 | auto cg_outputs = fe.runFusion({t0, t1}); |
6116 | |
6117 | auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble)); |
6118 | |
6119 | testValidate( |
6120 | &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); |
6121 | } |
6122 | |
6123 | // Softmax with a 1D tensor. Parallelized only with a single thread block. |
6124 | TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) { |
6125 | Fusion fusion; |
6126 | FusionGuard fg(&fusion); |
6127 | |
6128 | const int tidx = 128; |
6129 | const int dimx = 1000; |
6130 | |
6131 | // Set up your input tensor views |
6132 | TensorView* input_tv0 = makeSymbolicTensor(1); |
6133 | fusion.addInput(input_tv0); |
6134 | |
6135 | TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0); |
6136 | TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); |
6137 | TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true}); |
6138 | |
6139 | // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be |
6140 | // computed at sum_exp_rf_tv8. |
6141 | TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0); |
6142 | |
6143 | TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); |
6144 | |
6145 | fusion.addOutput(output_tv4); |
6146 | |
6147 | bcast_sum_tv3->split(0, tidx); |
6148 | |
6149 | sum_exp_tv2->split(-1, tidx); |
6150 | TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); |
6151 | |
6152 | output_tv4->split(-1, tidx); |
6153 | |
6154 | exp_tv1->computeAt(sum_exp_rf_tv5, -1); |
6155 | exp_tv1_copy->computeAt(output_tv4, -1); |
6156 | |
6157 | TensorView* tensors_to_parallelize[] = { |
6158 | sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; |
6159 | |
6160 | for (auto tv : tensors_to_parallelize) { |
6161 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
6162 | } |
6163 | |
6164 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6165 | at::Tensor t0 = at::randn({dimx}, options); |
6166 | at::Tensor cg_output = at::empty({dimx}, options); |
6167 | at::Tensor t3_output = at::empty_like(cg_output, options); |
6168 | |
6169 | FusionExecutor fe; |
6170 | fe.compileFusion(&fusion, {t0}); |
6171 | fe.runFusion({t0}, {cg_output}); |
6172 | |
6173 | auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); |
6174 | |
6175 | testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__); |
6176 | } |
6177 | |
6178 | // Softmax with a 1D tensor with input normalization. |
6179 | TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { |
6180 | Fusion fusion; |
6181 | FusionGuard fg(&fusion); |
6182 | |
6183 | const int tidx = 128; |
6184 | const int dimx = 1000; |
6185 | |
6186 | // Set up your input tensor views |
6187 | TensorView* input_tv0 = makeSymbolicTensor(1); |
6188 | fusion.addInput(input_tv0); |
6189 | |
6190 | // Normalize with the max value before computing exp. |
6191 | TensorView* max_val_tv1 = reductionOp( |
6192 | BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0); |
6193 | TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true}); |
6194 | TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); |
6195 | TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); |
6196 | TensorView* sum_exp_tv5 = sum(exp_tv4, {-1}); |
6197 | TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true}); |
6198 | |
6199 | // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be |
6200 | // computed at sum_exp_rf_tv8. |
6201 | TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2); |
6202 | TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy); |
6203 | |
6204 | TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6); |
6205 | |
6206 | fusion.addOutput(output_tv7); |
6207 | bcast_max_tv2->split(0, tidx); |
6208 | bcast_sum_tv6->split(0, tidx); |
6209 | |
6210 | max_val_tv1->split(-1, tidx); |
6211 | TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2}); |
6212 | |
6213 | sum_exp_tv5->split(-1, tidx); |
6214 | TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2}); |
6215 | |
6216 | output_tv7->split(-1, tidx); |
6217 | |
6218 | sub_tv3->computeAt(sum_exp_rf_tv9, -1); |
6219 | sub_tv3_copy->computeAt(output_tv7, -1); |
6220 | |
6221 | TensorView* tensors_to_parallelize[] = { |
6222 | max_val_tv1, |
6223 | bcast_max_tv2, |
6224 | sum_exp_tv5, |
6225 | bcast_sum_tv6, |
6226 | output_tv7, |
6227 | max_val_rf_tv8, |
6228 | sum_exp_rf_tv9}; |
6229 | |
6230 | for (auto tv : tensors_to_parallelize) { |
6231 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
6232 | } |
6233 | |
6234 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6235 | at::Tensor input = at::randn({dimx}, options); |
6236 | at::Tensor t3_output = at::empty({dimx}, options); |
6237 | |
6238 | FusionExecutor fe; |
6239 | fe.compileFusion(&fusion, {input}); |
6240 | auto cg_outputs = fe.runFusion({input}); |
6241 | |
6242 | auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); |
6243 | |
6244 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
6245 | } |
6246 | |
6247 | // Softmax with a 3D tensor, where the inner-most 3rd dimension is |
6248 | // normalized. Pallelized with multiple thread blocks. |
6249 | TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) { |
6250 | Fusion fusion; |
6251 | FusionGuard fg(&fusion); |
6252 | |
6253 | const int tidx = 32; |
6254 | const int dimx = 32; |
6255 | const int dimy = 16; |
6256 | const int dimz = 130; |
6257 | |
6258 | // Set up your input tensor views |
6259 | TensorView* input_tv0 = makeSymbolicTensor(3); |
6260 | fusion.addInput(input_tv0); |
6261 | |
6262 | TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0); |
6263 | TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); |
6264 | TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true}); |
6265 | |
6266 | // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be |
6267 | // computed at sum_exp_rf_tv8. |
6268 | TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0); |
6269 | |
6270 | TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); |
6271 | |
6272 | fusion.addOutput(output_tv4); |
6273 | |
6274 | bcast_sum_tv3->split(-1, tidx); |
6275 | |
6276 | sum_exp_tv2->split(-1, tidx); |
6277 | TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); |
6278 | |
6279 | output_tv4->split(-1, tidx); |
6280 | |
6281 | exp_tv1->computeAt(sum_exp_rf_tv5, -1); |
6282 | exp_tv1_copy->computeAt(output_tv4, -1); |
6283 | |
6284 | TensorView* tensors_to_parallelize[] = { |
6285 | sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; |
6286 | |
6287 | for (auto tv : tensors_to_parallelize) { |
6288 | tv->axis(0)->parallelize(ParallelType::BIDx); |
6289 | tv->axis(1)->parallelize(ParallelType::BIDy); |
6290 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
6291 | } |
6292 | |
6293 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6294 | at::Tensor input = at::randn({dimx, dimy, dimz}, options); |
6295 | |
6296 | at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); |
6297 | |
6298 | FusionExecutor fe; |
6299 | fe.compileFusion(&fusion, {input}); |
6300 | fe.runFusion({input}, {cg_output}); |
6301 | |
6302 | auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); |
6303 | |
6304 | testValidate( |
6305 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
6306 | } |
6307 | |
6308 | // Softmax with a 3D tensor with input normalization. |
6309 | TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { |
6310 | Fusion fusion; |
6311 | FusionGuard fg(&fusion); |
6312 | |
6313 | const int tidx = 32; |
6314 | const int dimx = 32; |
6315 | const int dimy = 16; |
6316 | const int dimz = 130; |
6317 | |
6318 | // Set up your input tensor views |
6319 | TensorView* input_tv0 = makeSymbolicTensor(3); |
6320 | fusion.addInput(input_tv0); |
6321 | |
6322 | // Normalize with the max value before computing exp. |
6323 | TensorView* max_val_tv1 = reductionOp( |
6324 | BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0); |
6325 | TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true}); |
6326 | TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); |
6327 | TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); |
6328 | TensorView* sum_exp_tv5 = sum(exp_tv4, {-1}); |
6329 | TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true}); |
6330 | |
6331 | // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be |
6332 | // computed at sum_exp_rf_tv8. |
6333 | TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2); |
6334 | TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy); |
6335 | |
6336 | TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6); |
6337 | |
6338 | fusion.addOutput(output_tv7); |
6339 | |
6340 | bcast_max_tv2->split(-1, tidx); |
6341 | bcast_sum_tv6->split(-1, tidx); |
6342 | |
6343 | max_val_tv1->split(-1, tidx); |
6344 | TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2}); |
6345 | |
6346 | sum_exp_tv5->split(-1, tidx); |
6347 | TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2}); |
6348 | |
6349 | output_tv7->split(-1, tidx); |
6350 | |
6351 | sub_tv3->computeAt(sum_exp_rf_tv9, -1); |
6352 | sub_tv3_copy->computeAt(output_tv7, -1); |
6353 | |
6354 | TensorView* tensors_to_parallelize[] = { |
6355 | max_val_tv1, |
6356 | bcast_max_tv2, |
6357 | sum_exp_tv5, |
6358 | bcast_sum_tv6, |
6359 | output_tv7, |
6360 | max_val_rf_tv8, |
6361 | sum_exp_rf_tv9}; |
6362 | |
6363 | for (auto tv : tensors_to_parallelize) { |
6364 | tv->axis(0)->parallelize(ParallelType::BIDx); |
6365 | tv->axis(1)->parallelize(ParallelType::BIDy); |
6366 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
6367 | } |
6368 | |
6369 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6370 | at::Tensor input = at::randn({dimx, dimy, dimz}, options); |
6371 | at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); |
6372 | |
6373 | FusionExecutor fe; |
6374 | fe.compileFusion(&fusion, {input}); |
6375 | auto cg_outputs = fe.runFusion({input}); |
6376 | |
6377 | auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); |
6378 | |
6379 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
6380 | } |
6381 | |
6382 | TEST_F(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { |
6383 | Fusion fusion; |
6384 | FusionGuard fg(&fusion); |
6385 | |
6386 | // Set up your input tensor views |
6387 | TensorView* tv0 = makeSymbolicTensor(2); |
6388 | fusion.addInput(tv0); |
6389 | |
6390 | auto tv1 = sum(tv0, {1}); |
6391 | auto tv2 = broadcast(tv1, {false, true}); |
6392 | |
6393 | auto tv3 = add(tv0, IrBuilder::create<Double>(1.0)); |
6394 | |
6395 | auto tv4 = mul(tv2, tv3); |
6396 | |
6397 | auto tv5 = sum(tv4, {1}); |
6398 | auto tv6 = broadcast(tv5, {false, true}); |
6399 | |
6400 | auto tv7 = sub(tv6, tv4); |
6401 | fusion.addOutput(tv7); |
6402 | |
6403 | tv1->computeAt(tv7, 1); |
6404 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
6405 | ASSERT_ANY_THROW(tv1->computeAt(tv7, -1)); |
6406 | } |
6407 | |
6408 | // Similar to FusionReduction but uses grid reduction |
6409 | TEST_F(NVFuserTest, FusionGridReduction1_CUDA) { |
6410 | const int gdimx = 32; |
6411 | const int bdimx = 128; |
6412 | |
6413 | Fusion fusion; |
6414 | FusionGuard fg(&fusion); |
6415 | |
6416 | // Set up your input tensor views |
6417 | TensorView* tv0 = makeSymbolicTensor(2); |
6418 | fusion.addInput(tv0); |
6419 | |
6420 | // tv1[I0, R1] = tv0[I0, I1] |
6421 | TensorView* tv1 = |
6422 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
6423 | fusion.addOutput(tv1); |
6424 | |
6425 | TORCH_CHECK( |
6426 | ir_utils::getReductionOps(&fusion).size(), |
6427 | "Could not detect reduction in fusion." ); |
6428 | |
6429 | tv1->split(1, bdimx); |
6430 | // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] |
6431 | tv1->split(1, gdimx); |
6432 | // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] |
6433 | |
6434 | TensorView* tv2 = tv1->rFactor({1}); |
6435 | // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] |
6436 | // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] |
6437 | |
6438 | // Incrementally, can print in between for debugging |
6439 | tv0->computeAt(tv2, 1); |
6440 | tv2->computeAt(tv1, 1); |
6441 | |
6442 | // Re do it all at once, because why not. |
6443 | tv0->computeAt(tv1, 1); |
6444 | |
6445 | tv1->axis(0)->parallelize(ParallelType::BIDy); |
6446 | tv1->axis(1)->parallelize(ParallelType::BIDx); |
6447 | tv2->axis(2)->parallelize(ParallelType::BIDx); |
6448 | |
6449 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
6450 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
6451 | |
6452 | int numel_x = 10000; |
6453 | int numel_y = 65000; |
6454 | |
6455 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6456 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6457 | at::Tensor cg_output = at::empty({numel_x}, options); |
6458 | |
6459 | FusionExecutor fe; |
6460 | fe.compileFusion(&fusion, {input}); |
6461 | fe.runFusion({input}, {cg_output}); |
6462 | |
6463 | auto aten_output = input.to(at::kDouble).sum({1}); |
6464 | |
6465 | testValidate( |
6466 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
6467 | } |
6468 | |
6469 | // Same test as the above but uses BIDy and TIDx for reduction |
6470 | TEST_F(NVFuserTest, FusionGridReduction2_CUDA) { |
6471 | const int gdimy = 32; |
6472 | const int bdimx = 128; |
6473 | |
6474 | Fusion fusion; |
6475 | FusionGuard fg(&fusion); |
6476 | |
6477 | // Set up your input tensor views |
6478 | TensorView* tv0 = makeSymbolicTensor(2); |
6479 | fusion.addInput(tv0); |
6480 | |
6481 | // tv1[I0, R1] = tv0[I0, I1] |
6482 | TensorView* tv1 = |
6483 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
6484 | fusion.addOutput(tv1); |
6485 | |
6486 | TORCH_CHECK( |
6487 | ir_utils::getReductionOps(&fusion).size(), |
6488 | "Could not detect reduction in fusion." ); |
6489 | |
6490 | tv1->split(1, bdimx); |
6491 | // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] |
6492 | tv1->split(1, gdimy); |
6493 | // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] |
6494 | |
6495 | TensorView* tv2 = tv1->rFactor({1}); |
6496 | // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] |
6497 | // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] |
6498 | |
6499 | // Incrementally, can print in between for debugging |
6500 | tv0->computeAt(tv2, 1); |
6501 | tv2->computeAt(tv1, 1); |
6502 | |
6503 | // Re do it all at once, because why not. |
6504 | tv0->computeAt(tv1, 1); |
6505 | |
6506 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
6507 | tv1->axis(1)->parallelize(ParallelType::BIDy); |
6508 | tv2->axis(2)->parallelize(ParallelType::BIDy); |
6509 | |
6510 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
6511 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
6512 | |
6513 | int numel_x = 10000; |
6514 | int numel_y = 65000; |
6515 | |
6516 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6517 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6518 | |
6519 | FusionExecutor fe; |
6520 | fe.compileFusion(&fusion, {input}); |
6521 | auto cg_outputs = fe.runFusion({input}); |
6522 | |
6523 | auto aten_output = input.to(at::kDouble).sum({1}); |
6524 | |
6525 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
6526 | } |
6527 | |
6528 | // Same test but uses BIDy and BIDz for reduction. No TID used. |
6529 | TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) { |
6530 | // Grid reductions when there aren't any threads are serial reductions |
6531 | // keep these numbers low so our error isn't too high compared to normal cuda |
6532 | // reductions |
6533 | const int gdimz = 15; |
6534 | const int gdimy = 9; |
6535 | |
6536 | Fusion fusion; |
6537 | FusionGuard fg(&fusion); |
6538 | |
6539 | // Set up your input tensor views |
6540 | TensorView* tv0 = makeSymbolicTensor(2); |
6541 | fusion.addInput(tv0); |
6542 | |
6543 | // tv1[I0, R1] = tv0[I0, I1] |
6544 | TensorView* tv1 = |
6545 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
6546 | fusion.addOutput(tv1); |
6547 | |
6548 | TORCH_CHECK( |
6549 | ir_utils::getReductionOps(&fusion).size(), |
6550 | "Could not detect reduction in fusion." ); |
6551 | |
6552 | tv1->split(1, gdimy); |
6553 | // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] |
6554 | tv1->split(1, gdimz); |
6555 | // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] |
6556 | |
6557 | TensorView* tv2 = tv1->rFactor({1}); |
6558 | // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] |
6559 | // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] |
6560 | |
6561 | // Incrementally, can print in between for debugging |
6562 | tv0->computeAt(tv2, 1); |
6563 | tv2->computeAt(tv1, 1); |
6564 | |
6565 | // Re do it all at once, because why not. |
6566 | tv0->computeAt(tv1, 1); |
6567 | |
6568 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
6569 | tv1->axis(1)->parallelize(ParallelType::BIDz); |
6570 | tv2->axis(2)->parallelize(ParallelType::BIDz); |
6571 | tv1->axis(-1)->parallelize(ParallelType::BIDy); |
6572 | tv2->axis(-1)->parallelize(ParallelType::BIDy); |
6573 | |
6574 | int numel_x = 100; |
6575 | int numel_y = 6500; |
6576 | |
6577 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6578 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6579 | at::Tensor cg_output = at::empty({numel_x}, options); |
6580 | |
6581 | FusionExecutor fe; |
6582 | fe.compileFusion(&fusion, {input}); |
6583 | fe.runFusion({input}, {cg_output}); |
6584 | |
6585 | auto aten_output = input.to(at::kDouble).sum({1}); |
6586 | testValidate( |
6587 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
6588 | } |
6589 | |
6590 | // Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0 |
6591 | TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) { |
6592 | // Grid reductions when there aren't any threads are serial reductions |
6593 | // keep these numbers low so our error isn't too high compared to normal cuda |
6594 | // reductions |
6595 | const int gdimz = 15; |
6596 | const int gdimy = 9; |
6597 | |
6598 | Fusion fusion; |
6599 | FusionGuard fg(&fusion); |
6600 | |
6601 | // Set up your input tensor views |
6602 | TensorView* tv0 = makeSymbolicTensor(2); |
6603 | fusion.addInput(tv0); |
6604 | |
6605 | // tv1[R0, I1] = tv0[I0, I1] |
6606 | TensorView* tv1 = |
6607 | reductionOp(BinaryOpType::Add, {0}, IrBuilder::create<Double>(0), tv0); |
6608 | fusion.addOutput(tv1); |
6609 | |
6610 | TORCH_CHECK( |
6611 | ir_utils::getReductionOps(&fusion).size(), |
6612 | "Could not detect reduction in fusion." ); |
6613 | |
6614 | tv1->split(0, gdimy); |
6615 | // tv1[R0o, R0i{128}, I1] = tv0[I0, I1] |
6616 | tv1->split(0, gdimz); |
6617 | // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1] |
6618 | |
6619 | TensorView* tv2 = tv1->rFactor({0}); |
6620 | // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1] |
6621 | // tv1[ R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1] |
6622 | |
6623 | // Note that computeAt isn't going to make anything better as there |
6624 | // is no dynamically sized dimension. |
6625 | |
6626 | // Map parallelism as [Serial, BIDz, BIDy, BIDx] |
6627 | tv1->axis(-1)->parallelize(ParallelType::BIDx); |
6628 | tv2->axis(-1)->parallelize(ParallelType::BIDx); |
6629 | tv1->axis(-2)->parallelize(ParallelType::BIDy); |
6630 | tv2->axis(-2)->parallelize(ParallelType::BIDy); |
6631 | tv1->axis(-3)->parallelize(ParallelType::BIDz); |
6632 | tv2->axis(-3)->parallelize(ParallelType::BIDz); |
6633 | |
6634 | int numel_x = 6500; |
6635 | int numel_y = 100; |
6636 | |
6637 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6638 | at::manual_seed(0); |
6639 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6640 | |
6641 | FusionExecutor fe; |
6642 | fe.compileFusion(&fusion, {input}); |
6643 | auto cg_outputs = fe.runFusion({input}); |
6644 | |
6645 | auto aten_output = input.to(at::kDouble).sum({0}); |
6646 | |
6647 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
6648 | } |
6649 | |
6650 | // This is similar to the FusionReduction, but swaps BIDx and TIDx |
6651 | TEST_F(NVFuserTest, FusionGridReduction4_CUDA) { |
6652 | Fusion fusion; |
6653 | FusionGuard fg(&fusion); |
6654 | |
6655 | const int bdimx = 128; |
6656 | const int gdimx = 1024; |
6657 | |
6658 | // Set up your input tensor views |
6659 | TensorView* tv0 = makeSymbolicTensor(2); |
6660 | fusion.addInput(tv0); |
6661 | |
6662 | // tv1[I0, R1] = tv0[I0, I1] |
6663 | TensorView* tv1 = |
6664 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
6665 | fusion.addOutput(tv1); |
6666 | |
6667 | TORCH_CHECK( |
6668 | ir_utils::getReductionOps(&fusion).size(), |
6669 | "Could not detect reduction in fusion." ); |
6670 | |
6671 | tv1->split(1, gdimx); |
6672 | // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1] |
6673 | tv1->split(1, 4); |
6674 | // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] |
6675 | |
6676 | TensorView* tv2 = tv1->rFactor({1}); |
6677 | // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1] |
6678 | // tv1[I0, R1oi{4}, R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] |
6679 | |
6680 | TensorView* tv3 = tv1->rFactor({1}); |
6681 | // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1] |
6682 | // tv3[I0, R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] |
6683 | // tv1[I0, R1i{1024}] = tv3[I0, R1oi{4}, Ir1i{1024}] |
6684 | |
6685 | // Incrementally, can print in between for debugging |
6686 | tv0->computeAt(tv2, 1); |
6687 | tv2->computeAt(tv3, 1); |
6688 | tv3->computeAt(tv1, 1); |
6689 | |
6690 | // Re do it all at once, because why not. |
6691 | tv0->computeAt(tv1, 1); |
6692 | |
6693 | tv2->axis(2)->parallelize(ParallelType::Unroll); |
6694 | tv1->axis(0)->parallelize(ParallelType::TIDx); |
6695 | |
6696 | tv1->axis(-1)->parallelize(ParallelType::BIDx); |
6697 | tv2->axis(-1)->parallelize(ParallelType::BIDx); |
6698 | tv3->axis(-1)->parallelize(ParallelType::BIDx); |
6699 | |
6700 | int numel_x = bdimx; |
6701 | int numel_y = 65000; |
6702 | |
6703 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6704 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6705 | at::Tensor cg_output = at::empty({numel_x}, options); |
6706 | |
6707 | FusionExecutor fe; |
6708 | fe.compileFusion(&fusion, {input}); |
6709 | fe.runFusion({input}, {cg_output}); |
6710 | |
6711 | auto aten_output = input.to(at::kDouble).sum({1}); |
6712 | testValidate( |
6713 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
6714 | } |
6715 | |
6716 | // Grid reduction with 2D thread blocks but only TIDx and BIDx are |
6717 | // mapped to a reduction dim |
6718 | TEST_F(NVFuserTest, FusionGridReduction5_CUDA) { |
6719 | Fusion fusion; |
6720 | FusionGuard fg(&fusion); |
6721 | |
6722 | const int bdimx = 64; |
6723 | const int bdimy = 16; |
6724 | const int gdimx = 4; |
6725 | |
6726 | // Set up your input tensor views |
6727 | TensorView* tv0 = makeSymbolicTensor(2); |
6728 | fusion.addInput(tv0); |
6729 | |
6730 | // tv1[I0, R1] = tv0[I0, I1] |
6731 | TensorView* tv1 = |
6732 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
6733 | fusion.addOutput(tv1); |
6734 | |
6735 | TORCH_CHECK( |
6736 | ir_utils::getReductionOps(&fusion).size(), |
6737 | "Could not detect reduction in fusion." ); |
6738 | |
6739 | tv1->split(1, bdimx); |
6740 | // tv1[I0, R1o, R1i{64}] = tv0[I0, I1] |
6741 | tv1->split(1, gdimx); |
6742 | // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1] |
6743 | |
6744 | TensorView* tv2 = tv1->rFactor({1}); |
6745 | // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1] |
6746 | // tv1[I0, R1oi{4}, R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] |
6747 | |
6748 | tv0->computeAt(tv1, 1); |
6749 | |
6750 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
6751 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
6752 | |
6753 | tv1->axis(-2)->parallelize(ParallelType::BIDx); |
6754 | tv2->axis(-2)->parallelize(ParallelType::BIDx); |
6755 | |
6756 | tv1->axis(0)->parallelize(ParallelType::TIDy); |
6757 | |
6758 | int numel_x = bdimy; |
6759 | int numel_y = 6500; |
6760 | |
6761 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6762 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6763 | |
6764 | FusionExecutor fe; |
6765 | fe.compileFusion(&fusion, {input}); |
6766 | auto cg_outputs = fe.runFusion({input}); |
6767 | |
6768 | auto aten_output = input.to(at::kDouble).sum({1}); |
6769 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
6770 | } |
6771 | |
6772 | // Similar to FusionGridReduction1 but with 3D tensors |
6773 | TEST_F(NVFuserTest, FusionGridReduction6_CUDA) { |
6774 | Fusion fusion; |
6775 | FusionGuard fg(&fusion); |
6776 | |
6777 | // Set up your input tensor views |
6778 | TensorView* tv0 = makeSymbolicTensor(3); |
6779 | fusion.addInput(tv0); |
6780 | |
6781 | // tv1[I0, R1, R2] = tv0[I0, I1, I2] |
6782 | TensorView* tv1 = |
6783 | reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0); |
6784 | fusion.addOutput(tv1); |
6785 | |
6786 | TORCH_CHECK( |
6787 | ir_utils::getReductionOps(&fusion).size(), |
6788 | "Could not detect reduction in fusion." ); |
6789 | |
6790 | // Splitting for TID |
6791 | tv1->split(2, 128); |
6792 | // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] |
6793 | |
6794 | // Splitting for BID |
6795 | tv1->split(1, 128); |
6796 | |
6797 | // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2] |
6798 | |
6799 | TensorView* tv2 = tv1->rFactor({3}); |
6800 | // tv2[I0, I1o, I1i{128}, R2o, I2i{128}] |
6801 | // tv1[I0, R1o, R1i{128}, R2i{128}] |
6802 | |
6803 | TensorView* tv3 = tv1->rFactor({1}); |
6804 | // tv2[I0, I1o, I1i{128}, R2o, I2i{128}] |
6805 | // tv3[I0, R1o, I1i{128}, I2i{128}] |
6806 | // tv1[I0, R1i{128}, R2i{128}] |
6807 | |
6808 | tv3->computeAt(tv1, 1); |
6809 | tv2->computeAt(tv3, 3); |
6810 | |
6811 | tv1->axis(0)->parallelize(ParallelType::BIDy); |
6812 | |
6813 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
6814 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
6815 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
6816 | |
6817 | tv1->axis(-2)->parallelize(ParallelType::BIDx); |
6818 | tv2->axis(-3)->parallelize(ParallelType::BIDx); |
6819 | tv3->axis(-2)->parallelize(ParallelType::BIDx); |
6820 | |
6821 | int numel_x = 6500; |
6822 | int numel_y = 200; |
6823 | int numel_z = numel_y; |
6824 | |
6825 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6826 | at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); |
6827 | at::Tensor cg_output = at::empty({numel_x}, options); |
6828 | |
6829 | FusionExecutor fe; |
6830 | fe.compileFusion(&fusion, {input}); |
6831 | fe.runFusion({input}, {cg_output}); |
6832 | |
6833 | auto aten_output = input.to(at::kDouble).sum({1, 2}); |
6834 | |
6835 | testValidate( |
6836 | &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); |
6837 | } |
6838 | |
6839 | // See issue #1049 |
6840 | TEST_F(NVFuserTest, FusionGridReduction7_CUDA) { |
6841 | Fusion fusion; |
6842 | FusionGuard fg(&fusion); |
6843 | |
6844 | auto tv0 = makeSymbolicTensor(1); |
6845 | fusion.addInput(tv0); |
6846 | |
6847 | auto tv1 = sum(tv0, {0}); |
6848 | fusion.addOutput(tv1); |
6849 | |
6850 | tv1->split(0, 1000); |
6851 | |
6852 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
6853 | tv1->axis(1)->parallelize(ParallelType::BIDy); |
6854 | |
6855 | const int numel_x = 1; |
6856 | |
6857 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6858 | at::Tensor input = at::randn({numel_x}, options); |
6859 | at::Tensor cg_output = at::empty({numel_x}, options); |
6860 | |
6861 | FusionExecutor fe; |
6862 | fe.compileFusion(&fusion, {input}); |
6863 | auto out = fe.runFusion({input}); |
6864 | |
6865 | auto aten_output = input.sum({0}); |
6866 | |
6867 | testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); |
6868 | } |
6869 | |
6870 | TEST_F(NVFuserTest, FusionGridReduction8_CUDA) { |
6871 | Fusion fusion; |
6872 | FusionGuard fg(&fusion); |
6873 | |
6874 | auto tv0 = makeSymbolicTensor(2); |
6875 | fusion.addInput(tv0); |
6876 | |
6877 | auto tv1 = sum(tv0, {0}); |
6878 | fusion.addOutput(tv1); |
6879 | |
6880 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
6881 | tv1->axis(1)->parallelize(ParallelType::TIDx); |
6882 | |
6883 | const int numel_x = 2; |
6884 | const int numel_y = 4; |
6885 | |
6886 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6887 | at::Tensor input = at::randn({numel_x, numel_y}, options); |
6888 | |
6889 | FusionExecutor fe; |
6890 | fe.compileFusion(&fusion, {input}); |
6891 | auto out = fe.runFusion({input}); |
6892 | |
6893 | auto aten_output = input.sum({0}); |
6894 | |
6895 | testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); |
6896 | } |
6897 | |
6898 | TEST_F(NVFuserTest, FusionGridReduction9_CUDA) { |
6899 | Fusion fusion; |
6900 | FusionGuard fg(&fusion); |
6901 | |
6902 | auto tv0 = makeSymbolicTensor(2); |
6903 | fusion.addInput(tv0); |
6904 | auto tv1 = sum(tv0, {1}); |
6905 | |
6906 | auto tv2 = makeSymbolicTensor(1); |
6907 | fusion.addInput(tv2); |
6908 | |
6909 | auto tv3 = add(tv2, tv1); |
6910 | fusion.addOutput(tv3); |
6911 | |
6912 | tv1->split(1, 2); |
6913 | |
6914 | tv1->axis(1)->parallelize(ParallelType::BIDx); |
6915 | tv1->axis(2)->parallelize(ParallelType::BIDy); |
6916 | |
6917 | tv1->computeAt(tv3, 1); |
6918 | |
6919 | const int numel_x = 4; |
6920 | const int numel_y = 10; |
6921 | |
6922 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6923 | at::Tensor t0 = at::randn({numel_x, numel_y}, options); |
6924 | at::Tensor t2 = at::randn({numel_x}, options); |
6925 | |
6926 | std::vector<IValue> aten_inputs = {t0, t2}; |
6927 | |
6928 | FusionExecutor fe; |
6929 | fe.compileFusion(&fusion, aten_inputs); |
6930 | auto cg_output = fe.runFusion(aten_inputs); |
6931 | |
6932 | auto aten_output = t0.sum({1}).add(t2); |
6933 | |
6934 | testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__); |
6935 | } |
6936 | |
6937 | TEST_F(NVFuserTest, FusionGridReduction10_CUDA) { |
6938 | Fusion fusion; |
6939 | FusionGuard fg(&fusion); |
6940 | |
6941 | auto tv0 = makeSymbolicTensor(4); |
6942 | fusion.addInput(tv0); |
6943 | |
6944 | auto tv1 = sum(tv0, {-1}); |
6945 | auto tv2 = sum(tv1, {-1}); |
6946 | auto tv3 = sum(tv2, {-1}); |
6947 | |
6948 | fusion.addOutput(tv3); |
6949 | tv1->axis(0)->parallelize(ParallelType::TIDx); |
6950 | tv1->axis(1)->parallelize(ParallelType::BIDx); |
6951 | tv1->axis(2)->parallelize(ParallelType::TIDy); |
6952 | tv1->axis(3)->parallelize(ParallelType::TIDz); |
6953 | |
6954 | tv2->axis(0)->parallelize(ParallelType::TIDx); |
6955 | tv2->axis(1)->parallelize(ParallelType::BIDx); |
6956 | tv2->axis(2)->parallelize(ParallelType::TIDy); |
6957 | |
6958 | tv3->axis(0)->parallelize(ParallelType::TIDx); |
6959 | tv3->axis(1)->parallelize(ParallelType::BIDx); |
6960 | |
6961 | tv0->computeAt(tv3, 1); |
6962 | |
6963 | const int numel_w = 2; |
6964 | const int numel_x = 3; |
6965 | const int numel_y = 4; |
6966 | const int numel_z = 5; |
6967 | |
6968 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
6969 | at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options); |
6970 | |
6971 | FusionExecutor fe; |
6972 | fe.compileFusion(&fusion, {t0}); |
6973 | auto cg_output = fe.runFusion({t0}); |
6974 | |
6975 | auto aten_output = t0.sum({1, 2, 3}); |
6976 | |
6977 | testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__); |
6978 | } |
6979 | |
6980 | TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) { |
6981 | int bid_x = 3; |
6982 | int tid_x = 2; |
6983 | int red_dim = 0; |
6984 | |
6985 | Fusion fusion; |
6986 | FusionGuard fg(&fusion); |
6987 | |
6988 | // Set up your input tensor views |
6989 | TensorView* tv0 = makeSymbolicTensor(2); |
6990 | fusion.addInput(tv0); |
6991 | |
6992 | TensorView* tv1 = reductionOp( |
6993 | BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0); |
6994 | fusion.addOutput(tv1); |
6995 | |
6996 | tv1->split(-1, tid_x); |
6997 | tv1->axis(-2)->parallelize(ParallelType::BIDx); |
6998 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
6999 | |
7000 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7001 | at::Tensor input = at::randn({16, bid_x * tid_x}, options); |
7002 | |
7003 | FusionExecutor fe; |
7004 | fe.compileFusion(&fusion, {input}); |
7005 | auto cg_outputs = fe.runFusion({input}); |
7006 | |
7007 | auto aten_output = input.to(at::kDouble).sum({red_dim}); |
7008 | |
7009 | testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); |
7010 | } |
7011 | |
7012 | TEST_F(NVFuserTest, FusionSplitBCast_CUDA) { |
7013 | Fusion fusion; |
7014 | FusionGuard fg(&fusion); |
7015 | |
7016 | // Set up your input tensor views |
7017 | TensorView* input_tv0 = makeSymbolicTensor(3); |
7018 | TensorView* input_tv1 = makeSymbolicTensor(3); |
7019 | fusion.addInput(input_tv0); |
7020 | fusion.addInput(input_tv1); |
7021 | |
7022 | TensorView* sum_tv2 = reductionOp( |
7023 | BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), input_tv0); |
7024 | TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true}); |
7025 | TensorView* output_tv4 = div(input_tv1, bcast_tv3); |
7026 | |
7027 | sum_tv2->split(-1, 32); |
7028 | TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2}); |
7029 | |
7030 | bcast_tv3->split(-1, 32); |
7031 | output_tv4->split(-1, 32); |
7032 | |
7033 | sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx); |
7034 | sum_tv2->axis(0)->parallelize(ParallelType::BIDx); |
7035 | bcast_tv3->axis(0)->parallelize(ParallelType::BIDx); |
7036 | output_tv4->axis(0)->parallelize(ParallelType::BIDx); |
7037 | |
7038 | sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy); |
7039 | sum_tv2->axis(1)->parallelize(ParallelType::BIDy); |
7040 | bcast_tv3->axis(1)->parallelize(ParallelType::BIDy); |
7041 | output_tv4->axis(1)->parallelize(ParallelType::BIDy); |
7042 | |
7043 | sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx); |
7044 | sum_tv2->axis(-1)->parallelize(ParallelType::TIDx); |
7045 | bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx); |
7046 | output_tv4->axis(-1)->parallelize(ParallelType::TIDx); |
7047 | |
7048 | fusion.addOutput(output_tv4); |
7049 | |
7050 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7051 | at::Tensor t0 = at::randn({32, 32, 128}, options); |
7052 | at::Tensor t1 = at::randn({32, 32, 128}, options); |
7053 | at::Tensor cg_output = at::empty({32, 32, 128}, options); |
7054 | |
7055 | FusionExecutor fe; |
7056 | fe.compileFusion(&fusion, {t0, t1}); |
7057 | fe.runFusion({t0, t1}, {cg_output}); |
7058 | } |
7059 | |
7060 | TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) { |
7061 | Fusion fusion; |
7062 | FusionGuard fg(&fusion); |
7063 | |
7064 | TensorView* tv0 = makeSymbolicTensor(2); |
7065 | fusion.addInput(tv0); |
7066 | |
7067 | // reduce then broadcast |
7068 | auto tv1 = sum(tv0, {0}); |
7069 | auto tv2 = broadcast(tv1, {false, true}); |
7070 | |
7071 | TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast()); |
7072 | } |
7073 | |
7074 | TEST_F(NVFuserTest, FusionBCastReduce_CUDA) { |
7075 | Fusion fusion; |
7076 | FusionGuard fg(&fusion); |
7077 | |
7078 | // Set up your input tensor views |
7079 | TensorView* tv0 = makeSymbolicTensor(2); |
7080 | |
7081 | auto tv1 = broadcast(tv0, {true, false, false}); |
7082 | auto tv2 = sum(tv1, {1}); |
7083 | TORCH_CHECK( |
7084 | tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() && |
7085 | !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction()); |
7086 | } |
7087 | |
7088 | // Multiple consumer reduction with computeAt |
7089 | // https://github.com/csarofeen/pytorch/issues/110 |
7090 | TEST_F(NVFuserTest, FusionReductionMultiConsumer_CUDA) { |
7091 | Fusion fusion; |
7092 | FusionGuard fg(&fusion); |
7093 | TensorView* tv0 = makeSymbolicTensor(2); |
7094 | fusion.addInput(tv0); |
7095 | auto tv1 = unaryOp(UnaryOpType::Exp, tv0); |
7096 | auto tv2 = |
7097 | reductionOp(BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), tv1); |
7098 | auto tv3 = |
7099 | reductionOp(BinaryOpType::Min, {-1}, IrBuilder::create<Double>(0), tv1); |
7100 | auto tv4 = add(tv2, tv3); |
7101 | fusion.addOutput(tv4); |
7102 | tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort); |
7103 | |
7104 | TORCH_CHECK(tv1->getComputeAtPosition() == 2); |
7105 | } |
7106 | |
7107 | TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { |
7108 | for (const auto i : c10::irange(2)) { |
7109 | Fusion fusion; |
7110 | FusionGuard fg(&fusion); |
7111 | |
7112 | // Set up your input tensor views |
7113 | TensorView* tv0 = makeSymbolicTensor(1); |
7114 | fusion.addInput(tv0); |
7115 | |
7116 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
7117 | auto tv2 = add(tv0, IrBuilder::create<Double>(1)); |
7118 | TensorView* tv3 = add(tv1, tv2); |
7119 | // Set outputs tv2 or tv1 and then tv3 |
7120 | if (i == 0) { |
7121 | fusion.addOutput(tv2); |
7122 | } else { |
7123 | fusion.addOutput(tv1); |
7124 | } |
7125 | fusion.addOutput(tv3); |
7126 | |
7127 | if (i == 0) { |
7128 | tv1->computeAt(tv3, -1); |
7129 | } else { |
7130 | tv2->computeAt(tv3, -1); |
7131 | } |
7132 | |
7133 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7134 | at::Tensor aten_input = at::randn({100}, options); |
7135 | std::vector<at::Tensor> aten_outputs = { |
7136 | aten_input + 1, (aten_input + 1) * 2}; |
7137 | |
7138 | FusionExecutor fe; |
7139 | fe.compileFusion(&fusion, {aten_input}); |
7140 | auto cg_outputs = fe.runFusion({aten_input}); |
7141 | |
7142 | testValidate( |
7143 | &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); |
7144 | } |
7145 | } |
7146 | |
7147 | TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { |
7148 | Fusion fusion; |
7149 | FusionGuard fg(&fusion); |
7150 | |
7151 | // Set up your input tensor views |
7152 | TensorView* tv0 = makeSymbolicTensor(2); |
7153 | fusion.addInput(tv0); |
7154 | |
7155 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
7156 | auto tv2 = add(tv0, IrBuilder::create<Double>(1)); |
7157 | TensorView* tv3 = add(tv1, tv2); |
7158 | fusion.addOutput(tv3); |
7159 | |
7160 | tv3->split(-1, 32); |
7161 | |
7162 | tv1->computeAt(tv3, -1); |
7163 | tv2->computeAt(tv3, -2); |
7164 | |
7165 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7166 | at::Tensor aten_input = at::randn({100, 100}, options); |
7167 | auto aten_output = (aten_input + 1) * 2; |
7168 | |
7169 | at::Tensor cg_output = at::empty_like(aten_input, options); |
7170 | |
7171 | FusionExecutor fe; |
7172 | fe.compileFusion(&fusion, {aten_input}); |
7173 | fe.runFusion({aten_input}, {cg_output}); |
7174 | |
7175 | testValidate( |
7176 | &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7177 | } |
7178 | |
7179 | TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { |
7180 | Fusion fusion; |
7181 | FusionGuard fg(&fusion); |
7182 | |
7183 | const int64_t dimx = 13; |
7184 | const int64_t dimy = 15; |
7185 | |
7186 | TensorView* tv0 = makeConcreteTensor({dimx, dimy}); |
7187 | fusion.addInput(tv0); |
7188 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1)); |
7189 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2)); |
7190 | TensorView* tv3 = add(tv2, IrBuilder::create<Double>(3)); |
7191 | TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4)); |
7192 | TensorView* tv5 = mul(tv2, tv4); |
7193 | fusion.addOutput(tv5); |
7194 | |
7195 | tv1->computeAt(tv2, 2); |
7196 | tv3->computeAt(tv4, 1); |
7197 | tv4->computeAt(tv5, 2); |
7198 | |
7199 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7200 | at::Tensor aten_input = at::randn({dimx, dimy}, options); |
7201 | auto t1 = aten_input.add(1.); |
7202 | auto t2 = t1.add(2.); |
7203 | auto t3 = t2.add(3.); |
7204 | auto t4 = t3.add(4.); |
7205 | auto aten_output = t2.mul(t4); |
7206 | |
7207 | torch::jit::fuser::cuda::FusionExecutor fe; |
7208 | fe.compileFusion(&fusion, {aten_input}); |
7209 | auto cg_outputs = fe.runFusion({aten_input}); |
7210 | |
7211 | testValidate( |
7212 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7213 | } |
7214 | |
7215 | TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) { |
7216 | Fusion fusion; |
7217 | FusionGuard fg(&fusion); |
7218 | |
7219 | TensorView* tv0 = makeSymbolicTensor(1); |
7220 | fusion.addInput(tv0); |
7221 | |
7222 | auto tv1 = sum(tv0, {0}); |
7223 | auto tv2 = add(tv1, IrBuilder::create<Double>(1)); |
7224 | fusion.addOutput(tv2); |
7225 | TORCH_CHECK(tv2->nDims() == 0); |
7226 | tv1->computeAt(tv2, 0); |
7227 | |
7228 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7229 | at::Tensor aten_input = at::randn({100}, options); |
7230 | auto aten_output = aten_input.to(at::kDouble).sum() + 1; |
7231 | |
7232 | FusionExecutor fe; |
7233 | fe.compileFusion(&fusion, {aten_input}); |
7234 | auto cg_outputs = fe.runFusion({aten_input}); |
7235 | |
7236 | testValidate( |
7237 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7238 | } |
7239 | |
7240 | TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) { |
7241 | Fusion fusion; |
7242 | FusionGuard fg(&fusion); |
7243 | |
7244 | TensorView* tv0 = makeSymbolicTensor(0); |
7245 | fusion.addInput(tv0); |
7246 | |
7247 | auto tv1 = broadcast(tv0, {true, true}); |
7248 | TORCH_CHECK(tv1->nDims() == 2); |
7249 | |
7250 | TensorView* tv2 = makeSymbolicTensor(2); |
7251 | fusion.addInput(tv2); |
7252 | |
7253 | auto tv3 = add(tv1, tv2); |
7254 | auto tv4 = sum(tv3, {0, 1}); |
7255 | fusion.addOutput(tv4); |
7256 | |
7257 | tv3->computeAt(tv4, -1); |
7258 | tv3->axis(-2)->parallelize(ParallelType::TIDx); |
7259 | tv3->axis(-1)->parallelize(ParallelType::TIDy); |
7260 | |
7261 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7262 | at::Tensor t0 = at::randn({}, options); |
7263 | at::Tensor t1 = at::randn({10, 10}, options); |
7264 | |
7265 | auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1) |
7266 | .to(at::kDouble) |
7267 | .sum(); |
7268 | |
7269 | std::vector<IValue> aten_inputs = {t0, t1}; |
7270 | at::Tensor cg_output = at::empty({}, options); |
7271 | |
7272 | FusionExecutor fe; |
7273 | fe.compileFusion(&fusion, aten_inputs); |
7274 | fe.runFusion(aten_inputs, {cg_output}); |
7275 | |
7276 | testValidate( |
7277 | &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); |
7278 | } |
7279 | |
7280 | TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) { |
7281 | Fusion fusion; |
7282 | FusionGuard fg(&fusion); |
7283 | |
7284 | const int bdimx = 32; |
7285 | const int gdimx = 32; |
7286 | |
7287 | TensorView* tv0 = makeSymbolicTensor(1); |
7288 | fusion.addInput(tv0); |
7289 | |
7290 | auto tv1 = sum(tv0, {0}); |
7291 | fusion.addOutput(tv1); |
7292 | |
7293 | tv1->split(0, bdimx); |
7294 | tv1->split(0, gdimx); |
7295 | auto tv2 = tv1->rFactor({0}); |
7296 | |
7297 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
7298 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
7299 | tv1->axis(-2)->parallelize(ParallelType::BIDx); |
7300 | tv2->axis(-2)->parallelize(ParallelType::BIDx); |
7301 | |
7302 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7303 | at::Tensor aten_input = at::randn({1000}, options); |
7304 | auto aten_output = aten_input.to(at::kDouble).sum(); |
7305 | |
7306 | at::Tensor cg_output = at::empty({}, options); |
7307 | |
7308 | FusionExecutor fe; |
7309 | fe.compileFusion(&fusion, {aten_input}); |
7310 | fe.runFusion({aten_input}, {cg_output}); |
7311 | |
7312 | testValidate( |
7313 | &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7314 | } |
7315 | |
7316 | TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) { |
7317 | Fusion fusion; |
7318 | FusionGuard fg(&fusion); |
7319 | const int tidx = 128; |
7320 | |
7321 | // Set up your input tensor views |
7322 | TensorView* tv0 = makeSymbolicTensor(2); |
7323 | fusion.addInput(tv0); |
7324 | |
7325 | auto tv1 = sum(tv0, {1}); |
7326 | auto tv2 = broadcast(tv1, {false, true}); |
7327 | |
7328 | tv1->split(1, tidx); |
7329 | auto tv3 = tv1->rFactor({-2}); |
7330 | |
7331 | TensorView* tv4 = makeSymbolicTensor(2); |
7332 | fusion.addInput(tv4); |
7333 | |
7334 | auto tv5 = add(tv2, tv4); |
7335 | fusion.addOutput(tv5); |
7336 | tv5->split(1, tidx); |
7337 | |
7338 | tv3->computeAt(tv5, 1); |
7339 | |
7340 | tv2->split(1, tidx); |
7341 | |
7342 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
7343 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
7344 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
7345 | tv5->axis(-1)->parallelize(ParallelType::TIDx); |
7346 | |
7347 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
7348 | |
7349 | int x = 63, y = 200; |
7350 | |
7351 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7352 | |
7353 | at::Tensor t0 = at::randn({x, y}, options); |
7354 | at::Tensor t4 = at::randn({x, y}, options); |
7355 | |
7356 | auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y}); |
7357 | auto aten_output = t3.add(t4); |
7358 | |
7359 | std::vector<IValue> aten_inputs = {t0, t4}; |
7360 | FusionExecutor fe; |
7361 | fe.compileFusion(&fusion, {t0, t4}); |
7362 | auto cg_outputs = fe.runFusion({t0, t4}); |
7363 | |
7364 | testValidate( |
7365 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
7366 | } |
7367 | |
7368 | TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) { |
7369 | Fusion fusion; |
7370 | FusionGuard fg(&fusion); |
7371 | |
7372 | TensorView* tv0 = makeConcreteTensor({2, 3}); |
7373 | fusion.addInput(tv0); |
7374 | |
7375 | TensorView* tv1 = broadcast(tv0, {true, false, true, false, true}); |
7376 | |
7377 | fusion.addOutput(tv1); |
7378 | |
7379 | const auto options = |
7380 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7381 | |
7382 | at::Tensor aten_input = at::randn({2, 3}, options); |
7383 | auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0); |
7384 | |
7385 | FusionExecutor fe; |
7386 | fe.compileFusion(&fusion, {aten_input}); |
7387 | auto cg_outputs = fe.runFusion({aten_input}); |
7388 | |
7389 | testValidate( |
7390 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7391 | } |
7392 | |
7393 | TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { |
7394 | Fusion fusion; |
7395 | FusionGuard fg(&fusion); |
7396 | |
7397 | TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6}); |
7398 | fusion.addInput(tv0); |
7399 | |
7400 | TensorView* tv1 = sum(tv0, {0, 2, -1}, /*keep_dim=*/true); |
7401 | |
7402 | fusion.addOutput(tv1); |
7403 | |
7404 | const auto options = |
7405 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7406 | |
7407 | at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options); |
7408 | auto aten_output = |
7409 | aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true); |
7410 | |
7411 | FusionExecutor fe; |
7412 | fe.compileFusion(&fusion, {aten_input}); |
7413 | auto cg_outputs = fe.runFusion({aten_input}); |
7414 | |
7415 | testValidate( |
7416 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7417 | } |
7418 | |
7419 | TEST_F(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { |
7420 | constexpr int bid_x = 80; |
7421 | constexpr int tid_x = 4096; |
7422 | constexpr int red_dim = 1; |
7423 | |
7424 | Fusion fusion; |
7425 | FusionGuard fg(&fusion); |
7426 | |
7427 | // Set up your input tensor views |
7428 | TensorView* tv0 = makeConcreteTensor({bid_x, tid_x}); |
7429 | fusion.addInput(tv0); |
7430 | |
7431 | TensorView* tv1 = reductionOp( |
7432 | BinaryOpType::Add, |
7433 | {red_dim}, |
7434 | IrBuilder::create<Double>(0), |
7435 | tv0, |
7436 | /*keep_dim=*/true); |
7437 | |
7438 | fusion.addOutput(tv1); |
7439 | |
7440 | const auto options = |
7441 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7442 | |
7443 | at::Tensor aten_input = at::randn({bid_x, tid_x}, options); |
7444 | auto aten_output = |
7445 | aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true); |
7446 | |
7447 | // Apply reduction heuristic |
7448 | auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); |
7449 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
7450 | scheduleReduction(&fusion, *reduction_params); |
7451 | |
7452 | auto lparams = reduction_params->lparams; |
7453 | |
7454 | FusionExecutor fe; |
7455 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7456 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
7457 | |
7458 | testValidate( |
7459 | &fusion, |
7460 | cg_outputs, |
7461 | {aten_input}, |
7462 | {aten_output}, |
7463 | __LINE__, |
7464 | __FILE__, |
7465 | "" , |
7466 | lparams); |
7467 | } |
7468 | |
7469 | TEST_F(NVFuserTest, FusionSumTo_CUDA) { |
7470 | Fusion fusion; |
7471 | FusionGuard fg(&fusion); |
7472 | |
7473 | std::vector<int64_t> tensor_shape{2, 3, 4, 5, 6}; |
7474 | std::vector<int64_t> sum_to_shape{1, 5, 6}; |
7475 | |
7476 | std::vector<int64_t> tensor_shape_ref{2, 3, 4, 5, 6}; |
7477 | std::vector<int64_t> sum_to_shape_ref{1, 5, 6}; |
7478 | |
7479 | std::vector<Int*> sum_to_symb; |
7480 | std::transform( |
7481 | sum_to_shape.begin(), |
7482 | sum_to_shape.end(), |
7483 | std::back_inserter(sum_to_symb), |
7484 | [](int s) -> Int* { return IrBuilder::create<Int>(s); }); |
7485 | |
7486 | TensorView* tv0 = makeConcreteTensor(tensor_shape); |
7487 | fusion.addInput(tv0); |
7488 | |
7489 | TensorView* tv1 = sum_to(tv0, sum_to_symb); |
7490 | fusion.addOutput(tv1); |
7491 | |
7492 | const auto options = |
7493 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7494 | |
7495 | at::Tensor aten_input = at::randn(tensor_shape_ref, options); |
7496 | auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); |
7497 | |
7498 | FusionExecutor fe; |
7499 | fe.compileFusion(&fusion, {aten_input}); |
7500 | auto cg_outputs = fe.runFusion({aten_input}); |
7501 | |
7502 | TORCH_CHECK( |
7503 | cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()), |
7504 | "sum_to not keeping the final dimension" ); |
7505 | |
7506 | testValidate( |
7507 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7508 | } |
7509 | |
7510 | TEST_F(NVFuserTest, FusionSumToNoop_CUDA) { |
7511 | Fusion fusion; |
7512 | FusionGuard fg(&fusion); |
7513 | |
7514 | std::vector<int64_t> tensor_shape{4, 5, 6}; |
7515 | std::vector<int64_t> sum_to_shape{4, 5, 6}; |
7516 | |
7517 | std::vector<int64_t> tensor_shape_ref{4, 5, 6}; |
7518 | std::vector<int64_t> sum_to_shape_ref{4, 5, 6}; |
7519 | |
7520 | std::vector<Int*> sum_to_symb; |
7521 | std::transform( |
7522 | sum_to_shape.begin(), |
7523 | sum_to_shape.end(), |
7524 | std::back_inserter(sum_to_symb), |
7525 | [](int s) -> Int* { return IrBuilder::create<Int>(s); }); |
7526 | |
7527 | TensorView* tv0 = makeConcreteTensor(tensor_shape); |
7528 | fusion.addInput(tv0); |
7529 | |
7530 | TensorView* tv1 = sum_to(tv0, sum_to_symb); |
7531 | |
7532 | // Dummy operator to avoid tv0 both input and output |
7533 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(0)); |
7534 | fusion.addOutput(tv2); |
7535 | |
7536 | const auto options = |
7537 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7538 | |
7539 | at::Tensor aten_input = at::randn(tensor_shape_ref, options); |
7540 | |
7541 | FusionExecutor fe; |
7542 | fe.compileFusion(&fusion, {aten_input}); |
7543 | auto cg_outputs = fe.runFusion({aten_input}); |
7544 | auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); |
7545 | |
7546 | TORCH_CHECK( |
7547 | cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()), |
7548 | "sum_to not keeping the final dimension" ); |
7549 | |
7550 | testValidate( |
7551 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
7552 | } |
7553 | |
7554 | TEST_F(NVFuserTest, FusionReductionScheduler_CUDA) { |
7555 | constexpr int bid_x = 80; |
7556 | constexpr int tid_x = 4096; |
7557 | constexpr int red_dim = 1; |
7558 | |
7559 | Fusion fusion; |
7560 | FusionGuard fg(&fusion); |
7561 | |
7562 | // Set up your input tensor views |
7563 | TensorView* tv0 = makeSymbolicTensor(2); |
7564 | fusion.addInput(tv0); |
7565 | |
7566 | TensorView* tv1 = reductionOp( |
7567 | BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0); |
7568 | fusion.addOutput(tv1); |
7569 | |
7570 | const auto options = |
7571 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7572 | |
7573 | at::Tensor aten_input = at::randn({bid_x, tid_x}, options); |
7574 | auto aten_output = aten_input.to(at::kDouble).sum({red_dim}); |
7575 | |
7576 | // Apply reduction heuristic |
7577 | auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); |
7578 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
7579 | scheduleReduction(&fusion, *reduction_params); |
7580 | |
7581 | auto lparams = reduction_params->lparams; |
7582 | |
7583 | FusionExecutor fe; |
7584 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7585 | // no broadcasting needed, omitting the last optional argument; |
7586 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
7587 | |
7588 | testValidate( |
7589 | &fusion, |
7590 | cg_outputs, |
7591 | {aten_input}, |
7592 | {aten_output}, |
7593 | __LINE__, |
7594 | __FILE__, |
7595 | "" , |
7596 | lparams); |
7597 | } |
7598 | |
7599 | // This test checks if our system could correctly handles the case where both |
7600 | // reduction and trivial reduction exist in the fusion. Trivial reduction |
7601 | // deserve testing because trivial reduction is handled more like a broadcasting |
7602 | // rather than a reduction. |
7603 | TEST_F(NVFuserTest, FusionReductionWithTrivialReduction_CUDA) { |
7604 | constexpr int bid_x = 80; |
7605 | constexpr int tid_x = 4096; |
7606 | |
7607 | std::vector<std::vector<int64_t>> shapes = { |
7608 | {-1, -1, 1}, {-1, 1, -1}, {1, -1, -1}}; |
7609 | |
7610 | for (auto shape : shapes) { |
7611 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
7612 | Fusion& fusion = *fusion_ptr; |
7613 | FusionGuard fg(&fusion); |
7614 | |
7615 | std::vector<std::vector<int64_t>> reduction_dims = { |
7616 | {0}, |
7617 | {1}, |
7618 | {2}, |
7619 | {0, 1}, |
7620 | {0, 2}, |
7621 | {1, 2}, |
7622 | {0, 1, 2}, |
7623 | }; |
7624 | |
7625 | // Set up your input tensor views |
7626 | TensorView* tv0 = makeConcreteTensor(shape); |
7627 | fusion.addInput(tv0); |
7628 | |
7629 | for (auto rdims : reduction_dims) { |
7630 | std::vector<int> rdims_(rdims.begin(), rdims.end()); |
7631 | auto tv = sum(tv0, rdims_); |
7632 | fusion.addOutput(tv); |
7633 | } |
7634 | |
7635 | const auto options = |
7636 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7637 | |
7638 | auto concrete_shape = shape; |
7639 | std::deque<int64_t> concrete_values = {bid_x, tid_x}; |
7640 | for (auto& s : concrete_shape) { |
7641 | if (s == -1) { |
7642 | s = concrete_values.front(); |
7643 | concrete_values.pop_front(); |
7644 | } |
7645 | } |
7646 | |
7647 | at::Tensor aten_input = at::randn(concrete_shape, options); |
7648 | std::vector<at::Tensor> aten_outputs; |
7649 | for (auto rdims : reduction_dims) { |
7650 | aten_outputs.push_back(aten_input.sum(rdims)); |
7651 | } |
7652 | |
7653 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
7654 | auto cg_outputs = executor_cache.runFusionWithInputs({aten_input}); |
7655 | |
7656 | testValidate( |
7657 | &fusion, |
7658 | cg_outputs, |
7659 | {aten_input}, |
7660 | aten_outputs, |
7661 | __LINE__, |
7662 | __FILE__, |
7663 | "" ); |
7664 | } |
7665 | } |
7666 | |
7667 | // Simple reduction parallelized on a symbolic size. |
7668 | TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) { |
7669 | Fusion fusion; |
7670 | FusionGuard fg(&fusion); |
7671 | |
7672 | // Set up your input tensor views |
7673 | TensorView* tv0 = makeSymbolicTensor(2); |
7674 | fusion.addInput(tv0); |
7675 | |
7676 | // tv1[I0, R1] = tv0[I0, I1] |
7677 | TensorView* tv1 = |
7678 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
7679 | fusion.addOutput(tv1); |
7680 | |
7681 | // Interface should just be a direct split with a Parallel type. We can |
7682 | // include the parallelize call if we do this. |
7683 | tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); |
7684 | // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] |
7685 | |
7686 | TensorView* tv2 = tv1->rFactor({1}); |
7687 | // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1] |
7688 | // tv1[I0, R1oi{4}, R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] |
7689 | |
7690 | // Incrementally, can print in between for debugging |
7691 | tv0->computeAt(tv2, 1); |
7692 | tv2->computeAt(tv1, 1); |
7693 | |
7694 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
7695 | |
7696 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
7697 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
7698 | |
7699 | int numel_x = 65000; |
7700 | int numel_y = 1025; |
7701 | |
7702 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7703 | at::Tensor aten_input = at::randn({numel_x, numel_y}, options); |
7704 | auto aten_output = aten_input.to(at::kDouble).sum({1}); |
7705 | |
7706 | // How many threads to use for the block reduction |
7707 | int runtime_threadIdx_dim = 128; |
7708 | |
7709 | LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); |
7710 | |
7711 | FusionExecutor fe; |
7712 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7713 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
7714 | |
7715 | testValidate( |
7716 | &fusion, |
7717 | cg_outputs, |
7718 | {aten_input}, |
7719 | {aten_output}, |
7720 | __LINE__, |
7721 | __FILE__, |
7722 | "" , |
7723 | lparams); |
7724 | } |
7725 | |
7726 | TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { |
7727 | const std::vector<int> red_dims = {0, 2}; |
7728 | // Copy is because CodeGen requires int and Pytorch requires int64_t |
7729 | // for a vector of reduction dimensions |
7730 | const std::vector<int64_t> red_dims64 = {0, 2}; |
7731 | const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20}; |
7732 | const std::vector<int64_t> tensor_dims_out = {10, 20}; |
7733 | |
7734 | Fusion fusion; |
7735 | FusionGuard fg(&fusion); |
7736 | |
7737 | // Set up your input tensor views |
7738 | TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); |
7739 | fusion.addInput(tv0); |
7740 | |
7741 | TensorView* tv1 = reductionOp( |
7742 | BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0); |
7743 | fusion.addOutput(tv1); |
7744 | |
7745 | const auto options = |
7746 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7747 | at::Tensor aten_input = at::randn(tensor_dims_in, options); |
7748 | auto aten_output = aten_input.to(at::kDouble).sum(red_dims64); |
7749 | at::Tensor cg_output = at::empty(tensor_dims_out, options); |
7750 | |
7751 | // Apply reduction heuristic |
7752 | auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); |
7753 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
7754 | scheduleReduction(&fusion, *reduction_params); |
7755 | auto lparams = reduction_params->lparams; |
7756 | |
7757 | FusionExecutor fe; |
7758 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7759 | fe.runFusion({aten_input}, {cg_output}, lparams); |
7760 | |
7761 | testValidate( |
7762 | &fusion, |
7763 | {cg_output}, |
7764 | {aten_input}, |
7765 | {aten_output}, |
7766 | __LINE__, |
7767 | __FILE__, |
7768 | "" , |
7769 | lparams); |
7770 | } |
7771 | |
7772 | TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { |
7773 | const std::vector<int> red_dims = {1, 3}; |
7774 | // Copy is because CodeGen requires int and Pytorch requires int64_t |
7775 | // for a vector of reduction dimensions |
7776 | const std::vector<int64_t> red_dims64 = {1, 3}; |
7777 | const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20}; |
7778 | |
7779 | Fusion fusion; |
7780 | FusionGuard fg(&fusion); |
7781 | |
7782 | // Set up your input tensor views |
7783 | TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); |
7784 | fusion.addInput(tv0); |
7785 | |
7786 | TensorView* tv1 = reductionOp( |
7787 | BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0); |
7788 | fusion.addOutput(tv1); |
7789 | |
7790 | const auto options = |
7791 | at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
7792 | at::Tensor aten_input = at::randn(tensor_dims_in, options); |
7793 | auto aten_output = aten_input.to(at::kDouble).sum(red_dims64); |
7794 | |
7795 | auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); |
7796 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
7797 | scheduleReduction(&fusion, *reduction_params); |
7798 | auto lparams = reduction_params->lparams; |
7799 | |
7800 | FusionExecutor fe; |
7801 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7802 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
7803 | |
7804 | testValidate( |
7805 | &fusion, |
7806 | cg_outputs, |
7807 | {aten_input}, |
7808 | {aten_output}, |
7809 | __LINE__, |
7810 | __FILE__, |
7811 | "" , |
7812 | lparams); |
7813 | } |
7814 | |
7815 | TEST_F(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { |
7816 | std::vector<DataType> dtypes = { |
7817 | DataType::Double, DataType::Float, DataType::Half}; |
7818 | // TODO: add test for complex. Currently complex fails with the following |
7819 | // NVRTC compilation error message: |
7820 | // error: no suitable user-defined conversion from |
7821 | // "CudaCodeGen::std::complex<double>" to "CudaCodeGen::std::complex<float>" |
7822 | // exists |
7823 | #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 |
7824 | if (at::cuda::getDeviceProperties(0)->major >= 8) { |
7825 | dtypes.insert(dtypes.end(), DataType::BFloat16); |
7826 | } |
7827 | #endif |
7828 | |
7829 | std::vector<int> red_dims; |
7830 | |
7831 | // Tried to cut down the number iterations with just |
7832 | // doing every other power of 2. |
7833 | for (int i = 1; i <= 1024 * 1024; i <<= 2) { |
7834 | red_dims.push_back(i); |
7835 | } |
7836 | |
7837 | for (auto dtype : dtypes) { |
7838 | at::ScalarType aten_dtype = data_type_to_aten(dtype); |
7839 | for (auto& rdim : red_dims) { |
7840 | Fusion fusion; |
7841 | FusionGuard fg(&fusion); |
7842 | |
7843 | bool is_fp16 = dtype == DataType::Half; |
7844 | bool is_bf16 = dtype == DataType::BFloat16; |
7845 | |
7846 | TensorView* tv0 = makeSymbolicTensor(1, dtype); |
7847 | fusion.addInput(tv0); |
7848 | |
7849 | TensorView* tv0_cast = tv0; |
7850 | if (is_fp16 || is_bf16) { |
7851 | tv0_cast = castOp(DataType::Float, tv0); |
7852 | } |
7853 | |
7854 | TensorView* tv1 = sum(tv0_cast, {0}); |
7855 | |
7856 | TensorView* tv1_cast = tv1; |
7857 | if (is_fp16) { |
7858 | tv1_cast = castOp(DataType::Half, tv1); |
7859 | } |
7860 | if (is_bf16) { |
7861 | tv1_cast = castOp(DataType::BFloat16, tv1); |
7862 | } |
7863 | |
7864 | fusion.addOutput(tv1_cast); |
7865 | |
7866 | auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); |
7867 | |
7868 | at::Tensor aten_input = at::randn({rdim}, options); |
7869 | auto aten_output = aten_input.to(at::kDouble).sum({0}); |
7870 | |
7871 | auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); |
7872 | TORCH_CHECK(reduction_params != nullptr, "Reduction is not found!" ); |
7873 | scheduleReduction(&fusion, *reduction_params); |
7874 | auto lparams = reduction_params->lparams; |
7875 | |
7876 | FusionExecutor fe; |
7877 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7878 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
7879 | |
7880 | testValidate( |
7881 | &fusion, |
7882 | cg_outputs, |
7883 | {aten_input}, |
7884 | {aten_output}, |
7885 | __LINE__, |
7886 | __FILE__, |
7887 | "" , |
7888 | lparams); |
7889 | } |
7890 | } |
7891 | } |
7892 | |
7893 | TEST_F(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { |
7894 | std::vector<DataType> dtypes = { |
7895 | DataType::Double, DataType::Float, DataType::Half}; |
7896 | // TODO: add complex support. Currently, complex fails with the following |
7897 | // NVRTC compilation error: |
7898 | // error: no instance of overloaded function "__shfl_xor_sync" matches the |
7899 | // argument list |
7900 | #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 |
7901 | if (at::cuda::getDeviceProperties(0)->major >= 8) { |
7902 | dtypes.insert(dtypes.end(), DataType::BFloat16); |
7903 | } |
7904 | #endif |
7905 | |
7906 | std::vector<int> red_axis = {1, 0}; |
7907 | std::vector<int> output_dims = {160, 320}; |
7908 | std::vector<int> red_dims; |
7909 | |
7910 | // Tried to cut down the number iterations with just |
7911 | // doing every other power of 2. |
7912 | for (int i = 1; i <= 1024 * 1024; i <<= 2) { |
7913 | red_dims.push_back(i); |
7914 | } |
7915 | |
7916 | for (auto dtype : dtypes) { |
7917 | at::ScalarType aten_dtype = data_type_to_aten(dtype); |
7918 | for (auto& axis : red_axis) { |
7919 | for (auto& odim : output_dims) { |
7920 | for (auto& rdim : red_dims) { |
7921 | Fusion fusion; |
7922 | FusionGuard fg(&fusion); |
7923 | |
7924 | bool is_fp16 = dtype == DataType::Half; |
7925 | bool is_bf16 = dtype == DataType::BFloat16; |
7926 | |
7927 | TensorView* tv0 = makeSymbolicTensor(2, dtype); |
7928 | fusion.addInput(tv0); |
7929 | |
7930 | TensorView* tv0_cast = tv0; |
7931 | if (is_fp16 || is_bf16) { |
7932 | tv0_cast = castOp(DataType::Float, tv0); |
7933 | } |
7934 | |
7935 | TensorView* tv1 = sum(tv0_cast, {axis}); |
7936 | |
7937 | TensorView* tv1_cast = tv1; |
7938 | if (is_fp16) { |
7939 | tv1_cast = castOp(DataType::Half, tv1); |
7940 | } |
7941 | if (is_bf16) { |
7942 | tv1_cast = castOp(DataType::BFloat16, tv1); |
7943 | } |
7944 | fusion.addOutput(tv1_cast); |
7945 | |
7946 | auto options = |
7947 | at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); |
7948 | |
7949 | at::Tensor aten_input = |
7950 | (axis ? at::randn({odim, rdim}, options) |
7951 | : at::randn({rdim, odim}, options)); |
7952 | |
7953 | auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); |
7954 | TORCH_CHECK(reduction_params != nullptr, "Reduction is not found!" ); |
7955 | scheduleReduction(&fusion, *reduction_params); |
7956 | auto lparams = reduction_params->lparams; |
7957 | |
7958 | FusionExecutor fe; |
7959 | fe.compileFusion(&fusion, {aten_input}, lparams); |
7960 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
7961 | auto aten_output = aten_input.to(at::kDouble).sum({axis}); |
7962 | testValidate( |
7963 | &fusion, |
7964 | cg_outputs, |
7965 | {aten_input}, |
7966 | {aten_output}, |
7967 | __LINE__, |
7968 | __FILE__, |
7969 | "" , |
7970 | lparams); |
7971 | } |
7972 | } |
7973 | } |
7974 | } |
7975 | } |
7976 | |
7977 | TEST_F(NVFuserTest, FusionCacheBefore_CUDA) { |
7978 | // TVM Cache Write |
7979 | Fusion fusion; |
7980 | FusionGuard fg(&fusion); |
7981 | |
7982 | TensorView* tv0 = makeSymbolicTensor(2); |
7983 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
7984 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0)); |
7985 | fusion.addInput(tv0); |
7986 | fusion.addOutput(tv2); |
7987 | |
7988 | // Before: TV2 = TV1 * 3 |
7989 | // After: TV3 = TV1 * 3; |
7990 | // TV2 = TV3; |
7991 | TensorView* tv3 = tv2->cacheBefore(); |
7992 | |
7993 | constexpr int BSX = 32; |
7994 | tv2->split(-1, BSX); |
7995 | tv0->computeAt(tv2, -1); |
7996 | |
7997 | // Thread and Block binding |
7998 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
7999 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8000 | |
8001 | constexpr int M = 32, N = 750; |
8002 | |
8003 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8004 | at::Tensor aten_input = at::randn({M, N}, options); |
8005 | at::Tensor aten_output = (aten_input + 1.0) * 3.0; |
8006 | |
8007 | FusionExecutor fe; |
8008 | fe.compileFusion(&fusion, {aten_input}); |
8009 | auto cg_outputs = fe.runFusion({aten_input}); |
8010 | |
8011 | testValidate( |
8012 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
8013 | } |
8014 | |
8015 | TEST_F(NVFuserTest, FusionCacheAfter_CUDA) { |
8016 | // TVM Cache Read |
8017 | Fusion fusion; |
8018 | FusionGuard fg(&fusion); |
8019 | |
8020 | TensorView* tv0 = makeSymbolicTensor(2); |
8021 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
8022 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0)); |
8023 | fusion.addInput(tv0); |
8024 | fusion.addOutput(tv2); |
8025 | |
8026 | // Before: TV1 = TV0 + 1 |
8027 | // After: TV3 = TV0; |
8028 | // TV1 = TV3 + 1 |
8029 | TensorView* tv3 = tv0->cacheAfter(); |
8030 | |
8031 | constexpr int BSX = 32; |
8032 | tv2->split(-1, BSX); |
8033 | tv0->computeAt(tv2, -1); |
8034 | |
8035 | // Thread and Block binding |
8036 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
8037 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8038 | |
8039 | constexpr int M = 32, N = 457; |
8040 | |
8041 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8042 | at::Tensor aten_input = at::randn({M, N}, options); |
8043 | at::Tensor aten_output = (aten_input + 1.0) * 3.0; |
8044 | |
8045 | FusionExecutor fe; |
8046 | fe.compileFusion(&fusion, {aten_input}); |
8047 | auto cg_outputs = fe.runFusion({aten_input}); |
8048 | |
8049 | testValidate( |
8050 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
8051 | } |
8052 | |
8053 | TEST_F(NVFuserTest, FusionCacheFork_CUDA) { |
8054 | Fusion fusion; |
8055 | FusionGuard fg(&fusion); |
8056 | |
8057 | TensorView* tv0 = makeSymbolicTensor(2); |
8058 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0)); |
8059 | TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0)); |
8060 | fusion.addInput(tv0); |
8061 | fusion.addOutput(tv1); |
8062 | fusion.addOutput(tv2); |
8063 | // Before: TV1 = TV0 + 1 |
8064 | // TV2 = TV1 * 1 |
8065 | // Output: TV1, TV2 |
8066 | |
8067 | // After: TV1 = TV0 + 1 |
8068 | // TV3 = TV1 |
8069 | // TV2 = TV1 * 1 |
8070 | // Output: TV3, TV2 |
8071 | |
8072 | // cacheFork !!does not!! automatically apply ComputeAt to the cache |
8073 | auto tv3 = tv1->cacheFork(); |
8074 | |
8075 | constexpr int BSX = 32; |
8076 | tv2->split(-1, BSX); |
8077 | tv0->computeAt(tv2, -1); |
8078 | |
8079 | // Thread and Block binding |
8080 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
8081 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8082 | |
8083 | constexpr int M = 32, N = 457; |
8084 | |
8085 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8086 | at::Tensor aten_input = at::randn({M, N}, options); |
8087 | at::Tensor aten_output1 = aten_input + 1.0; |
8088 | at::Tensor aten_output2 = aten_output1 * 3.0; |
8089 | |
8090 | FusionExecutor fe; |
8091 | fe.compileFusion(&fusion, {aten_input}); |
8092 | auto cg_outputs = fe.runFusion({aten_input}); |
8093 | |
8094 | testValidate( |
8095 | &fusion, |
8096 | cg_outputs, |
8097 | {aten_input}, |
8098 | {aten_output1, aten_output2}, |
8099 | __LINE__, |
8100 | __FILE__); |
8101 | } |
8102 | |
8103 | TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) { |
8104 | Fusion fusion; |
8105 | FusionGuard fg(&fusion); |
8106 | |
8107 | TensorView* tv0 = makeSymbolicTensor(2); |
8108 | TensorView* tv1 = makeSymbolicTensor(2); |
8109 | TensorView* tv2 = makeSymbolicTensor(2); |
8110 | TensorView* tv3 = makeSymbolicTensor(2); |
8111 | TensorView* tv4 = sub(tv2, tv3); |
8112 | TensorView* tv5 = add(tv1, tv4); |
8113 | TensorView* tv6 = sub(tv5, tv0); |
8114 | fusion.addInput(tv0); |
8115 | fusion.addInput(tv1); |
8116 | fusion.addInput(tv2); |
8117 | fusion.addInput(tv3); |
8118 | fusion.addOutput(tv6); |
8119 | // t6 = ((t1 + (t2 - t3)) - t0) |
8120 | |
8121 | tv5->cacheAfter(); |
8122 | tv5->cacheBefore(); |
8123 | |
8124 | // cacheAfter on inputs placed before schedule |
8125 | constexpr int BSX = 32; |
8126 | tv6->split(-1, BSX); |
8127 | tv2->computeAt(tv6, -1); |
8128 | |
8129 | // Thread and Block binding |
8130 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
8131 | tv6->axis(-1)->parallelize(ParallelType::TIDx); |
8132 | |
8133 | constexpr int M = 32, N = 810; |
8134 | |
8135 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8136 | at::Tensor t0 = at::randn({M, N}, options); |
8137 | at::Tensor t1 = at::randn({M, N}, options); |
8138 | at::Tensor t2 = at::randn({M, N}, options); |
8139 | at::Tensor t3 = at::randn({M, N}, options); |
8140 | |
8141 | std::vector<IValue> aten_inputs = {t0, t1, t2, t3}; |
8142 | at::Tensor aten_output = (t1 + (t2 - t3)) - t0; |
8143 | |
8144 | FusionExecutor fe; |
8145 | fe.compileFusion(&fusion, aten_inputs); |
8146 | auto cg_outputs = fe.runFusion(aten_inputs); |
8147 | |
8148 | testValidate( |
8149 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
8150 | } |
8151 | |
8152 | TEST_F(NVFuserTest, FusionCacheBcast_CUDA) { |
8153 | Fusion fusion; |
8154 | FusionGuard fg(&fusion); |
8155 | |
8156 | // Algorithm |
8157 | TensorView* tv0 = makeSymbolicTensor(1); // (M, 1) |
8158 | TensorView* tv1 = broadcast(tv0, {false, true}); |
8159 | TensorView* tv2 = makeSymbolicTensor(1); // (1, N) |
8160 | TensorView* tv3 = broadcast(tv2, {true, false}); |
8161 | TensorView* tv4 = mul(tv1, tv3); |
8162 | fusion.addInput(tv0); |
8163 | fusion.addInput(tv2); |
8164 | fusion.addOutput(tv4); |
8165 | |
8166 | // Case 1 |
8167 | tv0->cacheAfter(); |
8168 | |
8169 | // Case 2 |
8170 | tv1->cacheBefore(); |
8171 | |
8172 | // Case 3 |
8173 | tv1->cacheAfter(); |
8174 | |
8175 | // Case 4 |
8176 | TensorView* tv8 = tv4->cacheBefore(); |
8177 | |
8178 | constexpr int BSX = 128; |
8179 | tv4->split(0, BSX); |
8180 | tv4->split(-1, BSX); |
8181 | tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); |
8182 | // M/BSX, N/BSY, BSX, BSY |
8183 | tv0->computeAt(tv4, 2); |
8184 | tv2->computeAt(tv4, 2); |
8185 | // 0, 1 | 2, 3, 4 |
8186 | |
8187 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
8188 | tv4->axis(1)->parallelize(ParallelType::BIDy); |
8189 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
8190 | // Manual Replay on TV3 |
8191 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
8192 | tv8->axis(-1)->parallelize(ParallelType::TIDx); |
8193 | |
8194 | constexpr int M = 92, N = 500; |
8195 | |
8196 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8197 | at::Tensor t0 = at::randn({M}, options); |
8198 | at::Tensor t1 = at::randn({N}, options); |
8199 | std::vector<IValue> aten_inputs = {t0, t1}; |
8200 | at::Tensor aten_output = |
8201 | t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0)); |
8202 | |
8203 | FusionExecutor fe; |
8204 | fe.compileFusion(&fusion, aten_inputs); |
8205 | auto cg_outputs = fe.runFusion(aten_inputs); |
8206 | |
8207 | testValidate( |
8208 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
8209 | } |
8210 | |
8211 | TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) { |
8212 | Fusion fusion; |
8213 | FusionGuard fg(&fusion); |
8214 | |
8215 | TensorView* tv0 = makeSymbolicTensor(1); |
8216 | TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1)); |
8217 | TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2)); |
8218 | TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); |
8219 | TensorView* tv4 = add(tv3, IrBuilder::create<Double>(2)); |
8220 | |
8221 | fusion.addInput(tv0); |
8222 | fusion.addOutput(tv2); |
8223 | fusion.addOutput(tv4); |
8224 | |
8225 | auto tv5 = tv1->cacheBefore(); |
8226 | auto tv6 = tv3->cacheBefore(); |
8227 | tv5->setMemoryType(MemoryType::Shared); |
8228 | tv6->setMemoryType(MemoryType::Shared); |
8229 | |
8230 | tv1->computeAt(tv2, -1); |
8231 | tv3->computeAt(tv4, -1); |
8232 | |
8233 | // Fails because tensor must be recomputed twice |
8234 | // auto tv7 = tv0->cacheAfter(); |
8235 | |
8236 | constexpr int N = 800; |
8237 | |
8238 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8239 | at::Tensor aten_input = at::randn({N}, options); |
8240 | auto aten_output = (aten_input + 1) + 2; |
8241 | |
8242 | FusionExecutor fe; |
8243 | fe.compileFusion(&fusion, {aten_input}); |
8244 | auto cg_outputs = fe.runFusion({aten_input}); |
8245 | |
8246 | testValidate( |
8247 | &fusion, |
8248 | cg_outputs, |
8249 | {aten_input}, |
8250 | {aten_output, aten_output}, |
8251 | __LINE__, |
8252 | __FILE__); |
8253 | } |
8254 | |
8255 | TEST_F(NVFuserTest, FusionSmem_CUDA) { |
8256 | Fusion fusion; |
8257 | FusionGuard fg(&fusion); |
8258 | |
8259 | // Algorithm |
8260 | TensorView* tv0 = makeSymbolicTensor(2); // (M, N) |
8261 | TensorView* tv1 = makeSymbolicTensor(2); // (M, N) |
8262 | TensorView* tv2 = mul(tv0, tv1); |
8263 | fusion.addInput(tv0); |
8264 | fusion.addInput(tv1); |
8265 | fusion.addOutput(tv2); |
8266 | |
8267 | // Schedule |
8268 | TensorView* tv3 = tv0->cacheAfter(); |
8269 | TensorView* tv4 = tv1->cacheAfter(); |
8270 | tv3->setMemoryType(MemoryType::Shared); |
8271 | tv4->setMemoryType(MemoryType::Shared); |
8272 | |
8273 | constexpr int BSY = 32; |
8274 | constexpr int BSX = 128; |
8275 | tv2->split(0, BSY); |
8276 | tv2->split(2, BSX); |
8277 | // M/BSX, BSX, N/BSX, BSX |
8278 | tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); |
8279 | // M/BSX, N/BSX, BSX, BSX |
8280 | |
8281 | tv0->computeAt(tv2, 2); |
8282 | tv1->computeAt(tv2, 2); |
8283 | |
8284 | // Thread and Block binding |
8285 | tv2->axis(0)->parallelize(ParallelType::BIDx); |
8286 | tv2->axis(1)->parallelize(ParallelType::BIDy); |
8287 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8288 | // Manual Binding |
8289 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
8290 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
8291 | |
8292 | constexpr int M = 128, N = 10240; |
8293 | |
8294 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8295 | at::Tensor t0 = at::randn({M, N}, options); |
8296 | at::Tensor t1 = at::randn({M, N}, options); |
8297 | at::Tensor aten_output = mul(t0, t1); |
8298 | |
8299 | std::vector<IValue> aten_inputs = {t0, t1}; |
8300 | |
8301 | FusionExecutor fe; |
8302 | fe.compileFusion(&fusion, {t0, t1}); |
8303 | auto cg_outputs = fe.runFusion({t0, t1}); |
8304 | |
8305 | testValidate( |
8306 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
8307 | |
8308 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); |
8309 | } |
8310 | |
8311 | TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { |
8312 | Fusion fusion; |
8313 | FusionGuard fg(&fusion); |
8314 | |
8315 | // Algorithm |
8316 | TensorView* tv0 = makeSymbolicTensor(3); // M, K, N |
8317 | TensorView* tv1 = sum(tv0, {1}); // M, R, N |
8318 | fusion.addInput(tv0); |
8319 | fusion.addOutput(tv1); |
8320 | |
8321 | TensorView* tv2 = tv0->cacheAfter(); |
8322 | tv2->setMemoryType(MemoryType::Shared); |
8323 | |
8324 | // Schedule |
8325 | constexpr int BSX = 32; |
8326 | tv1->split(2, BSX); |
8327 | tv1->split(1, 128); |
8328 | tv1->split(0, BSX); |
8329 | // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX |
8330 | tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); |
8331 | TensorView* tv3 = tv1->rFactor({-2}); |
8332 | |
8333 | tv0->computeAt(tv1, -2); |
8334 | tv0->computeAt(tv3, -2); |
8335 | |
8336 | // Thread and Block binding |
8337 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
8338 | tv1->axis(1)->parallelize(ParallelType::BIDy); |
8339 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
8340 | // Manual Binding |
8341 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8342 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
8343 | |
8344 | constexpr int M = 154, K = 45, N = 1524; |
8345 | |
8346 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8347 | at::Tensor aten_input = at::randn({M, K, N}, options); |
8348 | at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); |
8349 | |
8350 | FusionExecutor fe; |
8351 | fe.compileFusion(&fusion, {aten_input}); |
8352 | auto cg_outputs = fe.runFusion({aten_input}); |
8353 | |
8354 | testValidate( |
8355 | &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); |
8356 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); |
8357 | } |
8358 | |
8359 | TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { |
8360 | Fusion fusion; |
8361 | FusionGuard fg(&fusion); |
8362 | |
8363 | // Algorithm |
8364 | TensorView* tv0 = makeSymbolicTensor(2); // (M, K) |
8365 | TensorView* tv1 = makeSymbolicTensor(2); // (K, N) |
8366 | TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) |
8367 | TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) |
8368 | TensorView* tv4 = mul(tv2, tv3); // M, K, N |
8369 | TensorView* tv5 = sum(tv4, {1}); // M, R, N |
8370 | fusion.addInput(tv0); |
8371 | fusion.addInput(tv1); |
8372 | fusion.addOutput(tv5); |
8373 | |
8374 | // Schedule |
8375 | constexpr int BSX = 16; |
8376 | tv5->split(2, BSX - 1); |
8377 | tv5->split(1, BSX); |
8378 | tv5->split(0, BSX + 1); |
8379 | // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX |
8380 | tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); |
8381 | // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX |
8382 | TensorView* tv6 = tv5->rFactor({-1}); |
8383 | |
8384 | tv2->setMemoryType(MemoryType::Shared); |
8385 | tv3->setMemoryType(MemoryType::Shared); |
8386 | tv4->setMemoryType(MemoryType::Shared); |
8387 | tv6->setMemoryType(MemoryType::Shared); |
8388 | |
8389 | tv0->computeAt(tv5, 3); |
8390 | tv1->computeAt(tv5, 3); |
8391 | |
8392 | // Thread and Block binding |
8393 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
8394 | tv5->axis(1)->parallelize(ParallelType::BIDy); |
8395 | tv5->axis(-2)->parallelize(ParallelType::TIDy); |
8396 | tv5->axis(-1)->parallelize(ParallelType::TIDx); |
8397 | // Manual Binding |
8398 | tv2->axis(-3)->parallelize(ParallelType::TIDy); |
8399 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8400 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
8401 | tv4->axis(-3)->parallelize(ParallelType::TIDy); |
8402 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
8403 | tv6->axis(-3)->parallelize(ParallelType::TIDy); |
8404 | tv6->axis(-2)->parallelize(ParallelType::TIDx); |
8405 | |
8406 | // Make sure BIDx is makred as exact (see issue #1119) |
8407 | GpuLower gpulw(&fusion); |
8408 | TORCH_CHECK(gpulw.parallelDimensionMap().isExact(ParallelType::BIDx)); |
8409 | |
8410 | constexpr int M = 154, K = 45, N = 1524; |
8411 | |
8412 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8413 | at::Tensor t0 = at::randn({M, K}, options); |
8414 | at::Tensor t1 = at::randn({K, N}, options); |
8415 | |
8416 | std::vector<IValue> aten_inputs = {t0, t1}; |
8417 | at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); |
8418 | |
8419 | FusionExecutor fe; |
8420 | fe.compileFusion(&fusion, {t0, t1}); |
8421 | auto cg_outputs = fe.runFusion({t0, t1}); |
8422 | |
8423 | testValidate( |
8424 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
8425 | |
8426 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); |
8427 | } |
8428 | |
8429 | TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { |
8430 | Fusion fusion; |
8431 | FusionGuard fg(&fusion); |
8432 | |
8433 | // Algorithm |
8434 | TensorView* tv0 = makeSymbolicTensor(2); // (M, K) |
8435 | TensorView* tv1 = makeSymbolicTensor(2); // (K, N) |
8436 | TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) |
8437 | TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) |
8438 | TensorView* tv4 = mul(tv2, tv3); // M, K, N |
8439 | TensorView* tv5 = sum(tv4, {1}); // M, R, N |
8440 | fusion.addInput(tv0); |
8441 | fusion.addInput(tv1); |
8442 | fusion.addOutput(tv5); |
8443 | |
8444 | // Schedule |
8445 | // Remove reduction axis from tv5 |
8446 | // tv6 = (M, R, N) |
8447 | // tv5 = (M, N) |
8448 | TensorView* tv6 = tv5->cacheBefore(); |
8449 | |
8450 | constexpr int BSX = 16; |
8451 | tv5->split(1, BSX); |
8452 | tv5->split(0, BSX); |
8453 | // M/BSX, BSX, N/BSX, BSX |
8454 | tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); |
8455 | // tv5 = M/BSX, N/BSX, MSX, NSX |
8456 | |
8457 | tv6->computeAt(tv5, 2); |
8458 | tv6->computeAt(tv5, 2); |
8459 | |
8460 | tv6->split(-1, BSX); |
8461 | // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX |
8462 | tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); |
8463 | // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX |
8464 | TensorView* tv7 = tv6->rFactor({-1}); |
8465 | // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr |
8466 | // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX |
8467 | |
8468 | tv0->computeAt(tv6, 3); |
8469 | tv1->computeAt(tv6, 3); |
8470 | |
8471 | tv0->computeAt(tv7, 3); |
8472 | tv1->computeAt(tv7, 3); |
8473 | |
8474 | tv2->setMemoryType(MemoryType::Shared); |
8475 | tv3->setMemoryType(MemoryType::Shared); |
8476 | tv4->setMemoryType(MemoryType::Shared); |
8477 | tv6->setMemoryType(MemoryType::Shared); |
8478 | tv7->setMemoryType(MemoryType::Shared); |
8479 | // Memory Type |
8480 | |
8481 | // Thread and Block binding |
8482 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
8483 | tv5->axis(1)->parallelize(ParallelType::BIDy); |
8484 | tv5->axis(-2)->parallelize(ParallelType::TIDy); |
8485 | tv5->axis(-1)->parallelize(ParallelType::TIDx); |
8486 | // Manual Binding |
8487 | tv2->axis(-3)->parallelize(ParallelType::TIDy); |
8488 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
8489 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
8490 | tv4->axis(-3)->parallelize(ParallelType::TIDy); |
8491 | tv4->axis(-1)->parallelize(ParallelType::TIDx); |
8492 | |
8493 | tv7->axis(-3)->parallelize(ParallelType::TIDy); |
8494 | tv7->axis(-2)->parallelize(ParallelType::TIDx); |
8495 | |
8496 | tv6->axis(-2)->parallelize(ParallelType::TIDy); |
8497 | tv6->axis(-1)->parallelize(ParallelType::TIDx); |
8498 | |
8499 | constexpr int M = 154, K = 45, N = 1524; |
8500 | |
8501 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8502 | at::Tensor t0 = at::randn({M, K}, options); |
8503 | at::Tensor t1 = at::randn({K, N}, options); |
8504 | at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); |
8505 | |
8506 | std::vector<IValue> aten_inputs = {t0, t1}; |
8507 | |
8508 | FusionExecutor fe; |
8509 | fe.compileFusion(&fusion, aten_inputs); |
8510 | auto cg_outputs = fe.runFusion(aten_inputs); |
8511 | |
8512 | testValidate( |
8513 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
8514 | |
8515 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); |
8516 | } |
8517 | |
8518 | TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { |
8519 | Fusion fusion; |
8520 | FusionGuard fg(&fusion); |
8521 | |
8522 | TensorView* x = makeSymbolicTensor(2); |
8523 | fusion.addInput(x); |
8524 | TensorView* max_val = reductionOp( |
8525 | BinaryOpType::Max, |
8526 | {-1}, |
8527 | IrBuilder::create<Double>(std::numeric_limits<float>::lowest()), |
8528 | x); // (M) |
8529 | TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) |
8530 | TensorView* x_max_sub = sub(x, bcast_max); // (M, N) |
8531 | TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N) |
8532 | TensorView* sum_exp = sum(exp, {-1}); // (M, R) |
8533 | TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B) |
8534 | TensorView* softmax = div(exp, bcast_sum); // (M, N) |
8535 | fusion.addOutput(softmax); |
8536 | |
8537 | // Read Input into Shared Memory |
8538 | // Load Input + Pwise into shared memory |
8539 | auto cache_x = x->cacheAfter(); |
8540 | cache_x->setMemoryType(MemoryType::Shared); |
8541 | exp->setMemoryType(MemoryType::Shared); |
8542 | |
8543 | std::vector<TensorView*> all_tensors( |
8544 | {x, |
8545 | cache_x, |
8546 | max_val, |
8547 | bcast_max, |
8548 | x_max_sub, |
8549 | exp, |
8550 | sum_exp, |
8551 | bcast_sum, |
8552 | softmax}); |
8553 | |
8554 | auto tidx = IrBuilder::create<Int>(); |
8555 | fusion.addInput(tidx); |
8556 | |
8557 | for (auto tensor : all_tensors) { |
8558 | tensor->split(-1, tidx); |
8559 | } |
8560 | |
8561 | auto sum_exp_rf = sum_exp->rFactor({1}); |
8562 | all_tensors.push_back(sum_exp_rf); |
8563 | |
8564 | // computeAt |
8565 | x->computeAt(x_max_sub, 1); |
8566 | exp->computeAt(softmax, 1); |
8567 | x_max_sub->computeAt(exp, 2); |
8568 | |
8569 | softmax->axis(0)->parallelize(ParallelType::BIDx); |
8570 | for (auto tensor : all_tensors) { |
8571 | tensor->axis(-1)->parallelize(ParallelType::TIDx); |
8572 | } |
8573 | |
8574 | const int64_t dimx = 1024; |
8575 | const int64_t dimy = 4096; |
8576 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8577 | at::Tensor aten_input = at::randn({dimx, dimy}, options); |
8578 | auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); |
8579 | |
8580 | torch::jit::fuser::cuda::FusionExecutor fe; |
8581 | fe.compileFusion(&fusion, {aten_input, 128}); |
8582 | auto cg_outputs = fe.runFusion({aten_input, 128}); |
8583 | |
8584 | testValidate( |
8585 | &fusion, |
8586 | cg_outputs, |
8587 | {aten_input, 128}, |
8588 | {aten_output}, |
8589 | __LINE__, |
8590 | __FILE__); |
8591 | } |
8592 | |
8593 | TEST_F(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { |
8594 | Fusion fusion; |
8595 | FusionGuard fg(&fusion); |
8596 | |
8597 | const int kReductionAxis = 3; |
8598 | std::vector<int64_t> input_shape{10, 10, 10, 67}; |
8599 | TensorView* input = makeSymbolicTensor(input_shape.size()); |
8600 | fusion.addInput(input); |
8601 | |
8602 | auto output = softmax(input, kReductionAxis); |
8603 | |
8604 | fusion.addOutput(output); |
8605 | |
8606 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8607 | at::Tensor aten_input = at::randn(input_shape, options); |
8608 | auto aten_output = |
8609 | at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false); |
8610 | |
8611 | auto reduction_params = getPersistentHeuristics(&fusion, {aten_input}); |
8612 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
8613 | |
8614 | schedulePersistentKernel(&fusion, *reduction_params); |
8615 | |
8616 | auto lparams = reduction_params->lparams; |
8617 | |
8618 | torch::jit::fuser::cuda::FusionExecutor fe; |
8619 | fe.compileFusion(&fusion, {aten_input}, lparams); |
8620 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
8621 | |
8622 | testValidate( |
8623 | &fusion, |
8624 | cg_outputs, |
8625 | {aten_input}, |
8626 | {aten_output}, |
8627 | __LINE__, |
8628 | __FILE__, |
8629 | "" , |
8630 | lparams); |
8631 | } |
8632 | |
8633 | TEST_F(NVFuserTest, FusionTestMaskSoftmax_CUDA) { |
8634 | // This test is testing the usage of all padding tokens |
8635 | // with softmax like Bert might might use in a full padding |
8636 | // sequence. |
8637 | Fusion fusion; |
8638 | FusionGuard fg(&fusion); |
8639 | |
8640 | const int kReductionAxis = 3; |
8641 | std::vector<int64_t> input_shape{256, 16, 128, 128}; |
8642 | TensorView* input = makeSymbolicTensor(input_shape.size()); |
8643 | TensorView* mask = makeSymbolicTensor(input_shape.size()); |
8644 | fusion.addInput(input); |
8645 | fusion.addInput(mask); |
8646 | |
8647 | auto out1 = add(input, mask); |
8648 | auto output = softmax(out1, kReductionAxis); |
8649 | |
8650 | fusion.addOutput(output); |
8651 | |
8652 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8653 | at::Tensor aten_input = at::randn(input_shape, options); |
8654 | at::Tensor aten_mask = at::ones(input_shape, options); |
8655 | // -10,000 is used here as a magic number because the padding |
8656 | // tokens need to be a value that gives a value close to zero |
8657 | // as to not influence softmax. Bert, in particular, does |
8658 | // not use -Infinity because sometimes it will have a |
8659 | // softmax of all padding tokkens that can result a divide by |
8660 | // zero that creates NaN result. |
8661 | aten_mask = aten_mask * -10000.0; |
8662 | auto aten_out1 = aten_input + aten_mask; |
8663 | auto aten_output = at::_softmax(aten_out1, kReductionAxis, false); |
8664 | |
8665 | auto reduction_params = |
8666 | getPersistentHeuristics(&fusion, {aten_input, aten_mask}); |
8667 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
8668 | |
8669 | schedulePersistentKernel(&fusion, *reduction_params); |
8670 | |
8671 | auto lparams = reduction_params->lparams; |
8672 | |
8673 | torch::jit::fuser::cuda::FusionExecutor fe; |
8674 | fe.compileFusion(&fusion, {aten_input, aten_mask}, lparams); |
8675 | auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams); |
8676 | |
8677 | testValidate( |
8678 | &fusion, |
8679 | cg_outputs, |
8680 | {aten_input, aten_mask}, |
8681 | {aten_output}, |
8682 | __LINE__, |
8683 | __FILE__, |
8684 | "" , |
8685 | lparams); |
8686 | } |
8687 | |
8688 | TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { |
8689 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
8690 | Fusion& fusion = *fusion_ptr.get(); |
8691 | FusionGuard fg(&fusion); |
8692 | |
8693 | std::vector<int64_t> shape{20, 100, 35, 67}; |
8694 | std::vector<int64_t> norm_shape{67}; |
8695 | |
8696 | const size_t kM = shape.size(); |
8697 | const size_t kN = norm_shape.size(); |
8698 | const size_t kOuterNumDims = kM - kN; |
8699 | |
8700 | std::vector<int64_t> outer_shape; |
8701 | for (const auto idx : c10::irange(kOuterNumDims)) { |
8702 | outer_shape.push_back(shape[idx]); |
8703 | } |
8704 | for (const auto idx : c10::irange(kOuterNumDims, kM)) { |
8705 | outer_shape.push_back(1); |
8706 | } |
8707 | |
8708 | auto grad_out = makeSymbolicTensor(shape.size()); |
8709 | auto input = makeSymbolicTensor(shape.size()); |
8710 | auto mean = makeConcreteTensor(outer_shape); |
8711 | auto rstd = makeConcreteTensor(outer_shape); |
8712 | auto weight = makeSymbolicTensor(norm_shape.size()); |
8713 | auto bias = makeSymbolicTensor(norm_shape.size()); |
8714 | fusion.addInput(grad_out); |
8715 | fusion.addInput(input); |
8716 | fusion.addInput(mean); |
8717 | fusion.addInput(rstd); |
8718 | fusion.addInput(weight); |
8719 | fusion.addInput(bias); |
8720 | |
8721 | auto grads = layer_norm_backward( |
8722 | grad_out, |
8723 | input, |
8724 | norm_shape, |
8725 | mean, |
8726 | rstd, |
8727 | weight, |
8728 | bias, |
8729 | {true, true, true}); |
8730 | |
8731 | fusion.addOutput(grads.grad_input); |
8732 | fusion.addOutput(grads.grad_weight); |
8733 | fusion.addOutput(grads.grad_bias); |
8734 | |
8735 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8736 | at::Tensor aten_grad_out = at::randn(shape, options); |
8737 | at::Tensor aten_input = at::randn(shape, options); |
8738 | at::Tensor aten_weight = at::randn(norm_shape, options); |
8739 | at::Tensor aten_bias = at::randn(norm_shape, options); |
8740 | auto at_weight = c10::optional<at::Tensor>(aten_weight); |
8741 | auto at_bias = c10::optional<at::Tensor>(aten_bias); |
8742 | |
8743 | const float kEps = 1e-5; |
8744 | auto aten_results = |
8745 | at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps); |
8746 | auto aten_output = std::get<0>(aten_results); |
8747 | auto aten_mean = std::get<1>(aten_results); |
8748 | auto aten_rstd = std::get<2>(aten_results); |
8749 | |
8750 | FusionExecutorCache fec(std::move(fusion_ptr)); |
8751 | std::vector<IValue> aten_inputs = { |
8752 | aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; |
8753 | auto cg_outputs = fec.runFusionWithInputs(aten_inputs); |
8754 | |
8755 | auto aten_gradients = at::native_layer_norm_backward( |
8756 | aten_grad_out.to(at::kDouble), |
8757 | aten_input.to(at::kDouble), |
8758 | norm_shape, |
8759 | aten_mean.to(at::kDouble), |
8760 | aten_rstd.to(at::kDouble), |
8761 | c10::optional<at::Tensor>(aten_weight.to(at::kDouble)), |
8762 | c10::optional<at::Tensor>(aten_bias.to(at::kDouble)), |
8763 | {true, true, true}); |
8764 | |
8765 | testValidate( |
8766 | &fusion, |
8767 | cg_outputs, |
8768 | aten_inputs, |
8769 | {std::get<0>(aten_gradients), |
8770 | std::get<1>(aten_gradients), |
8771 | std::get<2>(aten_gradients)}, |
8772 | __LINE__, |
8773 | __FILE__); |
8774 | } |
8775 | |
8776 | TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) { |
8777 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
8778 | Fusion& fusion = *fusion_ptr.get(); |
8779 | FusionGuard fg(&fusion); |
8780 | const int64_t NORM_SIZE = 1024; |
8781 | std::vector<int64_t> shape{8, 56, NORM_SIZE}; |
8782 | std::vector<int64_t> norm_shape{NORM_SIZE}; |
8783 | |
8784 | const size_t kM = shape.size(); |
8785 | const size_t kN = norm_shape.size(); |
8786 | const size_t kOuterNumDims = kM - kN; |
8787 | |
8788 | std::vector<int64_t> outer_shape; |
8789 | for (const auto idx : c10::irange(kOuterNumDims)) { |
8790 | outer_shape.push_back(shape[idx]); |
8791 | } |
8792 | for (const auto idx : c10::irange(kOuterNumDims, kM)) { |
8793 | outer_shape.push_back(1); |
8794 | } |
8795 | |
8796 | auto grad_out = makeContigTensor(shape.size()); |
8797 | auto input = makeContigTensor(shape.size()); |
8798 | auto rstd = makeConcreteTensor(outer_shape); |
8799 | auto weight = makeContigTensor(norm_shape.size()); |
8800 | fusion.addInput(grad_out); |
8801 | fusion.addInput(input); |
8802 | fusion.addInput(rstd); |
8803 | fusion.addInput(weight); |
8804 | |
8805 | auto grads = rms_norm_backward( |
8806 | grad_out, input, norm_shape, rstd, weight, {true, true}); |
8807 | |
8808 | fusion.addOutput(grads.grad_input); |
8809 | fusion.addOutput(grads.grad_weight); |
8810 | |
8811 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8812 | at::Tensor aten_grad_out = at::randn(shape, options); |
8813 | at::Tensor aten_input = at::randn(shape, options); |
8814 | at::Tensor aten_weight = at::randn(norm_shape, options); |
8815 | auto at_weight = c10::optional<at::Tensor>(aten_weight); |
8816 | |
8817 | const float kEps = 1e-6; |
8818 | auto pow2 = at::pow(aten_input, 2); |
8819 | auto sum = at::sum(pow2, -1, true); |
8820 | auto var = at::mul(sum, 1.0 / NORM_SIZE); |
8821 | auto aten_rstd = at::pow(at::add(var, kEps), -0.5); |
8822 | |
8823 | FusionExecutorCache fec(std::move(fusion_ptr)); |
8824 | std::vector<IValue> aten_inputs = { |
8825 | aten_grad_out, aten_input, aten_rstd, aten_weight}; |
8826 | auto cg_outputs = fec.runFusionWithInputs(aten_inputs); |
8827 | |
8828 | auto in_mul_rstd = at::mul(aten_input, aten_rstd); |
8829 | auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd); |
8830 | auto aten_grad_weight = at::sum(grad_out_mul, c10::IntArrayRef{0, 1}); |
8831 | auto sum_loss1 = at::sum(at::mul(aten_grad_out, aten_weight), -1, true); |
8832 | auto sum_loss2 = at::sum( |
8833 | at::mul( |
8834 | at::mul(at::mul(aten_grad_out, aten_weight), aten_input), aten_rstd), |
8835 | -1, |
8836 | true); |
8837 | |
8838 | const float fH = NORM_SIZE; |
8839 | auto term1 = at::mul(aten_rstd, 1.0 / fH); |
8840 | auto aten_grad_input = at::mul(at::mul(aten_grad_out, fH), aten_weight); |
8841 | aten_grad_input = at::sub(aten_grad_input, sum_loss1); |
8842 | aten_grad_input = at::sub( |
8843 | aten_grad_input, at::mul(at::mul(aten_input, aten_rstd), sum_loss2)); |
8844 | aten_grad_input = at::mul(aten_grad_input, term1); |
8845 | testValidate( |
8846 | &fusion, |
8847 | cg_outputs, |
8848 | aten_inputs, |
8849 | {aten_grad_input, aten_grad_weight}, |
8850 | __LINE__, |
8851 | __FILE__); |
8852 | } |
8853 | |
8854 | TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { |
8855 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
8856 | Fusion& fusion = *fusion_ptr.get(); |
8857 | FusionGuard fg(&fusion); |
8858 | |
8859 | const float kEps = 1e-5; |
8860 | Double* eps_ptr = IrBuilder::create<Double>(kEps); |
8861 | |
8862 | std::vector<int64_t> input_shape{20, 100, 35, 67}; |
8863 | std::vector<int64_t> norm_shape{67}; |
8864 | |
8865 | auto input = makeSymbolicTensor(input_shape.size()); |
8866 | fusion.addInput(input); |
8867 | |
8868 | auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr); |
8869 | |
8870 | fusion.addOutput(result.output); |
8871 | fusion.addOutput(result.mean); |
8872 | fusion.addOutput(result.invstd); |
8873 | |
8874 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8875 | at::Tensor aten_input = at::randn(input_shape, options); |
8876 | c10::optional<at::Tensor> aten_weight = c10::nullopt; |
8877 | c10::optional<at::Tensor> aten_bias = c10::nullopt; |
8878 | auto aten_outputs = at::native_layer_norm( |
8879 | aten_input, norm_shape, aten_weight, aten_bias, kEps); |
8880 | |
8881 | // Check reduction axis is same for all reductions |
8882 | // Generate Launch Parameters |
8883 | auto reduction_params = getPersistentHeuristics(&fusion, {aten_input}); |
8884 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
8885 | |
8886 | FusionExecutorCache fec(std::move(fusion_ptr)); |
8887 | auto cg_outputs = fec.runFusionWithInputs({aten_input}); |
8888 | |
8889 | testValidate( |
8890 | &fusion, |
8891 | cg_outputs, |
8892 | {aten_input}, |
8893 | {std::get<0>(aten_outputs), |
8894 | std::get<1>(aten_outputs), |
8895 | std::get<2>(aten_outputs)}, |
8896 | __LINE__, |
8897 | __FILE__, |
8898 | "" ); |
8899 | } |
8900 | |
8901 | TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) { |
8902 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
8903 | Fusion& fusion = *fusion_ptr.get(); |
8904 | FusionGuard fg(&fusion); |
8905 | |
8906 | int64_t NORM_SIZE = 1024; |
8907 | const float kEps = 1e-6; |
8908 | Double* eps_ptr = IrBuilder::create<Double>(kEps); |
8909 | |
8910 | std::vector<int64_t> input_shape{8, 56, NORM_SIZE}; |
8911 | std::vector<int64_t> norm_shape{NORM_SIZE}; |
8912 | |
8913 | auto input = makeContigTensor(input_shape.size()); |
8914 | fusion.addInput(input); |
8915 | auto result = rms_norm(input, norm_shape, nullptr, eps_ptr); |
8916 | |
8917 | fusion.addOutput(result.output); |
8918 | fusion.addOutput(result.invstd); |
8919 | |
8920 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8921 | at::Tensor aten_input = at::randn(input_shape, options); |
8922 | c10::optional<at::Tensor> aten_weight = c10::nullopt; |
8923 | |
8924 | auto pow2 = at::pow(aten_input, 2); |
8925 | |
8926 | auto sum = at::sum(pow2, -1, true); |
8927 | auto var = at::mul(sum, 1.0 / NORM_SIZE); |
8928 | auto invstd = at::pow(at::add(var, kEps), -0.5); |
8929 | auto output = at::mul(aten_input, invstd); |
8930 | //// Check reduction axis is same for all reductions |
8931 | //// Generate Launch Parameters |
8932 | auto reduction_params = getPersistentHeuristics(&fusion, {aten_input}); |
8933 | TORCH_CHECK(reduction_params, "Reduction schedule was not generated!" ); |
8934 | |
8935 | FusionExecutorCache fec(std::move(fusion_ptr)); |
8936 | auto cg_outputs = fec.runFusionWithInputs({aten_input}); |
8937 | |
8938 | testValidate( |
8939 | &fusion, |
8940 | cg_outputs, |
8941 | {aten_input}, |
8942 | {output, invstd}, |
8943 | __LINE__, |
8944 | __FILE__, |
8945 | "" ); |
8946 | } |
8947 | |
8948 | TEST_F(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { |
8949 | if (!deviceMajorMinorCheck(7)) { |
8950 | GTEST_SKIP() << "skipping tests on pre-Volta GPUs" ; |
8951 | return; |
8952 | } |
8953 | auto fusion = std::make_unique<Fusion>(); |
8954 | FusionGuard fg(fusion.get()); |
8955 | |
8956 | const float kMomentum = 0.1; |
8957 | const float kEps = 1e-5; |
8958 | const bool kTraining = true; |
8959 | std::vector<int64_t> input_shape{20, 100, 35, 45}; |
8960 | |
8961 | auto input = makeSymbolicTensor(input_shape.size()); |
8962 | auto weight = makeSymbolicTensor(1); |
8963 | auto bias = makeSymbolicTensor(1); |
8964 | auto running_mean = makeSymbolicTensor(1); |
8965 | auto running_var = makeSymbolicTensor(1); |
8966 | fusion->addInput(input); |
8967 | fusion->addInput(weight); |
8968 | fusion->addInput(bias); |
8969 | fusion->addInput(running_mean); |
8970 | fusion->addInput(running_var); |
8971 | |
8972 | Double* momentum = IrBuilder::create<Double>(kMomentum); |
8973 | Double* eps = IrBuilder::create<Double>(kEps); |
8974 | |
8975 | auto result = batch_norm( |
8976 | input, weight, bias, running_mean, running_var, kTraining, momentum, eps); |
8977 | |
8978 | fusion->addOutput(result.output); |
8979 | fusion->addOutput(result.mean); |
8980 | fusion->addOutput(result.invstd); |
8981 | |
8982 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
8983 | auto at_input = at::randn(input_shape, options); |
8984 | auto at_weight = at::ones({input_shape[1]}, options); |
8985 | auto at_bias = at::zeros({input_shape[1]}, options); |
8986 | auto at_run_mean = at::zeros({input_shape[1]}, options); |
8987 | auto at_run_var = at::ones({input_shape[1]}, options); |
8988 | |
8989 | std::vector<IValue> aten_inputs = { |
8990 | at_input, at_weight, at_bias, at_run_mean, at_run_var}; |
8991 | |
8992 | FusionExecutorCache executor_cache(std::move(fusion)); |
8993 | |
8994 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
8995 | |
8996 | auto aten_outputs = at::native_batch_norm( |
8997 | at_input, |
8998 | c10::optional<at::Tensor>(at_weight), |
8999 | c10::optional<at::Tensor>(at_bias), |
9000 | c10::optional<at::Tensor>(at_run_mean), |
9001 | c10::optional<at::Tensor>(at_run_var), |
9002 | kTraining, |
9003 | kMomentum, |
9004 | kEps); |
9005 | |
9006 | testValidate( |
9007 | executor_cache.fusion(), |
9008 | cg_outputs, |
9009 | aten_inputs, |
9010 | {std::get<0>(aten_outputs), |
9011 | std::get<1>(aten_outputs), |
9012 | std::get<2>(aten_outputs)}, |
9013 | __LINE__, |
9014 | __FILE__, |
9015 | "" ); |
9016 | } |
9017 | |
9018 | TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalization_CUDA) { |
9019 | if (!deviceMajorMinorCheck(7)) { |
9020 | GTEST_SKIP() << "skipping tests on pre-Volta GPUs" ; |
9021 | return; |
9022 | } |
9023 | auto fusion = std::make_unique<Fusion>(); |
9024 | FusionGuard fg(fusion.get()); |
9025 | |
9026 | const float kMomentum = 0.1; |
9027 | const float kEps = 1e-5; |
9028 | const bool kUseInputStats = true; |
9029 | std::vector<int64_t> input_shape{20, 100, 35, 45}; |
9030 | |
9031 | auto input = makeSymbolicTensor(input_shape.size()); |
9032 | auto weight = makeSymbolicTensor(1); |
9033 | auto bias = makeSymbolicTensor(1); |
9034 | auto running_mean = makeSymbolicTensor(1); |
9035 | auto running_var = makeSymbolicTensor(1); |
9036 | fusion->addInput(input); |
9037 | fusion->addInput(weight); |
9038 | fusion->addInput(bias); |
9039 | fusion->addInput(running_mean); |
9040 | fusion->addInput(running_var); |
9041 | |
9042 | Double* momentum = IrBuilder::create<Double>(kMomentum); |
9043 | Double* eps = IrBuilder::create<Double>(kEps); |
9044 | |
9045 | auto result = instance_norm( |
9046 | input, |
9047 | weight, |
9048 | bias, |
9049 | running_mean, |
9050 | running_var, |
9051 | kUseInputStats, |
9052 | momentum, |
9053 | eps); |
9054 | |
9055 | fusion->addOutput(result.output); |
9056 | // fusion->addOutput(result.mean); |
9057 | // fusion->addOutput(result.invstd); |
9058 | |
9059 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9060 | auto at_input = at::randn(input_shape, options); |
9061 | auto at_weight = at::ones({input_shape[1]}, options); |
9062 | auto at_bias = at::zeros({input_shape[1]}, options); |
9063 | auto at_run_mean = at::zeros({input_shape[1]}, options); |
9064 | auto at_run_var = at::ones({input_shape[1]}, options); |
9065 | |
9066 | std::vector<IValue> aten_inputs = { |
9067 | at_input, at_weight, at_bias, at_run_mean, at_run_var}; |
9068 | |
9069 | FusionExecutorCache executor_cache(std::move(fusion)); |
9070 | |
9071 | auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); |
9072 | auto cg_outputs_full = {at_run_mean, at_run_var, cg_outputs[0]}; |
9073 | |
9074 | auto aten_outputs = at::instance_norm( |
9075 | at_input, |
9076 | c10::optional<at::Tensor>(at_weight), |
9077 | c10::optional<at::Tensor>(at_bias), |
9078 | c10::optional<at::Tensor>(at_run_mean), |
9079 | c10::optional<at::Tensor>(at_run_var), |
9080 | kUseInputStats, |
9081 | kMomentum, |
9082 | kEps, |
9083 | false); |
9084 | |
9085 | testValidate( |
9086 | executor_cache.fusion(), |
9087 | cg_outputs, |
9088 | aten_inputs, |
9089 | // TODO: can run_mean/run_var be checked here? |
9090 | // fusion_outputs.size() == aten_outputs.size() && aten_outputs.size() == |
9091 | // fusion->outputs().size() - output_alias_indices.size() |
9092 | {aten_outputs}, |
9093 | __LINE__, |
9094 | __FILE__, |
9095 | "" ); |
9096 | } |
9097 | |
9098 | TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalizationBackward_CUDA) { |
9099 | if (!deviceMajorMinorCheck(7)) { |
9100 | GTEST_SKIP() << "skipping tests on pre-Volta GPUs" ; |
9101 | return; |
9102 | } |
9103 | auto fusion_forward = std::make_unique<Fusion>(); |
9104 | FusionGuard fg_forward(fusion_forward.get()); |
9105 | |
9106 | const float kMomentum = 0.1; |
9107 | const float kEps = 1e-5; |
9108 | const bool kUseInputStats = true; |
9109 | const bool channels_last = true; |
9110 | const int B = 2; |
9111 | const int C = 5; |
9112 | const int S = 3; |
9113 | std::vector<int64_t> input_shape{B, C, S, S, S}; |
9114 | // explicit channels-last for NVFuser |
9115 | std::vector<int64_t> nvfuser_input_shape{B, S, S, S, C}; |
9116 | |
9117 | auto input = makeContigTensor(input_shape.size()); |
9118 | auto weight = makeContigTensor(1); |
9119 | auto bias = makeContigTensor(1); |
9120 | fusion_forward->addInput(input); |
9121 | fusion_forward->addInput(weight); |
9122 | fusion_forward->addInput(bias); |
9123 | |
9124 | Double* momentum = IrBuilder::create<Double>(kMomentum); |
9125 | Double* eps = IrBuilder::create<Double>(kEps); |
9126 | auto result_forward = instance_norm( |
9127 | input, |
9128 | weight, |
9129 | bias, |
9130 | nullptr, |
9131 | nullptr, |
9132 | kUseInputStats, |
9133 | momentum, |
9134 | eps, |
9135 | channels_last); |
9136 | fusion_forward->addOutput(result_forward.output); |
9137 | fusion_forward->addOutput(result_forward.mean); |
9138 | fusion_forward->addOutput(result_forward.invstd); |
9139 | |
9140 | FusionExecutorCache executor_cache_forward(std::move(fusion_forward)); |
9141 | |
9142 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9143 | auto at_input = at::randn(input_shape, options) |
9144 | .to(at::MemoryFormat::ChannelsLast3d) |
9145 | .set_requires_grad(true); |
9146 | auto at_input_nvfuser = at_input.clone().detach().permute({0, 2, 3, 4, 1}); |
9147 | auto at_weight = at::ones({input_shape[1]}, options).set_requires_grad(true); |
9148 | auto at_weight_nvfuser = at_weight.clone().detach(); |
9149 | auto at_bias = at::zeros({input_shape[1]}, options).set_requires_grad(true); |
9150 | auto at_bias_nvfuser = at_bias.clone().detach(); |
9151 | std::vector<torch::jit::IValue> aten_inputs_forward = { |
9152 | at_input_nvfuser, at_weight_nvfuser, at_bias_nvfuser}; |
9153 | // out, mean, invstd |
9154 | auto outputs_forward = |
9155 | executor_cache_forward.runFusionWithInputs(aten_inputs_forward); |
9156 | auto at_out = at::instance_norm( |
9157 | at_input, |
9158 | c10::optional<at::Tensor>(at_weight), |
9159 | c10::optional<at::Tensor>(at_bias), |
9160 | c10::optional<at::Tensor>(c10::nullopt), |
9161 | c10::optional<at::Tensor>(c10::nullopt), |
9162 | kUseInputStats, |
9163 | kMomentum, |
9164 | kEps, |
9165 | false); |
9166 | auto at_grad = |
9167 | at::randn(input_shape, options).to(at::MemoryFormat::ChannelsLast3d); |
9168 | auto at_grad_nvfuser = at_grad.clone().detach().permute({0, 2, 3, 4, 1}); |
9169 | at_out.backward(at_grad); |
9170 | auto fusion_backward = std::make_unique<Fusion>(); |
9171 | FusionGuard fg_backward(fusion_backward.get()); |
9172 | |
9173 | input = makeContigTensor(input_shape.size()); |
9174 | auto grad_output = makeContigTensor(input_shape.size()); |
9175 | weight = makeContigTensor(1); |
9176 | auto save_mean = makeContigTensor(2); |
9177 | auto save_invstd = makeContigTensor(2); |
9178 | auto dummy = makeContigTensor(0); |
9179 | |
9180 | fusion_backward->addInput(input); |
9181 | fusion_backward->addInput(grad_output); |
9182 | fusion_backward->addInput(weight); |
9183 | fusion_backward->addInput(dummy); // dummy for run_mean |
9184 | fusion_backward->addInput(dummy); // dummy for run_var |
9185 | fusion_backward->addInput(save_mean); |
9186 | fusion_backward->addInput(save_invstd); |
9187 | |
9188 | auto result_backward = instance_norm_backward( |
9189 | input, |
9190 | grad_output, |
9191 | weight, |
9192 | nullptr, |
9193 | nullptr, |
9194 | save_mean, |
9195 | save_invstd, |
9196 | kUseInputStats, |
9197 | eps, |
9198 | {true, true, true}, |
9199 | channels_last); |
9200 | |
9201 | fusion_backward->addOutput(result_backward.grad_input); |
9202 | fusion_backward->addOutput(result_backward.grad_weight); |
9203 | fusion_backward->addOutput(result_backward.grad_bias); |
9204 | |
9205 | FusionExecutorCache executor_cache_backward(std::move(fusion_backward)); |
9206 | std::vector<torch::jit::IValue> aten_inputs_backward = { |
9207 | at_input_nvfuser, |
9208 | at_grad_nvfuser, |
9209 | at_weight_nvfuser, |
9210 | at::empty({}), |
9211 | at::empty({}), |
9212 | outputs_forward[1], |
9213 | outputs_forward[2]}; |
9214 | auto outputs_backward = |
9215 | executor_cache_backward.runFusionWithInputs(aten_inputs_backward); |
9216 | outputs_backward[0] = outputs_backward[0].permute({0, 4, 1, 2, 3}); |
9217 | testValidate( |
9218 | executor_cache_backward.fusion(), |
9219 | outputs_backward, |
9220 | aten_inputs_backward, |
9221 | {at_input.grad(), at_weight.grad(), at_bias.grad()}, |
9222 | __LINE__, |
9223 | __FILE__, |
9224 | "" ); |
9225 | } |
9226 | |
9227 | TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA) { |
9228 | Fusion fusion; |
9229 | FusionGuard fg(&fusion); |
9230 | |
9231 | const int pixels_per_thread = 64; |
9232 | const int TIDX = 128; |
9233 | const int static_size = pixels_per_thread * TIDX; |
9234 | |
9235 | TensorView* sx = makeConcreteTensor({-1, static_size}); |
9236 | TensorView* dx = makeSymbolicTensor(2); |
9237 | fusion.addInput(sx); |
9238 | fusion.addInput(dx); |
9239 | |
9240 | TensorView* max_sx = reductionOp( |
9241 | BinaryOpType::Max, |
9242 | {-1}, |
9243 | IrBuilder::create<Double>(std::numeric_limits<float>::lowest()), |
9244 | sx); // (M) |
9245 | TensorView* max_dx = reductionOp( |
9246 | BinaryOpType::Max, |
9247 | {-1}, |
9248 | IrBuilder::create<Double>(std::numeric_limits<float>::lowest()), |
9249 | dx); // (M) |
9250 | |
9251 | // Reduction => merge local and shared memory TensorViews |
9252 | TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx); |
9253 | TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) |
9254 | |
9255 | TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N) |
9256 | TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N) |
9257 | |
9258 | TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N) |
9259 | TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N) |
9260 | |
9261 | TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R) |
9262 | TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R) |
9263 | |
9264 | // Reduction => merge local and shared memory TensorViews |
9265 | TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp); |
9266 | TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B) |
9267 | |
9268 | TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N) |
9269 | TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N) |
9270 | fusion.addOutput(sx_softmax); |
9271 | fusion.addOutput(dx_softmax); |
9272 | |
9273 | auto sx_cache = sx->cacheAfter(); |
9274 | auto dx_cache = dx->cacheAfter(); |
9275 | dx_cache->setMemoryType(MemoryType::Shared); |
9276 | dx_exp->setMemoryType(MemoryType::Shared); |
9277 | |
9278 | // Reduction and Broadcast Tensors common to both memory TVs |
9279 | std::vector<TensorView*> common_tensors( |
9280 | {max_val, sum_exp, bcast_max, bcast_sum}); |
9281 | |
9282 | // Static Local Memory TVs |
9283 | std::vector<TensorView*> static_tensors( |
9284 | {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax}); |
9285 | |
9286 | // Dynamic Local Memory TVs |
9287 | std::vector<TensorView*> dynamic_tensors( |
9288 | {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax}); |
9289 | |
9290 | std::vector<TensorView*> all_tensors; |
9291 | all_tensors.insert( |
9292 | all_tensors.end(), common_tensors.begin(), common_tensors.end()); |
9293 | all_tensors.insert( |
9294 | all_tensors.end(), static_tensors.begin(), static_tensors.end()); |
9295 | all_tensors.insert( |
9296 | all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end()); |
9297 | |
9298 | // M => M |
9299 | // M, N => M, N/128, 128 |
9300 | for (auto tensor : all_tensors) { |
9301 | if (tensor->nDims() > 1) { |
9302 | tensor->split(-1, TIDX); |
9303 | } |
9304 | } |
9305 | |
9306 | auto sx_sum_exp_rf = sx_sum_exp->rFactor({1}); |
9307 | auto dx_sum_exp_rf = dx_sum_exp->rFactor({1}); |
9308 | all_tensors.push_back(sx_sum_exp_rf); |
9309 | all_tensors.push_back(dx_sum_exp_rf); |
9310 | |
9311 | // computeAt |
9312 | sx->computeAt(sx_max_sub, 1); |
9313 | dx->computeAt(dx_max_sub, 1); |
9314 | |
9315 | sx_exp->computeAt(sx_softmax, 1); |
9316 | dx_exp->computeAt(dx_softmax, 1); |
9317 | |
9318 | sx_max_sub->computeAt(sx_exp, 2); |
9319 | dx_max_sub->computeAt(dx_exp, 2); |
9320 | |
9321 | sx_softmax->axis(0)->parallelize(ParallelType::BIDx); |
9322 | dx_softmax->axis(0)->parallelize(ParallelType::BIDx); |
9323 | for (auto tensor : all_tensors) { |
9324 | if (tensor->nDims() > 1) { |
9325 | tensor->axis(-1)->parallelize(ParallelType::TIDx); |
9326 | } |
9327 | } |
9328 | |
9329 | const int64_t dimx = 1024; |
9330 | const int64_t dimy = 16384; |
9331 | |
9332 | auto properties = at::cuda::getDeviceProperties(0); |
9333 | const size_t required_smem_size = |
9334 | (dimy - static_size) * sizeof(float) + TIDX * sizeof(float); |
9335 | if (properties->sharedMemPerBlockOptin < required_smem_size) { |
9336 | GTEST_SKIP() << "not enough shared memory space on device to run test: " |
9337 | << properties->sharedMemPerBlock; |
9338 | } |
9339 | |
9340 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9341 | at::Tensor aten_input = at::randn({dimx, dimy}, options); |
9342 | at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size); |
9343 | at::Tensor aten_dynamic_in = |
9344 | aten_input.narrow(1, static_size, dimy - static_size); |
9345 | |
9346 | at::Tensor out = at::zeros({dimx, dimy}, options); |
9347 | at::Tensor cg_static_out = out.narrow(1, 0, static_size); |
9348 | at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size); |
9349 | |
9350 | std::vector<at::Tensor> aten_outputs; |
9351 | |
9352 | auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); |
9353 | at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size); |
9354 | at::Tensor aten_dynamic_out = |
9355 | aten_output.narrow(1, static_size, dimy - static_size); |
9356 | |
9357 | torch::jit::fuser::cuda::FusionExecutor fe; |
9358 | fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in}); |
9359 | fe.runFusion( |
9360 | {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); |
9361 | |
9362 | testValidate( |
9363 | &fusion, |
9364 | {cg_static_out, cg_dynamic_out}, |
9365 | {aten_static_in, aten_dynamic_in}, |
9366 | {cg_static_out, cg_dynamic_out}, |
9367 | __LINE__, |
9368 | __FILE__); |
9369 | } |
9370 | |
9371 | TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { |
9372 | Fusion fusion; |
9373 | FusionGuard fg(&fusion); |
9374 | |
9375 | const int pixels_per_thread = 64; |
9376 | const int TIDX = 128; |
9377 | const int static_size = pixels_per_thread * TIDX; |
9378 | |
9379 | TensorView* sx = makeConcreteTensor({-1, static_size}); |
9380 | TensorView* dx = makeSymbolicTensor(2); |
9381 | fusion.addInput(sx); |
9382 | fusion.addInput(dx); |
9383 | |
9384 | Double* gamma = IrBuilder::create<Double>(); |
9385 | Double* beta = IrBuilder::create<Double>(); |
9386 | Double* eps = IrBuilder::create<Double>(); |
9387 | Int* N = IrBuilder::create<Int>(); |
9388 | fusion.addInput(gamma); |
9389 | fusion.addInput(beta); |
9390 | fusion.addInput(eps); |
9391 | fusion.addInput(N); |
9392 | |
9393 | // Reduction |
9394 | auto sx_sum = sum(sx, {-1}); // (M, R) |
9395 | auto dx_sum = sum(dx, {-1}); // (M, R) |
9396 | // Reduction => merge local and shared memory TensorViews |
9397 | auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum); |
9398 | |
9399 | // Broadcast |
9400 | auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B) |
9401 | // Pwise |
9402 | auto x_mean = div(x_sum_bcast, N); // (M, B) |
9403 | |
9404 | auto sx_mean_sub = sub(sx, x_mean); // (M, N) |
9405 | auto dx_mean_sub = sub(dx, x_mean); // (M, N) |
9406 | |
9407 | auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N) |
9408 | auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N) |
9409 | |
9410 | // Reduction |
9411 | auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R) |
9412 | auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R) |
9413 | // Reduction => merge local and shared memory TensorViews |
9414 | auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum); |
9415 | |
9416 | // Broadcast |
9417 | auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B) |
9418 | // Pwise |
9419 | auto var = div(var_sum_bcast, N); // (M, B) |
9420 | auto var_eps = add(var, eps); // (M, B) |
9421 | auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B) |
9422 | |
9423 | auto sx_norm = mul(sx_mean_sub, rvar); |
9424 | auto dx_norm = mul(dx_mean_sub, rvar); |
9425 | |
9426 | auto sx_norm_gamma = mul(sx_norm, gamma); |
9427 | auto dx_norm_gamma = mul(dx_norm, gamma); |
9428 | |
9429 | auto sx_norm_gamma_beta = add(sx_norm_gamma, beta); |
9430 | auto dx_norm_gamma_beta = add(dx_norm_gamma, beta); |
9431 | |
9432 | fusion.addOutput(sx_norm_gamma_beta); |
9433 | fusion.addOutput(dx_norm_gamma_beta); |
9434 | |
9435 | sx_norm_gamma_beta->setContiguity(false); |
9436 | dx_norm_gamma_beta->setContiguity(false); |
9437 | |
9438 | // Read Input into Shared Memory |
9439 | // Read Input minus Input_Mean into Shared Memory |
9440 | auto sx_cache = sx->cacheAfter(); |
9441 | auto dx_cache = dx->cacheAfter(); |
9442 | dx_cache->setMemoryType(MemoryType::Shared); |
9443 | dx_mean_sub->setMemoryType(MemoryType::Shared); |
9444 | |
9445 | std::vector<TensorView*> common_tensors( |
9446 | {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar}); |
9447 | |
9448 | std::vector<TensorView*> static_tensors( |
9449 | {sx, |
9450 | sx_cache, |
9451 | sx_sum, |
9452 | sx_mean_sub, |
9453 | sx_mean_sub_pow, |
9454 | sx_var_sum, |
9455 | sx_norm, |
9456 | sx_norm_gamma, |
9457 | sx_norm_gamma_beta}); |
9458 | |
9459 | std::vector<TensorView*> dynamic_tensors( |
9460 | {dx, |
9461 | dx_cache, |
9462 | dx_sum, |
9463 | dx_mean_sub, |
9464 | dx_mean_sub_pow, |
9465 | dx_var_sum, |
9466 | dx_norm, |
9467 | dx_norm_gamma, |
9468 | dx_norm_gamma_beta}); |
9469 | |
9470 | std::vector<TensorView*> all_tensors; |
9471 | all_tensors.insert( |
9472 | all_tensors.end(), common_tensors.begin(), common_tensors.end()); |
9473 | all_tensors.insert( |
9474 | all_tensors.end(), static_tensors.begin(), static_tensors.end()); |
9475 | all_tensors.insert( |
9476 | all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end()); |
9477 | |
9478 | // M => M |
9479 | // M, N => M, N/128, 128 |
9480 | for (auto tensor : all_tensors) { |
9481 | if (tensor->nDims() > 1) { |
9482 | tensor->split(-1, TIDX); |
9483 | } |
9484 | } |
9485 | |
9486 | // Local Sum => Block Broadcast |
9487 | TensorView* sx_sum_rf = sx_sum->rFactor({1}); |
9488 | TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1}); |
9489 | TensorView* dx_sum_rf = dx_sum->rFactor({1}); |
9490 | TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1}); |
9491 | all_tensors.push_back(sx_sum_rf); |
9492 | all_tensors.push_back(sx_var_sum_rf); |
9493 | all_tensors.push_back(dx_sum_rf); |
9494 | all_tensors.push_back(dx_var_sum_rf); |
9495 | |
9496 | // ComputeAt |
9497 | sx->computeAt(sx_mean_sub_pow, 1); |
9498 | dx->computeAt(dx_mean_sub_pow, 1); |
9499 | |
9500 | var_sum->computeAt(rvar, 1); |
9501 | |
9502 | sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2); |
9503 | dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2); |
9504 | |
9505 | sx_norm->computeAt(sx_norm_gamma_beta, 2); |
9506 | dx_norm->computeAt(dx_norm_gamma_beta, 2); |
9507 | |
9508 | sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx); |
9509 | dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx); |
9510 | for (auto tensor : all_tensors) { |
9511 | if (tensor->nDims() > 1) { |
9512 | tensor->axis(-1)->parallelize(ParallelType::TIDx); |
9513 | } |
9514 | } |
9515 | |
9516 | const int dimx = 1024; |
9517 | const int dimy = 16384; |
9518 | const float kGamma = 1.0f; |
9519 | const float kBeta = 0.0f; |
9520 | const float kEps = 1e-5; |
9521 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9522 | |
9523 | auto properties = at::cuda::getDeviceProperties(0); |
9524 | const size_t required_smem_size = |
9525 | (dimy - static_size) * sizeof(float) + TIDX * sizeof(float); |
9526 | if (properties->sharedMemPerBlockOptin < required_smem_size) { |
9527 | GTEST_SKIP() << "not enough shared memory space on device to run test: " |
9528 | << properties->sharedMemPerBlock; |
9529 | } |
9530 | |
9531 | at::Tensor aten_input = at::randn({dimx, dimy}, options); |
9532 | at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size); |
9533 | at::Tensor aten_dynamic_in = |
9534 | aten_input.narrow(1, static_size, dimy - static_size); |
9535 | |
9536 | at::Tensor out = at::zeros({dimx, dimy}, options); |
9537 | at::Tensor cg_static_out = out.narrow(1, 0, static_size); |
9538 | at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size); |
9539 | |
9540 | std::vector<IValue> aten_inputs = { |
9541 | aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; |
9542 | |
9543 | torch::jit::fuser::cuda::FusionExecutor fe; |
9544 | fe.compileFusion(&fusion, aten_inputs); |
9545 | |
9546 | fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); |
9547 | |
9548 | auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); |
9549 | auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); |
9550 | auto at_rvar = at::rsqrt(at::add(at_var, kEps)); |
9551 | auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar); |
9552 | auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta); |
9553 | at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size); |
9554 | at::Tensor aten_dynamic_out = |
9555 | aten_output.narrow(1, static_size, dimy - static_size); |
9556 | |
9557 | testValidate( |
9558 | &fusion, |
9559 | {cg_static_out, cg_dynamic_out}, |
9560 | aten_inputs, |
9561 | {aten_static_out, aten_dynamic_out}, |
9562 | __LINE__, |
9563 | __FILE__); |
9564 | } |
9565 | |
9566 | TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { |
9567 | Fusion fusion; |
9568 | FusionGuard fg(&fusion); |
9569 | |
9570 | // Set up your input tensor views |
9571 | auto x = makeSymbolicTensor(2); |
9572 | Double* gamma = IrBuilder::create<Double>(); |
9573 | Double* beta = IrBuilder::create<Double>(); |
9574 | Double* eps = IrBuilder::create<Double>(); |
9575 | Int* N = IrBuilder::create<Int>(); |
9576 | fusion.addInput(x); |
9577 | fusion.addInput(gamma); |
9578 | fusion.addInput(beta); |
9579 | fusion.addInput(eps); |
9580 | fusion.addInput(N); |
9581 | |
9582 | // Reduction |
9583 | auto x_sum = sum(x, {-1}); // (M, R) |
9584 | // Broadcast |
9585 | auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B) |
9586 | // Pwise |
9587 | auto x_mean = div(x_sum_bcast, N); // (M, B) |
9588 | auto x_mean_sub = sub(x, x_mean); // (M, N) |
9589 | auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N) |
9590 | // Reduction |
9591 | auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R) |
9592 | // Broadcast |
9593 | auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B) |
9594 | // Pwise |
9595 | auto var = div(var_sum_bcast, N); // (M, B) |
9596 | auto var_eps = add(var, eps); // (M, B) |
9597 | auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B) |
9598 | auto norm = mul(x_mean_sub, rvar); |
9599 | auto norm_gamma = mul(norm, gamma); |
9600 | auto norm_gamma_beta = add(norm_gamma, beta); |
9601 | fusion.addOutput(norm_gamma_beta); |
9602 | |
9603 | // Read Input into Shared Memory |
9604 | // Read Input minus Input_Mean into Shared Memory |
9605 | auto cache_x = x->cacheAfter(); |
9606 | cache_x->setMemoryType(MemoryType::Shared); |
9607 | x_mean_sub->setMemoryType(MemoryType::Shared); |
9608 | |
9609 | std::vector<TensorView*> all_tensors( |
9610 | {x_sum, |
9611 | x_mean, |
9612 | cache_x, |
9613 | x_sum_bcast, |
9614 | x_mean_sub, |
9615 | x_mean_sub_pow, |
9616 | var_sum, |
9617 | var_sum_bcast, |
9618 | var, |
9619 | var_eps, |
9620 | rvar, |
9621 | norm, |
9622 | norm_gamma, |
9623 | norm_gamma_beta}); |
9624 | |
9625 | auto tidx = IrBuilder::create<Int>(); |
9626 | fusion.addInput(tidx); |
9627 | |
9628 | for (auto tensor : all_tensors) { |
9629 | tensor->split(-1, tidx); |
9630 | } |
9631 | |
9632 | // Local Sum => Block Broadcast |
9633 | TensorView* x_sum_rf = x_sum->rFactor({1}); |
9634 | TensorView* var_sum_rf = var_sum->rFactor({1}); |
9635 | all_tensors.push_back(x_sum_rf); |
9636 | all_tensors.push_back(var_sum_rf); |
9637 | |
9638 | // ComputeAt |
9639 | x->computeAt(x_mean_sub_pow, 1); |
9640 | var_sum->computeAt(rvar, 1); |
9641 | x_mean_sub_pow->computeAt(var_sum_rf, 2); |
9642 | norm->computeAt(norm_gamma_beta, 2); |
9643 | |
9644 | for (auto tv : all_tensors) { |
9645 | tv->axis(0)->parallelize(ParallelType::BIDx); |
9646 | tv->axis(-1)->parallelize(ParallelType::TIDx); |
9647 | } |
9648 | |
9649 | const int dimx = 128; |
9650 | const int dimy = 2048; |
9651 | const float kGamma = 1.0f; |
9652 | const float kBeta = 0.0f; |
9653 | const float kEps = 1e-5; |
9654 | const int TIDX = 128; |
9655 | |
9656 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9657 | at::Tensor aten_input = at::randn({dimx, dimy}, options); |
9658 | auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); |
9659 | auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1); |
9660 | auto at_rvar = at::rsqrt(at::add(at_var, kEps)); |
9661 | auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar); |
9662 | auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta); |
9663 | |
9664 | std::vector<IValue> aten_inputs = { |
9665 | aten_input, kGamma, kBeta, kEps, dimy, TIDX}; |
9666 | |
9667 | torch::jit::fuser::cuda::FusionExecutor fe; |
9668 | fe.compileFusion(&fusion, aten_inputs); |
9669 | auto cg_outputs = fe.runFusion(aten_inputs); |
9670 | |
9671 | testValidate( |
9672 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
9673 | } |
9674 | |
9675 | TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { |
9676 | Fusion fusion; |
9677 | FusionGuard fg(&fusion); |
9678 | |
9679 | // Set up your input tensor views |
9680 | TensorView* tv0 = makeSymbolicTensor(2); |
9681 | TensorView* tv1 = |
9682 | reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0); |
9683 | fusion.addInput(tv0); |
9684 | fusion.addOutput(tv1); |
9685 | // tv1[I0, R1] = tv0[I0, I1] |
9686 | |
9687 | // Interface should just be a direct split with a Parallel type. We can |
9688 | // include the parallelize call if we do this. |
9689 | tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); |
9690 | // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] |
9691 | |
9692 | TensorView* tv2 = tv1->rFactor({2}); |
9693 | tv2->setMemoryType(MemoryType::Shared); |
9694 | // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] |
9695 | // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] |
9696 | |
9697 | tv0->computeAt(tv1, 1); |
9698 | |
9699 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
9700 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
9701 | |
9702 | constexpr int numel_x = 65000, numel_y = 1024; |
9703 | |
9704 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9705 | at::Tensor aten_input = at::randn({numel_x, numel_y}, options); |
9706 | auto aten_output = aten_input.to(at::kDouble).sum({1}); |
9707 | |
9708 | // How many threads to use for the block reduction |
9709 | constexpr int runtime_threadIdx_dim = 128; |
9710 | |
9711 | LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); |
9712 | |
9713 | FusionExecutor fe; |
9714 | fe.compileFusion(&fusion, {aten_input}, lparams); |
9715 | auto cg_outputs = fe.runFusion({aten_input}, lparams); |
9716 | |
9717 | testValidate( |
9718 | &fusion, |
9719 | cg_outputs, |
9720 | {aten_input}, |
9721 | {aten_output}, |
9722 | __LINE__, |
9723 | __FILE__, |
9724 | "" , |
9725 | lparams); |
9726 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); |
9727 | } |
9728 | |
9729 | TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { |
9730 | Fusion fusion; |
9731 | FusionGuard fg(&fusion); |
9732 | |
9733 | // Algorithm |
9734 | Int* sym_bsx = IrBuilder::create<Int>(); |
9735 | TensorView* tv0 = makeSymbolicTensor(3); // M, K, N |
9736 | fusion.addInput(tv0); |
9737 | fusion.addInput(sym_bsx); |
9738 | |
9739 | TensorView* tv1 = sum(tv0, {1}); // M, R, N |
9740 | fusion.addOutput(tv1); |
9741 | |
9742 | TensorView* tv2 = tv0->cacheAfter(); |
9743 | tv2->setMemoryType(MemoryType::Shared); |
9744 | |
9745 | // Schedule |
9746 | constexpr int BSX = 32; |
9747 | tv1->split(2, BSX); |
9748 | tv1->split(1, sym_bsx); |
9749 | tv1->split(0, BSX); |
9750 | // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX |
9751 | tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); |
9752 | TensorView* tv3 = tv1->rFactor({-2}); |
9753 | |
9754 | tv0->computeAt(tv1, -2); |
9755 | tv0->computeAt(tv3, -2); |
9756 | |
9757 | // Thread and Block binding |
9758 | tv1->axis(0)->parallelize(ParallelType::BIDx); |
9759 | tv1->axis(1)->parallelize(ParallelType::BIDy); |
9760 | tv1->axis(-1)->parallelize(ParallelType::TIDx); |
9761 | // Manual Binding |
9762 | tv2->axis(-1)->parallelize(ParallelType::TIDx); |
9763 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
9764 | |
9765 | constexpr int M = 154, K = 45, N = 1524; |
9766 | |
9767 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9768 | at::Tensor aten_input = at::randn({M, K, N}, options); |
9769 | at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}); |
9770 | |
9771 | // How many threads to use for the block reduction |
9772 | constexpr int runtime_threadIdx_dim = 128; |
9773 | |
9774 | auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); |
9775 | |
9776 | FusionExecutor fe; |
9777 | fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); |
9778 | auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams); |
9779 | |
9780 | testValidate( |
9781 | &fusion, |
9782 | cg_outputs, |
9783 | {aten_input, runtime_threadIdx_dim}, |
9784 | {aten_output}, |
9785 | __LINE__, |
9786 | __FILE__, |
9787 | "" , |
9788 | lparams); |
9789 | |
9790 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); |
9791 | } |
9792 | |
9793 | TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { |
9794 | Fusion fusion; |
9795 | FusionGuard fg(&fusion); |
9796 | |
9797 | Int* sym_bsx = IrBuilder::create<Int>(); |
9798 | TensorView* tv0 = makeSymbolicTensor(2); // (M, K) |
9799 | TensorView* tv1 = makeSymbolicTensor(2); // (K, N) |
9800 | TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) |
9801 | TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) |
9802 | TensorView* tv4 = mul(tv2, tv3); // M, K, N |
9803 | fusion.addInput(tv0); |
9804 | fusion.addInput(tv1); |
9805 | fusion.addInput(sym_bsx); |
9806 | fusion.addOutput(tv4); |
9807 | // Algorithm |
9808 | |
9809 | tv2->setMemoryType(MemoryType::Shared); |
9810 | tv3->setMemoryType(MemoryType::Shared); |
9811 | |
9812 | constexpr int BSX = 32; |
9813 | tv4->split(2, BSX); |
9814 | tv4->split(1, sym_bsx); |
9815 | tv4->split(0, BSX); |
9816 | // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX |
9817 | tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); |
9818 | // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX |
9819 | |
9820 | tv0->computeAt(tv4, 3); |
9821 | tv1->computeAt(tv4, 3); |
9822 | // Schedule |
9823 | |
9824 | tv4->axis(0)->parallelize(ParallelType::BIDx); |
9825 | tv4->axis(2)->parallelize(ParallelType::BIDy); |
9826 | // Manual Binding |
9827 | tv2->axis(-2)->parallelize(ParallelType::TIDx); |
9828 | tv3->axis(-1)->parallelize(ParallelType::TIDx); |
9829 | // Thread and Block binding |
9830 | |
9831 | constexpr int M = 128, K = 457, N = 1024; |
9832 | |
9833 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9834 | at::Tensor t0 = at::randn({M, K}, options); |
9835 | at::Tensor t1 = at::randn({K, N}, options); |
9836 | at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); |
9837 | std::vector<IValue> aten_inputs = {t0, t1, BSX}; |
9838 | |
9839 | LaunchParams lparams(-1, -1, -1, BSX, -1, -1); |
9840 | |
9841 | FusionExecutor fe; |
9842 | fe.compileFusion(&fusion, aten_inputs, lparams); |
9843 | auto cg_outputs = fe.runFusion(aten_inputs, lparams); |
9844 | |
9845 | testValidate( |
9846 | &fusion, |
9847 | cg_outputs, |
9848 | aten_inputs, |
9849 | {aten_output}, |
9850 | __LINE__, |
9851 | __FILE__, |
9852 | "" , |
9853 | lparams); |
9854 | |
9855 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); |
9856 | } |
9857 | |
9858 | TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { |
9859 | Fusion fusion; |
9860 | FusionGuard fg(&fusion); |
9861 | |
9862 | // Symbolic integers we will use for runtime tiling |
9863 | Int* symbolic_m_tile_dim = IrBuilder::create<Int>(); // bound to threadIdx.z |
9864 | Int* symbolic_split_k_tile_dim = |
9865 | IrBuilder::create<Int>(); // bound to blockIdx.x |
9866 | Int* symbolic_block_k_tile_dim = |
9867 | IrBuilder::create<Int>(); // bound to threadIdx.x |
9868 | // Compile-time integer for tiling |
9869 | int n_smem_tile = 8; // bound to threadIdx.y |
9870 | |
9871 | // Symbolic 2D tensors TV0[M, K], TV1[K, N] |
9872 | TensorView* tv0 = makeSymbolicTensor(2); |
9873 | TensorView* tv1 = makeSymbolicTensor(2); |
9874 | |
9875 | // Broadcast tv0 to [M, K, *] |
9876 | TensorView* tv2 = broadcast(tv0, {false, false, true}); |
9877 | // Broadcast tv1 to [*, K, N] |
9878 | TensorView* tv3 = broadcast(tv1, {true, false, false}); |
9879 | |
9880 | // Pointwise multiplication resulting in tv3[M, K, N] |
9881 | TensorView* tv4 = mul(tv2, tv3); |
9882 | |
9883 | // Turn the K-dimension of tv4 into a reduction dimension |
9884 | TensorView* tv5 = sum(tv4, {1}); |
9885 | |
9886 | // Register inputs and outputs |
9887 | fusion.addInput(tv0); |
9888 | fusion.addInput(tv1); |
9889 | fusion.addOutput(tv5); |
9890 | |
9891 | // Register runtime tile dims as inputs |
9892 | fusion.addInput(symbolic_m_tile_dim); |
9893 | fusion.addInput(symbolic_split_k_tile_dim); |
9894 | fusion.addInput(symbolic_block_k_tile_dim); |
9895 | |
9896 | // Make a 3D tile, mix of symbolic and constant, do in reverse order because |
9897 | // dims are inserted |
9898 | // [M, K, N] |
9899 | tv5->split(2, n_smem_tile); |
9900 | tv5->split(1, symbolic_block_k_tile_dim); |
9901 | tv5->split(1, symbolic_split_k_tile_dim); |
9902 | tv5->split(0, symbolic_m_tile_dim); |
9903 | // [Mo, Mi, Koo, Koi, Ki, No, Ni] |
9904 | |
9905 | // Reorder so all outer tiles are in the leftmost 3 positions |
9906 | tv5->reorder({{1, 5}, {5, 1}}); |
9907 | // [Mo, No, Koo, Koi, Ki, Mi, Ni] |
9908 | |
9909 | // Factor out the outer reduction IterDomain, then run the inter-cta |
9910 | // reduction, and intra-cta reduction |
9911 | auto tv6 = tv5->rFactor({2}); |
9912 | // [Mo, No, rKoo, rKoi, rKi, Mi, Ni] |
9913 | // [Mo, No, rKoi, rKi, Mi, Ni] |
9914 | |
9915 | // Scope computations |
9916 | tv6->computeAt(tv5, 2); |
9917 | // [Mo, No, rKoo, Koi, Ki, Mi, Ni] |
9918 | // [Mo, No, rKoi, rKi, Mi, Ni] |
9919 | |
9920 | // Setup compute at schedule |
9921 | tv0->computeAt(tv6, 3); |
9922 | tv1->computeAt(tv6, 3); |
9923 | tv4->computeAt(tv6, -1); |
9924 | // |
9925 | // T2[Mo, bNo, Koo, Koi, Kii, Mi, bNi] CA(4, 3) |
9926 | // T3[bMo, No, Koo, Koi, Kii, bMi, Ni] CA(4, 3) |
9927 | // T4[ Mo, No, Koo, Koi, Kii, Mi, Ni] |
9928 | // T6[ Mo, No, rKoo, Koi, Kii, Mi, Ni] |
9929 | // T5[ Mo, No, rKoi, rKii, Mi, Ni] |
9930 | |
9931 | // Cache smem tiles |
9932 | tv2->setMemoryType(MemoryType::Shared); |
9933 | tv3->setMemoryType(MemoryType::Shared); |
9934 | tv4->setMemoryType(MemoryType::Local); |
9935 | tv6->setMemoryType(MemoryType::Local); |
9936 | |
9937 | tv5->axis(0)->parallelize(ParallelType::BIDz); |
9938 | tv5->axis(1)->parallelize(ParallelType::BIDy); |
9939 | |
9940 | std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6}; |
9941 | for (auto tv : tv_list) { |
9942 | tv->axis(-2)->parallelize(ParallelType::TIDz); |
9943 | tv->axis(-1)->parallelize(ParallelType::TIDy); |
9944 | } |
9945 | tv2->axis(3)->parallelize(ParallelType::TIDx); |
9946 | tv3->axis(3)->parallelize(ParallelType::TIDx); |
9947 | tv4->axis(3)->parallelize(ParallelType::TIDx); |
9948 | tv6->axis(3)->parallelize(ParallelType::TIDx); |
9949 | tv5->axis(2)->parallelize(ParallelType::TIDx); |
9950 | |
9951 | tv2->axis(4)->parallelize(ParallelType::BIDx); |
9952 | tv3->axis(4)->parallelize(ParallelType::BIDx); |
9953 | tv4->axis(4)->parallelize(ParallelType::BIDx); |
9954 | tv6->axis(4)->parallelize(ParallelType::BIDx); |
9955 | tv5->axis(3)->parallelize(ParallelType::BIDx); |
9956 | |
9957 | constexpr int M = 31, K = 65, N = 33; |
9958 | |
9959 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
9960 | at::Tensor t0 = at::randn({M, K}, options); |
9961 | at::Tensor t1 = at::randn({K, N}, options); |
9962 | |
9963 | // Runtime tiling |
9964 | int m_tile = 4; // bound to threadIdx.z |
9965 | int split_k = 7; // bound to blockIdx.x |
9966 | int intra_cta = 8; // bound to threadIdx.x |
9967 | |
9968 | std::vector<IValue> aten_inputs = {t0, t1, m_tile, split_k, intra_cta}; |
9969 | at::Tensor aten_output = |
9970 | mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); |
9971 | |
9972 | FusionExecutor fe; |
9973 | // Generate CUDA and compile with nvRTC |
9974 | fe.compileFusion(&fusion, aten_inputs); |
9975 | auto cg_outputs = fe.runFusion(aten_inputs); |
9976 | |
9977 | testValidate( |
9978 | &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); |
9979 | |
9980 | TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); |
9981 | } |
9982 | |
9983 | } // namespace jit |
9984 | } // namespace torch |
9985 | #endif // #if defined(USE_CUDA) |
9986 | |