1 | #if defined(USE_CUDA) |
2 | #include <gmock/gmock-matchers.h> |
3 | #include <gtest/gtest.h> |
4 | |
5 | #include <arith.h> |
6 | #include <codegen.h> |
7 | #include <disjoint_set.h> |
8 | #include <executor.h> |
9 | #include <executor_launch_params.h> |
10 | #include <expr_evaluator.h> |
11 | #include <fusion.h> |
12 | #include <fusion_segmenter.h> |
13 | #include <inlining.h> |
14 | #include <ir_all_nodes.h> |
15 | #include <ir_builder.h> |
16 | #include <ir_graphviz.h> |
17 | #include <ir_iostream.h> |
18 | #include <ir_utils.h> |
19 | #include <iter_visitor.h> |
20 | #include <kernel_cache.h> |
21 | #include <kernel_expr_evaluator.h> |
22 | #include <kernel_ir.h> |
23 | #include <kernel_ir_dispatch.h> |
24 | #include <lower2device.h> |
25 | #include <lower_divisible_split.h> |
26 | #include <mutator.h> |
27 | #include <ops/all_ops.h> |
28 | #include <register_interface.h> |
29 | #include <root_domain_map.h> |
30 | #include <scheduler/all_schedulers.h> |
31 | #include <scheduler/reduction_utils.h> |
32 | #include <scheduler/utils.h> |
33 | #include <test/test_gpu_validator.h> |
34 | #include <test/test_utils.h> |
35 | #include <transform_replay.h> |
36 | #include <transform_rfactor.h> |
37 | |
38 | // fuser and IR parser |
39 | #include <parser.h> |
40 | #include <torch/csrc/jit/ir/irparser.h> |
41 | |
42 | #include <ATen/cuda/CUDAContext.h> |
43 | #include <ATen/cuda/Exceptions.h> |
44 | #include <c10/cuda/CUDAStream.h> |
45 | |
46 | #include <algorithm> |
47 | #include <iostream> |
48 | |
49 | // Tests go in torch::jit |
50 | namespace torch { |
51 | namespace jit { |
52 | |
53 | using namespace torch::jit::fuser::cuda; |
54 | using namespace at::indexing; |
55 | |
56 | TEST_F(NVFuserTest, FusionViewDtypeSameSizeOutput_CUDA) { |
57 | Fusion fusion; |
58 | FusionGuard fg(&fusion); |
59 | |
60 | std::vector<int64_t> input_shape{2, 10, 40}; |
61 | |
62 | TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float); |
63 | TensorView* bias = makeSymbolicTensor(input_shape.size()); |
64 | fusion.addInput(x); |
65 | fusion.addInput(bias); |
66 | |
67 | auto x_add_bias = add(x, bias); |
68 | auto x_view = view(x_add_bias, DataType::Int32); |
69 | fusion.addOutput(x_view); |
70 | |
71 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
72 | at::Tensor at_x = at::randn(input_shape, options); |
73 | at::Tensor at_bias = at::randn(input_shape, options); |
74 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
75 | |
76 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
77 | |
78 | FusionExecutor fe; |
79 | fe.compileFusion(&fusion, aten_inputs, lparams); |
80 | auto outputs = fe.runFusion(aten_inputs, lparams); |
81 | |
82 | auto at_x_add_bias = at_x + at_bias; |
83 | auto at_x_view = at_x_add_bias.view(at::ScalarType::Int); |
84 | |
85 | testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__); |
86 | } |
87 | |
88 | TEST_F(NVFuserTest, FusionViewDtypeFailMismatchSize_CUDA) { |
89 | Fusion fusion; |
90 | FusionGuard fg(&fusion); |
91 | |
92 | std::vector<int64_t> input_shape{2, 10, 40}; |
93 | |
94 | TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float); |
95 | TensorView* bias = makeSymbolicTensor(input_shape.size()); |
96 | fusion.addInput(x); |
97 | fusion.addInput(bias); |
98 | |
99 | auto x_add_bias = add(x, bias); |
100 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
101 | ASSERT_ANY_THROW(view(x_add_bias, DataType::Int)); |
102 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
103 | ASSERT_ANY_THROW(view(x_add_bias, DataType::Half)); |
104 | } |
105 | |
106 | TEST_F(NVFuserTest, FusionViewAsRealOutput_CUDA) { |
107 | Fusion fusion; |
108 | FusionGuard fg(&fusion); |
109 | |
110 | // TODO: We should modify our schedulers to correctly handle |
111 | // view_as_real. And test these schedulers. |
112 | std::vector<int64_t> input_shape{512}; |
113 | std::vector<int64_t> output_shape{512, 2}; |
114 | |
115 | TensorView* x = |
116 | makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat); |
117 | TensorView* bias = |
118 | makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat); |
119 | fusion.addInput(x); |
120 | fusion.addInput(bias); |
121 | |
122 | TensorView* y = makeSymbolicTensor(output_shape.size()); |
123 | fusion.addInput(y); |
124 | |
125 | auto y_plus_1 = add(y, IrBuilder::create<Double>(1)); |
126 | |
127 | auto x_add_bias = add(x, bias); |
128 | auto x_view = view_as_real(x_add_bias); |
129 | auto out = add(y_plus_1, x_view); |
130 | fusion.addOutput(out); |
131 | |
132 | out->axis(0)->parallelize(ParallelType::TIDx); |
133 | x_add_bias->computeAt(out, -1); |
134 | y->computeAt(out, -1); |
135 | |
136 | auto in_options = |
137 | at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0); |
138 | auto out_options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
139 | at::Tensor at_x = at::randn(input_shape, in_options); |
140 | at::Tensor at_bias = at::randn(input_shape, in_options); |
141 | at::Tensor at_y = at::randn(output_shape, out_options); |
142 | std::vector<IValue> aten_inputs = {at_x, at_bias, at_y}; |
143 | |
144 | FusionExecutor fe; |
145 | fe.compileFusion(&fusion, aten_inputs); |
146 | auto outputs = fe.runFusion(aten_inputs); |
147 | |
148 | auto at_x_add_bias = at_x + at_bias; |
149 | auto at_x_view = at::view_as_real(at_x_add_bias); |
150 | auto at_y_plus_1 = at_y + 1.0; |
151 | auto at_out = at_y_plus_1 + at_x_view; |
152 | |
153 | testValidate(&fusion, outputs, aten_inputs, {at_out}, __LINE__, __FILE__); |
154 | } |
155 | |
156 | TEST_F(NVFuserTest, FusionViewRfactorExtentReplacement_CUDA) { |
157 | auto fusion = std::make_unique<Fusion>(); |
158 | FusionGuard fg(fusion.get()); |
159 | |
160 | auto tv0 = makeSymbolicTensor(2); |
161 | fusion->addInput(tv0); |
162 | auto tv1 = makeContigTensor(2); |
163 | fusion->addInput(tv1); |
164 | |
165 | auto tv2 = view(tv0, {12, 8}, {4, 3, 8}); |
166 | auto tv3 = sum(tv2, {-1}); |
167 | auto tv4 = add(tv3, IrBuilder::create<Double>(1)); |
168 | auto tv5 = add(tv1, tv4); |
169 | fusion->addOutput(tv5); |
170 | |
171 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
172 | at::manual_seed(0); |
173 | auto t0 = at::randn({12, 8}, options); |
174 | auto t1 = at::randn({4, 3}, options); |
175 | |
176 | FusionExecutorCache executor_cache(std::move(fusion)); |
177 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); |
178 | |
179 | auto ref = at::native::view(t0, {4, 3, 8}).sum({-1}) + 1 + t1; |
180 | |
181 | testValidate( |
182 | executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
183 | } |
184 | |
185 | TEST_F(NVFuserTest, FusionViewOutput_CUDA) { |
186 | Fusion fusion; |
187 | FusionGuard fg(&fusion); |
188 | |
189 | std::vector<int64_t> input_shape{2, 10, 40}; |
190 | std::vector<int64_t> output_shape{2, 10, 4, 10}; |
191 | |
192 | TensorView* x = makeSymbolicTensor(input_shape.size()); |
193 | TensorView* bias = makeSymbolicTensor(input_shape.size()); |
194 | fusion.addInput(x); |
195 | fusion.addInput(bias); |
196 | |
197 | auto x_add_bias = add(x, bias); |
198 | auto x_view = view(x_add_bias, input_shape, output_shape); |
199 | fusion.addOutput(x_view); |
200 | |
201 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
202 | at::Tensor at_x = at::randn(input_shape, options); |
203 | at::Tensor at_bias = at::randn(input_shape, options); |
204 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
205 | |
206 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
207 | |
208 | FusionExecutor fe; |
209 | fe.compileFusion(&fusion, aten_inputs, lparams); |
210 | auto outputs = fe.runFusion(aten_inputs, lparams); |
211 | |
212 | auto at_x_add_bias = at_x + at_bias; |
213 | auto at_x_view = at::native::view(at_x_add_bias, output_shape); |
214 | |
215 | testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__); |
216 | } |
217 | |
218 | TEST_F(NVFuserTest, FusionViewFailMismatchSize_CUDA) { |
219 | Fusion fusion; |
220 | FusionGuard fg(&fusion); |
221 | |
222 | // The number of elements in input and output shapes do not match, |
223 | // so this view transformation is invalid. |
224 | // 2 * 10 * 40 != 2 * 50 * 4 * 10 |
225 | |
226 | std::vector<int64_t> input_shape{2, 10, 40}; |
227 | std::vector<int64_t> output_shape{2, 50, 4, 10}; |
228 | |
229 | TensorView* x = makeSymbolicTensor(input_shape.size()); |
230 | TensorView* bias = makeSymbolicTensor(input_shape.size()); |
231 | fusion.addInput(x); |
232 | fusion.addInput(bias); |
233 | |
234 | auto x_add_bias = add(x, bias); |
235 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
236 | ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape)); |
237 | } |
238 | |
239 | TEST_F(NVFuserTest, FusionViewFailMulitDimInference_CUDA) { |
240 | Fusion fusion; |
241 | FusionGuard fg(&fusion); |
242 | |
243 | // Only one dimension can be inferred in the output shape. |
244 | // Otherwise, the size of the dimensions is ambiguous. |
245 | std::vector<int64_t> input_shape{2, 10, 40}; |
246 | std::vector<int64_t> output_shape{2, -1, 4, -1}; |
247 | |
248 | TensorView* x = makeSymbolicTensor(input_shape.size()); |
249 | TensorView* bias = makeSymbolicTensor(input_shape.size()); |
250 | fusion.addInput(x); |
251 | fusion.addInput(bias); |
252 | |
253 | auto x_add_bias = add(x, bias); |
254 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) |
255 | ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape)); |
256 | } |
257 | |
258 | void reductionViewAddFusion( |
259 | std::vector<int64_t>& input_shape, |
260 | std::vector<int64_t>& output_shape, |
261 | bool view_before_reduction) { |
262 | constexpr int kReductionAxis = -1; |
263 | |
264 | // Drop size for reduction axis from view_shape |
265 | std::vector<int64_t> view_shape; |
266 | { |
267 | const auto kAxis = (kReductionAxis < 0) |
268 | ? (kReductionAxis + input_shape.size()) |
269 | : kReductionAxis; |
270 | for (auto i : c10::irange(input_shape.size())) { |
271 | if (view_before_reduction || i != kAxis) { |
272 | view_shape.push_back(input_shape[i]); |
273 | } |
274 | } |
275 | } |
276 | |
277 | auto bias_shape = (view_before_reduction) ? input_shape : output_shape; |
278 | for (auto has_implicit_broadcast : {false, true}) { |
279 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
280 | Fusion& fusion = *fusion_ptr.get(); |
281 | FusionGuard fg(&fusion); |
282 | |
283 | TensorView* x = (has_implicit_broadcast) |
284 | ? makeConcreteTensor(input_shape) |
285 | : makeSymbolicTensor(input_shape.size()); |
286 | TensorView* bias = (has_implicit_broadcast) |
287 | ? makeConcreteTensor(bias_shape) |
288 | : makeSymbolicTensor(bias_shape.size()); |
289 | fusion.addInput(x); |
290 | fusion.addInput(bias); |
291 | |
292 | auto tv1 = |
293 | (view_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis}); |
294 | auto x_view = view(tv1, view_shape, output_shape); |
295 | auto y = (view_before_reduction) ? sum(x_view, {kReductionAxis}) |
296 | : add(x_view, bias); |
297 | fusion.addOutput(y); |
298 | |
299 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
300 | at::Tensor at_x = at::randn(input_shape, options); |
301 | at::Tensor at_bias = at::randn(bias_shape, options); |
302 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
303 | |
304 | FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); |
305 | auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); |
306 | |
307 | auto at_tv1 = (view_before_reduction) ? (at_x + at_bias) |
308 | : at::sum(at_x, kReductionAxis); |
309 | auto at_x_view = at::native::view(at_tv1, output_shape); |
310 | auto at_y = (view_before_reduction) ? at::sum(at_x_view, kReductionAxis) |
311 | : at::add(at_x_view, at_bias); |
312 | |
313 | testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); |
314 | } |
315 | } |
316 | |
317 | typedef std::vector<int64_t> shape; |
318 | typedef std::pair<shape, shape> view_example; |
319 | |
320 | // TODO: View examples with just 333 elements are failing validation in |
321 | // normalization. This might just be because our tolerances aren't tuned well |
322 | // for small sizes and the parallelization could be limited which could be |
323 | // detected as a validation issue, though it might not actually be a correctness |
324 | // issue. Using 3333 instead of 333 in those cases but should validate what's |
325 | // going on in the 333 case. |
326 | std::vector<view_example> all_view_examples = { |
327 | {{1, 19, 1, 3 * 4, 7, 1, 99}, {1, 19, -1, 3, 4 * 7 * 99}}, |
328 | {{1, 19, 1, 3 * 4, 7, 1, 99}, {1, 19, 1, 3, 4 * 7 * 99}}, |
329 | {{19, 3 * 4, 7, 99}, {19, 3, 4 * 7 * 99}}, |
330 | |
331 | {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, 4, -1}}, |
332 | {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, 4, 10}}, |
333 | {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 2, 4, 1, 10}}, |
334 | |
335 | {{3, 17, 2 * 4 * 10, 1, 9}, {-1, 1, 2, 4, 10, 9}}, |
336 | {{3, 17, 2 * 4 * 10, 1, 9}, {3 * 17, 1, 2, 4, 10, 9}}, |
337 | {{3, 17, 2 * 4 * 10, 1, 9}, {3 * 17, 2, 4, 1, 10, 9}}, |
338 | |
339 | {{2, 3, 2 * 2, 5}, {1, 2 * 3, 1, -1, 2, 5, 1}}, |
340 | |
341 | {{22, 11 * 2, 2}, {22, -1, 1, 1, 2 * 2}}, |
342 | {{22, 1, 22, 1}, {-1}}, |
343 | {{22, 11 * 2, 2}, {22, 11, 1, 1, 2 * 2}}, |
344 | {{22, 1, 22, 1}, {22 * 22}}, |
345 | |
346 | {{37, 9, 7, 3 * 2, 5 * 2}, {37 * 9, 2, -1, 3, 7 * 5}}, |
347 | {{37, 9, 7, 3 * 2, 5 * 2}, {37 * 9, 2, 2, 3, 7 * 5}}, |
348 | |
349 | {{1, 1, 3333, 1}, {1, 1, -1, 1}}, |
350 | // Disabled for now due to non-deterministic nan issue (#1920) |
351 | // {{1, 1111 * 3}, {1, 1, 1, -1, 1, 3}}, |
352 | {{1, 3333, 1}, {-1}}, |
353 | {{1, 1, 3333, 1}, {1, 1, 3333, 1}}, |
354 | {{1, 303 * 11, 1}, {1, 303, -1, 1}}, |
355 | {{1, 3333, 1}, {1, 303, 11, 1}}, |
356 | // Disabled for now due to non-deterministic nan issue (#1920) |
357 | // {{1, 3333}, {1, 1, 1, 1111, 1, 3}}, |
358 | {{1, 3333, 1}, {3333}}, |
359 | |
360 | {{1, 3922 * 7, 1, 2}, {1, 3922 * 2, 1, -1}}, |
361 | {{1, 3922 * 2, 1, 7}, {1, -1, 2}}, |
362 | {{1, 3922 * 7, 2}, {1, 3922 * 2, 7}}, |
363 | {{1, 3922 * 2, 1, 7}, {1, 3922 * 7, 2}}, |
364 | {{1, 3922 * 7, 1, 2}, {1, 3922 * 2, 1, 7}}, |
365 | |
366 | {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, 4, 1, -1}}, |
367 | {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}}, |
368 | |
369 | {{2, 3, 2 * 2, 5}, {1, 6, 1, 2, 2, 5, 1}}, |
370 | }; |
371 | |
372 | TEST_F(NVFuserTest, FusionViewReductionShmoo_CUDA) { |
373 | for (auto e : all_view_examples) { |
374 | reductionViewAddFusion(e.first, e.second, true /* view_before_reduction */); |
375 | } |
376 | std::vector<view_example> view_after_reduce_examples = { |
377 | {{19, 12, 7, 99}, {19, 3, 28}}, |
378 | {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 28}}, |
379 | {{3, 17, 80, 1}, {51, 1, 2, 4, 10}}, |
380 | {{3, 17, 80, 1, 9}, {51, 1, 2, 4, 10}}, |
381 | {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 1}}, |
382 | {{22, 22, 2}, {22, 11, 1, 1, 2}}, |
383 | {{37, 9, 7, 6, 10}, {333, 2, 21}}, |
384 | {{1, 1, 333, 1}, {1, 1, 333, 1}}, |
385 | {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1}}, |
386 | {{1, 333, 1}, {1, 37, 9, 1}}, |
387 | {{22, 1, 22, 1}, {484}}, |
388 | {{1, 333, 1}, {333}}, |
389 | {{1, 27454, 1, 2}, {1, 3922, 1, 7}}, |
390 | {{1, 7844, 1, 7}, {1, 1961, 4}}}; |
391 | |
392 | for (auto e : view_after_reduce_examples) { |
393 | reductionViewAddFusion( |
394 | e.first, e.second, false /* view_before_reduction */); |
395 | } |
396 | } |
397 | |
398 | void persistentViewAddFusion( |
399 | std::vector<int64_t>& input_shape, |
400 | std::vector<int64_t>& output_shape, |
401 | bool view_before_persistent) { |
402 | constexpr int kAxis = -1; |
403 | |
404 | // Support -1 sizes in the inputs |
405 | auto inferred_shapes = inferViewShapes(input_shape, output_shape); |
406 | auto inferred_input = inferred_shapes.first; |
407 | auto inferred_output = inferred_shapes.second; |
408 | |
409 | auto bias_shape = view_before_persistent ? inferred_input : inferred_output; |
410 | for (auto has_implicit_broadcast : {false, true}) { |
411 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
412 | Fusion& fusion = *fusion_ptr.get(); |
413 | FusionGuard fg(&fusion); |
414 | |
415 | TensorView* x = (has_implicit_broadcast) |
416 | ? makeConcreteTensor(inferred_input) |
417 | : makeSymbolicTensor(inferred_input.size()); |
418 | TensorView* bias = (has_implicit_broadcast) |
419 | ? makeConcreteTensor(bias_shape) |
420 | : makeSymbolicTensor(bias_shape.size()); |
421 | fusion.addInput(x); |
422 | fusion.addInput(bias); |
423 | |
424 | auto tv1 = (view_before_persistent) ? add(x, bias) : softmax(x, kAxis); |
425 | auto x_view = view(tv1, inferred_input, inferred_output); |
426 | auto y = |
427 | (view_before_persistent) ? softmax(x_view, kAxis) : add(x_view, bias); |
428 | fusion.addOutput(y); |
429 | |
430 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
431 | at::Tensor at_x = at::randn(inferred_input, options); |
432 | at::Tensor at_bias = at::randn(bias_shape, options); |
433 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
434 | |
435 | FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); |
436 | auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); |
437 | |
438 | auto at_tv1 = (view_before_persistent) |
439 | ? (at_x + at_bias) |
440 | : at::_softmax(at_x, kAxis, false /* half_to_float */); |
441 | auto at_x_view = at::native::view(at_tv1, inferred_output); |
442 | auto at_y = (view_before_persistent) |
443 | ? at::_softmax(at_x_view, kAxis, false /* half_to_float */) |
444 | : at::add(at_x_view, at_bias); |
445 | |
446 | testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); |
447 | } |
448 | } |
449 | |
450 | TEST_F(NVFuserTest, FusionViewPersistentShmoo_CUDA) { |
451 | for (auto e : all_view_examples) { |
452 | persistentViewAddFusion( |
453 | e.first, e.second, true /* view_before_persistent */); |
454 | } |
455 | |
456 | for (auto e : all_view_examples) { |
457 | persistentViewAddFusion( |
458 | e.first, e.second, false /* view_before_persistent */); |
459 | } |
460 | } |
461 | |
462 | void addViewGeluFusion( |
463 | std::vector<int64_t>& input_shape, |
464 | std::vector<int64_t>& output_shape) { |
465 | for (auto has_implicit_broadcast : {false, true}) { |
466 | Fusion fusion; |
467 | FusionGuard fg(&fusion); |
468 | |
469 | TensorView* x = (has_implicit_broadcast) |
470 | ? makeConcreteTensor(input_shape) |
471 | : makeSymbolicTensor(input_shape.size()); |
472 | TensorView* bias = (has_implicit_broadcast) |
473 | ? makeConcreteTensor(input_shape) |
474 | : makeSymbolicTensor(input_shape.size()); |
475 | fusion.addInput(x); |
476 | fusion.addInput(bias); |
477 | |
478 | auto x_add_bias = add(x, bias); |
479 | auto x_view = view(x_add_bias, input_shape, output_shape); |
480 | auto y = gelu(x_view); |
481 | fusion.addOutput(y); |
482 | |
483 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
484 | at::Tensor at_x = at::randn(input_shape, options); |
485 | at::Tensor at_bias = at::randn(input_shape, options); |
486 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
487 | |
488 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
489 | |
490 | FusionExecutor fe; |
491 | fe.compileFusion(&fusion, aten_inputs, lparams); |
492 | auto outputs = fe.runFusion(aten_inputs, lparams); |
493 | |
494 | auto at_x_add_bias = at_x + at_bias; |
495 | auto at_x_view = at::native::view(at_x_add_bias, output_shape); |
496 | auto at_y = at::gelu(at_x_view); |
497 | |
498 | testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); |
499 | } |
500 | } |
501 | |
502 | TEST_F(NVFuserTest, FusionViewSplit_CUDA) { |
503 | std::vector<int64_t> input_shape{80}; |
504 | std::vector<int64_t> output_shape{2, 4, 10}; |
505 | addViewGeluFusion(input_shape, output_shape); |
506 | } |
507 | |
508 | TEST_F(NVFuserTest, FusionViewBroadcast_CUDA) { |
509 | std::vector<int64_t> input_shape{80}; |
510 | std::vector<int64_t> output_shape{1, 80}; |
511 | addViewGeluFusion(input_shape, output_shape); |
512 | } |
513 | |
514 | TEST_F(NVFuserTest, FusionViewMerge_CUDA) { |
515 | std::vector<int64_t> input_shape{2, 40, 7}; |
516 | std::vector<int64_t> output_shape{560}; |
517 | addViewGeluFusion(input_shape, output_shape); |
518 | } |
519 | |
520 | TEST_F(NVFuserTest, FusionViewAllShmoo_CUDA) { |
521 | for (auto e : all_view_examples) { |
522 | addViewGeluFusion(e.first, e.second); |
523 | } |
524 | } |
525 | |
526 | void geluViewAddFusion( |
527 | std::vector<int64_t> input_shape, |
528 | std::vector<int64_t> output_shape) { |
529 | // Support -1 sizes in the inputs |
530 | auto inferred_shapes = inferViewShapes(input_shape, output_shape); |
531 | auto inferred_input = inferred_shapes.first; |
532 | auto inferred_output = inferred_shapes.second; |
533 | |
534 | for (auto hasImplicitBroadcast : {false, true}) { |
535 | Fusion fusion; |
536 | FusionGuard fg(&fusion); |
537 | |
538 | TensorView* x = (hasImplicitBroadcast) |
539 | ? makeConcreteTensor(inferred_input) |
540 | : makeSymbolicTensor(inferred_input.size()); |
541 | TensorView* bias = (hasImplicitBroadcast) |
542 | ? makeConcreteTensor(inferred_output) |
543 | : makeSymbolicTensor(inferred_output.size()); |
544 | fusion.addInput(x); |
545 | fusion.addInput(bias); |
546 | |
547 | auto x_gelu = gelu(x); |
548 | auto x_view = view(x_gelu, inferred_input, inferred_output); |
549 | auto y = add(x_view, bias); |
550 | fusion.addOutput(y); |
551 | |
552 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
553 | at::Tensor at_x = at::randn(inferred_input, options); |
554 | at::Tensor at_bias = at::randn(inferred_output, options); |
555 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
556 | |
557 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
558 | |
559 | FusionExecutor fe; |
560 | fe.compileFusion(&fusion, aten_inputs, lparams); |
561 | auto outputs = fe.runFusion(aten_inputs, lparams); |
562 | |
563 | auto at_x_gelu = at::gelu(at_x); |
564 | auto at_x_view = at::native::view(at_x_gelu, inferred_output); |
565 | auto at_y = at_x_view + at_bias; |
566 | |
567 | testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); |
568 | } |
569 | } |
570 | |
571 | TEST_F(NVFuserTest, FusionViewStride_CUDA) { |
572 | for (const auto& e : all_view_examples) { |
573 | geluViewAddFusion(e.first, e.second); |
574 | } |
575 | } |
576 | |
577 | void geluViewBinaryAddFusion( |
578 | std::vector<int64_t> input_shape1, |
579 | std::vector<int64_t> input_shape2, |
580 | std::vector<int64_t> output_shape) { |
581 | for (auto hasImplicitBroadcast : {false, true}) { |
582 | Fusion fusion; |
583 | FusionGuard fg(&fusion); |
584 | |
585 | TensorView* x = (hasImplicitBroadcast) |
586 | ? makeConcreteTensor(input_shape1) |
587 | : makeSymbolicTensor(input_shape1.size()); |
588 | TensorView* bias = (hasImplicitBroadcast) |
589 | ? makeConcreteTensor(input_shape2) |
590 | : makeSymbolicTensor(input_shape2.size()); |
591 | fusion.addInput(x); |
592 | fusion.addInput(bias); |
593 | |
594 | auto x_gelu = gelu(x); |
595 | auto x_view = view(x_gelu, input_shape1, output_shape); |
596 | auto bias_view = view(bias, input_shape2, output_shape); |
597 | auto y = add(x_view, bias_view); |
598 | fusion.addOutput(y); |
599 | |
600 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
601 | at::Tensor at_x = at::randn(input_shape1, options); |
602 | at::Tensor at_bias = at::randn(input_shape2, options); |
603 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
604 | |
605 | auto lparams = schedulePointwise(&fusion, aten_inputs); |
606 | |
607 | FusionExecutor fe; |
608 | fe.compileFusion(&fusion, aten_inputs, lparams); |
609 | auto outputs = fe.runFusion(aten_inputs, lparams); |
610 | |
611 | auto at_x_gelu = at::gelu(at_x); |
612 | auto at_x_view = at::native::view(at_x_gelu, output_shape); |
613 | auto at_bias_view = at::native::view(at_bias, output_shape); |
614 | auto at_y = at_x_view + at_bias_view; |
615 | |
616 | testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); |
617 | } |
618 | } |
619 | |
620 | TEST_F(NVFuserTest, FusionViewBinary_CUDA) { |
621 | geluViewBinaryAddFusion({27454, 2}, {54908}, {7844, 7}); |
622 | } |
623 | |
624 | // Repro of issue #1493 |
625 | TEST_F(NVFuserTest, FusionViewConcreteDomain_CUDA) { |
626 | Fusion fusion; |
627 | FusionGuard fg(&fusion); |
628 | |
629 | auto tv0 = makeSymbolicTensor(2); |
630 | fusion.addInput(tv0); |
631 | auto tv1 = makeContigTensor(2); |
632 | fusion.addInput(tv1); |
633 | |
634 | auto tv2 = view(tv0, {2, 3}, {6}); |
635 | auto tv3 = add(tv2, IrBuilder::create<Double>(1)); |
636 | auto tv4 = broadcast(tv3, {true, false}); |
637 | auto tv5 = add(tv4, tv1); |
638 | |
639 | fusion.addOutput(tv5); |
640 | |
641 | tv5->merge(0); |
642 | tv0->computeAt(tv5, -1); |
643 | tv1->computeAt(tv5, -1); |
644 | |
645 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
646 | at::manual_seed(0); |
647 | auto t0 = at::randn({2, 3}, options); |
648 | auto t1 = at::randn({1, 6}, options); |
649 | |
650 | FusionExecutor fe; |
651 | fe.compileFusion(&fusion, {t0, t1}); |
652 | auto cg_outputs = fe.runFusion({t0, t1}); |
653 | |
654 | auto ref = (at::native::view(t0, {6}) + 1).unsqueeze(0) + t1; |
655 | |
656 | testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
657 | } |
658 | |
659 | TEST_F(NVFuserTest, FusionViewConcreteDomain2_CUDA) { |
660 | constexpr int kAxis = -1; |
661 | std::vector<int64_t> input_shape = {19, 12, 7, 99}; |
662 | std::vector<int64_t> output_shape = {19, 3, 2772}; |
663 | |
664 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
665 | Fusion& fusion = *fusion_ptr.get(); |
666 | FusionGuard fg(&fusion); |
667 | |
668 | TensorView* x = makeSymbolicTensor(input_shape.size()); |
669 | TensorView* bias = makeSymbolicTensor(output_shape.size()); |
670 | fusion.addInput(x); |
671 | fusion.addInput(bias); |
672 | |
673 | auto tv1 = softmax(x, kAxis); |
674 | auto x_view = view(tv1, input_shape, output_shape); |
675 | auto y = add(x_view, bias); |
676 | fusion.addOutput(y); |
677 | |
678 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
679 | at::Tensor at_x = at::randn(input_shape, options); |
680 | at::Tensor at_bias = at::randn(output_shape, options); |
681 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
682 | |
683 | FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); |
684 | auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); |
685 | |
686 | auto at_tv1 = at::_softmax(at_x, kAxis, false /* half_to_float */); |
687 | auto at_x_view = at::native::view(at_tv1, output_shape); |
688 | auto at_y = at::add(at_x_view, at_bias); |
689 | |
690 | testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); |
691 | } |
692 | |
693 | // Repro of issue #1608 |
694 | TEST_F(NVFuserTest, FusionViewConcreteDomain3_CUDA) { |
695 | std::vector<int64_t> input_shape = {14, 12, 8, 100}; |
696 | std::vector<int64_t> bcast_shape = {14, 12, 8, 1}; |
697 | std::vector<int64_t> other_shape = {14, 100, 96}; |
698 | std::vector<int64_t> output_shape = {14, 3, 3200}; |
699 | |
700 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
701 | Fusion& fusion = *fusion_ptr.get(); |
702 | FusionGuard fg(&fusion); |
703 | |
704 | TensorView* x = makeSymbolicTensor(input_shape.size()); |
705 | TensorView* y = makeConcreteTensor(bcast_shape); |
706 | TensorView* z = makeSymbolicTensor(other_shape.size()); |
707 | fusion.addInput(x); |
708 | fusion.addInput(y); |
709 | fusion.addInput(z); |
710 | |
711 | auto tv1 = add(x, y); |
712 | auto tv2 = view(tv1, input_shape, output_shape); |
713 | auto tv3 = view(z, other_shape, output_shape); |
714 | auto output = add(tv2, tv3); |
715 | fusion.addOutput(output); |
716 | |
717 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
718 | at::Tensor at_x = at::randn(input_shape, options); |
719 | at::Tensor at_y = at::randn(bcast_shape, options); |
720 | at::Tensor at_z = at::randn(other_shape, options); |
721 | std::vector<IValue> aten_inputs = {at_x, at_y, at_z}; |
722 | |
723 | FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); |
724 | auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); |
725 | |
726 | auto at_tv1 = at::add(at_x, at_y); |
727 | auto at_tv2 = at::native::view(at_tv1, output_shape); |
728 | auto at_tv3 = at::native::view(at_z, output_shape); |
729 | auto at_output = at::add(at_tv2, at_tv3); |
730 | |
731 | testValidate(&fusion, outputs, aten_inputs, {at_output}, __LINE__, __FILE__); |
732 | } |
733 | |
734 | TEST_F(NVFuserTest, FusionViewConcreteDomain4_CUDA) { |
735 | std::vector<int64_t> shape1 = {3, 4, 5}; |
736 | std::vector<int64_t> shape2 = {3 * 4 * 5}; |
737 | |
738 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
739 | Fusion& fusion = *fusion_ptr.get(); |
740 | FusionGuard fg(&fusion); |
741 | |
742 | auto tv0 = makeSymbolicTensor(shape1.size() - 1); |
743 | fusion.addInput(tv0); |
744 | |
745 | auto tv1 = makeSymbolicTensor(shape1.size()); |
746 | fusion.addInput(tv1); |
747 | |
748 | auto tv2 = broadcast(tv0, {true, false, false}); |
749 | auto tv3 = add(tv1, tv2); |
750 | auto tv4 = view(tv3, shape1, shape2); |
751 | auto tv5 = set(tv4); |
752 | fusion.addOutput(tv5); |
753 | |
754 | tv0->computeAt(tv5, -1); |
755 | tv1->computeAt(tv5, -1); |
756 | |
757 | TORCH_CHECK(tv5->nDims() == 1); |
758 | |
759 | // The concrete domain of tv5, which is 1D, with permissive or loop mapping |
760 | // needs to be either the domain of tv4 or tv5, both of which have the three |
761 | // concrete root domains of tv1. In other words, it must map with tv4 and tv5 |
762 | // with the exact mapping. |
763 | ComputeAtMap map(&fusion); |
764 | auto concrete_id = |
765 | map.getConcreteMappedID(tv5->axis(0), IdMappingMode::PERMISSIVE); |
766 | TORCH_CHECK( |
767 | map.areMapped(concrete_id, tv5->axis(0), IdMappingMode::EXACT), |
768 | "Invalid concrete ID: " , |
769 | concrete_id->toString()); |
770 | TORCH_CHECK( |
771 | map.areMapped(concrete_id, tv4->axis(0), IdMappingMode::EXACT), |
772 | "Invalid concrete ID: " , |
773 | concrete_id->toString()); |
774 | } |
775 | |
776 | TEST_F(NVFuserTest, FusionViewConcreteDomain5_CUDA) { |
777 | const std::vector<int64_t> shape1 = {12}; |
778 | const std::vector<int64_t> shape2 = {4, 3}; |
779 | const std::vector<int64_t> shape3 = {12, 5}; |
780 | const std::vector<int64_t> shape4 = {4, 3, 5}; |
781 | |
782 | for (auto order : {true, false}) { |
783 | std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); |
784 | Fusion& fusion = *fusion_ptr.get(); |
785 | FusionGuard fg(&fusion); |
786 | |
787 | auto tv0 = makeSymbolicTensor(1); |
788 | fusion.addInput(tv0); |
789 | |
790 | auto tv1 = makeSymbolicTensor(2); |
791 | fusion.addInput(tv1); |
792 | |
793 | auto tv0_cache = set(tv0); |
794 | |
795 | auto path1 = [&]() { |
796 | auto view_2d = view(tv0_cache, shape1, shape2); |
797 | auto view_2d_copy = set(view_2d); |
798 | fusion.addOutput(view_2d_copy); |
799 | return view_2d_copy; |
800 | }; |
801 | |
802 | auto path2 = [&]() { |
803 | auto tv0_bc = broadcast(tv0_cache, {false, true}); |
804 | auto tv0_bc_plus_tv1 = add(tv0_bc, tv1); |
805 | auto view_3d = view(tv0_bc_plus_tv1, shape3, shape4); |
806 | auto view_3d_copy = set(view_3d); |
807 | fusion.addOutput(view_3d_copy); |
808 | return view_3d_copy; |
809 | }; |
810 | |
811 | TensorView* path1_out = nullptr; |
812 | TensorView* path2_out = nullptr; |
813 | |
814 | if (order) { |
815 | // Fails before #1544. Concrete ID is picked from path1_out, which |
816 | // doesn't have the second root domain of tv1 |
817 | path2_out = path2(); |
818 | path1_out = path1(); |
819 | } else { |
820 | // Works fine |
821 | path1_out = path1(); |
822 | path2_out = path2(); |
823 | } |
824 | |
825 | path2_out->merge(-2, -1); |
826 | path2_out->merge(-2, -1); |
827 | |
828 | tv0->computeAt(path2_out, -1); |
829 | tv1->computeAt(path2_out, -1); |
830 | |
831 | TORCH_CHECK(path1_out->nDims() == 1); |
832 | TORCH_CHECK(path2_out->nDims() == 1); |
833 | |
834 | ComputeAtMap map(&fusion); |
835 | |
836 | // Make sure the two output tensors are mapped. Note both are 1D. |
837 | TORCH_CHECK(map.areMapped( |
838 | path1_out->axis(0), path2_out->axis(0), IdMappingMode::LOOP)); |
839 | |
840 | auto concrete_id = |
841 | map.getConcreteMappedID(path2_out->axis(0), IdMappingMode::LOOP); |
842 | TORCH_CHECK( |
843 | path2_out->axis(0) == concrete_id, |
844 | "Incorrect concrete ID: " , |
845 | concrete_id->toString()); |
846 | } |
847 | } |
848 | |
849 | TEST_F(NVFuserTest, FusionFlattenAfterUnsqueezeOutput_CUDA) { |
850 | Fusion fusion; |
851 | FusionGuard fg(&fusion); |
852 | |
853 | std::vector<int64_t> input_shape{512}; |
854 | |
855 | TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Double); |
856 | TensorView* bias = makeSymbolicTensor(input_shape.size(), DataType::Double); |
857 | fusion.addInput(x); |
858 | fusion.addInput(bias); |
859 | |
860 | auto x_add_bias = add(x, bias); |
861 | auto x_unsqueeze = unsqueeze(x_add_bias, -1); |
862 | auto x_view = flatten(x_unsqueeze); |
863 | fusion.addOutput(x_view); |
864 | |
865 | auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0); |
866 | at::Tensor at_x = at::randn(input_shape, options); |
867 | at::Tensor at_bias = at::randn(input_shape, options); |
868 | std::vector<IValue> aten_inputs = {at_x, at_bias}; |
869 | |
870 | x_view->split(0, 4); |
871 | x_add_bias->computeAt(x_view, 1); |
872 | x_view->axis(0)->parallelize(ParallelType::TIDx); |
873 | |
874 | FusionExecutor fe; |
875 | fe.compileFusion(&fusion, aten_inputs); |
876 | auto outputs = fe.runFusion(aten_inputs); |
877 | |
878 | auto at_x_add_bias = at_x + at_bias; |
879 | auto at_x_view = at_x_add_bias.unsqueeze(-1).flatten(); |
880 | |
881 | testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__); |
882 | } |
883 | |
884 | TEST_F(NVFuserTest, FusionComputeAtRootDomainMapWithView_CUDA) { |
885 | Fusion fusion; |
886 | FusionGuard fg(&fusion); |
887 | |
888 | const std::vector<int64_t> input_shape1{10, 12}; |
889 | const std::vector<int64_t> input_shape2{10, 3, 4}; |
890 | |
891 | auto tv0 = makeSymbolicTensor(2); |
892 | fusion.addInput(tv0); |
893 | |
894 | auto tv1 = add(tv0, IrBuilder::create<Double>(1)); |
895 | |
896 | // reduction followed by broadcast |
897 | auto tv2 = sum(tv1, {1}); |
898 | auto tv3 = broadcast(tv2, {false, true, true}); |
899 | |
900 | // Path with a view |
901 | auto tv4 = view(tv1, input_shape1, input_shape2); |
902 | |
903 | // Join the reduciton+broadcast and view paths together |
904 | auto tv5 = add(tv3, tv4); |
905 | fusion.addOutput(tv5); |
906 | |
907 | ComputeAtRootDomainMap map; |
908 | map.build(); |
909 | |
910 | // It's not possible to compute tv1 at the -1 position of |
911 | // t2. ComputeAtRootDomainMap should tell that by not mapping the |
912 | // second axis. |
913 | auto tv1_tv2_mappable_dims = |
914 | map.getMappableDims(tv1->domain(), tv2->domain()); |
915 | TORCH_CHECK( |
916 | tv1_tv2_mappable_dims.find(tv1->axis(1)) == tv1_tv2_mappable_dims.end(), |
917 | "Invalid ComputeAtRootDomainMap. Domain should not be mappable: " , |
918 | tv1->axis(1)->toString()); |
919 | } |
920 | |
921 | TEST_F(NVFuserTest, FusionExpandRepro_CUDA) { |
922 | Fusion fusion; |
923 | FusionGuard fg(&fusion); |
924 | |
925 | const std::vector<int64_t> input_shape1{4, 1, 1}; |
926 | const std::vector<int64_t> input_shape2{4, 3, 2}; |
927 | |
928 | auto tv0 = makeConcreteTensor({-1, 1, 1}); |
929 | fusion.addInput(tv0); |
930 | auto tv1 = makeSymbolicTensor(3); |
931 | fusion.addInput(tv1); |
932 | |
933 | auto tv2 = expand_as(tv0, tv1); |
934 | fusion.addOutput(tv2); |
935 | |
936 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
937 | at::Tensor at_x = at::randn(input_shape1, options); |
938 | at::Tensor at_y = at::randn(input_shape2, options); |
939 | std::vector<IValue> aten_inputs = {at_x, at_y}; |
940 | |
941 | FusionExecutor fe; |
942 | fe.compileFusion(&fusion); |
943 | LaunchParams l_params; |
944 | auto outputs = fe.runFusion(aten_inputs, {}, l_params, 0); |
945 | |
946 | auto out = at_x.expand_as(at_y); |
947 | |
948 | testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__); |
949 | |
950 | // second run to verify cached output allocation |
951 | outputs = fe.runFusion(aten_inputs, {}, l_params, 0); |
952 | testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__); |
953 | } |
954 | |
955 | TEST_F(NVFuserTest, FusionExpandView1_CUDA) { |
956 | auto fusion = std::make_unique<Fusion>(); |
957 | FusionGuard fg(fusion.get()); |
958 | |
959 | auto tv0 = makeConcreteTensor({4, 1, 8}); |
960 | fusion->addInput(tv0); |
961 | |
962 | auto tv1 = makeConcreteTensor({12, 8}); |
963 | fusion->addInput(tv1); |
964 | |
965 | auto tv2 = expand( |
966 | tv0, |
967 | {IrBuilder::create<Int>(4), |
968 | IrBuilder::create<Int>(3), |
969 | IrBuilder::create<Int>(8)}); |
970 | |
971 | auto tv3 = view(tv2, {4, 3, 8}, {12, 8}); |
972 | auto tv4 = add(tv3, tv1); |
973 | fusion->addOutput(tv4); |
974 | |
975 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
976 | at::manual_seed(0); |
977 | auto t0 = at::randn({4, 1, 8}, options); |
978 | auto t1 = at::randn({12, 8}, options); |
979 | |
980 | FusionExecutorCache executor_cache(std::move(fusion)); |
981 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); |
982 | |
983 | auto ref = at::reshape(t0.expand({4, 3, 8}), {12, 8}) + t1; |
984 | |
985 | testValidate( |
986 | executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
987 | } |
988 | |
989 | TEST_F(NVFuserTest, FusionExpandView2_CUDA) { |
990 | auto fusion = std::make_unique<Fusion>(); |
991 | FusionGuard fg(fusion.get()); |
992 | |
993 | auto tv0 = makeConcreteTensor({1, 8}); |
994 | fusion->addInput(tv0); |
995 | |
996 | auto tv1 = makeConcreteTensor({3, 4, 8}); |
997 | fusion->addInput(tv1); |
998 | |
999 | auto tv2 = |
1000 | expand(tv0, {IrBuilder::create<Int>(12), IrBuilder::create<Int>(8)}); |
1001 | |
1002 | auto tv3 = view(tv2, {12, 8}, {3, 4, 8}); |
1003 | auto tv4 = add(tv3, tv1); |
1004 | fusion->addOutput(tv4); |
1005 | |
1006 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1007 | at::manual_seed(0); |
1008 | auto t0 = at::randn({1, 8}, options); |
1009 | auto t1 = at::randn({3, 4, 8}, options); |
1010 | |
1011 | FusionExecutorCache executor_cache(std::move(fusion)); |
1012 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1}); |
1013 | |
1014 | auto ref = at::reshape(t0.expand({12, 8}), {3, 4, 8}) + t1; |
1015 | |
1016 | testValidate( |
1017 | executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); |
1018 | } |
1019 | |
1020 | TEST_F(NVFuserTest, FusionViewTransformCache_CUDA) { |
1021 | auto assert_matches = [](view_example example_0, view_example example_1) { |
1022 | TORCH_INTERNAL_ASSERT( |
1023 | analyzeViewConstraint(example_0.first, example_0.second) == |
1024 | analyzeViewConstraint(example_1.first, example_1.second), |
1025 | "View: " , |
1026 | example_0.first, |
1027 | " -> " , |
1028 | example_0.second, |
1029 | " Does not match:" , |
1030 | example_1.first, |
1031 | " -> " , |
1032 | example_1.second); |
1033 | }; |
1034 | |
1035 | auto assert_does_not_match = [](view_example example_0, |
1036 | view_example example_1) { |
1037 | TORCH_INTERNAL_ASSERT( |
1038 | !(analyzeViewConstraint(example_0.first, example_0.second) == |
1039 | analyzeViewConstraint(example_1.first, example_1.second)), |
1040 | "View: " , |
1041 | example_0.first, |
1042 | " -> " , |
1043 | example_0.second, |
1044 | " Should not match:" , |
1045 | example_1.first, |
1046 | " -> " , |
1047 | example_1.second); |
1048 | }; |
1049 | |
1050 | // Splits are done as splitting out left hand side, so left hand side |
1051 | // split changes can't reuse view, but right hand side split changes can. |
1052 | // Merges, since they don't bury hard values in can always be reshared. |
1053 | // Need to make sure trivial reduction, and broadcast changes don't try to |
1054 | // reuse view. What matches and what doesn't is very specific to the |
1055 | // implementation of how the splits/merges are generated. This could be |
1056 | // changed over time as there isn't a single set of transformations to |
1057 | // potentially make a view. For example we could always merge all dimensions, |
1058 | // then split out all dimensions. This would always be valid but would not be |
1059 | // efficient for indexing. |
1060 | |
1061 | // "Same" |
1062 | assert_matches( |
1063 | {{1, 1, 3333, 1}, {1, 1, 3333, 1}}, {{1, 1, 3333, 1}, {1, 1, -1, 1}}); |
1064 | assert_matches( |
1065 | {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, 4, 1, 8}}, |
1066 | {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, 4, 1, -1}}); |
1067 | |
1068 | // Trivial reduce matching |
1069 | assert_matches({{1, 3333, 1}, {-1}}, {{1, 24, 1}, {-1}}); |
1070 | |
1071 | // Trivial reduce not matching |
1072 | assert_does_not_match({{1, 3333, 1}, {-1}}, {{1, 3333}, {-1}}); |
1073 | |
1074 | // Broadcast matching |
1075 | assert_matches({{3333}, {1, -1, 1}}, {{24}, {1, -1, 1}}); |
1076 | |
1077 | // Broadcast not matching |
1078 | assert_does_not_match({{3333}, {1, -1, 1}}, {{24}, {1, -1}}); |
1079 | |
1080 | // RHS split |
1081 | assert_matches( |
1082 | {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, 4, -1}}, |
1083 | {{3, 17, 2 * 4 * 10 * 7, 1}, {3 * 17, 1, 2, 4, -1}}); |
1084 | assert_matches( |
1085 | {{1, 303 * 11, 1}, {1, 303, -1, 1}}, |
1086 | {{1, 303 * 11 * 4, 1}, {1, 303, -1, 1}}); |
1087 | assert_matches( |
1088 | {{2, 3, 2 * 2 * 3, 5}, {1, 2 * 3, 1, 2, -1, 5, 1}}, |
1089 | {{2, 3, 2 * 2 * 4, 5}, {1, 2 * 3, 1, 2, -1, 5, 1}}); |
1090 | assert_matches( |
1091 | {{22, 11 * 2, 2}, {22, 11, 1, 1, -1}}, |
1092 | {{22, 11 * 2 * 4, 2 * 3}, {22, 11, 1, 1, -1}}); |
1093 | assert_matches( |
1094 | {{1, 1111 * 3}, {1, 1, 1, 1111, 1, -1}}, |
1095 | {{1, 1111 * 3 * 7}, {1, 1, 1, 1111, 1, -1}}); |
1096 | assert_matches( |
1097 | {{1, 303 * 11 * 2, 1}, {1, 303, -1, 1}}, |
1098 | {{1, 303 * 11 * 3, 1}, {1, 303, -1, 1}}); |
1099 | assert_matches( |
1100 | {{8, 1, 1, 2 * 4, 1, 8}, {8, 2, -1, 1, 8}}, |
1101 | {{8, 1, 1, 2 * 4 * 6, 1, 8}, {8, 2, -1, 1, 8}}); |
1102 | |
1103 | // LHS split not matching |
1104 | assert_does_not_match( |
1105 | {{3, 17, 2 * 4 * 10, 1}, {3 * 17, 1, 2, -1, 10}}, |
1106 | {{3, 17, 2 * 4 * 3 * 10, 1}, {3 * 17, 1, 2, -1, 10}}); |
1107 | assert_does_not_match( |
1108 | {{1, 303 * 11, 1}, {1, -1, 11, 1}}, |
1109 | {{1, 303 * 11 * 2, 1}, {1, -1, 11, 1}}); |
1110 | assert_does_not_match( |
1111 | {{2, 3, 2 * 2, 5}, {1, 2 * 3, 1, -1, 2, 5, 1}}, |
1112 | {{2, 3, 3 * 2, 5}, {1, 2 * 3, 1, -1, 2, 5, 1}}); |
1113 | assert_does_not_match( |
1114 | {{22, (11 + 1) * 2, 2}, {22, -1, 1, 1, 2 * 2}}, |
1115 | {{22, 11 * 2, 2}, {22, -1, 1, 1, 2 * 2}}); |
1116 | assert_does_not_match( |
1117 | {{1, 1111 * 3}, {1, 1, 1, -1, 1, 3}}, |
1118 | {{1, 1111 * 2 * 3}, {1, 1, 1, -1, 1, 3}}); |
1119 | assert_does_not_match( |
1120 | {{1, 303 * 11, 1}, {1, -1, 11, 1}}, |
1121 | {{1, (303 + 1) * 11, 1}, {1, -1, 11, 1}}); |
1122 | assert_does_not_match( |
1123 | {{8, 1, 1, 2 * 4, 1, 8}, {8, -1, 4, 1, 8}}, |
1124 | {{8, 1, 1, 3 * 4, 1, 8}, {8, -1, 4, 1, 8}}); |
1125 | |
1126 | // Merge matching |
1127 | assert_matches( |
1128 | {{3, 17, 2 * 4 * 10, 1, 9}, {-1, 1, 2, 4, 10, 9}}, |
1129 | {{4, 18, 2 * 4 * 10, 1, 9}, {-1, 1, 2, 4, 10, 9}}); |
1130 | assert_matches({{22, 1, 23, 1}, {-1, 1}}, {{23, 1, 22, 1}, {-1, 1}}); |
1131 | |
1132 | // Merge not matching |
1133 | assert_does_not_match({{2, 3, 4}, {-1, 4}}, {{2, 3, 4}, {2, -1}}); |
1134 | assert_does_not_match( |
1135 | {{22, 1, 23, 1, 24}, {-1, 24}}, {{22, 1, 23, 1, 24}, {22, -1}}); |
1136 | |
1137 | // Split->Merge matching |
1138 | assert_matches( |
1139 | {{22, 11 * 2, 3}, {22, 11, 1, 1, -1}}, |
1140 | {{22, 11 * 3, 2}, {22, 11, 1, 1, -1}}); |
1141 | assert_matches( |
1142 | {{1, 3922 * 3 * 7, 1, 2 * 2}, {1, 3922 * 2, 1, -1}}, |
1143 | {{1, 3922 * 7, 1, 2}, {1, 3922 * 2, 1, -1}}); |
1144 | |
1145 | // Split->Merge not matching |
1146 | assert_does_not_match( |
1147 | {{22, 11 * 2, 2}, {22, -1, 1, 1, 4}}, |
1148 | {{22, 11 * 2 * 3, 2}, {22, -1, 1, 1, 4}}); |
1149 | assert_does_not_match( |
1150 | {{1, 3922 * 7, 1, 2}, {1, -1, 1, 7}}, |
1151 | {{1, 3922 * 7 * 2, 1, 2}, {1, -1, 1, 7}}); |
1152 | |
1153 | // Merge->Split matching |
1154 | assert_matches( |
1155 | {{1, 3922 * 2, 1, 7}, {1, 3922 * 7, -1}}, |
1156 | {{1, 3922 * 2 * 3, 1, 7}, {1, 3922 * 7, -1}}); |
1157 | assert_matches( |
1158 | {{19, 3 * 4, 7, 99}, {19, 3, -1}}, {{19, 3 * 3, 8, 10}, {19, 3, -1}}); |
1159 | |
1160 | // Merge->Split not matching |
1161 | assert_does_not_match( |
1162 | {{1, 3922 * 2, 1, 7}, {1, -1, 2}}, {{1, 3922, 1, 7}, {1, -1, 2}}); |
1163 | assert_does_not_match( |
1164 | {{19, 3 * 4, 7, 99}, {19, -1, 3}}, {{19, 3 * 5, 7, 99}, {19, -1, 3}}); |
1165 | } |
1166 | |
1167 | TEST_F(NVFuserTest, FusionViewIdGraph_CUDA) { |
1168 | Fusion fusion; |
1169 | FusionGuard fg(&fusion); |
1170 | |
1171 | int w = 2, x = 3, y = 4, z = 5; |
1172 | |
1173 | auto tv0 = makeConcreteTensor({w, x, y, z}); |
1174 | fusion.addInput(tv0); |
1175 | |
1176 | auto tv1 = sin(tv0); |
1177 | |
1178 | auto tv2 = view(tv1, {w, x, y, z}, {w, y, x * z}); |
1179 | fusion.addOutput(tv2); |
1180 | |
1181 | auto tv3 = makeConcreteTensor({w, x, y, z}); |
1182 | fusion.addInput(tv3); |
1183 | |
1184 | auto tv4 = view(tv3, {w, x, y, z}, {w, y, x * z}); |
1185 | fusion.addOutput(tv4); |
1186 | |
1187 | // Link 0 and 3 together for view analysis done based on before the views |
1188 | // actually happened. |
1189 | auto tv5 = add(tv0, tv3); |
1190 | fusion.addOutput(tv5); |
1191 | |
1192 | auto tv6 = makeConcreteTensor({w, x, x, y, z}); |
1193 | |
1194 | auto tv7 = sum(tv6, {2}); |
1195 | auto tv8 = broadcast(tv7, {false, true, false, true, false, false}); |
1196 | |
1197 | auto tv9 = makeConcreteTensor({w, 6, x, 7, y, z}); |
1198 | fusion.addInput(tv9); |
1199 | auto tv10 = add(tv8, tv9); |
1200 | fusion.addOutput(tv10); |
1201 | |
1202 | auto tv12 = view(tv8, {w, 1, x, 1, y, z}, {w, y, x * z}); |
1203 | fusion.addOutput(tv12); |
1204 | |
1205 | // Link the views after the views happen |
1206 | auto t13 = add(tv12, tv4); |
1207 | fusion.addOutput(t13); |
1208 | |
1209 | // Grab the trivial reduced tensor from t12's view. |
1210 | auto tv11 = ir_utils::producerTvsOf(tv12)[0]; |
1211 | |
1212 | // Start from the exact iter domain graph of the fusion |
1213 | IterDomainGraph id_graph(&fusion); |
1214 | auto disjoint_view_ids = id_graph.exactNodes(); |
1215 | |
1216 | TORCH_CHECK( |
1217 | id_graph.exactNodes().strictAreMapped(tv2->axis(1), tv4->axis(1))); |
1218 | TORCH_CHECK( |
1219 | id_graph.exactNodes().strictAreMapped(tv2->axis(2), tv4->axis(2))); |
1220 | |
1221 | TORCH_CHECK(id_graph.exactNodes().strictAreMapped( |
1222 | tv2->getRootDomain()[1], tv12->getRootDomain()[1])); |
1223 | TORCH_CHECK(id_graph.exactNodes().strictAreMapped( |
1224 | tv2->getRootDomain()[2], tv12->getRootDomain()[2])); |
1225 | TORCH_CHECK(id_graph.exactNodes().strictAreMapped( |
1226 | tv2->getRootDomain()[3], tv12->getRootDomain()[3])); |
1227 | } |
1228 | |
1229 | TEST_F(NVFuserTest, FusionViewVectorize_CUDA) { |
1230 | Fusion fusion; |
1231 | FusionGuard fg(&fusion); |
1232 | |
1233 | auto tv0 = makeContigTensor(3); |
1234 | fusion.addInput(tv0); |
1235 | auto tv1 = flatten(tv0, 1, 2); |
1236 | auto tv2 = flatten(tv0, 1, 2); |
1237 | auto tv3 = sin(tv1); |
1238 | auto tv4 = sin(tv2); |
1239 | fusion.addOutput(tv3); |
1240 | fusion.addOutput(tv4); |
1241 | |
1242 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1243 | at::Tensor input = at::randn({256, 1024, 1024}, options); |
1244 | |
1245 | auto lparams = schedulePointwise(&fusion, {input}); |
1246 | |
1247 | auto hasVectorization = [](TensorView* tv) -> bool { |
1248 | for (auto i : tv->domain()->domain()) { |
1249 | if (i->getParallelType() == ParallelType::Vectorize) { |
1250 | return true; |
1251 | } |
1252 | } |
1253 | return false; |
1254 | }; |
1255 | |
1256 | for (auto o : fusion.outputs()) { |
1257 | TORCH_CHECK(hasVectorization(o->as<TensorView>())); |
1258 | } |
1259 | for (auto i : fusion.inputs()) { |
1260 | for (auto c : ir_utils::consumerTvsOf(i->as<TensorView>())) { |
1261 | TORCH_CHECK(hasVectorization(c)); |
1262 | } |
1263 | } |
1264 | |
1265 | FusionExecutor fe; |
1266 | fe.compileFusion(&fusion, {input}, lparams); |
1267 | auto outputs = fe.runFusion({input}, lparams); |
1268 | |
1269 | auto tv_ref = input.flatten(1, 2).sin(); |
1270 | |
1271 | testValidate(&fusion, outputs, {input}, {tv_ref, tv_ref}, __LINE__, __FILE__); |
1272 | } |
1273 | |
1274 | TEST_F(NVFuserTest, FusionExpandFlatten_CUDA) { |
1275 | #ifdef FBCODE_CAFFE2 |
1276 | GTEST_SKIP() << "Fails accuracy on V100 32gb" ; |
1277 | #endif |
1278 | auto fusion = std::make_unique<Fusion>(); |
1279 | FusionGuard fg(fusion.get()); |
1280 | |
1281 | auto tv0 = makeConcreteTensor({-1, -1, 1}); |
1282 | fusion->addInput(tv0); |
1283 | auto tv1 = expand( |
1284 | tv0, |
1285 | {tv0->axis(0)->extent(), |
1286 | tv0->axis(1)->extent(), |
1287 | IrBuilder::create<Int>(8)}); |
1288 | auto tv2 = flatten(tv1, 1, 2); |
1289 | auto tv3 = sum(tv2, {1}); |
1290 | fusion->addOutput(tv3); |
1291 | |
1292 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1293 | at::Tensor input = at::randn({256, 1024, 1}, options); |
1294 | |
1295 | FusionExecutorCache executor_cache(std::move(fusion)); |
1296 | auto cg_outputs = executor_cache.runFusionWithInputs({input}); |
1297 | |
1298 | auto aten_out = input.expand({256, 1024, 8}).flatten(1, 2).sum(1); |
1299 | |
1300 | testValidate( |
1301 | executor_cache.fusion(), |
1302 | cg_outputs, |
1303 | {input}, |
1304 | {aten_out}, |
1305 | __LINE__, |
1306 | __FILE__); |
1307 | } |
1308 | |
1309 | TEST_F(NVFuserTest, FusionIllegalReductionFlatten_CUDA) { |
1310 | EXPECT_THAT( |
1311 | []() { |
1312 | auto fusion = std::make_unique<Fusion>(); |
1313 | FusionGuard fg(fusion.get()); |
1314 | |
1315 | auto tv0 = makeConcreteTensor({2, 3}); |
1316 | fusion->addInput(tv0); |
1317 | |
1318 | auto tv1 = sum(tv0, {1}); |
1319 | auto tv2 = flatten(tv1, 0, 1); |
1320 | fusion->addOutput(tv2); |
1321 | }, |
1322 | testing::ThrowsMessage<c10::Error>( |
1323 | testing::HasSubstr("Invalid end_dim" ))); |
1324 | } |
1325 | |
1326 | TEST_F(NVFuserTest, FusionReductionFlatten1_CUDA) { |
1327 | auto fusion = std::make_unique<Fusion>(); |
1328 | FusionGuard fg(fusion.get()); |
1329 | |
1330 | auto tv0 = makeConcreteTensor({2, 3, 5}); |
1331 | fusion->addInput(tv0); |
1332 | |
1333 | auto tv1 = sum(tv0, {1}); |
1334 | auto tv2 = flatten(tv1, 0, 1); |
1335 | fusion->addOutput(tv2); |
1336 | |
1337 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1338 | at::manual_seed(0); |
1339 | auto t0 = at::randn({2, 3, 5}, options); |
1340 | auto ref = t0.sum({1}).flatten(0, 1); |
1341 | |
1342 | FusionExecutorCache executor_cache(std::move(fusion)); |
1343 | auto cg_outputs = executor_cache.runFusionWithInputs({t0}); |
1344 | |
1345 | testValidate( |
1346 | executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__); |
1347 | } |
1348 | |
1349 | TEST_F(NVFuserTest, FusionPwiseViewSchedule_CUDA) { |
1350 | Fusion fusion; |
1351 | FusionGuard fg(&fusion); |
1352 | |
1353 | int x = 31, y = 65, z = 103; |
1354 | |
1355 | auto tv0 = makeConcreteTensor({x, y, z}); |
1356 | fusion.addInput(tv0); |
1357 | |
1358 | auto tv1 = sin(tv0); |
1359 | |
1360 | auto tv2 = view(tv1, {x, y, z}, {x, y * z}); |
1361 | fusion.addOutput(tv2); |
1362 | |
1363 | auto tv3 = makeConcreteTensor({x, y, z}); |
1364 | fusion.addInput(tv3); |
1365 | |
1366 | auto tv4 = view(tv3, {x, y, z}, {x, y * z}); |
1367 | fusion.addOutput(tv4); |
1368 | |
1369 | // Link 0 and 3 together for view analysis done based on before the views |
1370 | // actually happened. |
1371 | auto tv5 = add(tv0, tv3); |
1372 | fusion.addOutput(tv5); |
1373 | |
1374 | TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion)); |
1375 | { |
1376 | TransformPropagator propagator(tv4); |
1377 | MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator); |
1378 | } |
1379 | |
1380 | for (auto i : c10::irange(tv5->nDims() - 1)) { |
1381 | tv5->merge(0); |
1382 | } |
1383 | tv5->split(0, 32); |
1384 | tv5->split(0, 4); |
1385 | tv5->axis(0)->parallelize(ParallelType::BIDx); |
1386 | tv5->axis(1)->parallelize(ParallelType::Unroll); |
1387 | tv5->axis(2)->parallelize(ParallelType::TIDx); |
1388 | |
1389 | { |
1390 | TransformPropagator propagator(tv5); |
1391 | MaxRootDomainInfoSpanningTree spanning_tree(tv5); |
1392 | spanning_tree.traverse(&propagator); |
1393 | scheduler_utils::parallelizeAllLike(tv5); |
1394 | |
1395 | // Inline the schedule |
1396 | inlineMost(); |
1397 | } |
1398 | |
1399 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1400 | |
1401 | at::Tensor t0 = at::randn({x, y, z}, options); |
1402 | at::Tensor t3 = at::randn({x, y, z}, options); |
1403 | auto t1 = sin(t0); |
1404 | auto t2 = at::native::view(t1, {x, y * z}); |
1405 | auto t4 = at::native::view(t3, {x, y * z}); |
1406 | auto t5 = t0 + t3; |
1407 | |
1408 | FusionExecutor fe; |
1409 | fe.compileFusion(&fusion, {t0, t3}); |
1410 | auto cg_outputs = fe.runFusion({t0, t3}); |
1411 | |
1412 | testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__); |
1413 | } |
1414 | |
1415 | TEST_F(NVFuserTest, FusionSumViewSchedule_CUDA) { |
1416 | Fusion fusion; |
1417 | FusionGuard fg(&fusion); |
1418 | |
1419 | int x = 31, y = 65, z = 103; |
1420 | |
1421 | auto tv0 = makeConcreteTensor({x, y, z}); |
1422 | fusion.addInput(tv0); |
1423 | |
1424 | auto tv1 = sin(tv0); |
1425 | |
1426 | auto tv2 = view(tv1, {x, y, z}, {x, y * z}); |
1427 | fusion.addOutput(tv2); |
1428 | |
1429 | auto tv3 = makeConcreteTensor({x, y, z}); |
1430 | fusion.addInput(tv3); |
1431 | |
1432 | auto tv4 = view(tv3, {x, y, z}, {x, y * z}); |
1433 | auto tv5 = sum(tv4, {1}); |
1434 | fusion.addOutput(tv5); |
1435 | |
1436 | // Link 0 and 3 together for view analysis done based on before the views |
1437 | // actually happened. |
1438 | auto tv6 = add(tv0, tv3); |
1439 | fusion.addOutput(tv6); |
1440 | |
1441 | TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion)); |
1442 | { |
1443 | TransformPropagator propagator(tv4); |
1444 | MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator); |
1445 | } |
1446 | |
1447 | tv5->split(1, 128); |
1448 | tv5->split(1, 4); |
1449 | |
1450 | auto tv5_rf = tv5->rFactor({1, 2}); |
1451 | tv5_rf->axis(0)->parallelize(ParallelType::BIDx); |
1452 | tv5_rf->axis(2)->parallelize(ParallelType::Unroll); |
1453 | tv5_rf->axis(3)->parallelize(ParallelType::TIDx); |
1454 | |
1455 | { |
1456 | TransformPropagator propagator(tv5_rf); |
1457 | MaxRootDomainInfoSpanningTree spanning_tree(tv5_rf); |
1458 | spanning_tree.traverse(&propagator); |
1459 | scheduler_utils::parallelizeAllLike(tv5_rf); |
1460 | |
1461 | // Inline the schedule |
1462 | inlineMost(); |
1463 | } |
1464 | |
1465 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1466 | |
1467 | at::Tensor t0 = at::randn({x, y, z}, options); |
1468 | at::Tensor t3 = at::randn({x, y, z}, options); |
1469 | auto t1 = sin(t0); |
1470 | auto t2 = at::native::view(t1, {x, y * z}); |
1471 | auto t4 = at::native::view(t3, {x, y * z}); |
1472 | auto t5 = t4.sum({1}); |
1473 | auto t6 = t0 + t3; |
1474 | |
1475 | FusionExecutor fe; |
1476 | fe.compileFusion(&fusion, {t0, t3}); |
1477 | auto cg_outputs = fe.runFusion({t0, t3}); |
1478 | |
1479 | testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__); |
1480 | } |
1481 | |
1482 | // Make sure matching views are segmented into the same kernel |
1483 | TEST_F(NVFuserTest, FusionViewMagicSchedule1_CUDA) { |
1484 | auto fusion_ptr = std::make_unique<Fusion>(); |
1485 | Fusion& fusion = *fusion_ptr.get(); |
1486 | FusionGuard fg(&fusion); |
1487 | |
1488 | int x = 31, y = 65, z = 103; |
1489 | |
1490 | auto tv0 = makeConcreteTensor({x, y, z}); |
1491 | fusion.addInput(tv0); |
1492 | |
1493 | auto tv1 = sin(tv0); |
1494 | |
1495 | auto tv2 = view(tv1, {x, y, z}, {x, y * z}); |
1496 | fusion.addOutput(tv2); |
1497 | |
1498 | auto tv3 = makeConcreteTensor({x, y, z}); |
1499 | fusion.addInput(tv3); |
1500 | |
1501 | auto tv4 = view(tv3, {x, y, z}, {x, y * z}); |
1502 | fusion.addOutput(tv4); |
1503 | |
1504 | // Link 0 and 3 together for view analysis done based on before the views |
1505 | // actually happened. |
1506 | auto tv5 = add(tv0, tv3); |
1507 | fusion.addOutput(tv5); |
1508 | |
1509 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1510 | |
1511 | at::Tensor t0 = at::randn({x, y, z}, options); |
1512 | at::Tensor t3 = at::randn({x, y, z}, options); |
1513 | auto t1 = sin(t0); |
1514 | auto t2 = at::native::view(t1, {x, y * z}); |
1515 | auto t4 = at::native::view(t3, {x, y * z}); |
1516 | auto t5 = t0 + t3; |
1517 | |
1518 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
1519 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3}); |
1520 | TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented()); |
1521 | |
1522 | testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__); |
1523 | } |
1524 | |
1525 | // Make sure views of views are correct |
1526 | TEST_F(NVFuserTest, FusionViewMagicSchedule2_CUDA) { |
1527 | auto fusion_ptr = std::make_unique<Fusion>(); |
1528 | Fusion& fusion = *fusion_ptr.get(); |
1529 | FusionGuard fg(&fusion); |
1530 | |
1531 | int x = 31, y = 65, z = 103; |
1532 | |
1533 | auto tv0 = makeConcreteTensor({x, y, z}); |
1534 | fusion.addInput(tv0); |
1535 | |
1536 | auto tv1 = sin(tv0); |
1537 | |
1538 | auto tv2 = view(tv1, {x, y, z}, {x, y * z}); |
1539 | auto tv3 = view(tv2, {x, y * z}, {x * y, z}); |
1540 | auto tv4 = view(tv3, {x * y, z}, {y, x * z}); |
1541 | auto tv5 = view(tv4, {y, x * z}, {x, y, z}); |
1542 | fusion.addOutput(tv5); |
1543 | |
1544 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1545 | |
1546 | at::Tensor t0 = at::randn({x, y, z}, options); |
1547 | auto aten_out = sin(t0); |
1548 | |
1549 | // For now pointwise scheduler only accepts a single view at a time, so this |
1550 | // will be broken up into multiple kernels. This is due to the reference check |
1551 | // looking for all mappings to all input IDs. |
1552 | // TODO: Fix the reference check for this case |
1553 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
1554 | auto cg_outputs = executor_cache.runFusionWithInputs({t0}); |
1555 | |
1556 | testValidate(&fusion, cg_outputs, {t0}, {aten_out}, __LINE__, __FILE__); |
1557 | } |
1558 | |
1559 | // Make sure broadcasts not on the view path that don't interfere with view are |
1560 | // segmented in one kernel and correctly trigger 2D pointwise scheduling |
1561 | TEST_F(NVFuserTest, FusionViewMagicSchedule3_CUDA) { |
1562 | auto fusion_ptr = std::make_unique<Fusion>(); |
1563 | Fusion& fusion = *fusion_ptr.get(); |
1564 | FusionGuard fg(&fusion); |
1565 | |
1566 | int w = 15, x = 31, y = 49, z = 65; |
1567 | |
1568 | auto tv0 = makeConcreteTensor({x, y, z}); |
1569 | fusion.addInput(tv0); |
1570 | |
1571 | auto tv1 = sin(tv0); |
1572 | |
1573 | auto tv2 = view(tv1, {x, y, z}, {x, y * z}); |
1574 | fusion.addOutput(tv2); |
1575 | |
1576 | auto tv3 = makeConcreteTensor({x, y, z}); |
1577 | fusion.addInput(tv3); |
1578 | |
1579 | auto tv4 = view(tv3, {x, y, z}, {x, y * z}); |
1580 | fusion.addOutput(tv4); |
1581 | |
1582 | // Link 0 and 3 together for view analysis done based on before the views |
1583 | // actually happened. |
1584 | auto tv5 = add(tv0, tv3); |
1585 | fusion.addOutput(tv5); |
1586 | |
1587 | // Broadcast on another branch to drive the pointwise reference to not be on |
1588 | // the view paths. |
1589 | |
1590 | auto tv6 = makeConcreteTensor({w, x, y, z}); |
1591 | fusion.addInput(tv6); |
1592 | auto tv7 = broadcast(tv0, {true, false, false, false}); |
1593 | auto tv8 = add(tv6, tv7); |
1594 | // tv8 should be the reference for the pointwise fusion. This broadcast |
1595 | // pattern doesn't interfere with the views, so this should also be scheduled |
1596 | // as 2D. |
1597 | fusion.addOutput(tv8); |
1598 | |
1599 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1600 | |
1601 | at::Tensor t0 = at::randn({x, y, z}, options); |
1602 | at::Tensor t3 = at::randn({x, y, z}, options); |
1603 | auto t1 = sin(t0); |
1604 | auto t2 = at::native::view(t1, {x, y * z}); |
1605 | auto t4 = at::native::view(t3, {x, y * z}); |
1606 | auto t5 = t0 + t3; |
1607 | at::Tensor t6 = at::randn({w, x, y, z}, options); |
1608 | auto t8 = t6.add(t0); |
1609 | |
1610 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
1611 | // Collect the heuristic params |
1612 | executor_cache.profile(true); |
1613 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6}); |
1614 | |
1615 | TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented()); |
1616 | TORCH_CHECK(executor_cache.getMostRecentExecutorInfo() |
1617 | .params->isA<PointwiseParams>()); |
1618 | auto pparams = |
1619 | executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>(); |
1620 | TORCH_CHECK(pparams->break_point == 1); |
1621 | |
1622 | testValidate( |
1623 | &fusion, cg_outputs, {t0, t3, t6}, {t2, t4, t5, t8}, __LINE__, __FILE__); |
1624 | } |
1625 | |
1626 | // Make sure broadcasts through views when not conflicting with view are |
1627 | // segmented into one kernel and trigger 2D pointwise scheduler. |
1628 | TEST_F(NVFuserTest, FusionViewMagicSchedule4_CUDA) { |
1629 | auto fusion_ptr = std::make_unique<Fusion>(); |
1630 | Fusion& fusion = *fusion_ptr.get(); |
1631 | FusionGuard fg(&fusion); |
1632 | |
1633 | int w = 15, x = 31, y = 49, z = 65; |
1634 | |
1635 | auto tv0 = makeConcreteTensor({x, y, z}); |
1636 | fusion.addInput(tv0); |
1637 | |
1638 | auto tv1 = sin(tv0); |
1639 | |
1640 | auto tv2 = view(tv1, {x, y, z}, {x, y * z}); |
1641 | fusion.addOutput(tv2); |
1642 | |
1643 | auto tv3 = makeConcreteTensor({x, y, z}); |
1644 | fusion.addInput(tv3); |
1645 | |
1646 | auto tv4 = makeConcreteTensor({x, 1, 1}); |
1647 | fusion.addInput(tv4); |
1648 | |
1649 | auto tv5 = add(tv4, tv3); |
1650 | |
1651 | auto tv6 = view(tv5, {x, y, z}, {x, y * z}); |
1652 | fusion.addOutput(tv6); |
1653 | |
1654 | // Link 0 and 3 together for view analysis done based on before the views |
1655 | // actually happened. |
1656 | auto tv7 = add(tv0, tv3); |
1657 | fusion.addOutput(tv7); |
1658 | |
1659 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1660 | |
1661 | at::Tensor t0 = at::randn({x, y, z}, options); |
1662 | at::Tensor t3 = at::randn({x, y, z}, options); |
1663 | at::Tensor t4 = at::randn({x, 1, 1}, options); |
1664 | auto t1 = sin(t0); |
1665 | auto t2 = at::native::view(t1, {x, y * z}); |
1666 | auto t5 = t4 + t3; |
1667 | auto t6 = at::native::view(t5, {x, y * z}); |
1668 | auto t7 = t0 + t3; |
1669 | |
1670 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
1671 | // Collect the heuristic params |
1672 | executor_cache.profile(true); |
1673 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t4}); |
1674 | |
1675 | TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented()); |
1676 | TORCH_CHECK(executor_cache.getMostRecentExecutorInfo() |
1677 | .params->isA<PointwiseParams>()); |
1678 | auto pparams = |
1679 | executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>(); |
1680 | TORCH_CHECK(pparams->break_point == 1); |
1681 | |
1682 | testValidate( |
1683 | &fusion, cg_outputs, {t0, t3, t4}, {t2, t6, t7}, __LINE__, __FILE__); |
1684 | } |
1685 | |
1686 | // Make sure different views that are consumed by the reference are segmented |
1687 | // into a single kernel. |
1688 | TEST_F(NVFuserTest, FusionViewMagicSchedule5_CUDA) { |
1689 | auto fusion_ptr = std::make_unique<Fusion>(); |
1690 | Fusion& fusion = *fusion_ptr.get(); |
1691 | FusionGuard fg(&fusion); |
1692 | |
1693 | int w = 15, x = 31, y = 49, z = 65; |
1694 | |
1695 | auto tv0 = makeConcreteTensor({w, x, y * z}); |
1696 | fusion.addInput(tv0); |
1697 | auto tv1 = sin(tv0); |
1698 | auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w}); |
1699 | |
1700 | auto tv3 = makeConcreteTensor({w, x * y, z}); |
1701 | fusion.addInput(tv3); |
1702 | auto tv4 = cos(tv3); |
1703 | auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w}); |
1704 | |
1705 | auto tv6 = add(tv2, tv5); |
1706 | fusion.addOutput(tv6); |
1707 | |
1708 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1709 | |
1710 | at::Tensor t0 = at::randn({w, x, y * z}, options); |
1711 | auto t1 = sin(t0); |
1712 | auto t2 = at::native::view(t1, {z, y, x, w}); |
1713 | at::Tensor t3 = at::randn({w, x * y, z}, options); |
1714 | auto t4 = cos(t3); |
1715 | auto t5 = at::native::view(t4, {z, y, x, w}); |
1716 | auto t6 = add(t2, t5); |
1717 | |
1718 | FusionExecutorCache executor_cache(std::move(fusion_ptr)); |
1719 | // Collect the heuristic params |
1720 | executor_cache.profile(true); |
1721 | auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3}); |
1722 | |
1723 | TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented()); |
1724 | TORCH_CHECK(executor_cache.getMostRecentExecutorInfo() |
1725 | .params->isA<PointwiseParams>()); |
1726 | |
1727 | testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__); |
1728 | } |
1729 | |
1730 | // Make sure different views that are consumed by the reference are segmented |
1731 | // into a single kernel. |
1732 | TEST_F(NVFuserTest, FusionViewMapping_CUDA) { |
1733 | auto fusion_ptr = std::make_unique<Fusion>(); |
1734 | Fusion& fusion = *fusion_ptr.get(); |
1735 | FusionGuard fg(&fusion); |
1736 | |
1737 | int w = 15, x = 31, y = 49, z = 65; |
1738 | |
1739 | auto tv0 = makeConcreteTensor({w, x, y * z}); |
1740 | fusion.addInput(tv0); |
1741 | auto tv1 = sin(tv0); |
1742 | auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w}); |
1743 | |
1744 | auto tv3 = makeConcreteTensor({w, x * y, z}); |
1745 | fusion.addInput(tv3); |
1746 | auto tv4 = cos(tv3); |
1747 | auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w}); |
1748 | |
1749 | auto tv6 = add(tv2, tv5); |
1750 | fusion.addOutput(tv6); |
1751 | |
1752 | tv6->merge(0); |
1753 | tv6->merge(0); |
1754 | tv6->merge(0); |
1755 | tv6->split(0, 128); |
1756 | tv6->split(0, 4); |
1757 | tv6->axis(0)->parallelize(ParallelType::BIDx); |
1758 | tv6->axis(1)->parallelize(ParallelType::Unroll); |
1759 | tv6->axis(2)->parallelize(ParallelType::TIDx); |
1760 | |
1761 | TransformPropagator propagator(tv6); |
1762 | MaxRootDomainInfoSpanningTree spanning_tree(tv6); |
1763 | spanning_tree.traverse(&propagator); |
1764 | scheduler_utils::parallelizeAllLike(tv6); |
1765 | |
1766 | // Inline the schedule |
1767 | inlineMost(); |
1768 | |
1769 | auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); |
1770 | |
1771 | at::Tensor t0 = at::randn({w, x, y * z}, options); |
1772 | auto t1 = sin(t0); |
1773 | auto t2 = at::native::view(t1, {z, y, x, w}); |
1774 | at::Tensor t3 = at::randn({w, x * y, z}, options); |
1775 | auto t4 = cos(t3); |
1776 | auto t5 = at::native::view(t4, {z, y, x, w}); |
1777 | auto t6 = add(t2, t5); |
1778 | |
1779 | FusionExecutor fe; |
1780 | fe.compileFusion(&fusion, {t0, t3}); |
1781 | auto cg_outputs = fe.runFusion({t0, t3}); |
1782 | |
1783 | testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__); |
1784 | } |
1785 | |
1786 | TEST_F(NVFuserTest, FusionLowerDivisibleSplits_CUDA) { |
1787 | auto fusion_ptr = std::make_unique<Fusion>(); |
1788 | Fusion& fusion = *fusion_ptr.get(); |
1789 | FusionGuard fg(&fusion); |
1790 | |
1791 | int w = 15, x = 31, y = 49, z = 65; |
1792 | |
1793 | auto tv0 = makeContigTensor(4); |
1794 | fusion.addInput(tv0); |
1795 | auto tv1 = sin(tv0); |
1796 | auto tv2 = view(tv1, {w, x, y, z}, {z, y, x, w}); |
1797 | |
1798 | fusion.addOutput(tv2); |
1799 | |
1800 | tv2->merge(0)->merge(0)->merge(0)->split(0, 4)->split(0, 8, false); |
1801 | |
1802 | TransformPropagator propagator(tv2); |
1803 | MaxRootDomainInfoSpanningTree spanning_tree(tv2); |
1804 | spanning_tree.traverse(&propagator); |
1805 | scheduler_utils::parallelizeAllLike(tv2); |
1806 | |
1807 | // Inline the schedule |
1808 | inlineMost(); |
1809 | |
1810 | auto divisible_splits = getAllDivisibleSplits(&fusion); |
1811 | |
1812 | // Operations on all tensors are basically: |
1813 | // [10] merge(0) [9]->outer->definition |
1814 | // [9] merge(0) [8]->outer->definition |
1815 | // [8] merge(0) [7]->in->definition |
1816 | // [7] split(0, z, false) [6]->in->definition |
1817 | // [6] split(1, y, false) [5]->in->definition |
1818 | // [5] split(2, x, false) [3]->inner->definition |
1819 | // RFactor of tv2 |
1820 | // [4] merge(0) [3]->outer->definition |
1821 | // [3] merge(0) [2]->outer->definition |
1822 | // [2] merge(0) [1]->in->definition |
1823 | // [1] split(0, 4) [0]->in->definition |
1824 | // [0] split(0, 8, false) tv->axis(0)->definition |
1825 | |
1826 | for (auto tv : std::vector<TensorView*>({tv2, tv1, tv0})) { |
1827 | auto transform_0 = tv->axis(0)->definition()->as<Split>(); |
1828 | auto transform_1 = transform_0->in()->definition()->as<Split>(); |
1829 | auto transform_2 = transform_1->in()->definition()->as<Merge>(); |
1830 | auto transform_3 = transform_2->outer()->definition()->as<Merge>(); |
1831 | |
1832 | auto transform_5 = transform_3->inner()->definition()->as<Split>(); |
1833 | auto transform_6 = transform_5->in()->definition()->as<Split>(); |
1834 | auto transform_7 = transform_6->in()->definition()->as<Split>(); |
1835 | |
1836 | TORCH_CHECK( |
1837 | divisible_splits.find(transform_5) != divisible_splits.end(), |
1838 | "Expecting: " , |
1839 | transform_5->toString(), |
1840 | "\nFrom TV: " , |
1841 | tv, |
1842 | "\nTo be a divisible split." ); |
1843 | TORCH_CHECK( |
1844 | divisible_splits.find(transform_6) != divisible_splits.end(), |
1845 | "Expecting: " , |
1846 | transform_6->toString(), |
1847 | "\nFrom TV: " , |
1848 | tv, |
1849 | "\nTo be a divisible split." ); |
1850 | TORCH_CHECK( |
1851 | divisible_splits.find(transform_7) != divisible_splits.end(), |
1852 | "Expecting: " , |
1853 | transform_7->toString(), |
1854 | "\nFrom TV: " , |
1855 | tv, |
1856 | "\nTo be a divisible split." ); |
1857 | } |
1858 | } |
1859 | |
1860 | } // namespace jit |
1861 | } // namespace torch |
1862 | #endif // #if defined(USE_CUDA) |
1863 | |