test_gpu_transpose.cpp source code [pytorch/third_party/nvfuser/test/test_gpu_transpose.cpp]

1	#if defined(USE_CUDA)
2	#include <gmock/gmock-matchers.h>
3	#include <gtest/gtest.h>
4
5	#include <executor.h>
6	#include <inlining.h>
7	#include <kernel_cache.h>
8	#include <ops/all_ops.h>
9	#include <scheduler/all_schedulers.h>
10	#include <scheduler/transpose.h>
11	#include <scheduler/utils.h>
12	#include <test/test_gpu_validator.h>
13	#include <test/test_utils.h>
14
15	// Tests go in torch::jit
16	namespace torch {
17	namespace jit {
18
19	using namespace torch::jit::fuser::cuda;
20
21	TEST_F(NVFuserTest, FusionTranspose1_CUDA) {
22	Fusion fusion;
23	FusionGuard fg(&fusion);
24
25	constexpr int M = `10`;
26	constexpr int N = `20`;
27
28	auto tv0 = makeSymbolicTensor(`2`);
29	auto tv1 = transpose(tv0);
30	fusion.addInput(tv0);
31	fusion.addOutput(tv1);
32
33	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
34	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
35
36	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
37	at::manual_seed(`0`);
38	at::Tensor t0 = at::randn({M, N}, options);
39	std::vector<IValue> aten_inputs = {t0};
40
41	FusionExecutor fe;
42	fe.compileFusion(&fusion, aten_inputs);
43	auto outputs = fe.runFusion(aten_inputs);
44
45	at::Tensor aten_output = t0.t();
46
47	testValidate(
48	&fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
49	}
50
51	TEST_F(NVFuserTest, FusionTranspose2_CUDA) {
52	Fusion fusion;
53	FusionGuard fg(&fusion);
54
55	constexpr int M = `10`;
56	constexpr int N = `20`;
57
58	auto tv0 = makeSymbolicTensor(`2`);
59	auto tv1 = transpose(tv0);
60	fusion.addInput(tv0);
61	fusion.addOutput(tv1);
62
63	tv1->merge(`0`);
64	tv1->split(`0`, `32`);
65
66	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
67	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
68
69	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
70	at::manual_seed(`0`);
71	at::Tensor t0 = at::randn({M, N}, options);
72	std::vector<IValue> aten_inputs = {t0};
73
74	FusionExecutor fe;
75	fe.compileFusion(&fusion, aten_inputs);
76	auto outputs = fe.runFusion(aten_inputs);
77
78	at::Tensor aten_output = t0.t();
79
80	testValidate(
81	&fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
82	}
83
84	TEST_F(NVFuserTest, FusionTransposeWithSwizzle_CUDA) {
85	Fusion fusion;
86	FusionGuard fg(&fusion);
87
88	auto tv0 = makeSymbolicTensor(`2`);
89	fusion.addInput(tv0);
90	auto tv1 = transpose(tv0);
91	fusion.addOutput(tv1);
92
93	// tv0: [I0, I1]
94	// tv1: [I1, I0]
95
96	const int BS = `32`;
97
98	// CTA tiling by BSBS*
99	tv1->split(`1`, BS);
100	tv1->split(`0`, BS);
101	tv1->reorder({{`1`, `2`}});
102	// tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
103
104	// Create a smem buffer to cache each tile
105	auto tv0_cache = tv0->cacheAfter();
106	tv0_cache->setMemoryType(MemoryType::Shared);
107
108	tv0->computeAt(tv1, `2`);
109	// tv0: [I0, I1]
110	// tv0_cache: [I1/BS, I0/BS, BS(I1), BS(I0)]
111	// tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
112
113	// Assign each thread block to a tile
114	tv1->axis(`0`)->parallelize(ParallelType::BIDy);
115	tv1->axis(`1`)->parallelize(ParallelType::BIDx);
116
117	// Thread mapping for each tile. For both of the input and output
118	// tiles, map TIDx to the fastest-changing dimension to facilitate
119	// coalesced gmem accesses.
120	tv1->axis(`2`)->parallelize(ParallelType::TIDy);
121	tv1->axis(`3`)->parallelize(ParallelType::TIDx);
122	// Note that the fastest-changing axis is next to the inner-most
123	// axis since computeAt reorders the axes as the output tensor.
124	tv0_cache->axis(`2`)->parallelize(ParallelType::TIDx);
125	tv0_cache->axis(`3`)->parallelize(ParallelType::TIDy);
126
127	// Swizzles the smem cache to avoid bank conflicts
128	tv0_cache->swizzle(SwizzleType::Transpose, {`3`, `2`});
129
130	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
131	const int bx = `100`;
132	const int by = `200`;
133	at::Tensor t0 = at::randn({bx, by}, options);
134	std::vector<IValue> aten_inputs = {t0};
135
136	FusionExecutor fe;
137	fe.compileFusion(&fusion, aten_inputs);
138	auto cg_outputs = fe.runFusion(aten_inputs);
139
140	auto aten_output = t0.t();
141
142	testValidate(
143	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
144	}
145
146	TEST_F(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) {
147	Fusion fusion;
148	FusionGuard fg(&fusion);
149
150	auto tv0 = makeSymbolicTensor(`2`);
151	fusion.addInput(tv0);
152	auto tv1 = transpose(tv0);
153	fusion.addOutput(tv1);
154
155	// tv0: [I0, I1]
156	// tv1: [I1, I0]
157
158	const int BS = `32`;
159	const int BDIM = `256`;
160
161	// CTA tiling by BSBS*
162	tv1->split(`1`, BS);
163	tv1->split(`0`, BS);
164	tv1->reorder({{`1`, `2`}});
165	// tv1: [I1/BS, I0/BS, BS(I1), BS(I0)]
166
167	// Create a smem buffer to cache each tile
168	auto tv0_cache = tv0->cacheAfter();
169	tv0_cache->setMemoryType(MemoryType::Shared);
170
171	tv0->computeAt(tv1, `2`);
172	// tv0: [I0, I1]
173	// tv0_cache: [I1/BS, I0/BS, BSBS/BDIM, BDIM]*
174	// tv1: [I1/BS, I0/BS, BSBS/BDIM, BDIM]*
175
176	// Tranform the tile axes for 1D thread mapping
177	tv1->merge(-`2`, -`1`);
178	tv1->split(-`1`, BDIM);
179	// tv1: [I1/BS, I0/BS, BSBS/BDIM, BDIM]*
180
181	// Transform the cache similarly but apply swizzle to the 2D tile axes.
182	tv0_cache->reorder({{-`2`, -`1`}});
183	tv0_cache->swizzle(SwizzleType::Transpose, {`2`, `3`});
184	tv0_cache->merge(-`2`, -`1`);
185	tv0_cache->split(-`1`, BDIM);
186	// tv0: [I1/BS, I0/BS, BSBS/BDIM, BDIM]*
187
188	// Assign each thread block to a tile
189	tv1->axis(`0`)->parallelize(ParallelType::BIDy);
190	tv1->axis(`1`)->parallelize(ParallelType::BIDx);
191
192	// Thread mapping for each tile.
193	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
194	tv0_cache->axis(-`1`)->parallelize(ParallelType::TIDx);
195
196	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
197	const int bx = `100`;
198	const int by = `200`;
199	at::Tensor t0 = at::randn({bx, by}, options);
200	std::vector<IValue> aten_inputs = {t0};
201
202	FusionExecutor fe;
203	fe.compileFusion(&fusion, aten_inputs);
204	auto cg_outputs = fe.runFusion(aten_inputs);
205
206	auto aten_output = t0.t();
207
208	testValidate(
209	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
210	}
211
212	// x->sin->transpose->cos->y
213	TEST_F(NVFuserTest, FusionScheduleTransposeSimple_CUDA) {
214	Fusion fusion;
215	FusionGuard fg(&fusion);
216
217	auto tv0 = makeContigTensor(`3`);
218	fusion.addInput(tv0);
219	auto tv1 = sin(tv0);
220	auto tv2 = transpose(tv1, `1`, `2`);
221	auto tv3 = cos(tv2);
222	fusion.addOutput(tv3);
223
224	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
225	at::Tensor input = at::randn({`256`, `1024`, `1024`}, options);
226
227	auto lparams = scheduleTranspose(&fusion, {input});
228
229	FusionExecutor fe;
230	fe.compileFusion(&fusion, {input}, lparams);
231	auto outputs = fe.runFusion({input}, lparams);
232
233	auto tv_ref = input.sin().transpose(`1`, `2`).cos();
234
235	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
236	}
237
238	// x->tanspose->sin->transpose->cos->y
239	TEST_F(NVFuserTest, FusionScheduleTransposeSinTransposeCos_CUDA) {
240	Fusion fusion;
241	FusionGuard fg(&fusion);
242
243	auto tv0 = makeContigTensor(`3`);
244	fusion.addInput(tv0);
245	auto tv1 = transpose(tv0, `0`, `2`);
246	auto tv2 = sin(tv1);
247	auto tv3 = transpose(tv2, `1`, `2`);
248	auto tv4 = cos(tv3);
249	fusion.addOutput(tv4);
250
251	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
252	at::Tensor input = at::randn({`256`, `1024`, `1024`}, options);
253
254	auto lparams = scheduleTranspose(&fusion, {input});
255
256	FusionExecutor fe;
257	fe.compileFusion(&fusion, {input}, lparams);
258	auto outputs = fe.runFusion({input}, lparams);
259
260	auto tv_ref = input.transpose(`0`, `2`).sin().transpose(`1`, `2`).cos();
261
262	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
263	}
264
265	/*
266	* t0->transpose--.
267	* \
268	* t1->transpose---add-->sin->t5
269	*/
270	TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInput_CUDA) {
271	Fusion fusion;
272	FusionGuard fg(&fusion);
273
274	auto tv0 = makeContigTensor(`3`);
275	auto tv1 = makeContigTensor(`3`);
276	fusion.addInput(tv0);
277	fusion.addInput(tv1);
278	auto tv2 = transpose(tv0, `0`, `2`);
279	auto tv3 = transpose(tv1, `0`, `2`);
280	auto tv4 = add(tv2, tv3);
281	auto tv5 = sin(tv4);
282	fusion.addOutput(tv5);
283
284	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
285	at::Tensor input0 = at::randn({`256`, `1024`, `1024`}, options);
286	at::Tensor input1 = at::randn({`256`, `1024`, `1024`}, options);
287
288	auto lparams = scheduleTranspose(&fusion, {input0, input1});
289
290	FusionExecutor fe;
291	fe.compileFusion(&fusion, {input0, input1}, lparams);
292	auto outputs = fe.runFusion({input0, input1}, lparams);
293
294	auto tv_ref = (input0.transpose(`0`, `2`) + input1.transpose(`0`, `2`)).sin();
295
296	testValidate(
297	&fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
298	}
299
300	// t0->sin->transpose->t5
301	// `->cos->transpose->t6
302	TEST_F(NVFuserTest, FusionScheduleTransposeMultipleOutput_CUDA) {
303	Fusion fusion;
304	FusionGuard fg(&fusion);
305
306	auto tv0 = makeContigTensor(`3`);
307	fusion.addInput(tv0);
308	auto tv2 = sin(tv0);
309	auto tv3 = cos(tv0);
310	auto tv5 = transpose(tv2, `0`, `2`);
311	auto tv6 = transpose(tv3, `0`, `2`);
312	fusion.addOutput(tv5);
313	fusion.addOutput(tv6);
314
315	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
316	at::Tensor input = at::randn({`256`, `1024`, `1024`}, options);
317
318	auto lparams = scheduleTranspose(&fusion, {input});
319
320	FusionExecutor fe;
321	fe.compileFusion(&fusion, {input}, lparams);
322	auto outputs = fe.runFusion({input}, lparams);
323
324	auto tv_ref1 = input.sin().transpose(`0`, `2`);
325	auto tv_ref2 = input.cos().transpose(`0`, `2`);
326
327	testValidate(
328	&fusion, outputs, {input}, {tv_ref1, tv_ref2}, __LINE__, __FILE__);
329	}
330
331	/*
332	* t0->transpose->sin->t3
333	* \_.-->cos->t5
334	* /
335	* t1
336	*/
337	TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInputOutput_CUDA) {
338	Fusion fusion;
339	FusionGuard fg(&fusion);
340
341	auto tv0 = makeContigTensor(`3`);
342	auto tv1 = makeContigTensor(`3`);
343	fusion.addInput(tv0);
344	fusion.addInput(tv1);
345	auto tv2 = transpose(tv0, `0`, `2`);
346	auto tv3 = sin(tv2);
347	fusion.addOutput(tv3);
348	auto tv4 = add(tv0, tv1);
349	auto tv5 = cos(tv4);
350	fusion.addOutput(tv5);
351
352	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
353	at::Tensor input0 = at::randn({`256`, `1024`, `1024`}, options);
354	at::Tensor input1 = at::randn({`256`, `1024`, `1024`}, options);
355
356	auto lparams = scheduleTranspose(&fusion, {input0, input1});
357
358	FusionExecutor fe;
359	fe.compileFusion(&fusion, {input0, input1}, lparams);
360	auto outputs = fe.runFusion({input0, input1}, lparams);
361
362	auto tv_ref1 = input0.transpose(`0`, `2`).sin();
363	auto tv_ref2 = (input0 + input1).cos();
364
365	testValidate(
366	&fusion,
367	outputs,
368	{input0, input1},
369	{tv_ref1, tv_ref2},
370	__LINE__,
371	__FILE__);
372	}
373
374	/*
375	* .------>sin------>z
376	* x->transpose->transpose->add->y
377	* \_______________________/
378	*/
379	TEST_F(NVFuserTest, FusionScheduleTransposeMatchingSkipConnection_CUDA) {
380	Fusion fusion;
381	FusionGuard fg(&fusion);
382
383	auto tv0 = makeContigTensor(`3`);
384	fusion.addInput(tv0);
385	auto tv1 = transpose(tv0, `0`, `2`);
386	auto tv2 = transpose(tv1, `0`, `2`);
387	auto tv3 = add(tv0, tv2);
388	fusion.addOutput(tv3);
389	auto tv4 = sin(tv1);
390	fusion.addOutput(tv4);
391
392	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
393	at::Tensor input = at::randn({`256`, `1024`, `1024`}, options);
394
395	auto lparams = scheduleTranspose(&fusion, {input});
396
397	FusionExecutor fe;
398	fe.compileFusion(&fusion, {input}, lparams);
399	auto outputs = fe.runFusion({input}, lparams);
400
401	auto tv_ref1 = input.transpose(`0`, `2`).transpose(`0`, `2`) + input;
402	auto tv_ref2 = input.transpose(`0`, `2`).sin();
403
404	testValidate(
405	&fusion, outputs, {input}, {tv_ref1, tv_ref2}, __LINE__, __FILE__);
406	}
407
408	// x->transpose--add->z
409	// y->broadcast-/
410	TEST_F(NVFuserTest, FusionScheduleTransposeBroadcast_CUDA) {
411	Fusion fusion;
412	FusionGuard fg(&fusion);
413
414	auto tv0 = makeContigTensor(`3`);
415	auto tv1 = makeContigTensor(`2`);
416	fusion.addInput(tv0);
417	fusion.addInput(tv1);
418	auto tv2 = transpose(tv0, `1`, `2`);
419	auto tv3 = broadcast(tv1, {false, false, true});
420	auto tv4 = add(tv2, tv3);
421	fusion.addOutput(tv4);
422
423	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
424	at::Tensor input0 = at::randn({`1024`, `256`, `1024`}, options);
425	at::Tensor input1 = at::randn({`1024`, `1024`}, options);
426
427	auto lparams = scheduleTranspose(&fusion, {input0, input1});
428	// auto lparams = schedulePointwise(&fusion, {input0, input1});
429
430	FusionExecutor fe;
431	fe.compileFusion(&fusion, {input0, input1}, lparams);
432	auto outputs = fe.runFusion({input0, input1}, lparams);
433
434	auto tv_ref = input0.transpose(`1`, `2`) + input1.unsqueeze(`2`);
435
436	testValidate(
437	&fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
438	}
439
440	// x->broadcast--add->z
441	// y->broadcast-/
442	TEST_F(NVFuserTest, FusionScheduleTransposeNoReference_CUDA) {
443	Fusion fusion;
444	FusionGuard fg(&fusion);
445
446	auto tv0 = makeContigTensor(`2`);
447	auto tv1 = makeContigTensor(`2`);
448	fusion.addInput(tv0);
449	fusion.addInput(tv1);
450	auto tv2 = broadcast(tv0, {false, true, false});
451	auto tv3 = broadcast(tv1, {false, false, true});
452	auto tv4 = add(tv2, tv3);
453	fusion.addOutput(tv4);
454
455	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
456	at::Tensor input0 = at::randn({`1024`, `256`}, options);
457	at::Tensor input1 = at::randn({`1024`, `1024`}, options);
458
459	EXPECT_THAT(
460	[&]() {
461	scheduleTranspose(&fusion, {input0, input1});
462	},
463	testing::ThrowsMessage<c10::Error>(
464	testing::HasSubstr("reference tensor")));
465	}
466
467	// x->broadcast--add->z
468	// y->broadcast-/
469	TEST_F(NVFuserTest, FusionScheduleBroadcastOnly_CUDA) {
470	for (bool contig0 : {true, false}) {
471	for (bool contig1 : {true, false}) {
472	Fusion fusion;
473	FusionGuard fg(&fusion);
474	auto tv0 = contig0 ? makeContigConcreteTensor({-`1`, `1`, -`1`})
475	: makeConcreteTensor({-`1`, `1`, -`1`});
476	auto tv1 = contig1 ? makeContigConcreteTensor({-`1`, -`1`, `1`})
477	: makeConcreteTensor({-`1`, -`1`, `1`});
478	fusion.addInput(tv0);
479	fusion.addInput(tv1);
480	auto tv2 = add(tv0, tv1);
481	fusion.addOutput(tv2);
482
483	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
484	at::Tensor input0 = at::randn({`1024`, `1`, `256`}, options);
485	at::Tensor input1 = at::randn({`1024`, `1024`, `1`}, options);
486
487	auto lparams = scheduleTranspose(&fusion, {input0, input1});
488
489	FusionExecutor fe;
490	fe.compileFusion(&fusion, {input0, input1}, lparams);
491	auto outputs = fe.runFusion({input0, input1}, lparams);
492
493	auto tv_ref = input0 + input1;
494
495	testValidate(
496	&fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
497	}
498	}
499	}
500
501	// mermaid graph:
502	// ```mermaid
503	// %%{
504	// init: {
505	// 'theme': 'base',
506	// 'themeVariables': { 'fontSize': '30px', 'fontFamily': 'times'}}
507	// }%%
508	// graph TD
509	// T0("T0(M, N, K)")
510	// T1("T1(N, M, K)")
511	// T2("T2(M, K, N)")
512	// T0 --> A("transpose(1, 2)") --> T3("T3(M, K, N)")
513	// T1 ---> sigmoid --> T5("T5(N, M, K)")
514	// T5 --> B("transpose(0, 2)") --> T7("T7(K, M, N)")
515	// T2 ----> C("add")
516	// T3 --> C --> T6("T6(M, K, N)")
517	// T6 --> D("transpose(0, 1)") --> T11("T11(K, M, N)")
518	// T11 --> E("add") -->T12("T12(K, M, N)")
519	// T7 --> E
520	// T1 ---> F("transpose(0, 1)") --> T4("T4(M, N, K)")
521	// T0 --> G("add") --> T8("T8(M, N, K)") --> relu ---> T9("T9(M, N, K)")
522	// T4 --> G
523	// T6 ---> sin ---> T10("T10(M, K, N)")
524	// style T0 fill:lightgreen
525	// style T1 fill:lightgreen
526	// style T2 fill:lightgreen
527	// style T12 fill:lightblue
528	// style T9 fill:lightblue
529	// style T10 fill:lightblue
530	// ```
531	TEST_F(NVFuserTest, FusionScheduleTransposeComplexDAG1_CUDA) {
532	Fusion fusion;
533	FusionGuard fg(&fusion);
534
535	auto tv0 = makeContigTensor(`3`);
536	auto tv1 = makeContigTensor(`3`);
537	auto tv2 = makeContigTensor(`3`);
538	fusion.addInput(tv0);
539	fusion.addInput(tv1);
540	fusion.addInput(tv2);
541	auto tv3 = transpose(tv0, `1`, `2`);
542	auto tv4 = transpose(tv1, `0`, `1`);
543	auto tv5 = sigmoid(tv1);
544	auto tv6 = add(tv2, tv3);
545	auto tv7 = transpose(tv5, `0`, `2`);
546	auto tv8 = add(tv4, tv0);
547	auto tv9 = relu(tv8);
548	fusion.addOutput(tv9);
549	auto tv10 = sin(tv6);
550	fusion.addOutput(tv10);
551	auto tv11 = transpose(tv6, `0`, `1`);
552	auto tv12 = add(tv7, tv11);
553	fusion.addOutput(tv12);
554
555	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
556	at::Tensor input0 = at::randn({`512`, `1024`, `256`}, options);
557	at::Tensor input1 = at::randn({`1024`, `512`, `256`}, options);
558	at::Tensor input2 = at::randn({`512`, `256`, `1024`}, options);
559
560	auto lparams = scheduleTranspose(&fusion, {input0, input1, input2});
561
562	FusionExecutor fe;
563	fe.compileFusion(&fusion, {input0, input1, input2}, lparams);
564	auto outputs = fe.runFusion({input0, input1, input2}, lparams);
565
566	auto t3 = input0.transpose(`1`, `2`);
567	auto t4 = input1.transpose(`0`, `1`);
568	auto t5 = input1.sigmoid();
569	auto t6 = input2 + t3;
570	auto t7 = t5.transpose(`0`, `2`);
571	auto t8 = t4 + input0;
572	auto t9 = t8.relu();
573	auto t10 = t6.sin();
574	auto t11 = t6.transpose(`0`, `1`);
575	auto t12 = t7 + t11;
576
577	testValidate(
578	&fusion,
579	outputs,
580	{input0, input1, input2},
581	{t9, t10, t12},
582	__LINE__,
583	__FILE__);
584	}
585
586	// mermaid graph:
587	// ```mermaid
588	// %%{
589	// init: {
590	// 'theme': 'base',
591	// 'themeVariables': { 'fontSize': '30px', 'fontFamily': 'times'}}
592	// }%%
593	// graph TD
594	// T0("T0(M, N, K)")
595	// T1("T1(N, M, K)")
596	// T2("T2(M, K, N)")
597	// T0 --> A("transpose(1, 2)") --> T3("T3(M, K, N)")
598	// T1 ---> sigmoid --> T5("T5(N, M, K)")
599	// T5 --> B("transpose(0, 2)") --> T7("T7(K, M, N)")
600	// T2 ----> C("add")
601	// T3 --> C --> T6("T6(M, K, N)")
602	// T6 --> D("transpose(0, 1)") --> T11("T11(K, M, N)")
603	// T11 --> E("add") -->T12("T12(K, M, N)")
604	// T7 --> E
605	// T1 ---> F("transpose(0, 1)") --> T4("T4(M, N, K)")
606	// T0 --> G("add") --> T8("T8(M, N, K)") --> relu ---> T9("T9(M, N, K)")
607	// T4 --> G
608	// T6 ---> sin ---> T10("T10(M, K, N)")
609	// style T0 fill:lightgreen
610	// style T1 fill:lightgreen
611	// style T2 fill:lightgreen
612	// style T12 fill:lightblue
613	// style T9 fill:lightblue
614	// style T10 fill:lightblue
615	// ```
616	TEST_F(NVFuserTest, FusionManualScheduleTransposeComplexDAG1_CUDA) {
617	// achieved: 833.526 GB/s on RTX 3090 (theoretical bandwidth: 936 GB/s)
618	Fusion fusion;
619	FusionGuard fg(&fusion);
620
621	auto tv0 = makeContigTensor(`3`);
622	auto tv1 = makeContigTensor(`3`);
623	auto tv2 = makeContigTensor(`3`);
624	fusion.addInput(tv0);
625	fusion.addInput(tv1);
626	fusion.addInput(tv2);
627	auto tv3 = transpose(tv0, `1`, `2`);
628	auto tv4 = transpose(tv1, `0`, `1`);
629	auto tv5 = sigmoid(tv1);
630	auto tv6 = add(tv2, tv3);
631	auto tv7 = transpose(tv5, `0`, `2`);
632	auto tv8 = add(tv4, tv0);
633	auto tv9 = relu(tv8);
634	fusion.addOutput(tv9);
635	auto tv10 = sin(tv6);
636	fusion.addOutput(tv10);
637	auto tv11 = transpose(tv6, `0`, `1`);
638	auto tv12 = add(tv7, tv11);
639	fusion.addOutput(tv12);
640
641	// group 1: tv0, tv1, tv9, innermost dim K*
642	// group 2: tv2, tv10, tv12, innermost dim N*
643
644	// cache inputs and outputs
645	auto tv0_cache = tv0->cacheAfter();
646	auto tv1_cache = tv1->cacheAfter();
647	auto tv2_cache = tv2->cacheAfter();
648	auto tv9_cache = tv9->cacheBefore();
649	auto tv10_cache = tv10->cacheBefore();
650	auto tv12_cache = tv12->cacheBefore();
651
652	// Step 1: Make 32x32 tiles, schedule outer dimensions
653	{
654	// Pick an arbitrary tensor as a reference tensor for this step. There is no
655	// requirement on which group this reference tensor should belong to. Here
656	// we pick tv9, which belongs to group 1.
657
658	// Make 32x32 tile:
659	// [M, N, K]
660	tv9->split(`1`, `32`);
661	tv9->reorder({{`2`, -`1`}});
662	tv9->split(`2`, `32`);
663	tv9->reorder({{`3`, -`1`}});
664	// [M, N/32, K/32, 32(N), 32(K)]
665
666	// merge outer dims, parallelize on BIDx, and unswitch
667	tv9->merge(`0`);
668	tv9->merge(`0`);
669	tv9->split(`0`, `1`);
670	// [M N/32 * K/32, 1, 32(N), 32(K)]*
671	tv9->axis(`0`)->parallelize(ParallelType::BIDx);
672	tv9->axis(`1`)->parallelize(ParallelType::Unswitch);
673	// [BIDx, Unswitch, 32(N), 32(K)]
674
675	// propagate to the entire DAG
676	MaxRootDomainInfoSpanningTree entire_dag(tv9);
677	TransformPropagator tp(tv9);
678	entire_dag.traverse(&tp);
679	scheduler_utils::parallelizeAllLike(tv9);
680	}
681
682	constexpr int threads_per_block = `128`;
683
684	// Step 2, schedule group 2
685	{
686	// group 2: tv2, tv10, tv12, innermost dim N*
687
688	tv2_cache->setMemoryType(MemoryType::Shared);
689	tv10_cache->setMemoryType(MemoryType::Shared);
690	tv12_cache->setMemoryType(MemoryType::Shared);
691
692	// pick tv10 as reference tensor for group 2
693	// [BIDx, Unswitch, 32(N), 32(K)]
694	tv10->reorder({{-`1`, -`2`}});
695	// [BIDx, Unswitch, 32(K), 32(N)]
696	tv10->merge(`2`);
697	tv10->split(`2`, `4`);
698	tv10->split(`2`, threads_per_block);
699	tv10->axis(-`1`)->parallelize(ParallelType::Vectorize);
700	tv10->axis(-`2`)->parallelize(ParallelType::TIDx);
701	tv10->axis(-`3`)->parallelize(ParallelType::Unroll);
702	// [BIDx, Unswitch, Unroll, TIDx, Vectorize]
703
704	// Propagate to group 2 and its cache. Note that group 2 and its cache are
705	// not connected, so we need to borrow other tensors of the DAG to be able
706	// to propagate. The transformations on borrowed tensors will be overwritten
707	// in the next step. We can not borrow the reference tensor of group 1.
708	auto all_tvs_except_ref1 = ir_utils::allTvsExcept(&fusion, {tv9});
709	auto all_tvs_except_ref1_set = std::unordered_set<TensorView*>(
710	all_tvs_except_ref1.begin(), all_tvs_except_ref1.end());
711	SetSelector selector(all_tvs_except_ref1_set);
712	MaxRootDomainInfoSpanningTree tree(tv10, &selector);
713	TransformPropagator tp(tv10);
714	tree.traverse(&tp);
715	scheduler_utils::parallelizeAllLike(
716	tv10, {tv2_cache, tv10, tv12}, {ParallelType::TIDx});
717	scheduler_utils::parallelizeAllLike(
718	tv10,
719	{tv2_cache, tv10, tv12},
720	{ParallelType::Vectorize, ParallelType::Unroll});
721	}
722
723	// Step 3, schedule group 1
724	{
725	// group 1: tv0, tv1, tv9, innermost dim K*
726	// [BIDx, Unswitch, 32(N), 32(K)]
727	tv9->merge(`2`);
728	tv9->split(`2`, `4`);
729	tv9->split(`2`, threads_per_block);
730	tv9->axis(-`1`)->parallelize(ParallelType::Vectorize);
731	tv9->axis(-`2`)->parallelize(ParallelType::TIDx);
732	tv9->axis(-`3`)->parallelize(ParallelType::Unroll);
733	// [BIDx, Unswitch, Unroll, TIDx, Vectorize]
734
735	// Propagate to the entire DAG except for group 2 and its cached inputs
736	auto all_tvs_except2 =
737	ir_utils::allTvsExcept(&fusion, {tv2, tv2_cache, tv10, tv12});
738	auto all_tvs_except2_set = std::unordered_set<TensorView*>(
739	all_tvs_except2.begin(), all_tvs_except2.end());
740	SetSelector selector(all_tvs_except2_set);
741	MaxRootDomainInfoSpanningTree tree(tv9, &selector);
742	TransformPropagator tp(tv9);
743	tree.traverse(&tp);
744	scheduler_utils::parallelizeAllLike(
745	tv9, all_tvs_except2, {ParallelType::TIDx});
746	scheduler_utils::parallelizeAllLike(
747	tv9,
748	{tv0_cache, tv1_cache, tv9},
749	{ParallelType::Vectorize, ParallelType::Unroll});
750	}
751
752	// inline
753	inlineMost();
754
755	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
756	at::Tensor input0 = at::randn({`512`, `1024`, `256`}, options);
757	at::Tensor input1 = at::randn({`1024`, `512`, `256`}, options);
758	at::Tensor input2 = at::randn({`512`, `256`, `1024`}, options);
759
760	FusionExecutor fe;
761	fe.compileFusion(&fusion, {input0, input1, input2});
762	auto outputs = fe.runFusion({input0, input1, input2});
763
764	auto t3 = input0.transpose(`1`, `2`);
765	auto t4 = input1.transpose(`0`, `1`);
766	auto t5 = input1.sigmoid();
767	auto t6 = input2 + t3;
768	auto t7 = t5.transpose(`0`, `2`);
769	auto t8 = t4 + input0;
770	auto t9 = t8.relu();
771	auto t10 = t6.sin();
772	auto t11 = t6.transpose(`0`, `1`);
773	auto t12 = t7 + t11;
774
775	testValidate(
776	&fusion,
777	outputs,
778	{input0, input1, input2},
779	{t9, t10, t12},
780	__LINE__,
781	__FILE__);
782	}
783
784	// x->view->y
785	TEST_F(NVFuserTest, FusionViewNoTranspose_CUDA) {
786	Fusion fusion;
787	FusionGuard fg(&fusion);
788
789	auto tv0 = makeContigTensor(`3`);
790	fusion.addInput(tv0);
791	auto tv1 = flatten(tv0, `1`, `2`);
792	fusion.addOutput(tv1);
793
794	TORCH_CHECK(!hasAtLeastTwoValidGroups(&fusion));
795	}
796
797	TEST_F(NVFuserTest, FusionTransposeSelfMapping_CUDA) {
798	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
799	Fusion& fusion = *fusion_ptr.get();
800	FusionGuard fg(&fusion);
801
802	auto tv0 = makeContigTensor(`2`);
803	fusion.addInput(tv0);
804	auto tv1 = transpose(tv0, `0`, `1`);
805	auto tv2 = add(tv0, tv1);
806	fusion.addOutput(tv2);
807
808	EXPECT_THAT(
809	[&]() { IterDomainGraph (fusion_ptr.get()); },
810	testing::ThrowsMessage<c10::Error>(
811	testing::HasSubstr("Unsupported domain mapping detected")));
812
813	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
814	auto t0 = at::randn({`5`, `5`}, options);
815
816	FusionExecutorCache executor_cache(std::move(fusion_ptr));
817	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
818
819	auto ref = t0.transpose(`0`, `1`) + t0;
820
821	testValidate(
822	executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
823	}
824
825	#if 0
826	// silent wrong result
827	TEST_F(NVFuserTest, FusionTransposeViewSelfMapping_CUDA) {
828	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
829	Fusion& fusion = *fusion_ptr.get();
830	FusionGuard fg(&fusion);
831
832	auto tv0 = makeContigTensor(`2`);
833	fusion.addInput(tv0);
834	auto tv1 = transpose(tv0, `0`, `1`);
835	auto tv2 = view(tv0, {`2`, `3`}, {`3`, `2`});
836	auto tv3 = add(tv1, tv2);
837	fusion.addOutput(tv3);
838
839	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
840	auto t0 = at::randn({`2`, `3`}, options);
841
842	FusionExecutorCache executor_cache(std::move(fusion_ptr));
843	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
844
845	auto ref = t0.transpose(`0`, `1`) + t0.view({`3`, `2`});
846
847	testValidate(
848	executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
849	}
850	#endif
851
852	// t0------------.
853	// t2->broadcast->sub->mul->relu->t6
854	// t1------------------'
855	TEST_F(NVFuserTest, FusionScheduleTransposeMissingDim_CUDA) {
856	Fusion fusion;
857	FusionGuard fg(&fusion);
858
859	auto tv0 = makeContigTensor(`3`);
860	auto tv1 = makeContigConcreteTensor({`1`, -`1`, `1`});
861	auto tv2 = makeContigTensor(`1`);
862	fusion.addInput(tv0);
863	fusion.addInput(tv1);
864	fusion.addInput(tv2);
865	auto tv3 = broadcast(tv2, {true, false, true});
866	auto tv4 = sub(tv0, tv3);
867	auto tv5 = mul(tv4, tv1);
868	auto tv6 = relu(tv5);
869	fusion.addOutput(tv6);
870
871	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
872	at::Tensor input0 = at::randn({`512`, `1024`, `512`}, options);
873	at::Tensor input1 = at::randn({`1`, `1024`, `1`}, options);
874	at::Tensor input2 = at::randn({`1024`}, options);
875
876	auto lparams = scheduleTranspose(&fusion, {input0, input1, input2});
877
878	FusionExecutor fe;
879	fe.compileFusion(&fusion, {input0, input1, input2}, lparams);
880	auto outputs = fe.runFusion({input0, input1, input2}, lparams);
881
882	auto t3 = input2.unsqueeze(`0`).unsqueeze(-`1`);
883	auto t4 = input0 - t3;
884	auto t5 = t4 * input1;
885	auto t6 = at::relu(t5);
886
887	testValidate(
888	&fusion, outputs, {input0, input1, input2}, {t6}, __LINE__, __FILE__);
889	}
890
891	// x->sin->transpose->cos->y
892	TEST_F(NVFuserTest, FusionScheduleTransposeSmall_CUDA) {
893	Fusion fusion;
894	FusionGuard fg(&fusion);
895
896	auto tv0 = makeContigTensor(`3`);
897	fusion.addInput(tv0);
898	auto tv1 = sin(tv0);
899	auto tv2 = transpose(tv1, `1`, `2`);
900	auto tv3 = cos(tv2);
901	fusion.addOutput(tv3);
902
903	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
904	at::Tensor input = at::randn({`1024`, `2`, `2`}, options);
905
906	auto lparams = scheduleTranspose(&fusion, {input});
907
908	FusionExecutor fe;
909	fe.compileFusion(&fusion, {input}, lparams);
910	auto outputs = fe.runFusion({input}, lparams);
911
912	auto tv_ref = input.sin().transpose(`1`, `2`).cos();
913
914	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
915	}
916
917	// x->sin->transpose->cos->y
918	TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize1_CUDA) {
919	Fusion fusion;
920	FusionGuard fg(&fusion);
921
922	auto tv0 = makeContigTensor(`3`);
923	fusion.addInput(tv0);
924	auto tv1 = sin(tv0);
925	auto tv2 = transpose(tv1, `1`, `2`);
926	auto tv3 = cos(tv2);
927	fusion.addOutput(tv3);
928
929	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
930	at::Tensor input = at::randn({`64` * `1024` * `1024`, `2`, `2`}, options);
931
932	auto lparams = scheduleTranspose(&fusion, {input});
933
934	FusionExecutor fe;
935	fe.compileFusion(&fusion, {input}, lparams);
936	auto outputs = fe.runFusion({input}, lparams);
937
938	auto tv_ref = input.sin().transpose(`1`, `2`).cos();
939
940	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
941	}
942
943	// x->sin->transpose->cos->y
944	TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize2_CUDA) {
945	Fusion fusion;
946	FusionGuard fg(&fusion);
947
948	auto tv0 = makeContigTensor(`3`);
949	fusion.addInput(tv0);
950	auto tv1 = sin(tv0);
951	auto tv2 = transpose(tv1, `0`, `2`);
952	auto tv3 = cos(tv2);
953	fusion.addOutput(tv3);
954
955	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
956	at::Tensor input = at::randn({`2`, `64` * `1024` * `1024`, `2`}, options);
957
958	auto lparams = scheduleTranspose(&fusion, {input});
959
960	FusionExecutor fe;
961	fe.compileFusion(&fusion, {input}, lparams);
962	auto outputs = fe.runFusion({input}, lparams);
963
964	auto tv_ref = input.sin().transpose(`0`, `2`).cos();
965
966	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
967	}
968
969	// x->sin->transpose->cos->y
970	TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) {
971	Fusion fusion;
972	FusionGuard fg(&fusion);
973
974	auto tv0 = makeContigTensor(`8`);
975	fusion.addInput(tv0);
976	auto tv1 = sin(tv0);
977	auto tv2 = transpose(tv1, `4`, `7`);
978	auto tv3 = cos(tv2);
979	fusion.addOutput(tv3);
980
981	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
982	at::Tensor input = at::randn({`1024` * `1024`, `2`, `2`, `2`, `2`, `2`, `2`, `2`}, options);
983
984	auto lparams = scheduleTranspose(&fusion, {input});
985
986	FusionExecutor fe;
987	fe.compileFusion(&fusion, {input}, lparams);
988	auto outputs = fe.runFusion({input}, lparams);
989
990	auto tv_ref = input.sin().transpose(`4`, `7`).cos();
991
992	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
993	}
994
995	// x->sin->transpose->cos->y
996	TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) {
997	std::array<std::vector<int64_t>, `2`> shapes{
998	std::vector<int64_t>{`1024` * `1024` * `128`, `2`},
999	std::vector<int64_t>{`2`, `1024` * `1024` * `128`}};
1000	for (const auto& shape : shapes) {
1001	Fusion fusion;
1002	FusionGuard fg(&fusion);
1003
1004	auto tv0 = makeContigTensor(`2`);
1005	fusion.addInput(tv0);
1006	auto tv1 = sin(tv0);
1007	auto tv2 = transpose(tv1, `0`, `1`);
1008	auto tv3 = cos(tv2);
1009	fusion.addOutput(tv3);
1010
1011	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1012	at::Tensor input = at::randn(shape, options);
1013
1014	auto lparams = scheduleTranspose(&fusion, {input});
1015
1016	FusionExecutor fe;
1017	fe.compileFusion(&fusion, {input}, lparams);
1018	auto outputs = fe.runFusion({input}, lparams);
1019
1020	auto tv_ref = input.sin().transpose(`0`, `1`).cos();
1021
1022	testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
1023	}
1024	}
1025
1026	TEST_F(NVFuserTest, FusionTransposeBankConflict1_CUDA) {
1027	Fusion fusion;
1028	FusionGuard fg(&fusion);
1029
1030	auto tv0 = makeConcreteTensor({`32`, `32`});
1031	fusion.addInput(tv0);
1032	auto tv1 = set(tv0);
1033	auto tv2 = transpose(tv1, `0`, `1`);
1034	auto tv3 = set(tv2);
1035	fusion.addOutput(tv3);
1036
1037	tv1->setMemoryType(MemoryType::Shared);
1038	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
1039	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
1040	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
1041
1042	auto bank_conflict_info = fusion.bankConflictInfo();
1043
1044	TORCH_CHECK(!bank_conflict_info.empty());
1045	for (auto info : bank_conflict_info) {
1046	std::pair<int, int> expect{`32`, `0`};
1047	TORCH_CHECK(info.second == expect);
1048	}
1049	}
1050
1051	TEST_F(NVFuserTest, FusionTransposeBankConflict2_CUDA) {
1052	Fusion fusion;
1053	FusionGuard fg(&fusion);
1054
1055	auto tv0 = makeConcreteTensor({`32`, `32`});
1056	fusion.addInput(tv0);
1057	auto tv1 = set(tv0);
1058	auto tv2 = transpose(tv1, `0`, `1`);
1059	auto tv3 = set(tv2);
1060	fusion.addOutput(tv3);
1061
1062	tv1->setMemoryType(MemoryType::Shared);
1063	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
1064	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
1065	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
1066
1067	auto bank_conflict_info = fusion.bankConflictInfo();
1068
1069	TORCH_CHECK(!bank_conflict_info.empty());
1070	for (auto info : bank_conflict_info) {
1071	std::pair<int, int> expect{`0`, `32`};
1072	TORCH_CHECK(info.second == expect);
1073	}
1074	}
1075
1076	TEST_F(NVFuserTest, FusionTransposeBankConflict3_CUDA) {
1077	Fusion fusion;
1078	FusionGuard fg(&fusion);
1079
1080	auto tv0 = makeConcreteTensor({`32`, `32`}, DataType::Bool);
1081	fusion.addInput(tv0);
1082	auto tv1 = set(tv0);
1083	auto tv2 = transpose(tv1, `0`, `1`);
1084	auto tv3 = set(tv2);
1085	fusion.addOutput(tv3);
1086
1087	tv1->setMemoryType(MemoryType::Shared);
1088	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
1089	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
1090	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
1091
1092	auto bank_conflict_info = fusion.bankConflictInfo();
1093
1094	TORCH_CHECK(!bank_conflict_info.empty());
1095	for (auto info : bank_conflict_info) {
1096	std::pair<int, int> expect{`8`, `0`};
1097	TORCH_CHECK(info.second == expect);
1098	}
1099	}
1100
1101	TEST_F(NVFuserTest, FusionTransposeBankConflict4_CUDA) {
1102	Fusion fusion;
1103	FusionGuard fg(&fusion);
1104
1105	auto tv0 = makeConcreteTensor({`32`, `32`});
1106	fusion.addInput(tv0);
1107	auto tv1 = set(tv0);
1108	auto tv2 = transpose(tv1, `0`, `1`);
1109	auto tv3 = set(tv2);
1110	fusion.addOutput(tv3);
1111
1112	tv1->setMemoryType(MemoryType::Shared);
1113	tv1->merge(`0`);
1114	tv1->split(`0`, `4`);
1115	tv1->split(`0`, `8`);
1116	tv1->axis(-`1`)->parallelize(ParallelType::Vectorize);
1117	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
1118	// T1 [TIDx(32), 8, V(4)]
1119
1120	tv2->setMemoryType(MemoryType::Shared);
1121	tv2->merge(`0`);
1122	tv2->split(`0`, `4`);
1123	tv2->split(`0`, `32`);
1124	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
1125	// T2 [8, TIDx(32), 4]
1126
1127	tv3->merge(`0`);
1128	tv3->split(`0`, `2`);
1129	tv3->split(`0`, `32`);
1130	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
1131	// T3 [16, TIDx(32), 2]
1132
1133	auto bank_conflict_info = fusion.bankConflictInfo();
1134
1135	TORCH_CHECK(!bank_conflict_info.empty());
1136	for (auto info : bank_conflict_info) {
1137	std::pair<int, int> expect1{`0`, `8`};
1138	std::pair<int, int> expect2{`8`, `4`};
1139	std::pair<int, int> expect3{`2`, `0`};
1140	TORCH_CHECK(
1141	info.second == expect1 \|\| info.second == expect2 \|\|
1142	info.second == expect3);
1143	}
1144	}
1145
1146	TEST_F(NVFuserTest, FusionTransposeBankConflict5_CUDA) {
1147	Fusion fusion;
1148	FusionGuard fg(&fusion);
1149
1150	auto tv0 = makeConcreteTensor({`1024`, `32`, `32`});
1151	fusion.addInput(tv0);
1152	auto tv1 = set(tv0);
1153	auto tv2 = transpose(tv1, `1`, `2`);
1154	auto tv3 = set(tv2);
1155	fusion.addOutput(tv3);
1156
1157	tv1->setMemoryType(MemoryType::Shared);
1158	tv1->axis(`2`)->parallelize(ParallelType::TIDx);
1159	tv2->axis(`2`)->parallelize(ParallelType::TIDx);
1160	tv3->axis(`2`)->parallelize(ParallelType::TIDx);
1161	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
1162	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
1163	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1164
1165	auto bank_conflict_info = fusion.bankConflictInfo();
1166
1167	TORCH_CHECK(!bank_conflict_info.empty());
1168	for (auto info : bank_conflict_info) {
1169	std::pair<int, int> expect{`32`, `0`};
1170	TORCH_CHECK(info.second == expect);
1171	}
1172	}
1173
1174	TEST_F(NVFuserTest, FusionTransposeBankConflict6_CUDA) {
1175	Fusion fusion;
1176	FusionGuard fg(&fusion);
1177
1178	auto tv0 = makeConcreteTensor({`1024`, `32`, `32`});
1179	fusion.addInput(tv0);
1180	auto tv1 = set(tv0);
1181	auto tv2 = transpose(tv1, `1`, `2`);
1182	auto tv3 = set(tv2);
1183	fusion.addOutput(tv3);
1184
1185	tv1->setMemoryType(MemoryType::Shared);
1186	tv1->axis(`2`)->parallelize(ParallelType::TIDy);
1187	tv2->axis(`2`)->parallelize(ParallelType::TIDy);
1188	tv3->axis(`2`)->parallelize(ParallelType::TIDy);
1189	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
1190	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
1191	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1192
1193	auto bank_conflict_info = fusion.bankConflictInfo();
1194
1195	TORCH_CHECK(!bank_conflict_info.empty());
1196	for (auto info : bank_conflict_info) {
1197	std::pair<int, int> expect{`32`, `0`};
1198	TORCH_CHECK(info.second == expect);
1199	}
1200	}
1201
1202	TEST_F(NVFuserTest, FusionTransposeBankConflict7_CUDA) {
1203	Fusion fusion;
1204	FusionGuard fg(&fusion);
1205
1206	auto tv0 = makeConcreteTensor({`1024`, `8`, `8`});
1207	fusion.addInput(tv0);
1208	auto tv1 = set(tv0);
1209	auto tv2 = transpose(tv1, `1`, `2`);
1210	auto tv3 = set(tv2);
1211	fusion.addOutput(tv3);
1212
1213	tv1->setMemoryType(MemoryType::Shared);
1214	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
1215	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
1216	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
1217	tv1->axis(`2`)->parallelize(ParallelType::TIDy);
1218	tv2->axis(`2`)->parallelize(ParallelType::TIDy);
1219	tv3->axis(`2`)->parallelize(ParallelType::TIDy);
1220	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
1221	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
1222	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1223
1224	auto bank_conflict_info = fusion.bankConflictInfo();
1225
1226	TORCH_CHECK(!bank_conflict_info.empty());
1227	for (auto info : bank_conflict_info) {
1228	std::pair<int, int> expect{`0`, `2`};
1229	TORCH_CHECK(info.second == expect);
1230	}
1231	}
1232
1233	TEST_F(NVFuserTest, FusionTransposeBankConflict8_CUDA) {
1234	Fusion fusion;
1235	FusionGuard fg(&fusion);
1236
1237	auto tv0 = makeConcreteTensor({`1024`, `8`, `8`});
1238	fusion.addInput(tv0);
1239	auto tv1 = set(tv0);
1240	auto tv2 = transpose(tv1, `1`, `2`);
1241	auto tv3 = set(tv2);
1242	fusion.addOutput(tv3);
1243
1244	tv1->setMemoryType(MemoryType::Shared);
1245	tv1->axis(`2`)->parallelize(ParallelType::TIDx);
1246	tv2->axis(`2`)->parallelize(ParallelType::TIDy);
1247	tv3->axis(`2`)->parallelize(ParallelType::TIDy);
1248	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
1249	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
1250	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1251
1252	auto bank_conflict_info = fusion.bankConflictInfo();
1253
1254	// no bank confliction
1255	TORCH_CHECK(bank_conflict_info.empty());
1256	}
1257
1258	} // namespace jit
1259	} // namespace torch
1260	#endif // #if defined(USE_CUDA)
1261

Browse the source code of pytorch/third_party/nvfuser/test/test_gpu_transpose.cpp