1 | #if defined(USE_CUDA) |
2 | #include <gmock/gmock-matchers.h> |
3 | #include <gtest/gtest.h> |
4 | |
5 | #include <codegen.h> |
6 | #include <executor.h> |
7 | #include <fusion.h> |
8 | #include <ir_all_nodes.h> |
9 | #include <ir_iostream.h> |
10 | #include <kernel_cache.h> |
11 | #include <ops/all_ops.h> |
12 | #include <test/test_gpu_validator.h> |
13 | #include <test/test_utils.h> |
14 | |
15 | // Tests go in torch::jit |
16 | namespace torch { |
17 | namespace jit { |
18 | |
19 | using namespace torch::jit::fuser::cuda; |
20 | |
21 | TEST_F(NVFuserTest, FusionStandaloneFull_CUDA) { |
22 | auto sizes = {0, 1, 10, 17, 1024}; |
23 | auto dtypes = { |
24 | kBool, |
25 | kFloat, |
26 | kLong, |
27 | kDouble, |
28 | kHalf, |
29 | kBFloat16, |
30 | kInt, |
31 | kComplexFloat, |
32 | kComplexDouble}; |
33 | |
34 | auto fusion = std::make_unique<Fusion>(); |
35 | FusionGuard fg(fusion.get()); |
36 | |
37 | Val* size = IrBuilder::create<Int>(); |
38 | Val* fill_val1 = IrBuilder::create<Int>(); |
39 | Val* fill_val2 = IrBuilder::create<Int>(); |
40 | Val* fill_val3 = IrBuilder::create<Int>(); |
41 | fusion->addInput(size); |
42 | fusion->addInput(fill_val1); |
43 | fusion->addInput(fill_val2); |
44 | fusion->addInput(fill_val3); |
45 | for (auto dtype : dtypes) { |
46 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
47 | continue; |
48 | } |
49 | auto out_tv = full({size}, fill_val1, aten_to_data_type(dtype)); |
50 | fusion->addOutput(out_tv); |
51 | out_tv = full({size, size}, fill_val2, aten_to_data_type(dtype)); |
52 | fusion->addOutput(out_tv); |
53 | out_tv = full_like(out_tv, fill_val3); |
54 | fusion->addOutput(out_tv); |
55 | } |
56 | |
57 | FusionExecutorCache executor_cache(std::move(fusion)); |
58 | |
59 | for (auto size : sizes) { |
60 | std::vector<at::Tensor> expect; |
61 | expect.reserve(dtypes.size()); |
62 | for (auto dtype : dtypes) { |
63 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
64 | continue; |
65 | } |
66 | const auto options = |
67 | at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); |
68 | expect.emplace_back(at::full({size}, 11, options)); |
69 | expect.emplace_back(at::full({size, size}, 12, options)); |
70 | expect.emplace_back(at::full({size, size}, 13, options)); |
71 | } |
72 | auto cg_outputs = executor_cache.runFusionWithInputs({size, 11, 12, 13}); |
73 | |
74 | testValidate( |
75 | executor_cache.fusion(), |
76 | cg_outputs, |
77 | {size, 11, 12, 13}, |
78 | expect, |
79 | __LINE__, |
80 | __FILE__); |
81 | } |
82 | } |
83 | |
84 | TEST_F(NVFuserTest, FusionStandaloneZeros_CUDA) { |
85 | auto sizes = {0, 1, 10, 17, 1024}; |
86 | auto dtypes = { |
87 | kBool, |
88 | kFloat, |
89 | kLong, |
90 | kDouble, |
91 | kHalf, |
92 | kBFloat16, |
93 | kInt, |
94 | kComplexFloat, |
95 | kComplexDouble}; |
96 | |
97 | auto fusion = std::make_unique<Fusion>(); |
98 | FusionGuard fg(fusion.get()); |
99 | |
100 | Val* size = IrBuilder::create<Int>(); |
101 | fusion->addInput(size); |
102 | for (auto dtype : dtypes) { |
103 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
104 | continue; |
105 | } |
106 | auto out_tv = zeros({size}, aten_to_data_type(dtype)); |
107 | fusion->addOutput(out_tv); |
108 | out_tv = zeros({size, size}, aten_to_data_type(dtype)); |
109 | fusion->addOutput(out_tv); |
110 | out_tv = zeros_like(out_tv); |
111 | fusion->addOutput(out_tv); |
112 | } |
113 | |
114 | FusionExecutorCache executor_cache(std::move(fusion)); |
115 | |
116 | for (auto size : sizes) { |
117 | std::vector<at::Tensor> expect; |
118 | expect.reserve(dtypes.size()); |
119 | for (auto dtype : dtypes) { |
120 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
121 | continue; |
122 | } |
123 | const auto options = |
124 | at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); |
125 | expect.emplace_back(at::zeros({size}, options)); |
126 | expect.emplace_back(at::zeros({size, size}, options)); |
127 | expect.emplace_back(at::zeros({size, size}, options)); |
128 | } |
129 | auto cg_outputs = executor_cache.runFusionWithInputs({size}); |
130 | |
131 | testValidate( |
132 | executor_cache.fusion(), |
133 | cg_outputs, |
134 | {size}, |
135 | expect, |
136 | __LINE__, |
137 | __FILE__); |
138 | } |
139 | } |
140 | |
141 | TEST_F(NVFuserTest, FusionStandaloneOnes_CUDA) { |
142 | auto sizes = {0, 1, 10, 17, 1024}; |
143 | auto dtypes = { |
144 | kBool, |
145 | kFloat, |
146 | kLong, |
147 | kDouble, |
148 | kHalf, |
149 | kBFloat16, |
150 | kInt, |
151 | kComplexFloat, |
152 | kComplexDouble}; |
153 | |
154 | auto fusion = std::make_unique<Fusion>(); |
155 | FusionGuard fg(fusion.get()); |
156 | |
157 | Val* size = IrBuilder::create<Int>(); |
158 | fusion->addInput(size); |
159 | for (auto dtype : dtypes) { |
160 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
161 | continue; |
162 | } |
163 | auto out_tv = ones({size}, aten_to_data_type(dtype)); |
164 | fusion->addOutput(out_tv); |
165 | out_tv = ones({size, size}, aten_to_data_type(dtype)); |
166 | fusion->addOutput(out_tv); |
167 | out_tv = ones_like(out_tv); |
168 | fusion->addOutput(out_tv); |
169 | } |
170 | |
171 | FusionExecutorCache executor_cache(std::move(fusion)); |
172 | |
173 | for (auto size : sizes) { |
174 | std::vector<at::Tensor> expect; |
175 | expect.reserve(dtypes.size()); |
176 | for (auto dtype : dtypes) { |
177 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
178 | continue; |
179 | } |
180 | const auto options = |
181 | at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); |
182 | expect.emplace_back(at::ones({size}, options)); |
183 | expect.emplace_back(at::ones({size, size}, options)); |
184 | expect.emplace_back(at::ones({size, size}, options)); |
185 | } |
186 | auto cg_outputs = executor_cache.runFusionWithInputs({size}); |
187 | |
188 | testValidate( |
189 | executor_cache.fusion(), |
190 | cg_outputs, |
191 | {size}, |
192 | expect, |
193 | __LINE__, |
194 | __FILE__); |
195 | } |
196 | } |
197 | |
198 | TEST_F(NVFuserTest, FusionStandaloneARange_CUDA) { |
199 | auto starts_ends = {-1., 0., 10.3, 1024. * 256}; |
200 | auto steps = {-1.5, 1., 2.}; |
201 | auto dtypes = {kFloat, kLong, kDouble}; |
202 | |
203 | for (auto dtype : dtypes) { |
204 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
205 | continue; |
206 | } |
207 | |
208 | auto fusion = std::make_unique<Fusion>(); |
209 | FusionGuard fg(fusion.get()); |
210 | |
211 | Val* start_int = IrBuilder::create<Int>(); |
212 | Val* end_int = IrBuilder::create<Int>(); |
213 | Val* step_int = IrBuilder::create<Int>(); |
214 | Val* start_double = IrBuilder::create<Double>(); |
215 | Val* end_double = IrBuilder::create<Double>(); |
216 | Val* step_double = IrBuilder::create<Double>(); |
217 | fusion->addInput(start_int); |
218 | fusion->addInput(end_int); |
219 | fusion->addInput(step_int); |
220 | fusion->addInput(start_double); |
221 | fusion->addInput(end_double); |
222 | fusion->addInput(step_double); |
223 | auto tv0 = arange(start_int, end_int, step_int, aten_to_data_type(dtype)); |
224 | auto tv1 = |
225 | arange(start_double, end_double, step_double, aten_to_data_type(dtype)); |
226 | auto tv2 = |
227 | arange(start_int, end_double, step_double, aten_to_data_type(dtype)); |
228 | auto tv3 = |
229 | arange(start_double, end_double, step_int, aten_to_data_type(dtype)); |
230 | fusion->addOutput(tv0); |
231 | fusion->addOutput(tv1); |
232 | fusion->addOutput(tv2); |
233 | fusion->addOutput(tv3); |
234 | |
235 | FusionExecutorCache executor_cache(std::move(fusion)); |
236 | |
237 | const auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); |
238 | |
239 | for (auto start : starts_ends) { |
240 | for (auto end : starts_ends) { |
241 | for (auto step : steps) { |
242 | if (std::signbit(end - start) != std::signbit(step)) { |
243 | continue; |
244 | } |
245 | |
246 | at::Tensor a = |
247 | at::arange((int64_t)start, (int64_t)end, (int64_t)step, options); |
248 | at::Tensor b = |
249 | at::arange((double)start, (double)end, (double)step, options); |
250 | at::Tensor c = |
251 | at::arange((int64_t)start, (double)end, (double)step, options); |
252 | at::Tensor d = |
253 | at::arange((double)start, (double)end, (int64_t)step, options); |
254 | |
255 | auto cg_outputs = executor_cache.runFusionWithInputs( |
256 | {(int64_t)start, |
257 | (int64_t)end, |
258 | (int64_t)step, |
259 | (double)start, |
260 | (double)end, |
261 | (double)step}); |
262 | |
263 | testValidate( |
264 | executor_cache.fusion(), |
265 | cg_outputs, |
266 | {(int64_t)start, |
267 | (int64_t)end, |
268 | (int64_t)step, |
269 | (double)start, |
270 | (double)end, |
271 | (double)step}, |
272 | {a, b, c, d}, |
273 | __LINE__, |
274 | __FILE__); |
275 | } |
276 | } |
277 | } |
278 | } |
279 | } |
280 | |
281 | TEST_F(NVFuserTest, FusionStandaloneEye_CUDA) { |
282 | auto sizes = {0, 1, 10, 17, 1024}; |
283 | auto dtypes = { |
284 | kBool, |
285 | kFloat, |
286 | kLong, |
287 | kDouble, |
288 | kHalf, |
289 | kBFloat16, |
290 | kInt, |
291 | kComplexFloat, |
292 | kComplexDouble}; |
293 | |
294 | auto fusion = std::make_unique<Fusion>(); |
295 | FusionGuard fg(fusion.get()); |
296 | |
297 | Val* size = IrBuilder::create<Int>(); |
298 | Val* maybe_m = IrBuilder::create<Int>(); |
299 | fusion->addInput(size); |
300 | fusion->addInput(maybe_m); |
301 | for (auto dtype : dtypes) { |
302 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
303 | continue; |
304 | } |
305 | auto out_tv1 = eye(size, aten_to_data_type(dtype)); |
306 | fusion->addOutput(out_tv1); |
307 | auto out_tv2 = eye(size, maybe_m, aten_to_data_type(dtype)); |
308 | fusion->addOutput(out_tv2); |
309 | } |
310 | |
311 | FusionExecutorCache executor_cache(std::move(fusion)); |
312 | |
313 | for (auto size : sizes) { |
314 | std::vector<at::Tensor> expect; |
315 | expect.reserve(dtypes.size()); |
316 | for (auto dtype : dtypes) { |
317 | if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) { |
318 | continue; |
319 | } |
320 | const auto options = |
321 | at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); |
322 | expect.emplace_back(at::eye(size, options)); |
323 | expect.emplace_back(at::eye(size, 15, options)); |
324 | } |
325 | auto cg_outputs = executor_cache.runFusionWithInputs({size, 15}); |
326 | |
327 | testValidate( |
328 | executor_cache.fusion(), |
329 | cg_outputs, |
330 | {size, 15}, |
331 | expect, |
332 | __LINE__, |
333 | __FILE__); |
334 | } |
335 | } |
336 | |
337 | } // namespace jit |
338 | } // namespace torch |
339 | #endif // #if defined(USE_CUDA) |
340 | |