1 | |
2 | #include <utils.h> |
3 | |
4 | #include <c10/util/string_view.h> |
5 | |
6 | #include <cstdlib> |
7 | #include <iostream> |
8 | #include <unordered_map> |
9 | |
10 | namespace torch { |
11 | namespace jit { |
12 | namespace fuser { |
13 | namespace cuda { |
14 | |
15 | namespace { |
16 | |
17 | auto parseDebugDumpOptions() { |
18 | std::unordered_map<DebugDumpOption, bool> options_map = { |
19 | {DebugDumpOption::FusionIr, false}, |
20 | {DebugDumpOption::FusionIrMath, false}, |
21 | {DebugDumpOption::FusionIrPresched, false}, |
22 | {DebugDumpOption::KernelIr, false}, |
23 | {DebugDumpOption::ComputeAtMap, false}, |
24 | {DebugDumpOption::CudaKernel, false}, |
25 | {DebugDumpOption::CudaFull, false}, |
26 | {DebugDumpOption::CudaToFile, false}, |
27 | {DebugDumpOption::DebugInfo, false}, |
28 | {DebugDumpOption::LaunchParam, false}, |
29 | {DebugDumpOption::FusionSegments, false}, |
30 | {DebugDumpOption::FusionSegmenterLog, false}, |
31 | {DebugDumpOption::FusionArgs, false}, |
32 | {DebugDumpOption::KernelArgs, false}, |
33 | {DebugDumpOption::EffectiveBandwidth, false}, |
34 | {DebugDumpOption::FusionSegmentsDrawing, false}, |
35 | {DebugDumpOption::PrintPtxasLog, false}, |
36 | {DebugDumpOption::BufferReuseInfo, false}, |
37 | {DebugDumpOption::SchedulerDebug, false}, |
38 | {DebugDumpOption::ParallelDimensions, false}, |
39 | {DebugDumpOption::Halo, false}, |
40 | {DebugDumpOption::PerfDebugVerbose, false}, |
41 | {DebugDumpOption::PythonDefinition, false}, |
42 | {DebugDumpOption::PythonFrontendDebug, false}, |
43 | {DebugDumpOption::TransformPropagator, false}, |
44 | {DebugDumpOption::Cubin, false}, |
45 | {DebugDumpOption::Ptx, false}, |
46 | {DebugDumpOption::BankConflictInfo, false}, |
47 | {DebugDumpOption::SyncMap, false}}; |
48 | |
49 | if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP" )) { |
50 | c10::string_view options_view(dump_options); |
51 | while (!options_view.empty()) { |
52 | const auto end_pos = options_view.find_first_of(','); |
53 | const auto token = options_view.substr(0, end_pos); |
54 | if (token == "fusion_ir" ) { |
55 | options_map[DebugDumpOption::FusionIr] = true; |
56 | } else if (token == "fusion_ir_math" ) { |
57 | options_map[DebugDumpOption::FusionIrMath] = true; |
58 | } else if (token == "fusion_ir_presched" ) { |
59 | options_map[DebugDumpOption::FusionIrPresched] = true; |
60 | } else if (token == "kernel_ir" ) { |
61 | options_map[DebugDumpOption::KernelIr] = true; |
62 | } else if (token == "ca_map" ) { |
63 | options_map[DebugDumpOption::ComputeAtMap] = true; |
64 | } else if (token == "cuda_kernel" ) { |
65 | options_map[DebugDumpOption::CudaKernel] = true; |
66 | } else if (token == "cuda_full" ) { |
67 | options_map[DebugDumpOption::CudaFull] = true; |
68 | } else if (token == "cuda_to_file" ) { |
69 | options_map[DebugDumpOption::CudaToFile] = true; |
70 | } else if (token == "debug_info" ) { |
71 | options_map[DebugDumpOption::DebugInfo] = true; |
72 | } else if (token == "launch_param" ) { |
73 | options_map[DebugDumpOption::LaunchParam] = true; |
74 | } else if (token == "segmented_fusion" ) { |
75 | options_map[DebugDumpOption::FusionSegments] = true; |
76 | } else if (token == "segmenter_logging" ) { |
77 | options_map[DebugDumpOption::FusionSegmenterLog] = true; |
78 | } else if (token == "fusion_args" ) { |
79 | options_map[DebugDumpOption::FusionArgs] = true; |
80 | } else if (token == "kernel_args" ) { |
81 | options_map[DebugDumpOption::KernelArgs] = true; |
82 | } else if (token == "dump_eff_bandwidth" ) { |
83 | options_map[DebugDumpOption::EffectiveBandwidth] = true; |
84 | } else if (token == "draw_segmented_fusion" ) { |
85 | options_map[DebugDumpOption::FusionSegmentsDrawing] = true; |
86 | } else if (token == "ptxas_verbose" ) { |
87 | options_map[DebugDumpOption::PrintPtxasLog] = true; |
88 | } else if (token == "buffer_reuse_verbose" ) { |
89 | options_map[DebugDumpOption::BufferReuseInfo] = true; |
90 | } else if (token == "scheduler_params" ) { |
91 | options_map[DebugDumpOption::SchedulerDebug] = true; |
92 | } else if (token == "parallel_dimensions" ) { |
93 | options_map[DebugDumpOption::ParallelDimensions] = true; |
94 | } else if (token == "halo" ) { |
95 | options_map[DebugDumpOption::Halo] = true; |
96 | } else if (token == "perf_debug_verbose" ) { |
97 | options_map[DebugDumpOption::PerfDebugVerbose] = true; |
98 | } else if (token == "python_definition" ) { |
99 | options_map[DebugDumpOption::PythonDefinition] = true; |
100 | } else if (token == "python_frontend_debug" ) { |
101 | options_map[DebugDumpOption::PythonFrontendDebug] = true; |
102 | } else if (token == "transform_propagator" ) { |
103 | options_map[DebugDumpOption::TransformPropagator] = true; |
104 | } else if (token == "cubin" ) { |
105 | options_map[DebugDumpOption::Cubin] = true; |
106 | } else if (token == "ptx" ) { |
107 | options_map[DebugDumpOption::Ptx] = true; |
108 | } else if (token == "bank_conflict" ) { |
109 | options_map[DebugDumpOption::BankConflictInfo] = true; |
110 | } else if (token == "sync_map" ) { |
111 | options_map[DebugDumpOption::SyncMap] = true; |
112 | } else { |
113 | TORCH_CHECK( |
114 | false, |
115 | "Invalid debug dump option: '" , |
116 | token, |
117 | "'\nAvailable options:\n" , |
118 | "\tfusion_ir, fusion_ir_math, fusion_ir_presched, kernel_ir, ca_map,\n" , |
119 | "\tcuda_kernel, cuda_full, cuda_to_file, debug_info, launch_param,\n" , |
120 | "\tsegmented_fusion, fusion_args, kernel_args, dump_eff_bandwidth,\n" , |
121 | "\tdraw_segmented_fusion, scheduler_params, parallel_dimensions,\n" , |
122 | "\tbuffer_reuse_verbose, ptxas_verbose, halo, segmenter_logging,\n" , |
123 | "\tperf_debug_verbose, python_definition, python_frontend_debug,\n" , |
124 | "\ttransform_propagator, cubin, ptx, bank_conflict, sync_map\n" ); |
125 | } |
126 | options_view = (end_pos != c10::string_view::npos) |
127 | ? options_view.substr(end_pos + 1) |
128 | : "" ; |
129 | } |
130 | } |
131 | |
132 | return options_map; |
133 | } |
134 | |
135 | auto parseDisableOptions() { |
136 | std::unordered_map<DisableOption, bool> options_map = { |
137 | {DisableOption::ArchCheck, false}, |
138 | {DisableOption::CompileToSass, false}, |
139 | {DisableOption::Fallback, false}, |
140 | {DisableOption::Fma, false}, |
141 | {DisableOption::IndexHoist, false}, |
142 | {DisableOption::Nvtx, false}, |
143 | {DisableOption::PredicateElimination, false}}; |
144 | |
145 | if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DISABLE" )) { |
146 | c10::string_view options_view(dump_options); |
147 | while (!options_view.empty()) { |
148 | const auto end_pos = options_view.find_first_of(','); |
149 | const auto token = options_view.substr(0, end_pos); |
150 | if (token == "arch_check" ) { |
151 | options_map[DisableOption::ArchCheck] = true; |
152 | } else if (token == "compile_to_sass" ) { |
153 | options_map[DisableOption::CompileToSass] = true; |
154 | } else if (token == "fallback" ) { |
155 | options_map[DisableOption::Fallback] = true; |
156 | } else if (token == "fma" ) { |
157 | TORCH_WARN( |
158 | "fmad is disabled for nvrtc, which could negatively affect performance. Try removing `fma` from env variable PYTORCH_NVFUSER_DISABLE for optimal performance." ); |
159 | options_map[DisableOption::Fma] = true; |
160 | } else if (token == "index_hoist" ) { |
161 | options_map[DisableOption::IndexHoist] = true; |
162 | } else if (token == "nvtx" ) { |
163 | options_map[DisableOption::Nvtx] = true; |
164 | } else if (token == "predicate_elimination" ) { |
165 | options_map[DisableOption::PredicateElimination] = true; |
166 | } else { |
167 | TORCH_CHECK( |
168 | false, |
169 | "Invalid disable option: '" , |
170 | token, |
171 | "'\nAvailable options:\n" , |
172 | "\tarch_check, fallback, fma, index_hoist, nvtx, predicate_elimination\n" ); |
173 | } |
174 | options_view = (end_pos != c10::string_view::npos) |
175 | ? options_view.substr(end_pos + 1) |
176 | : "" ; |
177 | } |
178 | } |
179 | |
180 | return options_map; |
181 | } |
182 | |
183 | auto parseEnableOptions() { |
184 | std::unordered_map<EnableOption, bool> options_map = { |
185 | {EnableOption::Complex, false}, |
186 | {EnableOption::KernelProfile, false}, |
187 | {EnableOption::LinearDecomposition, false}, |
188 | {EnableOption::ConvDecomposition, false}}; |
189 | |
190 | if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE" )) { |
191 | c10::string_view options_view(dump_options); |
192 | while (!options_view.empty()) { |
193 | const auto end_pos = options_view.find_first_of(','); |
194 | const auto token = options_view.substr(0, end_pos); |
195 | if (token == "complex" ) { |
196 | options_map[EnableOption::Complex] = true; |
197 | } else if (token == "kernel_profile" ) { |
198 | options_map[EnableOption::KernelProfile] = true; |
199 | } else if (token == "linear_decomposition" ) { |
200 | options_map[EnableOption::LinearDecomposition] = true; |
201 | } else if (token == "conv_decomposition" ) { |
202 | options_map[EnableOption::ConvDecomposition] = true; |
203 | } else { |
204 | TORCH_CHECK( |
205 | false, |
206 | "Invalid enable option: '" , |
207 | token, |
208 | "'\nAvailable options:\n" , |
209 | "\tcomplex, kernel_profile, linear_decomposition," , |
210 | "conv_decomposition" ); |
211 | } |
212 | options_view = (end_pos != c10::string_view::npos) |
213 | ? options_view.substr(end_pos + 1) |
214 | : "" ; |
215 | } |
216 | } |
217 | |
218 | return options_map; |
219 | } |
220 | |
221 | } // namespace |
222 | |
223 | #pragma clang diagnostic push |
224 | #pragma clang diagnostic ignored "-Wunused-function" |
225 | void debugPrint(const c10::TensorTypePtr& type) { |
226 | std::stringstream sizes_s; |
227 | if (auto sizes = type->symbolic_sizes().sizes()) { |
228 | for (const auto& shape_symbol : *sizes) { |
229 | if (shape_symbol.is_static()) { |
230 | sizes_s << shape_symbol.static_size() << ", " ; |
231 | } else { |
232 | sizes_s << "s(" << *reinterpret_cast<const int64_t*>(&shape_symbol) |
233 | << "), " ; |
234 | } |
235 | } |
236 | } else { |
237 | sizes_s << "no size available" ; |
238 | } |
239 | std::cout << "sizes:" << sizes_s.str() << std::endl; |
240 | if (const auto& stride_properties = type->stride_properties().sizes()) { |
241 | std::stringstream stride_s; |
242 | std::stringstream index_s; |
243 | std::stringstream contig_s; |
244 | |
245 | for (const auto& stride_property : *stride_properties) { |
246 | if (stride_property.has_value() && stride_property->stride_.has_value()) { |
247 | stride_s << *stride_property->stride_ << ", " ; |
248 | } else { |
249 | stride_s << "?, " ; |
250 | } |
251 | if (stride_property.has_value() && |
252 | stride_property->stride_index_.has_value()) { |
253 | index_s << *stride_property->stride_index_ << ", " ; |
254 | } else { |
255 | index_s << "?, " ; |
256 | } |
257 | if (stride_property.has_value() && |
258 | stride_property->contiguous_.has_value()) { |
259 | contig_s << *stride_property->contiguous_ << ", " ; |
260 | } else { |
261 | contig_s << "?, " ; |
262 | } |
263 | } |
264 | std::cout << "stride: " << stride_s.str() << std::endl; |
265 | std::cout << "stride index: " << index_s.str() << std::endl; |
266 | std::cout << "contiguous: " << contig_s.str() << std::endl; |
267 | } else { |
268 | std::cout << "no stride properties available" << std::endl; |
269 | } |
270 | } |
271 | #pragma clang diagnostic pop |
272 | |
273 | bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) { |
274 | return tensor_type && tensor_type->dim().has_value() && |
275 | tensor_type->dim().value() == 0; |
276 | } |
277 | |
278 | bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) { |
279 | auto opt_sizes = tensor_type->sizes().concrete_sizes(); |
280 | if (opt_sizes.has_value()) { |
281 | auto sizes = opt_sizes.value(); |
282 | for (const auto& size : sizes) { |
283 | if (size == 0) { |
284 | return true; |
285 | } |
286 | } |
287 | } |
288 | return false; |
289 | } |
290 | |
291 | bool is_cpu_scalar(const at::Tensor& tensor) { |
292 | return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0; |
293 | } |
294 | |
295 | bool is_cpu_scalar(const c10::TensorType& tensor_type) { |
296 | auto opt_device = tensor_type.device(); |
297 | auto opt_dim = tensor_type.dim(); |
298 | auto opt_numel = tensor_type.numel(); |
299 | return opt_device.has_value() && opt_device->is_cpu() && |
300 | opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 && |
301 | opt_numel.value() == 1; |
302 | } |
303 | |
304 | // Check device of TensorType in all inputs ensure all tensors are on cuda |
305 | // devices. |
306 | // return common device index (or -1 if device differs). |
307 | int getCommonDeviceCUDA(const at::ArrayRef<IValue>& inputs) { |
308 | int index = -1; |
309 | for (const auto& input : inputs) { |
310 | if (!input.isTensor()) { |
311 | continue; |
312 | } |
313 | const auto& device = input.toTensor().device(); |
314 | // skip cpu scalar tensor as they'll be promoted to scalar later |
315 | if (device.is_cpu() && is_cpu_scalar(input.toTensor())) { |
316 | continue; |
317 | } |
318 | TORCH_CHECK(device.is_cuda(), "nvfuser only supports cuda device" ); |
319 | auto cur_index = device.index(); |
320 | if (index != -1 && index != cur_index) { |
321 | return -1; |
322 | } |
323 | index = (int)cur_index; // NOLINT |
324 | } |
325 | return index; |
326 | } |
327 | |
328 | KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs) { |
329 | // Save 1 more bit besides the sign bit to be conservative |
330 | constexpr int64_t most_positive_int32_index = |
331 | std::numeric_limits<int>::max() / 2; |
332 | constexpr int64_t most_negative_int32_index = |
333 | std::numeric_limits<int>::min() / 2; |
334 | |
335 | // Check all runtime inputs, and if any one of |
336 | // the input's index exceeds max_int32 will |
337 | // fall back to int64 indexing |
338 | for (auto ivalue_input : inputs) { |
339 | if (ivalue_input.isTensor()) { |
340 | auto tensor_input = ivalue_input.toTensor(); |
341 | int64_t tensor_most_positive_index = 0; |
342 | int64_t tensor_most_negative_index = 0; |
343 | for (auto dim_i = 0; dim_i < tensor_input.ndimension(); dim_i++) { |
344 | // Ignore broadcast dimensions |
345 | if (tensor_input.size(dim_i) > 1) { |
346 | // accumulate based on the sign of stride |
347 | if (tensor_input.stride(dim_i) > 0) { |
348 | // Acuumulate positive stride |
349 | tensor_most_positive_index += |
350 | (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i); |
351 | } else { |
352 | // Acuumulate negative stride |
353 | tensor_most_negative_index += |
354 | (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i); |
355 | } |
356 | } |
357 | } |
358 | |
359 | // Fall back to int64 if it can be either too positive |
360 | // or too negative. |
361 | if (tensor_most_positive_index > most_positive_int32_index || |
362 | tensor_most_negative_index < most_negative_int32_index) { |
363 | return KernelIndexMode::INT64; |
364 | } |
365 | } |
366 | } |
367 | // return index mode as int32 |
368 | return KernelIndexMode::INT32; |
369 | } |
370 | |
371 | bool isDebugDumpEnabled(DebugDumpOption option) { |
372 | const static auto dump_options = parseDebugDumpOptions(); |
373 | return dump_options.at(option); |
374 | } |
375 | |
376 | bool isOptionDisabled(DisableOption option) { |
377 | const static auto options = parseDisableOptions(); |
378 | return options.at(option); |
379 | } |
380 | |
381 | bool isOptionEnabled(EnableOption option) { |
382 | const static auto options = parseEnableOptions(); |
383 | return options.at(option); |
384 | } |
385 | |
386 | bool useFallback() { |
387 | // Keep this env var for compatibility |
388 | const char* disable_fb_env = getenv("PYTORCH_NVFUSER_DISABLE_FALLBACK" ); |
389 | bool fallback_disabled = disable_fb_env ? atoi(disable_fb_env) : false; |
390 | fallback_disabled = |
391 | fallback_disabled || isOptionDisabled(DisableOption::Fallback); |
392 | |
393 | return !fallback_disabled; |
394 | } |
395 | |
396 | std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type) { |
397 | TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor." ); |
398 | auto optional_sizes = tensor_type->sizes().concrete_sizes(); |
399 | TORCH_INTERNAL_ASSERT( |
400 | optional_sizes.has_value(), "Missing size information for the tensor." ); |
401 | return optional_sizes.value(); |
402 | } |
403 | |
404 | } // namespace cuda |
405 | } // namespace fuser |
406 | } // namespace jit |
407 | } // namespace torch |
408 | |