1
2#include <utils.h>
3
4#include <c10/util/string_view.h>
5
6#include <cstdlib>
7#include <iostream>
8#include <unordered_map>
9
10namespace torch {
11namespace jit {
12namespace fuser {
13namespace cuda {
14
15namespace {
16
17auto parseDebugDumpOptions() {
18 std::unordered_map<DebugDumpOption, bool> options_map = {
19 {DebugDumpOption::FusionIr, false},
20 {DebugDumpOption::FusionIrMath, false},
21 {DebugDumpOption::FusionIrPresched, false},
22 {DebugDumpOption::KernelIr, false},
23 {DebugDumpOption::ComputeAtMap, false},
24 {DebugDumpOption::CudaKernel, false},
25 {DebugDumpOption::CudaFull, false},
26 {DebugDumpOption::CudaToFile, false},
27 {DebugDumpOption::DebugInfo, false},
28 {DebugDumpOption::LaunchParam, false},
29 {DebugDumpOption::FusionSegments, false},
30 {DebugDumpOption::FusionSegmenterLog, false},
31 {DebugDumpOption::FusionArgs, false},
32 {DebugDumpOption::KernelArgs, false},
33 {DebugDumpOption::EffectiveBandwidth, false},
34 {DebugDumpOption::FusionSegmentsDrawing, false},
35 {DebugDumpOption::PrintPtxasLog, false},
36 {DebugDumpOption::BufferReuseInfo, false},
37 {DebugDumpOption::SchedulerDebug, false},
38 {DebugDumpOption::ParallelDimensions, false},
39 {DebugDumpOption::Halo, false},
40 {DebugDumpOption::PerfDebugVerbose, false},
41 {DebugDumpOption::PythonDefinition, false},
42 {DebugDumpOption::PythonFrontendDebug, false},
43 {DebugDumpOption::TransformPropagator, false},
44 {DebugDumpOption::Cubin, false},
45 {DebugDumpOption::Ptx, false},
46 {DebugDumpOption::BankConflictInfo, false},
47 {DebugDumpOption::SyncMap, false}};
48
49 if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) {
50 c10::string_view options_view(dump_options);
51 while (!options_view.empty()) {
52 const auto end_pos = options_view.find_first_of(',');
53 const auto token = options_view.substr(0, end_pos);
54 if (token == "fusion_ir") {
55 options_map[DebugDumpOption::FusionIr] = true;
56 } else if (token == "fusion_ir_math") {
57 options_map[DebugDumpOption::FusionIrMath] = true;
58 } else if (token == "fusion_ir_presched") {
59 options_map[DebugDumpOption::FusionIrPresched] = true;
60 } else if (token == "kernel_ir") {
61 options_map[DebugDumpOption::KernelIr] = true;
62 } else if (token == "ca_map") {
63 options_map[DebugDumpOption::ComputeAtMap] = true;
64 } else if (token == "cuda_kernel") {
65 options_map[DebugDumpOption::CudaKernel] = true;
66 } else if (token == "cuda_full") {
67 options_map[DebugDumpOption::CudaFull] = true;
68 } else if (token == "cuda_to_file") {
69 options_map[DebugDumpOption::CudaToFile] = true;
70 } else if (token == "debug_info") {
71 options_map[DebugDumpOption::DebugInfo] = true;
72 } else if (token == "launch_param") {
73 options_map[DebugDumpOption::LaunchParam] = true;
74 } else if (token == "segmented_fusion") {
75 options_map[DebugDumpOption::FusionSegments] = true;
76 } else if (token == "segmenter_logging") {
77 options_map[DebugDumpOption::FusionSegmenterLog] = true;
78 } else if (token == "fusion_args") {
79 options_map[DebugDumpOption::FusionArgs] = true;
80 } else if (token == "kernel_args") {
81 options_map[DebugDumpOption::KernelArgs] = true;
82 } else if (token == "dump_eff_bandwidth") {
83 options_map[DebugDumpOption::EffectiveBandwidth] = true;
84 } else if (token == "draw_segmented_fusion") {
85 options_map[DebugDumpOption::FusionSegmentsDrawing] = true;
86 } else if (token == "ptxas_verbose") {
87 options_map[DebugDumpOption::PrintPtxasLog] = true;
88 } else if (token == "buffer_reuse_verbose") {
89 options_map[DebugDumpOption::BufferReuseInfo] = true;
90 } else if (token == "scheduler_params") {
91 options_map[DebugDumpOption::SchedulerDebug] = true;
92 } else if (token == "parallel_dimensions") {
93 options_map[DebugDumpOption::ParallelDimensions] = true;
94 } else if (token == "halo") {
95 options_map[DebugDumpOption::Halo] = true;
96 } else if (token == "perf_debug_verbose") {
97 options_map[DebugDumpOption::PerfDebugVerbose] = true;
98 } else if (token == "python_definition") {
99 options_map[DebugDumpOption::PythonDefinition] = true;
100 } else if (token == "python_frontend_debug") {
101 options_map[DebugDumpOption::PythonFrontendDebug] = true;
102 } else if (token == "transform_propagator") {
103 options_map[DebugDumpOption::TransformPropagator] = true;
104 } else if (token == "cubin") {
105 options_map[DebugDumpOption::Cubin] = true;
106 } else if (token == "ptx") {
107 options_map[DebugDumpOption::Ptx] = true;
108 } else if (token == "bank_conflict") {
109 options_map[DebugDumpOption::BankConflictInfo] = true;
110 } else if (token == "sync_map") {
111 options_map[DebugDumpOption::SyncMap] = true;
112 } else {
113 TORCH_CHECK(
114 false,
115 "Invalid debug dump option: '",
116 token,
117 "'\nAvailable options:\n",
118 "\tfusion_ir, fusion_ir_math, fusion_ir_presched, kernel_ir, ca_map,\n",
119 "\tcuda_kernel, cuda_full, cuda_to_file, debug_info, launch_param,\n",
120 "\tsegmented_fusion, fusion_args, kernel_args, dump_eff_bandwidth,\n",
121 "\tdraw_segmented_fusion, scheduler_params, parallel_dimensions,\n",
122 "\tbuffer_reuse_verbose, ptxas_verbose, halo, segmenter_logging,\n",
123 "\tperf_debug_verbose, python_definition, python_frontend_debug,\n",
124 "\ttransform_propagator, cubin, ptx, bank_conflict, sync_map\n");
125 }
126 options_view = (end_pos != c10::string_view::npos)
127 ? options_view.substr(end_pos + 1)
128 : "";
129 }
130 }
131
132 return options_map;
133}
134
135auto parseDisableOptions() {
136 std::unordered_map<DisableOption, bool> options_map = {
137 {DisableOption::ArchCheck, false},
138 {DisableOption::CompileToSass, false},
139 {DisableOption::Fallback, false},
140 {DisableOption::Fma, false},
141 {DisableOption::IndexHoist, false},
142 {DisableOption::Nvtx, false},
143 {DisableOption::PredicateElimination, false}};
144
145 if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DISABLE")) {
146 c10::string_view options_view(dump_options);
147 while (!options_view.empty()) {
148 const auto end_pos = options_view.find_first_of(',');
149 const auto token = options_view.substr(0, end_pos);
150 if (token == "arch_check") {
151 options_map[DisableOption::ArchCheck] = true;
152 } else if (token == "compile_to_sass") {
153 options_map[DisableOption::CompileToSass] = true;
154 } else if (token == "fallback") {
155 options_map[DisableOption::Fallback] = true;
156 } else if (token == "fma") {
157 TORCH_WARN(
158 "fmad is disabled for nvrtc, which could negatively affect performance. Try removing `fma` from env variable PYTORCH_NVFUSER_DISABLE for optimal performance.");
159 options_map[DisableOption::Fma] = true;
160 } else if (token == "index_hoist") {
161 options_map[DisableOption::IndexHoist] = true;
162 } else if (token == "nvtx") {
163 options_map[DisableOption::Nvtx] = true;
164 } else if (token == "predicate_elimination") {
165 options_map[DisableOption::PredicateElimination] = true;
166 } else {
167 TORCH_CHECK(
168 false,
169 "Invalid disable option: '",
170 token,
171 "'\nAvailable options:\n",
172 "\tarch_check, fallback, fma, index_hoist, nvtx, predicate_elimination\n");
173 }
174 options_view = (end_pos != c10::string_view::npos)
175 ? options_view.substr(end_pos + 1)
176 : "";
177 }
178 }
179
180 return options_map;
181}
182
183auto parseEnableOptions() {
184 std::unordered_map<EnableOption, bool> options_map = {
185 {EnableOption::Complex, false},
186 {EnableOption::KernelProfile, false},
187 {EnableOption::LinearDecomposition, false},
188 {EnableOption::ConvDecomposition, false}};
189
190 if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
191 c10::string_view options_view(dump_options);
192 while (!options_view.empty()) {
193 const auto end_pos = options_view.find_first_of(',');
194 const auto token = options_view.substr(0, end_pos);
195 if (token == "complex") {
196 options_map[EnableOption::Complex] = true;
197 } else if (token == "kernel_profile") {
198 options_map[EnableOption::KernelProfile] = true;
199 } else if (token == "linear_decomposition") {
200 options_map[EnableOption::LinearDecomposition] = true;
201 } else if (token == "conv_decomposition") {
202 options_map[EnableOption::ConvDecomposition] = true;
203 } else {
204 TORCH_CHECK(
205 false,
206 "Invalid enable option: '",
207 token,
208 "'\nAvailable options:\n",
209 "\tcomplex, kernel_profile, linear_decomposition,",
210 "conv_decomposition");
211 }
212 options_view = (end_pos != c10::string_view::npos)
213 ? options_view.substr(end_pos + 1)
214 : "";
215 }
216 }
217
218 return options_map;
219}
220
221} // namespace
222
223#pragma clang diagnostic push
224#pragma clang diagnostic ignored "-Wunused-function"
225void debugPrint(const c10::TensorTypePtr& type) {
226 std::stringstream sizes_s;
227 if (auto sizes = type->symbolic_sizes().sizes()) {
228 for (const auto& shape_symbol : *sizes) {
229 if (shape_symbol.is_static()) {
230 sizes_s << shape_symbol.static_size() << ", ";
231 } else {
232 sizes_s << "s(" << *reinterpret_cast<const int64_t*>(&shape_symbol)
233 << "), ";
234 }
235 }
236 } else {
237 sizes_s << "no size available";
238 }
239 std::cout << "sizes:" << sizes_s.str() << std::endl;
240 if (const auto& stride_properties = type->stride_properties().sizes()) {
241 std::stringstream stride_s;
242 std::stringstream index_s;
243 std::stringstream contig_s;
244
245 for (const auto& stride_property : *stride_properties) {
246 if (stride_property.has_value() && stride_property->stride_.has_value()) {
247 stride_s << *stride_property->stride_ << ", ";
248 } else {
249 stride_s << "?, ";
250 }
251 if (stride_property.has_value() &&
252 stride_property->stride_index_.has_value()) {
253 index_s << *stride_property->stride_index_ << ", ";
254 } else {
255 index_s << "?, ";
256 }
257 if (stride_property.has_value() &&
258 stride_property->contiguous_.has_value()) {
259 contig_s << *stride_property->contiguous_ << ", ";
260 } else {
261 contig_s << "?, ";
262 }
263 }
264 std::cout << "stride: " << stride_s.str() << std::endl;
265 std::cout << "stride index: " << index_s.str() << std::endl;
266 std::cout << "contiguous: " << contig_s.str() << std::endl;
267 } else {
268 std::cout << "no stride properties available" << std::endl;
269 }
270}
271#pragma clang diagnostic pop
272
273bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
274 return tensor_type && tensor_type->dim().has_value() &&
275 tensor_type->dim().value() == 0;
276}
277
278bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
279 auto opt_sizes = tensor_type->sizes().concrete_sizes();
280 if (opt_sizes.has_value()) {
281 auto sizes = opt_sizes.value();
282 for (const auto& size : sizes) {
283 if (size == 0) {
284 return true;
285 }
286 }
287 }
288 return false;
289}
290
291bool is_cpu_scalar(const at::Tensor& tensor) {
292 return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0;
293}
294
295bool is_cpu_scalar(const c10::TensorType& tensor_type) {
296 auto opt_device = tensor_type.device();
297 auto opt_dim = tensor_type.dim();
298 auto opt_numel = tensor_type.numel();
299 return opt_device.has_value() && opt_device->is_cpu() &&
300 opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 &&
301 opt_numel.value() == 1;
302}
303
304// Check device of TensorType in all inputs ensure all tensors are on cuda
305// devices.
306// return common device index (or -1 if device differs).
307int getCommonDeviceCUDA(const at::ArrayRef<IValue>& inputs) {
308 int index = -1;
309 for (const auto& input : inputs) {
310 if (!input.isTensor()) {
311 continue;
312 }
313 const auto& device = input.toTensor().device();
314 // skip cpu scalar tensor as they'll be promoted to scalar later
315 if (device.is_cpu() && is_cpu_scalar(input.toTensor())) {
316 continue;
317 }
318 TORCH_CHECK(device.is_cuda(), "nvfuser only supports cuda device");
319 auto cur_index = device.index();
320 if (index != -1 && index != cur_index) {
321 return -1;
322 }
323 index = (int)cur_index; // NOLINT
324 }
325 return index;
326}
327
328KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs) {
329 // Save 1 more bit besides the sign bit to be conservative
330 constexpr int64_t most_positive_int32_index =
331 std::numeric_limits<int>::max() / 2;
332 constexpr int64_t most_negative_int32_index =
333 std::numeric_limits<int>::min() / 2;
334
335 // Check all runtime inputs, and if any one of
336 // the input's index exceeds max_int32 will
337 // fall back to int64 indexing
338 for (auto ivalue_input : inputs) {
339 if (ivalue_input.isTensor()) {
340 auto tensor_input = ivalue_input.toTensor();
341 int64_t tensor_most_positive_index = 0;
342 int64_t tensor_most_negative_index = 0;
343 for (auto dim_i = 0; dim_i < tensor_input.ndimension(); dim_i++) {
344 // Ignore broadcast dimensions
345 if (tensor_input.size(dim_i) > 1) {
346 // accumulate based on the sign of stride
347 if (tensor_input.stride(dim_i) > 0) {
348 // Acuumulate positive stride
349 tensor_most_positive_index +=
350 (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
351 } else {
352 // Acuumulate negative stride
353 tensor_most_negative_index +=
354 (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
355 }
356 }
357 }
358
359 // Fall back to int64 if it can be either too positive
360 // or too negative.
361 if (tensor_most_positive_index > most_positive_int32_index ||
362 tensor_most_negative_index < most_negative_int32_index) {
363 return KernelIndexMode::INT64;
364 }
365 }
366 }
367 // return index mode as int32
368 return KernelIndexMode::INT32;
369}
370
371bool isDebugDumpEnabled(DebugDumpOption option) {
372 const static auto dump_options = parseDebugDumpOptions();
373 return dump_options.at(option);
374}
375
376bool isOptionDisabled(DisableOption option) {
377 const static auto options = parseDisableOptions();
378 return options.at(option);
379}
380
381bool isOptionEnabled(EnableOption option) {
382 const static auto options = parseEnableOptions();
383 return options.at(option);
384}
385
386bool useFallback() {
387 // Keep this env var for compatibility
388 const char* disable_fb_env = getenv("PYTORCH_NVFUSER_DISABLE_FALLBACK");
389 bool fallback_disabled = disable_fb_env ? atoi(disable_fb_env) : false;
390 fallback_disabled =
391 fallback_disabled || isOptionDisabled(DisableOption::Fallback);
392
393 return !fallback_disabled;
394}
395
396std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type) {
397 TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor.");
398 auto optional_sizes = tensor_type->sizes().concrete_sizes();
399 TORCH_INTERNAL_ASSERT(
400 optional_sizes.has_value(), "Missing size information for the tensor.");
401 return optional_sizes.value();
402}
403
404} // namespace cuda
405} // namespace fuser
406} // namespace jit
407} // namespace torch
408