utils.cpp source code [pytorch/third_party/nvfuser/csrc/utils.cpp]

1
2	#include <utils.h>
3
4	#include <c10/util/string_view.h>
5
6	#include <cstdlib>
7	#include <iostream>
8	#include <unordered_map>
9
10	namespace torch {
11	namespace jit {
12	namespace fuser {
13	namespace cuda {
14
15	namespace {
16
17	auto parseDebugDumpOptions() {
18	std::unordered_map<DebugDumpOption, bool> options_map = {
19	{DebugDumpOption::FusionIr, false},
20	{DebugDumpOption::FusionIrMath, false},
21	{DebugDumpOption::FusionIrPresched, false},
22	{DebugDumpOption::KernelIr, false},
23	{DebugDumpOption::ComputeAtMap, false},
24	{DebugDumpOption::CudaKernel, false},
25	{DebugDumpOption::CudaFull, false},
26	{DebugDumpOption::CudaToFile, false},
27	{DebugDumpOption::DebugInfo, false},
28	{DebugDumpOption::LaunchParam, false},
29	{DebugDumpOption::FusionSegments, false},
30	{DebugDumpOption::FusionSegmenterLog, false},
31	{DebugDumpOption::FusionArgs, false},
32	{DebugDumpOption::KernelArgs, false},
33	{DebugDumpOption::EffectiveBandwidth, false},
34	{DebugDumpOption::FusionSegmentsDrawing, false},
35	{DebugDumpOption::PrintPtxasLog, false},
36	{DebugDumpOption::BufferReuseInfo, false},
37	{DebugDumpOption::SchedulerDebug, false},
38	{DebugDumpOption::ParallelDimensions, false},
39	{DebugDumpOption::Halo, false},
40	{DebugDumpOption::PerfDebugVerbose, false},
41	{DebugDumpOption::PythonDefinition, false},
42	{DebugDumpOption::PythonFrontendDebug, false},
43	{DebugDumpOption::TransformPropagator, false},
44	{DebugDumpOption::Cubin, false},
45	{DebugDumpOption::Ptx, false},
46	{DebugDumpOption::BankConflictInfo, false},
47	{DebugDumpOption::SyncMap, false}};
48
49	if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) {
50	c10::string_view options_view(dump_options);
51	while (!options_view.empty()) {
52	const auto end_pos = options_view.find_first_of(`','`);
53	const auto token = options_view.substr(`0`, end_pos);
54	if (token == "fusion_ir") {
55	options_map [DebugDumpOption::FusionIr] = true;
56	} else if (token == "fusion_ir_math") {
57	options_map [DebugDumpOption::FusionIrMath] = true;
58	} else if (token == "fusion_ir_presched") {
59	options_map [DebugDumpOption::FusionIrPresched] = true;
60	} else if (token == "kernel_ir") {
61	options_map [DebugDumpOption::KernelIr] = true;
62	} else if (token == "ca_map") {
63	options_map [DebugDumpOption::ComputeAtMap] = true;
64	} else if (token == "cuda_kernel") {
65	options_map [DebugDumpOption::CudaKernel] = true;
66	} else if (token == "cuda_full") {
67	options_map [DebugDumpOption::CudaFull] = true;
68	} else if (token == "cuda_to_file") {
69	options_map [DebugDumpOption::CudaToFile] = true;
70	} else if (token == "debug_info") {
71	options_map [DebugDumpOption::DebugInfo] = true;
72	} else if (token == "launch_param") {
73	options_map [DebugDumpOption::LaunchParam] = true;
74	} else if (token == "segmented_fusion") {
75	options_map [DebugDumpOption::FusionSegments] = true;
76	} else if (token == "segmenter_logging") {
77	options_map [DebugDumpOption::FusionSegmenterLog] = true;
78	} else if (token == "fusion_args") {
79	options_map [DebugDumpOption::FusionArgs] = true;
80	} else if (token == "kernel_args") {
81	options_map [DebugDumpOption::KernelArgs] = true;
82	} else if (token == "dump_eff_bandwidth") {
83	options_map [DebugDumpOption::EffectiveBandwidth] = true;
84	} else if (token == "draw_segmented_fusion") {
85	options_map [DebugDumpOption::FusionSegmentsDrawing] = true;
86	} else if (token == "ptxas_verbose") {
87	options_map [DebugDumpOption::PrintPtxasLog] = true;
88	} else if (token == "buffer_reuse_verbose") {
89	options_map [DebugDumpOption::BufferReuseInfo] = true;
90	} else if (token == "scheduler_params") {
91	options_map [DebugDumpOption::SchedulerDebug] = true;
92	} else if (token == "parallel_dimensions") {
93	options_map [DebugDumpOption::ParallelDimensions] = true;
94	} else if (token == "halo") {
95	options_map [DebugDumpOption::Halo] = true;
96	} else if (token == "perf_debug_verbose") {
97	options_map [DebugDumpOption::PerfDebugVerbose] = true;
98	} else if (token == "python_definition") {
99	options_map [DebugDumpOption::PythonDefinition] = true;
100	} else if (token == "python_frontend_debug") {
101	options_map [DebugDumpOption::PythonFrontendDebug] = true;
102	} else if (token == "transform_propagator") {
103	options_map [DebugDumpOption::TransformPropagator] = true;
104	} else if (token == "cubin") {
105	options_map [DebugDumpOption::Cubin] = true;
106	} else if (token == "ptx") {
107	options_map [DebugDumpOption::Ptx] = true;
108	} else if (token == "bank_conflict") {
109	options_map [DebugDumpOption::BankConflictInfo] = true;
110	} else if (token == "sync_map") {
111	options_map [DebugDumpOption::SyncMap] = true;
112	} else {
113	TORCH_CHECK(
114	false,
115	"Invalid debug dump option: '",
116	token,
117	"'\nAvailable options:\n",
118	"\tfusion_ir, fusion_ir_math, fusion_ir_presched, kernel_ir, ca_map,\n",
119	"\tcuda_kernel, cuda_full, cuda_to_file, debug_info, launch_param,\n",
120	"\tsegmented_fusion, fusion_args, kernel_args, dump_eff_bandwidth,\n",
121	"\tdraw_segmented_fusion, scheduler_params, parallel_dimensions,\n",
122	"\tbuffer_reuse_verbose, ptxas_verbose, halo, segmenter_logging,\n",
123	"\tperf_debug_verbose, python_definition, python_frontend_debug,\n",
124	"\ttransform_propagator, cubin, ptx, bank_conflict, sync_map\n");
125	}
126	options_view = (end_pos != c10::string_view::npos)
127	? options_view.substr(end_pos + `1`)
128	: "";
129	}
130	}
131
132	return options_map;
133	}
134
135	auto parseDisableOptions() {
136	std::unordered_map<DisableOption, bool> options_map = {
137	{DisableOption::ArchCheck, false},
138	{DisableOption::CompileToSass, false},
139	{DisableOption::Fallback, false},
140	{DisableOption::Fma, false},
141	{DisableOption::IndexHoist, false},
142	{DisableOption::Nvtx, false},
143	{DisableOption::PredicateElimination, false}};
144
145	if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DISABLE")) {
146	c10::string_view options_view(dump_options);
147	while (!options_view.empty()) {
148	const auto end_pos = options_view.find_first_of(`','`);
149	const auto token = options_view.substr(`0`, end_pos);
150	if (token == "arch_check") {
151	options_map [DisableOption::ArchCheck] = true;
152	} else if (token == "compile_to_sass") {
153	options_map [DisableOption::CompileToSass] = true;
154	} else if (token == "fallback") {
155	options_map [DisableOption::Fallback] = true;
156	} else if (token == "fma") {
157	TORCH_WARN(
158	"fmad is disabled for nvrtc, which could negatively affect performance. Try removing `fma` from env variable PYTORCH_NVFUSER_DISABLE for optimal performance.");
159	options_map [DisableOption::Fma] = true;
160	} else if (token == "index_hoist") {
161	options_map [DisableOption::IndexHoist] = true;
162	} else if (token == "nvtx") {
163	options_map [DisableOption::Nvtx] = true;
164	} else if (token == "predicate_elimination") {
165	options_map [DisableOption::PredicateElimination] = true;
166	} else {
167	TORCH_CHECK(
168	false,
169	"Invalid disable option: '",
170	token,
171	"'\nAvailable options:\n",
172	"\tarch_check, fallback, fma, index_hoist, nvtx, predicate_elimination\n");
173	}
174	options_view = (end_pos != c10::string_view::npos)
175	? options_view.substr(end_pos + `1`)
176	: "";
177	}
178	}
179
180	return options_map;
181	}
182
183	auto parseEnableOptions() {
184	std::unordered_map<EnableOption, bool> options_map = {
185	{EnableOption::Complex, false},
186	{EnableOption::KernelProfile, false},
187	{EnableOption::LinearDecomposition, false},
188	{EnableOption::ConvDecomposition, false}};
189
190	if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
191	c10::string_view options_view(dump_options);
192	while (!options_view.empty()) {
193	const auto end_pos = options_view.find_first_of(`','`);
194	const auto token = options_view.substr(`0`, end_pos);
195	if (token == "complex") {
196	options_map [EnableOption::Complex] = true;
197	} else if (token == "kernel_profile") {
198	options_map [EnableOption::KernelProfile] = true;
199	} else if (token == "linear_decomposition") {
200	options_map [EnableOption::LinearDecomposition] = true;
201	} else if (token == "conv_decomposition") {
202	options_map [EnableOption::ConvDecomposition] = true;
203	} else {
204	TORCH_CHECK(
205	false,
206	"Invalid enable option: '",
207	token,
208	"'\nAvailable options:\n",
209	"\tcomplex, kernel_profile, linear_decomposition,",
210	"conv_decomposition");
211	}
212	options_view = (end_pos != c10::string_view::npos)
213	? options_view.substr(end_pos + `1`)
214	: "";
215	}
216	}
217
218	return options_map;
219	}
220
221	} // namespace
222
223	#pragma clang diagnostic push
224	#pragma clang diagnostic ignored "-Wunused-function"
225	void debugPrint(const c10::TensorTypePtr& type) {
226	std::stringstream sizes_s;
227	if (auto sizes = type ->symbolic_sizes().sizes()) {
228	for (const auto& shape_symbol : *sizes) {
229	if (shape_symbol.is_static()) {
230	sizes_s << shape_symbol.static_size() << ", ";
231	} else {
232	sizes_s << "s(" << *reinterpret_cast<const int64_t*>(&shape_symbol)
233	<< "), ";
234	}
235	}
236	} else {
237	sizes_s << "no size available";
238	}
239	std::cout << "sizes:" << sizes_s.str() << std::endl;
240	if (const auto& stride_properties = type ->stride_properties().sizes()) {
241	std::stringstream stride_s;
242	std::stringstream index_s;
243	std::stringstream contig_s;
244
245	for (const auto& stride_property : *stride_properties) {
246	if (stride_property.has_value() && stride_property ->stride_.has_value()) {
247	stride_s << *stride_property ->stride_ << ", ";
248	} else {
249	stride_s << "?, ";
250	}
251	if (stride_property.has_value() &&
252	stride_property ->stride_index_.has_value()) {
253	index_s << *stride_property ->stride_index_ << ", ";
254	} else {
255	index_s << "?, ";
256	}
257	if (stride_property.has_value() &&
258	stride_property ->contiguous_.has_value()) {
259	contig_s << *stride_property ->contiguous_ << ", ";
260	} else {
261	contig_s << "?, ";
262	}
263	}
264	std::cout << "stride: " << stride_s.str() << std::endl;
265	std::cout << "stride index: " << index_s.str() << std::endl;
266	std::cout << "contiguous: " << contig_s.str() << std::endl;
267	} else {
268	std::cout << "no stride properties available" << std::endl;
269	}
270	}
271	#pragma clang diagnostic pop
272
273	bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
274	return tensor_type && tensor_type ->dim().has_value() &&
275	tensor_type ->dim().value() == `0`;
276	}
277
278	bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type) {
279	auto opt_sizes = tensor_type ->sizes().concrete_sizes();
280	if (opt_sizes.has_value()) {
281	auto sizes = opt_sizes.value();
282	for (const auto& size : sizes) {
283	if (size == `0`) {
284	return true;
285	}
286	}
287	}
288	return false;
289	}
290
291	bool is_cpu_scalar(const at::Tensor& tensor) {
292	return tensor.device().is_cpu() && tensor.numel() == `1` && tensor.dim() == `0`;
293	}
294
295	bool is_cpu_scalar(const c10::TensorType& tensor_type) {
296	auto opt_device = tensor_type.device();
297	auto opt_dim = tensor_type.dim();
298	auto opt_numel = tensor_type.numel();
299	return opt_device.has_value() && opt_device ->is_cpu() &&
300	opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == `0` &&
301	opt_numel.value() == `1`;
302	}
303
304	// Check device of TensorType in all inputs ensure all tensors are on cuda
305	// devices.
306	// return common device index (or -1 if device differs).
307	int getCommonDeviceCUDA(const at::ArrayRef<IValue>& inputs) {
308	int index = -`1`;
309	for (const auto& input : inputs) {
310	if (!input.isTensor()) {
311	continue;
312	}
313	const auto& device = input.toTensor().device();
314	// skip cpu scalar tensor as they'll be promoted to scalar later
315	if (device.is_cpu() && is_cpu_scalar(input.toTensor())) {
316	continue;
317	}
318	TORCH_CHECK(device.is_cuda(), "nvfuser only supports cuda device");
319	auto cur_index = device.index();
320	if (index != -`1` && index != cur_index) {
321	return -`1`;
322	}
323	index = (int)cur_index; // NOLINT
324	}
325	return index;
326	}
327
328	KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs) {
329	// Save 1 more bit besides the sign bit to be conservative
330	constexpr int64_t most_positive_int32_index =
331	std::numeric_limits<int>::max() / `2`;
332	constexpr int64_t most_negative_int32_index =
333	std::numeric_limits<int>::min() / `2`;
334
335	// Check all runtime inputs, and if any one of
336	// the input's index exceeds max_int32 will
337	// fall back to int64 indexing
338	for (auto ivalue_input : inputs) {
339	if (ivalue_input.isTensor()) {
340	auto tensor_input = ivalue_input.toTensor();
341	int64_t tensor_most_positive_index = `0`;
342	int64_t tensor_most_negative_index = `0`;
343	for (auto dim_i = `0`; dim_i < tensor_input.ndimension(); dim_i++) {
344	// Ignore broadcast dimensions
345	if (tensor_input.size(dim_i) > `1`) {
346	// accumulate based on the sign of stride
347	if (tensor_input.stride(dim_i) > `0`) {
348	// Acuumulate positive stride
349	tensor_most_positive_index +=
350	(tensor_input.size(dim_i) - `1`) * tensor_input.stride(dim_i);
351	} else {
352	// Acuumulate negative stride
353	tensor_most_negative_index +=
354	(tensor_input.size(dim_i) - `1`) * tensor_input.stride(dim_i);
355	}
356	}
357	}
358
359	// Fall back to int64 if it can be either too positive
360	// or too negative.
361	if (tensor_most_positive_index > most_positive_int32_index \|\|
362	tensor_most_negative_index < most_negative_int32_index) {
363	return KernelIndexMode::INT64;
364	}
365	}
366	}
367	// return index mode as int32
368	return KernelIndexMode::INT32;
369	}
370
371	bool isDebugDumpEnabled(DebugDumpOption option) {
372	const static auto dump_options = parseDebugDumpOptions();
373	return dump_options.at(option);
374	}
375
376	bool isOptionDisabled(DisableOption option) {
377	const static auto options = parseDisableOptions();
378	return options.at(option);
379	}
380
381	bool isOptionEnabled(EnableOption option) {
382	const static auto options = parseEnableOptions();
383	return options.at(option);
384	}
385
386	bool useFallback() {
387	// Keep this env var for compatibility
388	const char* disable_fb_env = getenv("PYTORCH_NVFUSER_DISABLE_FALLBACK");
389	bool fallback_disabled = disable_fb_env ? atoi(disable_fb_env) : false;
390	fallback_disabled =
391	fallback_disabled \|\| isOptionDisabled(DisableOption::Fallback);
392
393	return !fallback_disabled;
394	}
395
396	std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type) {
397	TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor.");
398	auto optional_sizes = tensor_type ->sizes().concrete_sizes();
399	TORCH_INTERNAL_ASSERT(
400	optional_sizes.has_value(), "Missing size information for the tensor.");
401	return optional_sizes.value();
402	}
403
404	} // namespace cuda
405	} // namespace fuser
406	} // namespace jit
407	} // namespace torch
408

Browse the source code of pytorch/third_party/nvfuser/csrc/utils.cpp