utils.h source code [pytorch/third_party/nvfuser/csrc/utils.h]

1	#pragma once
2
3	#include <ATen/ATen.h>
4	#include <c10/util/Exception.h>
5	#include <type.h>
6	#include <torch/csrc/jit/ir/ir.h>
7
8	namespace torch {
9	namespace jit {
10	namespace fuser {
11	namespace cuda {
12
13	void debugPrint(const c10::TensorTypePtr& type);
14
15	bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
16	bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
17
18	bool is_cpu_scalar(const at::Tensor& tensor);
19	bool is_cpu_scalar(const c10::TensorType& tensor_type);
20
21	// TODO: merge these two
22	// check if input is compatible with 32b index mode
23	int getCommonDeviceCUDA(const at::ArrayRef<IValue>& inputs);
24	KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs);
25
26	//! Types of debug print-outs
27	//!
28	//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
29	//!
30	enum class DebugDumpOption {
31	FusionIr, //!< Dump the Fusion IR before lowering
32	FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
33	FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
34	KernelIr, //!< Dump the compiler Kernel IR
35	ComputeAtMap, //!< Dump the computeAt map
36	CudaKernel, //!< Dump the generated CUDA C++ kernel code
37	CudaFull, //!< Dump the complete CUDA C++ code
38	CudaToFile, //!< Dump CUDA Strings to File
39	DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump
40	//!< the full CUDA C++ code
41	LaunchParam, //!< Dump the Launch parameters of kernel
42	FusionSegments, //!< Dump Segmented Fusion Graph
43	FusionSegmenterLog, //!< Dump Detailed Segmenter Logging
44	FusionArgs, //!< Print the runtime fusion arguments
45	KernelArgs, //!< Print the runtime kernel arguments when launching kernels
46	EffectiveBandwidth, //! Measure kernel performance and print effective
47	//! bandwidth
48	FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
49	PrintPtxasLog, //!< Print the ptxas verbose log including register usage
50	BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
51	SchedulerDebug, //! Dump scheduler heuristic parameters
52	ParallelDimensions, //!< Dump known parallel dimensions
53	Halo, //! Halo information of tensors
54	PerfDebugVerbose, //! When running kernels, print verbose information
55	//! associated with what's running
56	PythonDefinition, //! Python Frontend Fusion Definition.
57	PythonFrontendDebug, //! Python Frontend debug information.
58	TransformPropagator, //! When running TransformPropagator, print propagation
59	//! path and replay result
60	Cubin, //! Dump compiled CUBIN
61	Ptx, //! Dump compiled PTX
62	BankConflictInfo, //! Dump bank confliction info
63	SyncMap //! RAW dependency info
64	};
65
66	TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
67
68	//! Types of features to disable
69	//!
70	//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
71	//!
72	enum class DisableOption {
73	ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
74	CompileToSass, //! Disable direct compilation to sass so the ptx can be
75	//! examined
76	Fallback, //! Disable fallback
77	Fma, //! Disable FMA instructions
78	IndexHoist, //! Disable index hoisting
79	Nvtx, //! Disable NVTX instrumentation
80	PredicateElimination //! Disable predicate elimination
81	};
82
83	TORCH_CUDA_CU_API bool isOptionDisabled(DisableOption option);
84
85	//! Types of features to enable
86	//!
87	//! These can be set through the `PYTORCH_NVFUSER_ENABLE` environment variable
88	//!
89	enum class EnableOption {
90	Complex, //! Enable complex support on python
91	KernelProfile, //! Enable intra-kernel performance profiling
92	LinearDecomposition, //! Enable linear-bias decomposition
93	ConvDecomposition, //! Enable conv-bias decomposition
94	};
95
96	TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
97
98	// Check if fallback path should be used which will dispatch to eagermode if any
99	// errors are encountered. Helpful for debugging.
100	bool useFallback();
101
102	//! Ceil integer division
103	constexpr int64_t ceilDiv(int64_t a, int64_t b) {
104	return (a + b - `1`) / b;
105	}
106
107	//! Simple mixin for suppressing copy & move operations, ex:
108	//!
109	//! class Foo : public NonCopyable {
110	//! ...
111	//! };
112	//!
113	class NonCopyable {
114	public:
115	NonCopyable() = default;
116
117	// No copy/move semantics
118	NonCopyable(const NonCopyable&) = delete;
119	NonCopyable& operator=(const NonCopyable&) = delete;
120	};
121
122	//! A generic root for a hierarchy of polymorphic classes:
123	//! - It ensures virtual destructors
124	//! - Provides the base->as<Derived>() and node->isA<T>() notation
125	class PolymorphicBase {
126	public:
127	virtual ~PolymorphicBase() = default;
128
129	// Replacement for static_cast<T>(ptr): ptr->as<T>()*
130	// (checked in DEBUG builds)
131	template <class T>
132	T* as() {
133	#ifdef NDEBUG
134	auto downcast_ptr = static_cast<T>(this*);
135	#else
136	auto downcast_ptr = dynamic_cast<T>(this*);
137	TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
138	#endif
139	return downcast_ptr;
140	}
141
142	template <class T>
143	const T* as() const {
144	#ifdef NDEBUG
145	auto downcast_ptr = static_cast<const T>(this*);
146	#else
147	auto downcast_ptr = dynamic_cast<const T>(this*);
148	TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
149	#endif
150	return downcast_ptr;
151	}
152
153	//! Check if the runtime time is T (or derived from T)
154	//!
155	//! \note Don't use this for conditional casts. Instead, use:
156	//!
157	//! if (auto t = dynamic_cast<T>(p)) { ... }
158	//!
159	//! instead of:
160	//!
161	//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
162	//!
163	template <class T>
164	bool isA() const {
165	return dynamic_cast<const T>(this) != nullptr*;
166	}
167	};
168
169	template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
170	constexpr unsigned int switch_pair(T t1, T t2) {
171	constexpr unsigned int _WORD_SHIFT = `16`;
172	return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
173	}
174
175	std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
176
177	} // namespace cuda
178	} // namespace fuser
179	} // namespace jit
180	} // namespace torch
181

Browse the source code of pytorch/third_party/nvfuser/csrc/utils.h