CUDADeviceAssertionHost.cpp source code [pytorch/c10/cuda/CUDADeviceAssertionHost.cpp]

1	#include <c10/cuda/CUDADeviceAssertionHost.h>
2	#include <c10/cuda/CUDAException.h>
3	#include <c10/util/Backtrace.h>
4	#include <c10/util/Exception.h>
5	#include <c10/util/irange.h>
6	#include <cuda_runtime.h>
7
8	#include <algorithm>
9	#include <iostream>
10	#include <memory>
11	#include <sstream>
12	#include <stdexcept>
13	#include <string>
14	#include <thread>
15
16	#define C10_CUDA_CHECK_WO_DSA(EXPR) \
17	do { \
18	const cudaError_t __err = EXPR; \
19	c10::cuda::c10_cuda_check_implementation( \
20	static_cast<int32_t>(__err), \
21	__FILE__, \
22	__func__, /* Line number data type not well-defined between \
23	compilers, so we perform an explicit cast */ \
24	static_cast<uint32_t>(__LINE__), \
25	false); \
26	} while (0)
27
28	namespace c10 {
29	namespace cuda {
30
31	namespace {
32
33	#ifdef TORCH_USE_CUDA_DSA
34	/// Get current device id
35	/// We need our own implementation of this function to prevent
36	/// an infinite initialization loop for CUDAKernelLaunchRegistry
37	int dsa_get_device_id() {
38	int device = -`1`;
39	C10_CUDA_CHECK_WO_DSA(cudaGetDevice(&device));
40	return device;
41	}
42
43	/// Get a device's compute capability - note that this dangerously assumes
44	/// that if one CUDA GPU supports device-side assertions they all do. This is
45	/// probably fine since the latest CUDA GPU that doesn't support UVM is the
46	/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
47	/// rare enough that the defensive
48	/// We need our own implementation of this function to prevent
49	/// an infinite initialization loop for CUDAKernelLaunchRegistry
50	int dsa_get_device_compute_capability(const int device_num) {
51	int compute_capability = -`1`;
52	C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
53	&compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
54	return compute_capability;
55	}
56	#endif
57
58	/// Get the number of CUDA devices
59	/// We need our own implementation of this function to prevent
60	/// an infinite initialization loop for CUDAKernelLaunchRegistry
61	int dsa_get_device_count() {
62	int device_count = -`1`;
63	C10_CUDA_CHECK_WO_DSA(cudaGetDeviceCount(&device_count));
64	return device_count;
65	}
66
67	bool dsa_check_if_all_devices_support_managed_memory() {
68	// It looks as though this'll work best on CUDA GPUs with Pascal
69	// architectures or newer, per
70	// https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
71	#ifdef TORCH_USE_CUDA_DSA
72	for (const auto i : c10::irange(dsa_get_device_count())) {
73	if (dsa_get_device_compute_capability(i) < `6`) {
74	return false;
75	}
76	}
77	return true;
78	#else
79	return false;
80	#endif
81	}
82
83	bool env_flag_set(const char* env_var_name) {
84	const char* const env_string = std::getenv(env_var_name);
85	return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
86	}
87
88	/// Deleter for UVM/managed memory pointers
89	void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
90	// Ignore error in destructor
91	if (uvm_assertions_ptr) {
92	C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr));
93	}
94	}
95
96	} // namespace
97
98	/// Check that kernels ran correctly by checking the message buffer. BLOCKING.
99	std::string c10_retrieve_device_side_assertion_info() {
100	#ifdef TORCH_USE_CUDA_DSA
101	const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref();
102	if (!launch_registry.enabled_at_runtime) {
103	return "Device-side assertion tracking was not enabled by user.";
104	} else if (!launch_registry.do_all_devices_support_managed_memory) {
105	return "Device-side assertions disabled because not all devices support managed memory.";
106	}
107
108	// Hack that saves a lot of challenging sync logic.
109	// The GPU increments the number of errors it's observed and the CPU can see
110	// that happening immediately which means we can make it here before the GPU
111	// is done writing information about those errors to memory.
112	// A short pause gives it time to finish. Since something's gone wrong, this
113	// pause shouldn't affect perf.
114	std::this_thread::sleep_for(std::chrono::seconds(`1`));
115
116	// The snapshot causes a brief block. That's okay because this function only
117	// executes if something's gone wrong such that speed is no longer a priority.
118	const auto launch_data = launch_registry.snapshot();
119	const auto& assertion_data = launch_data.first;
120	const auto& launch_infos = launch_data.second;
121
122	std::stringstream oss;
123
124	oss << "Looking for device-side assertion failure information...\n";
125
126	// Loop over each device that could be managed by the process
127	for (const auto device_num : c10::irange(assertion_data.size())) {
128	const auto& assertion_data_for_device = assertion_data.at(device_num);
129
130	// Did anything fail?
131	const auto failures_found = std::min(
132	assertion_data_for_device.assertion_count,
133	C10_CUDA_DSA_ASSERTION_COUNT);
134	if (failures_found == `0`) {
135	continue;
136	}
137
138	// Something failed, let's talk about that
139	oss << failures_found
140	<< " CUDA device-side assertion failures were found on GPU #"
141	<< device_num << "!" << std::endl;
142	if (assertion_data_for_device.assertion_count >
143	C10_CUDA_DSA_ASSERTION_COUNT) {
144	oss << "But at least " << assertion_data_for_device.assertion_count
145	<< " assertion failures occurred on the device" << std::endl;
146	oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info"
147	<< std::endl;
148	}
149
150	for (const auto i : c10::irange(failures_found)) {
151	const auto& self = assertion_data_for_device.assertions[i];
152	const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
153	oss << "Assertion failure " << i << std::endl;
154	oss << " GPU assertion failure message = " << self.assertion_msg
155	<< std::endl;
156	oss << " File containing assertion = " << self.filename << ":"
157	<< self.line_number << std::endl;
158	oss << " Device function containing assertion = " << self.function_name
159	<< std::endl;
160	oss << " Thread ID that failed assertion = [" << self.thread_id[`0`] << ","
161	<< self.thread_id[`1`] << "," << self.thread_id[`2`] << "]" << std::endl;
162	oss << " Block ID that failed assertion = [" << self.block_id[`0`] << ","
163	<< self.block_id[`1`] << "," << self.block_id[`2`] << "]" << std::endl;
164	if (launch_info.generation_number == self.caller) {
165	oss << " File containing kernel launch = "
166	<< launch_info.launch_filename << ":" << launch_info.launch_linenum
167	<< std::endl;
168	oss << " Function containing kernel launch = "
169	<< launch_info.launch_function << std::endl;
170	oss << " Name of kernel launched that led to failure = "
171	<< launch_info.kernel_name << std::endl;
172	oss << " Device that launched kernel = " << launch_info.device
173	<< std::endl;
174	oss << " Stream kernel was launched on = " << launch_info.stream
175	<< std::endl;
176	oss << " Backtrace of kernel launch site = ";
177	if (launch_registry.gather_launch_stacktrace) {
178	oss << "Launch stacktracing disabled." << std::endl;
179	} else {
180	oss << "\n" << launch_info.launch_stacktrace << std::endl;
181	}
182	} else {
183	oss << " CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
184	<< std::endl;
185	}
186	}
187	}
188	return oss.str();
189	#else
190	return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n";
191	#endif
192	}
193
194	CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
195	: do_all_devices_support_managed_memory(
196	dsa_check_if_all_devices_support_managed_memory()),
197	gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
198	enabled_at_runtime(check_env_for_dsa_enabled()) {
199	for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
200	uvm_assertions.emplace_back(nullptr, uvm_deleter);
201	}
202
203	kernel_launches.resize(max_kernel_launches);
204	}
205
206	bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
207	const {
208	return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING");
209	}
210
211	bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
212	return env_flag_set("PYTORCH_USE_CUDA_DSA");
213	}
214
215	uint32_t CUDAKernelLaunchRegistry::insert(
216	const char* launch_filename,
217	const char* launch_function,
218	const uint32_t launch_linenum,
219	const char* kernel_name,
220	const int32_t stream_id) {
221	#ifdef TORCH_USE_CUDA_DSA
222	if (!enabled_at_runtime) {
223	return `0`;
224	}
225
226	const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";
227
228	const std::lock_guard<std::mutex> lock(read_write_mutex);
229
230	const auto my_gen_number = generation_number++;
231	// TODO: It would probably be good to get a stack trace here so that
232	// we can better indicate which launch caused the failure.
233	kernel_launches[my_gen_number % max_kernel_launches] = {
234	launch_filename,
235	launch_function,
236	launch_linenum,
237	backtrace,
238	kernel_name,
239	dsa_get_device_id(),
240	stream_id,
241	my_gen_number};
242	return my_gen_number;
243	#else
244	return `0`;
245	#endif
246	}
247
248	std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
249	CUDAKernelLaunchRegistry::snapshot() const {
250	// This is likely to be the longest-lasting hold on the mutex, but
251	// we only expect it to be called in cases where we're already failing
252	// and speed is no longer important
253	const std::lock_guard<std::mutex> lock(read_write_mutex);
254
255	std::vector<DeviceAssertionsData> device_assertions_data;
256	for (const auto& x : uvm_assertions) {
257	if (x) {
258	device_assertions_data.push_back(*x);
259	} else {
260	device_assertions_data.emplace_back();
261	}
262	}
263
264	return std::make_pair(device_assertions_data, kernel_launches);
265	}
266
267	DeviceAssertionsData* CUDAKernelLaunchRegistry::
268	get_uvm_assertions_ptr_for_current_device() {
269	#ifdef TORCH_USE_CUDA_DSA
270	if (!enabled_at_runtime) {
271	return nullptr;
272	}
273
274	const auto device_num = dsa_get_device_id();
275
276	// If we've already set up this GPU with managed memory, return a pointer to
277	// the managed memory. This is a lock-free quick-return path.
278	if (uvm_assertions.at(device_num)) {
279	return uvm_assertions.at(device_num).get();
280	}
281
282	// Need a lock here so there's not race-condition on creating the new device
283	// assertions buffer
284	const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);
285
286	// If we've already set up this GPU with managed memory, return a pointer to
287	// the managed memory. This locked path ensures that the device memory is
288	// allocated only once
289	if (uvm_assertions.at(device_num)) {
290	return uvm_assertions.at(device_num).get();
291	}
292
293	// Otherwise, set up the GPU to be able to use the device-side assertion
294	// system
295	DeviceAssertionsData* uvm_assertions_ptr = nullptr;
296
297	C10_CUDA_CHECK_WO_DSA(
298	cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
299
300	C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
301	uvm_assertions_ptr,
302	sizeof(DeviceAssertionsData),
303	cudaMemAdviseSetPreferredLocation,
304	cudaCpuDeviceId));
305
306	// GPU will establish direct mapping of data in CPU memory, no page faults
307	// will be generated
308	C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
309	uvm_assertions_ptr,
310	sizeof(DeviceAssertionsData),
311	cudaMemAdviseSetAccessedBy,
312	cudaCpuDeviceId));
313
314	// Initialize the memory from the CPU; otherwise, pages may have to be created
315	// on demand. We think that UVM documentation indicates that first access may
316	// not honor preferred location, which would be bad, if true, because we want
317	// this memory on the host so we can access it post-assertion. Initializing
318	// this on the CPU helps ensure that that's where the memory will live.
319	*uvm_assertions_ptr = DeviceAssertionsData();
320
321	// Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
322	// uvm_assertions unique_ptr vector
323	uvm_assertions.at(device_num).reset(uvm_assertions_ptr);
324
325	return uvm_assertions_ptr;
326	#else
327	return nullptr;
328	#endif
329	}
330
331	CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() {
332	static CUDAKernelLaunchRegistry launch_registry;
333	return launch_registry;
334	}
335
336	bool CUDAKernelLaunchRegistry::has_failed() const {
337	for (const auto& x : uvm_assertions) {
338	if (x && x ->assertion_count > `0`) {
339	return true;
340	}
341	}
342	return false;
343	}
344
345	} // namespace cuda
346	} // namespace c10
347

Browse the source code of pytorch/c10/cuda/CUDADeviceAssertionHost.cpp