1#include <c10/cuda/CUDADeviceAssertionHost.h>
2#include <c10/cuda/CUDAException.h>
3#include <c10/util/Backtrace.h>
4#include <c10/util/Exception.h>
5#include <c10/util/irange.h>
6#include <cuda_runtime.h>
7
8#include <algorithm>
9#include <iostream>
10#include <memory>
11#include <sstream>
12#include <stdexcept>
13#include <string>
14#include <thread>
15
16#define C10_CUDA_CHECK_WO_DSA(EXPR) \
17 do { \
18 const cudaError_t __err = EXPR; \
19 c10::cuda::c10_cuda_check_implementation( \
20 static_cast<int32_t>(__err), \
21 __FILE__, \
22 __func__, /* Line number data type not well-defined between \
23 compilers, so we perform an explicit cast */ \
24 static_cast<uint32_t>(__LINE__), \
25 false); \
26 } while (0)
27
28namespace c10 {
29namespace cuda {
30
31namespace {
32
33#ifdef TORCH_USE_CUDA_DSA
34/// Get current device id
35/// We need our own implementation of this function to prevent
36/// an infinite initialization loop for CUDAKernelLaunchRegistry
37int dsa_get_device_id() {
38 int device = -1;
39 C10_CUDA_CHECK_WO_DSA(cudaGetDevice(&device));
40 return device;
41}
42
43/// Get a device's compute capability - note that this dangerously assumes
44/// that if one CUDA GPU supports device-side assertions they all do. This is
45/// probably fine since the latest CUDA GPU that doesn't support UVM is the
46/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
47/// rare enough that the defensive
48/// We need our own implementation of this function to prevent
49/// an infinite initialization loop for CUDAKernelLaunchRegistry
50int dsa_get_device_compute_capability(const int device_num) {
51 int compute_capability = -1;
52 C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
53 &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
54 return compute_capability;
55}
56#endif
57
58/// Get the number of CUDA devices
59/// We need our own implementation of this function to prevent
60/// an infinite initialization loop for CUDAKernelLaunchRegistry
61int dsa_get_device_count() {
62 int device_count = -1;
63 C10_CUDA_CHECK_WO_DSA(cudaGetDeviceCount(&device_count));
64 return device_count;
65}
66
67bool dsa_check_if_all_devices_support_managed_memory() {
68// It looks as though this'll work best on CUDA GPUs with Pascal
69// architectures or newer, per
70// https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
71#ifdef TORCH_USE_CUDA_DSA
72 for (const auto i : c10::irange(dsa_get_device_count())) {
73 if (dsa_get_device_compute_capability(i) < 6) {
74 return false;
75 }
76 }
77 return true;
78#else
79 return false;
80#endif
81}
82
83bool env_flag_set(const char* env_var_name) {
84 const char* const env_string = std::getenv(env_var_name);
85 return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
86}
87
88/// Deleter for UVM/managed memory pointers
89void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
90 // Ignore error in destructor
91 if (uvm_assertions_ptr) {
92 C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr));
93 }
94}
95
96} // namespace
97
98/// Check that kernels ran correctly by checking the message buffer. BLOCKING.
99std::string c10_retrieve_device_side_assertion_info() {
100#ifdef TORCH_USE_CUDA_DSA
101 const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref();
102 if (!launch_registry.enabled_at_runtime) {
103 return "Device-side assertion tracking was not enabled by user.";
104 } else if (!launch_registry.do_all_devices_support_managed_memory) {
105 return "Device-side assertions disabled because not all devices support managed memory.";
106 }
107
108 // Hack that saves a lot of challenging sync logic.
109 // The GPU increments the number of errors it's observed and the CPU can see
110 // that happening immediately which means we can make it here before the GPU
111 // is done writing information about those errors to memory.
112 // A short pause gives it time to finish. Since something's gone wrong, this
113 // pause shouldn't affect perf.
114 std::this_thread::sleep_for(std::chrono::seconds(1));
115
116 // The snapshot causes a brief block. That's okay because this function only
117 // executes if something's gone wrong such that speed is no longer a priority.
118 const auto launch_data = launch_registry.snapshot();
119 const auto& assertion_data = launch_data.first;
120 const auto& launch_infos = launch_data.second;
121
122 std::stringstream oss;
123
124 oss << "Looking for device-side assertion failure information...\n";
125
126 // Loop over each device that could be managed by the process
127 for (const auto device_num : c10::irange(assertion_data.size())) {
128 const auto& assertion_data_for_device = assertion_data.at(device_num);
129
130 // Did anything fail?
131 const auto failures_found = std::min(
132 assertion_data_for_device.assertion_count,
133 C10_CUDA_DSA_ASSERTION_COUNT);
134 if (failures_found == 0) {
135 continue;
136 }
137
138 // Something failed, let's talk about that
139 oss << failures_found
140 << " CUDA device-side assertion failures were found on GPU #"
141 << device_num << "!" << std::endl;
142 if (assertion_data_for_device.assertion_count >
143 C10_CUDA_DSA_ASSERTION_COUNT) {
144 oss << "But at least " << assertion_data_for_device.assertion_count
145 << " assertion failures occurred on the device" << std::endl;
146 oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info"
147 << std::endl;
148 }
149
150 for (const auto i : c10::irange(failures_found)) {
151 const auto& self = assertion_data_for_device.assertions[i];
152 const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
153 oss << "Assertion failure " << i << std::endl;
154 oss << " GPU assertion failure message = " << self.assertion_msg
155 << std::endl;
156 oss << " File containing assertion = " << self.filename << ":"
157 << self.line_number << std::endl;
158 oss << " Device function containing assertion = " << self.function_name
159 << std::endl;
160 oss << " Thread ID that failed assertion = [" << self.thread_id[0] << ","
161 << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
162 oss << " Block ID that failed assertion = [" << self.block_id[0] << ","
163 << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
164 if (launch_info.generation_number == self.caller) {
165 oss << " File containing kernel launch = "
166 << launch_info.launch_filename << ":" << launch_info.launch_linenum
167 << std::endl;
168 oss << " Function containing kernel launch = "
169 << launch_info.launch_function << std::endl;
170 oss << " Name of kernel launched that led to failure = "
171 << launch_info.kernel_name << std::endl;
172 oss << " Device that launched kernel = " << launch_info.device
173 << std::endl;
174 oss << " Stream kernel was launched on = " << launch_info.stream
175 << std::endl;
176 oss << " Backtrace of kernel launch site = ";
177 if (launch_registry.gather_launch_stacktrace) {
178 oss << "Launch stacktracing disabled." << std::endl;
179 } else {
180 oss << "\n" << launch_info.launch_stacktrace << std::endl;
181 }
182 } else {
183 oss << " CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
184 << std::endl;
185 }
186 }
187 }
188 return oss.str();
189#else
190 return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n";
191#endif
192}
193
194CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
195 : do_all_devices_support_managed_memory(
196 dsa_check_if_all_devices_support_managed_memory()),
197 gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
198 enabled_at_runtime(check_env_for_dsa_enabled()) {
199 for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
200 uvm_assertions.emplace_back(nullptr, uvm_deleter);
201 }
202
203 kernel_launches.resize(max_kernel_launches);
204}
205
206bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
207 const {
208 return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING");
209}
210
211bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
212 return env_flag_set("PYTORCH_USE_CUDA_DSA");
213}
214
215uint32_t CUDAKernelLaunchRegistry::insert(
216 const char* launch_filename,
217 const char* launch_function,
218 const uint32_t launch_linenum,
219 const char* kernel_name,
220 const int32_t stream_id) {
221#ifdef TORCH_USE_CUDA_DSA
222 if (!enabled_at_runtime) {
223 return 0;
224 }
225
226 const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";
227
228 const std::lock_guard<std::mutex> lock(read_write_mutex);
229
230 const auto my_gen_number = generation_number++;
231 // TODO: It would probably be good to get a stack trace here so that
232 // we can better indicate which launch caused the failure.
233 kernel_launches[my_gen_number % max_kernel_launches] = {
234 launch_filename,
235 launch_function,
236 launch_linenum,
237 backtrace,
238 kernel_name,
239 dsa_get_device_id(),
240 stream_id,
241 my_gen_number};
242 return my_gen_number;
243#else
244 return 0;
245#endif
246}
247
248std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
249CUDAKernelLaunchRegistry::snapshot() const {
250 // This is likely to be the longest-lasting hold on the mutex, but
251 // we only expect it to be called in cases where we're already failing
252 // and speed is no longer important
253 const std::lock_guard<std::mutex> lock(read_write_mutex);
254
255 std::vector<DeviceAssertionsData> device_assertions_data;
256 for (const auto& x : uvm_assertions) {
257 if (x) {
258 device_assertions_data.push_back(*x);
259 } else {
260 device_assertions_data.emplace_back();
261 }
262 }
263
264 return std::make_pair(device_assertions_data, kernel_launches);
265}
266
267DeviceAssertionsData* CUDAKernelLaunchRegistry::
268 get_uvm_assertions_ptr_for_current_device() {
269#ifdef TORCH_USE_CUDA_DSA
270 if (!enabled_at_runtime) {
271 return nullptr;
272 }
273
274 const auto device_num = dsa_get_device_id();
275
276 // If we've already set up this GPU with managed memory, return a pointer to
277 // the managed memory. This is a lock-free quick-return path.
278 if (uvm_assertions.at(device_num)) {
279 return uvm_assertions.at(device_num).get();
280 }
281
282 // Need a lock here so there's not race-condition on creating the new device
283 // assertions buffer
284 const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);
285
286 // If we've already set up this GPU with managed memory, return a pointer to
287 // the managed memory. This locked path ensures that the device memory is
288 // allocated only once
289 if (uvm_assertions.at(device_num)) {
290 return uvm_assertions.at(device_num).get();
291 }
292
293 // Otherwise, set up the GPU to be able to use the device-side assertion
294 // system
295 DeviceAssertionsData* uvm_assertions_ptr = nullptr;
296
297 C10_CUDA_CHECK_WO_DSA(
298 cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
299
300 C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
301 uvm_assertions_ptr,
302 sizeof(DeviceAssertionsData),
303 cudaMemAdviseSetPreferredLocation,
304 cudaCpuDeviceId));
305
306 // GPU will establish direct mapping of data in CPU memory, no page faults
307 // will be generated
308 C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
309 uvm_assertions_ptr,
310 sizeof(DeviceAssertionsData),
311 cudaMemAdviseSetAccessedBy,
312 cudaCpuDeviceId));
313
314 // Initialize the memory from the CPU; otherwise, pages may have to be created
315 // on demand. We think that UVM documentation indicates that first access may
316 // not honor preferred location, which would be bad, if true, because we want
317 // this memory on the host so we can access it post-assertion. Initializing
318 // this on the CPU helps ensure that that's where the memory will live.
319 *uvm_assertions_ptr = DeviceAssertionsData();
320
321 // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
322 // uvm_assertions unique_ptr vector
323 uvm_assertions.at(device_num).reset(uvm_assertions_ptr);
324
325 return uvm_assertions_ptr;
326#else
327 return nullptr;
328#endif
329}
330
331CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() {
332 static CUDAKernelLaunchRegistry launch_registry;
333 return launch_registry;
334}
335
336bool CUDAKernelLaunchRegistry::has_failed() const {
337 for (const auto& x : uvm_assertions) {
338 if (x && x->assertion_count > 0) {
339 return true;
340 }
341 }
342 return false;
343}
344
345} // namespace cuda
346} // namespace c10
347