1 | #include <c10/cuda/CUDADeviceAssertionHost.h> |
2 | #include <c10/cuda/CUDAException.h> |
3 | #include <c10/util/Backtrace.h> |
4 | #include <c10/util/Exception.h> |
5 | #include <c10/util/irange.h> |
6 | #include <cuda_runtime.h> |
7 | |
8 | #include <algorithm> |
9 | #include <iostream> |
10 | #include <memory> |
11 | #include <sstream> |
12 | #include <stdexcept> |
13 | #include <string> |
14 | #include <thread> |
15 | |
16 | #define C10_CUDA_CHECK_WO_DSA(EXPR) \ |
17 | do { \ |
18 | const cudaError_t __err = EXPR; \ |
19 | c10::cuda::c10_cuda_check_implementation( \ |
20 | static_cast<int32_t>(__err), \ |
21 | __FILE__, \ |
22 | __func__, /* Line number data type not well-defined between \ |
23 | compilers, so we perform an explicit cast */ \ |
24 | static_cast<uint32_t>(__LINE__), \ |
25 | false); \ |
26 | } while (0) |
27 | |
28 | namespace c10 { |
29 | namespace cuda { |
30 | |
31 | namespace { |
32 | |
33 | #ifdef TORCH_USE_CUDA_DSA |
34 | /// Get current device id |
35 | /// We need our own implementation of this function to prevent |
36 | /// an infinite initialization loop for CUDAKernelLaunchRegistry |
37 | int dsa_get_device_id() { |
38 | int device = -1; |
39 | C10_CUDA_CHECK_WO_DSA(cudaGetDevice(&device)); |
40 | return device; |
41 | } |
42 | |
43 | /// Get a device's compute capability - note that this dangerously assumes |
44 | /// that if one CUDA GPU supports device-side assertions they all do. This is |
45 | /// probably fine since the latest CUDA GPU that doesn't support UVM is the |
46 | /// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be |
47 | /// rare enough that the defensive |
48 | /// We need our own implementation of this function to prevent |
49 | /// an infinite initialization loop for CUDAKernelLaunchRegistry |
50 | int dsa_get_device_compute_capability(const int device_num) { |
51 | int compute_capability = -1; |
52 | C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute( |
53 | &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num)); |
54 | return compute_capability; |
55 | } |
56 | #endif |
57 | |
58 | /// Get the number of CUDA devices |
59 | /// We need our own implementation of this function to prevent |
60 | /// an infinite initialization loop for CUDAKernelLaunchRegistry |
61 | int dsa_get_device_count() { |
62 | int device_count = -1; |
63 | C10_CUDA_CHECK_WO_DSA(cudaGetDeviceCount(&device_count)); |
64 | return device_count; |
65 | } |
66 | |
67 | bool dsa_check_if_all_devices_support_managed_memory() { |
68 | // It looks as though this'll work best on CUDA GPUs with Pascal |
69 | // architectures or newer, per |
70 | // https://developer.nvidia.com/blog/unified-memory-cuda-beginners/ |
71 | #ifdef TORCH_USE_CUDA_DSA |
72 | for (const auto i : c10::irange(dsa_get_device_count())) { |
73 | if (dsa_get_device_compute_capability(i) < 6) { |
74 | return false; |
75 | } |
76 | } |
77 | return true; |
78 | #else |
79 | return false; |
80 | #endif |
81 | } |
82 | |
83 | bool env_flag_set(const char* env_var_name) { |
84 | const char* const env_string = std::getenv(env_var_name); |
85 | return (env_string == nullptr) ? false : std::strcmp(env_string, "0" ); |
86 | } |
87 | |
88 | /// Deleter for UVM/managed memory pointers |
89 | void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) { |
90 | // Ignore error in destructor |
91 | if (uvm_assertions_ptr) { |
92 | C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr)); |
93 | } |
94 | } |
95 | |
96 | } // namespace |
97 | |
98 | /// Check that kernels ran correctly by checking the message buffer. BLOCKING. |
99 | std::string c10_retrieve_device_side_assertion_info() { |
100 | #ifdef TORCH_USE_CUDA_DSA |
101 | const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref(); |
102 | if (!launch_registry.enabled_at_runtime) { |
103 | return "Device-side assertion tracking was not enabled by user." ; |
104 | } else if (!launch_registry.do_all_devices_support_managed_memory) { |
105 | return "Device-side assertions disabled because not all devices support managed memory." ; |
106 | } |
107 | |
108 | // Hack that saves a lot of challenging sync logic. |
109 | // The GPU increments the number of errors it's observed and the CPU can see |
110 | // that happening immediately which means we can make it here before the GPU |
111 | // is done writing information about those errors to memory. |
112 | // A short pause gives it time to finish. Since something's gone wrong, this |
113 | // pause shouldn't affect perf. |
114 | std::this_thread::sleep_for(std::chrono::seconds(1)); |
115 | |
116 | // The snapshot causes a brief block. That's okay because this function only |
117 | // executes if something's gone wrong such that speed is no longer a priority. |
118 | const auto launch_data = launch_registry.snapshot(); |
119 | const auto& assertion_data = launch_data.first; |
120 | const auto& launch_infos = launch_data.second; |
121 | |
122 | std::stringstream oss; |
123 | |
124 | oss << "Looking for device-side assertion failure information...\n" ; |
125 | |
126 | // Loop over each device that could be managed by the process |
127 | for (const auto device_num : c10::irange(assertion_data.size())) { |
128 | const auto& assertion_data_for_device = assertion_data.at(device_num); |
129 | |
130 | // Did anything fail? |
131 | const auto failures_found = std::min( |
132 | assertion_data_for_device.assertion_count, |
133 | C10_CUDA_DSA_ASSERTION_COUNT); |
134 | if (failures_found == 0) { |
135 | continue; |
136 | } |
137 | |
138 | // Something failed, let's talk about that |
139 | oss << failures_found |
140 | << " CUDA device-side assertion failures were found on GPU #" |
141 | << device_num << "!" << std::endl; |
142 | if (assertion_data_for_device.assertion_count > |
143 | C10_CUDA_DSA_ASSERTION_COUNT) { |
144 | oss << "But at least " << assertion_data_for_device.assertion_count |
145 | << " assertion failures occurred on the device" << std::endl; |
146 | oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info" |
147 | << std::endl; |
148 | } |
149 | |
150 | for (const auto i : c10::irange(failures_found)) { |
151 | const auto& self = assertion_data_for_device.assertions[i]; |
152 | const auto& launch_info = launch_infos[self.caller % launch_infos.size()]; |
153 | oss << "Assertion failure " << i << std::endl; |
154 | oss << " GPU assertion failure message = " << self.assertion_msg |
155 | << std::endl; |
156 | oss << " File containing assertion = " << self.filename << ":" |
157 | << self.line_number << std::endl; |
158 | oss << " Device function containing assertion = " << self.function_name |
159 | << std::endl; |
160 | oss << " Thread ID that failed assertion = [" << self.thread_id[0] << "," |
161 | << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl; |
162 | oss << " Block ID that failed assertion = [" << self.block_id[0] << "," |
163 | << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl; |
164 | if (launch_info.generation_number == self.caller) { |
165 | oss << " File containing kernel launch = " |
166 | << launch_info.launch_filename << ":" << launch_info.launch_linenum |
167 | << std::endl; |
168 | oss << " Function containing kernel launch = " |
169 | << launch_info.launch_function << std::endl; |
170 | oss << " Name of kernel launched that led to failure = " |
171 | << launch_info.kernel_name << std::endl; |
172 | oss << " Device that launched kernel = " << launch_info.device |
173 | << std::endl; |
174 | oss << " Stream kernel was launched on = " << launch_info.stream |
175 | << std::endl; |
176 | oss << " Backtrace of kernel launch site = " ; |
177 | if (launch_registry.gather_launch_stacktrace) { |
178 | oss << "Launch stacktracing disabled." << std::endl; |
179 | } else { |
180 | oss << "\n" << launch_info.launch_stacktrace << std::endl; |
181 | } |
182 | } else { |
183 | oss << " CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`." |
184 | << std::endl; |
185 | } |
186 | } |
187 | } |
188 | return oss.str(); |
189 | #else |
190 | return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n" ; |
191 | #endif |
192 | } |
193 | |
194 | CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry() |
195 | : do_all_devices_support_managed_memory( |
196 | dsa_check_if_all_devices_support_managed_memory()), |
197 | gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()), |
198 | enabled_at_runtime(check_env_for_dsa_enabled()) { |
199 | for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) { |
200 | uvm_assertions.emplace_back(nullptr, uvm_deleter); |
201 | } |
202 | |
203 | kernel_launches.resize(max_kernel_launches); |
204 | } |
205 | |
206 | bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing() |
207 | const { |
208 | return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING" ); |
209 | } |
210 | |
211 | bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const { |
212 | return env_flag_set("PYTORCH_USE_CUDA_DSA" ); |
213 | } |
214 | |
215 | uint32_t CUDAKernelLaunchRegistry::insert( |
216 | const char* launch_filename, |
217 | const char* launch_function, |
218 | const uint32_t launch_linenum, |
219 | const char* kernel_name, |
220 | const int32_t stream_id) { |
221 | #ifdef TORCH_USE_CUDA_DSA |
222 | if (!enabled_at_runtime) { |
223 | return 0; |
224 | } |
225 | |
226 | const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "" ; |
227 | |
228 | const std::lock_guard<std::mutex> lock(read_write_mutex); |
229 | |
230 | const auto my_gen_number = generation_number++; |
231 | // TODO: It would probably be good to get a stack trace here so that |
232 | // we can better indicate which launch caused the failure. |
233 | kernel_launches[my_gen_number % max_kernel_launches] = { |
234 | launch_filename, |
235 | launch_function, |
236 | launch_linenum, |
237 | backtrace, |
238 | kernel_name, |
239 | dsa_get_device_id(), |
240 | stream_id, |
241 | my_gen_number}; |
242 | return my_gen_number; |
243 | #else |
244 | return 0; |
245 | #endif |
246 | } |
247 | |
248 | std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>> |
249 | CUDAKernelLaunchRegistry::snapshot() const { |
250 | // This is likely to be the longest-lasting hold on the mutex, but |
251 | // we only expect it to be called in cases where we're already failing |
252 | // and speed is no longer important |
253 | const std::lock_guard<std::mutex> lock(read_write_mutex); |
254 | |
255 | std::vector<DeviceAssertionsData> device_assertions_data; |
256 | for (const auto& x : uvm_assertions) { |
257 | if (x) { |
258 | device_assertions_data.push_back(*x); |
259 | } else { |
260 | device_assertions_data.emplace_back(); |
261 | } |
262 | } |
263 | |
264 | return std::make_pair(device_assertions_data, kernel_launches); |
265 | } |
266 | |
267 | DeviceAssertionsData* CUDAKernelLaunchRegistry:: |
268 | get_uvm_assertions_ptr_for_current_device() { |
269 | #ifdef TORCH_USE_CUDA_DSA |
270 | if (!enabled_at_runtime) { |
271 | return nullptr; |
272 | } |
273 | |
274 | const auto device_num = dsa_get_device_id(); |
275 | |
276 | // If we've already set up this GPU with managed memory, return a pointer to |
277 | // the managed memory. This is a lock-free quick-return path. |
278 | if (uvm_assertions.at(device_num)) { |
279 | return uvm_assertions.at(device_num).get(); |
280 | } |
281 | |
282 | // Need a lock here so there's not race-condition on creating the new device |
283 | // assertions buffer |
284 | const std::lock_guard<std::mutex> lock(gpu_alloc_mutex); |
285 | |
286 | // If we've already set up this GPU with managed memory, return a pointer to |
287 | // the managed memory. This locked path ensures that the device memory is |
288 | // allocated only once |
289 | if (uvm_assertions.at(device_num)) { |
290 | return uvm_assertions.at(device_num).get(); |
291 | } |
292 | |
293 | // Otherwise, set up the GPU to be able to use the device-side assertion |
294 | // system |
295 | DeviceAssertionsData* uvm_assertions_ptr = nullptr; |
296 | |
297 | C10_CUDA_CHECK_WO_DSA( |
298 | cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData))); |
299 | |
300 | C10_CUDA_CHECK_WO_DSA(cudaMemAdvise( |
301 | uvm_assertions_ptr, |
302 | sizeof(DeviceAssertionsData), |
303 | cudaMemAdviseSetPreferredLocation, |
304 | cudaCpuDeviceId)); |
305 | |
306 | // GPU will establish direct mapping of data in CPU memory, no page faults |
307 | // will be generated |
308 | C10_CUDA_CHECK_WO_DSA(cudaMemAdvise( |
309 | uvm_assertions_ptr, |
310 | sizeof(DeviceAssertionsData), |
311 | cudaMemAdviseSetAccessedBy, |
312 | cudaCpuDeviceId)); |
313 | |
314 | // Initialize the memory from the CPU; otherwise, pages may have to be created |
315 | // on demand. We think that UVM documentation indicates that first access may |
316 | // not honor preferred location, which would be bad, if true, because we want |
317 | // this memory on the host so we can access it post-assertion. Initializing |
318 | // this on the CPU helps ensure that that's where the memory will live. |
319 | *uvm_assertions_ptr = DeviceAssertionsData(); |
320 | |
321 | // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the |
322 | // uvm_assertions unique_ptr vector |
323 | uvm_assertions.at(device_num).reset(uvm_assertions_ptr); |
324 | |
325 | return uvm_assertions_ptr; |
326 | #else |
327 | return nullptr; |
328 | #endif |
329 | } |
330 | |
331 | CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() { |
332 | static CUDAKernelLaunchRegistry launch_registry; |
333 | return launch_registry; |
334 | } |
335 | |
336 | bool CUDAKernelLaunchRegistry::has_failed() const { |
337 | for (const auto& x : uvm_assertions) { |
338 | if (x && x->assertion_count > 0) { |
339 | return true; |
340 | } |
341 | } |
342 | return false; |
343 | } |
344 | |
345 | } // namespace cuda |
346 | } // namespace c10 |
347 | |