1 | #pragma once |
2 | |
3 | #include <c10/cuda/CUDADeviceAssertionHost.h> |
4 | #include <c10/cuda/CUDAMacros.h> |
5 | #include <c10/cuda/CUDAMiscFunctions.h> |
6 | #include <c10/macros/Macros.h> |
7 | #include <c10/util/Exception.h> |
8 | #include <c10/util/irange.h> |
9 | #include <cuda.h> |
10 | |
11 | // Note [CHECK macro] |
12 | // ~~~~~~~~~~~~~~~~~~ |
13 | // This is a macro so that AT_ERROR can get accurate __LINE__ |
14 | // and __FILE__ information. We could split this into a short |
15 | // macro and a function implementation if we pass along __LINE__ |
16 | // and __FILE__, but no one has found this worth doing. |
17 | |
18 | // Used to denote errors from CUDA framework. |
19 | // This needs to be declared here instead util/Exception.h for proper conversion |
20 | // during hipify. |
21 | namespace c10 { |
22 | class C10_CUDA_API CUDAError : public c10::Error { |
23 | using Error::Error; |
24 | }; |
25 | } // namespace c10 |
26 | |
27 | #define C10_CUDA_CHECK(EXPR) \ |
28 | do { \ |
29 | const cudaError_t __err = EXPR; \ |
30 | c10::cuda::c10_cuda_check_implementation( \ |
31 | static_cast<int32_t>(__err), \ |
32 | __FILE__, \ |
33 | __func__, /* Line number data type not well-defined between \ |
34 | compilers, so we perform an explicit cast */ \ |
35 | static_cast<uint32_t>(__LINE__), \ |
36 | true); \ |
37 | } while (0) |
38 | |
39 | #define C10_CUDA_CHECK_WARN(EXPR) \ |
40 | do { \ |
41 | const cudaError_t __err = EXPR; \ |
42 | if (C10_UNLIKELY(__err != cudaSuccess)) { \ |
43 | auto error_unused C10_UNUSED = cudaGetLastError(); \ |
44 | (void)error_unused; \ |
45 | TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \ |
46 | } \ |
47 | } while (0) |
48 | |
49 | // Indicates that a CUDA error is handled in a non-standard way |
50 | #define C10_CUDA_ERROR_HANDLED(EXPR) EXPR |
51 | |
52 | // Intentionally ignore a CUDA error |
53 | #define C10_CUDA_IGNORE_ERROR(EXPR) \ |
54 | do { \ |
55 | const cudaError_t __err = EXPR; \ |
56 | if (C10_UNLIKELY(__err != cudaSuccess)) { \ |
57 | cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \ |
58 | (void)error_unused; \ |
59 | } \ |
60 | } while (0) |
61 | |
62 | // Clear the last CUDA error |
63 | #define C10_CUDA_CLEAR_ERROR() \ |
64 | do { \ |
65 | cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \ |
66 | (void)error_unused; \ |
67 | } while (0) |
68 | |
69 | // This should be used directly after every kernel launch to ensure |
70 | // the launch happened correctly and provide an early, close-to-source |
71 | // diagnostic if it didn't. |
72 | #define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError()) |
73 | |
74 | /// Launches a CUDA kernel appending to it all the information need to handle |
75 | /// device-side assertion failures. Checks that the launch was successful. |
76 | #define TORCH_DSA_KERNEL_LAUNCH( \ |
77 | kernel, blocks, threads, shared_mem, stream, ...) \ |
78 | do { \ |
79 | auto& launch_registry = \ |
80 | c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref(); \ |
81 | kernel<<<blocks, threads, shared_mem, stream>>>( \ |
82 | __VA_ARGS__, \ |
83 | launch_registry.get_uvm_assertions_ptr_for_current_device(), \ |
84 | launch_registry.insert( \ |
85 | __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \ |
86 | C10_CUDA_KERNEL_LAUNCH_CHECK(); \ |
87 | } while (0) |
88 | |
89 | namespace c10 { |
90 | namespace cuda { |
91 | |
92 | /// In the event of a CUDA failure, formats a nice error message about that |
93 | /// failure and also checks for device-side assertion failures |
94 | C10_CUDA_API void c10_cuda_check_implementation( |
95 | const int32_t err, |
96 | const char* filename, |
97 | const char* function_name, |
98 | const int line_number, |
99 | const bool include_device_assertions); |
100 | |
101 | } // namespace cuda |
102 | } // namespace c10 |
103 | |