1 | #include <torch/csrc/distributed/c10d/NCCLUtils.hpp> |
2 | |
3 | #include <c10/util/CallOnce.h> |
4 | |
5 | #ifdef USE_C10D_NCCL |
6 | |
7 | #include <mutex> |
8 | |
9 | namespace c10d { |
10 | |
11 | ncclComm_t NCCLComm::getNcclComm() { |
12 | std::unique_lock<std::mutex> lock(mutex_); |
13 | if (aborted_) { |
14 | auto commFailureMsg = commFailureReason_ != c10::nullopt |
15 | ? c10::str(" Original reason for failure was: " , *commFailureReason_) |
16 | : "" ; |
17 | TORCH_CHECK( |
18 | false, |
19 | c10::str( |
20 | "NCCL communicator was aborted on rank " , |
21 | rank_, |
22 | ". " , |
23 | commFailureMsg)); |
24 | } |
25 | return ncclComm_; |
26 | } |
27 | |
28 | std::string getNcclVersion() { |
29 | static c10::once_flag ncclGetVersionFlag; |
30 | static std::string versionString; |
31 | |
32 | c10::call_once(ncclGetVersionFlag, []() { |
33 | int version; |
34 | ncclResult_t status = ncclGetVersion(&version); |
35 | // can't compute the version if call did not return successfully or version |
36 | // code < 100 (corresponding to 0.1.0) |
37 | if (status != ncclSuccess || version < 100) { |
38 | versionString = "Unknown NCCL version" ; |
39 | } else { |
40 | // NCCL changed version coding starting 2.9 |
41 | const int majorBase = version < 2900 ? 1000 : 10000; |
42 | const int minorBase = 100; |
43 | auto ncclMajor = version / majorBase; |
44 | auto ncclMinor = (version % majorBase) / minorBase; |
45 | auto ncclPatch = |
46 | version % (ncclMajor * majorBase + ncclMinor * minorBase); |
47 | versionString = std::to_string(ncclMajor) + "." + |
48 | std::to_string(ncclMinor) + "." + std::to_string(ncclPatch); |
49 | } |
50 | }); |
51 | |
52 | return versionString; |
53 | } |
54 | |
55 | std::string ncclGetErrorWithVersion(ncclResult_t error) { |
56 | return std::string(ncclGetErrorString(error)) + ", NCCL version " + |
57 | getNcclVersion(); |
58 | } |
59 | |
60 | // Provides additional detail into NCCL error codes based on when these are |
61 | // thrown in the NCCL codebase. |
62 | std::string getNcclErrorDetailStr( |
63 | ncclResult_t error, |
64 | c10::optional<std::string> processGroupFailureReason /* = c10::nullopt */ |
65 | ) { |
66 | // Prioritize failure reason provided by PG NCCL first, as it can abort |
67 | // communicators when it encounters collective timeouts, etc. |
68 | if (processGroupFailureReason != c10::nullopt) { |
69 | return *processGroupFailureReason; |
70 | } |
71 | std::string interpret; |
72 | std::string err; |
73 | #ifdef ENABLE_NCCL_GET_LAST_ERROR |
74 | err = "\nLast error:\n" + std::string(ncclGetLastError(NULL)); |
75 | #endif |
76 | switch (error) { |
77 | case ncclUnhandledCudaError: |
78 | interpret = "ncclUnhandledCudaError: Call to CUDA function failed." ; |
79 | break; |
80 | case ncclSystemError: |
81 | interpret = |
82 | "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. " ; |
83 | #ifndef NCCL_REMOTE_ERROR |
84 | // Before ncclRemoteError was created, unexpected remote disconnect was |
85 | // categorized as ncclSystemError |
86 | interpret += "It can be also caused by unexpected exit of a remote peer." ; |
87 | #endif |
88 | break; |
89 | case ncclInternalError: |
90 | interpret = "ncclInternalError: Internal check failed." ; |
91 | break; |
92 | case ncclInvalidArgument: |
93 | interpret = "ncclInvalidArgument: Invalid value for an argument." ; |
94 | break; |
95 | case ncclInvalidUsage: |
96 | interpret = |
97 | "ncclInvalidUsage: This usually reflects invalid usage of NCCL library." ; |
98 | break; |
99 | #ifdef NCCL_REMOTE_ERROR |
100 | case ncclRemoteError: |
101 | interpret = |
102 | "ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely." ; |
103 | break; |
104 | #endif |
105 | default: |
106 | interpret = "Unknown NCCL error!" ; |
107 | } |
108 | return interpret + err; |
109 | } |
110 | |
111 | } // namespace c10d |
112 | |
113 | #endif // USE_C10D_NCCL |
114 | |