1#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
2
3#include <c10/util/CallOnce.h>
4
5#ifdef USE_C10D_NCCL
6
7#include <mutex>
8
9namespace c10d {
10
11ncclComm_t NCCLComm::getNcclComm() {
12 std::unique_lock<std::mutex> lock(mutex_);
13 if (aborted_) {
14 auto commFailureMsg = commFailureReason_ != c10::nullopt
15 ? c10::str(" Original reason for failure was: ", *commFailureReason_)
16 : "";
17 TORCH_CHECK(
18 false,
19 c10::str(
20 "NCCL communicator was aborted on rank ",
21 rank_,
22 ". ",
23 commFailureMsg));
24 }
25 return ncclComm_;
26}
27
28std::string getNcclVersion() {
29 static c10::once_flag ncclGetVersionFlag;
30 static std::string versionString;
31
32 c10::call_once(ncclGetVersionFlag, []() {
33 int version;
34 ncclResult_t status = ncclGetVersion(&version);
35 // can't compute the version if call did not return successfully or version
36 // code < 100 (corresponding to 0.1.0)
37 if (status != ncclSuccess || version < 100) {
38 versionString = "Unknown NCCL version";
39 } else {
40 // NCCL changed version coding starting 2.9
41 const int majorBase = version < 2900 ? 1000 : 10000;
42 const int minorBase = 100;
43 auto ncclMajor = version / majorBase;
44 auto ncclMinor = (version % majorBase) / minorBase;
45 auto ncclPatch =
46 version % (ncclMajor * majorBase + ncclMinor * minorBase);
47 versionString = std::to_string(ncclMajor) + "." +
48 std::to_string(ncclMinor) + "." + std::to_string(ncclPatch);
49 }
50 });
51
52 return versionString;
53}
54
55std::string ncclGetErrorWithVersion(ncclResult_t error) {
56 return std::string(ncclGetErrorString(error)) + ", NCCL version " +
57 getNcclVersion();
58}
59
60// Provides additional detail into NCCL error codes based on when these are
61// thrown in the NCCL codebase.
62std::string getNcclErrorDetailStr(
63 ncclResult_t error,
64 c10::optional<std::string> processGroupFailureReason /* = c10::nullopt */
65) {
66 // Prioritize failure reason provided by PG NCCL first, as it can abort
67 // communicators when it encounters collective timeouts, etc.
68 if (processGroupFailureReason != c10::nullopt) {
69 return *processGroupFailureReason;
70 }
71 std::string interpret;
72 std::string err;
73#ifdef ENABLE_NCCL_GET_LAST_ERROR
74 err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
75#endif
76 switch (error) {
77 case ncclUnhandledCudaError:
78 interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
79 break;
80 case ncclSystemError:
81 interpret =
82 "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. ";
83#ifndef NCCL_REMOTE_ERROR
84 // Before ncclRemoteError was created, unexpected remote disconnect was
85 // categorized as ncclSystemError
86 interpret += "It can be also caused by unexpected exit of a remote peer.";
87#endif
88 break;
89 case ncclInternalError:
90 interpret = "ncclInternalError: Internal check failed.";
91 break;
92 case ncclInvalidArgument:
93 interpret = "ncclInvalidArgument: Invalid value for an argument.";
94 break;
95 case ncclInvalidUsage:
96 interpret =
97 "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
98 break;
99#ifdef NCCL_REMOTE_ERROR
100 case ncclRemoteError:
101 interpret =
102 "ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely.";
103 break;
104#endif
105 default:
106 interpret = "Unknown NCCL error!";
107 }
108 return interpret + err;
109}
110
111} // namespace c10d
112
113#endif // USE_C10D_NCCL
114