NCCLUtils.cpp source code [pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp]

1	#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
2
3	#include <c10/util/CallOnce.h>
4
5	#ifdef USE_C10D_NCCL
6
7	#include <mutex>
8
9	namespace c10d {
10
11	ncclComm_t NCCLComm::getNcclComm() {
12	std::unique_lock<std::mutex> lock(mutex_);
13	if (aborted_) {
14	auto commFailureMsg = commFailureReason_ != c10::nullopt
15	? c10::str(" Original reason for failure was: ", *commFailureReason_)
16	: "";
17	TORCH_CHECK(
18	false,
19	c10::str(
20	"NCCL communicator was aborted on rank ",
21	rank_,
22	". ",
23	commFailureMsg));
24	}
25	return ncclComm_;
26	}
27
28	std::string getNcclVersion() {
29	static c10::once_flag ncclGetVersionFlag;
30	static std::string versionString;
31
32	c10::call_once(ncclGetVersionFlag, []() {
33	int version;
34	ncclResult_t status = ncclGetVersion(&version);
35	// can't compute the version if call did not return successfully or version
36	// code < 100 (corresponding to 0.1.0)
37	if (status != ncclSuccess \|\| version < `100`) {
38	versionString = "Unknown NCCL version";
39	} else {
40	// NCCL changed version coding starting 2.9
41	const int majorBase = version < `2900` ? `1000` : `10000`;
42	const int minorBase = `100`;
43	auto ncclMajor = version / majorBase;
44	auto ncclMinor = (version % majorBase) / minorBase;
45	auto ncclPatch =
46	version % (ncclMajor * majorBase + ncclMinor * minorBase);
47	versionString = std::to_string(ncclMajor) + "." +
48	std::to_string(ncclMinor) + "." + std::to_string(ncclPatch);
49	}
50	});
51
52	return versionString;
53	}
54
55	std::string ncclGetErrorWithVersion(ncclResult_t error) {
56	return std::string (ncclGetErrorString(error)) + ", NCCL version " +
57	getNcclVersion();
58	}
59
60	// Provides additional detail into NCCL error codes based on when these are
61	// thrown in the NCCL codebase.
62	std::string getNcclErrorDetailStr(
63	ncclResult_t error,
64	c10::optional<std::string> processGroupFailureReason / = c10::nullopt /
65	) {
66	// Prioritize failure reason provided by PG NCCL first, as it can abort
67	// communicators when it encounters collective timeouts, etc.
68	if (processGroupFailureReason != c10::nullopt) {
69	return *processGroupFailureReason;
70	}
71	std::string interpret;
72	std::string err;
73	#ifdef ENABLE_NCCL_GET_LAST_ERROR
74	err = "\nLast error:\n" + std::string (ncclGetLastError(NULL));
75	#endif
76	switch (error) {
77	case ncclUnhandledCudaError:
78	interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
79	break;
80	case ncclSystemError:
81	interpret =
82	"ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. ";
83	#ifndef NCCL_REMOTE_ERROR
84	// Before ncclRemoteError was created, unexpected remote disconnect was
85	// categorized as ncclSystemError
86	interpret += "It can be also caused by unexpected exit of a remote peer.";
87	#endif
88	break;
89	case ncclInternalError:
90	interpret = "ncclInternalError: Internal check failed.";
91	break;
92	case ncclInvalidArgument:
93	interpret = "ncclInvalidArgument: Invalid value for an argument.";
94	break;
95	case ncclInvalidUsage:
96	interpret =
97	"ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
98	break;
99	#ifdef NCCL_REMOTE_ERROR
100	case ncclRemoteError:
101	interpret =
102	"ncclRemoteError: A call failed possibly due to a network error or a remote process exiting prematurely.";
103	break;
104	#endif
105	default:
106	interpret = "Unknown NCCL error!";
107	}
108	return interpret + err;
109	}
110
111	} // namespace c10d
112
113	#endif // USE_C10D_NCCL
114

Browse the source code of pytorch/torch/csrc/distributed/c10d/NCCLUtils.cpp