nccl.h source code [pytorch/build/nccl/include/nccl.h]

1	/*************************************************************************
2	* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3	*
4	* See LICENSE.txt for license information
5	************************************************************************/
6
7	#ifndef NCCL_H_
8	#define NCCL_H_
9
10	#include <cuda_runtime.h>
11	#include <cuda_fp16.h>
12	#if CUDART_VERSION >= 11000
13	#include <cuda_bf16.h>
14	#endif
15
16	#define NCCL_MAJOR 2
17	#define NCCL_MINOR 14
18	#define NCCL_PATCH 3
19	#define NCCL_SUFFIX ""
20
21	#define NCCL_VERSION_CODE 21403
22	#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
23
24	#ifdef __cplusplus
25	extern "C" {
26	#endif
27
28	/ Opaque handle to communicator /
29	typedef struct ncclComm* ncclComm_t;
30
31	#define NCCL_UNIQUE_ID_BYTES 128
32	typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
33
34	/ Error type /
35	typedef enum { ncclSuccess = `0`,
36	ncclUnhandledCudaError = `1`,
37	ncclSystemError = `2`,
38	ncclInternalError = `3`,
39	ncclInvalidArgument = `4`,
40	ncclInvalidUsage = `5`,
41	ncclRemoteError = `6`,
42	ncclInProgress = `7`,
43	ncclNumResults = `8` } ncclResult_t;
44
45	/ Communicator configuration. Users can assign value to attributes to specify the*
46	* behavior of a communicator. */
47	typedef struct ncclConfig_v21400 {
48	/ attributes that users should never touch. /
49	size_t size;
50	unsigned int magic;
51	unsigned int version;
52	/ attributes that users are able to customize. /
53	int blocking;
54	} ncclConfig_t;
55
56	/ Config initializer must be assigned to initialize config structure when it is created.*
57	* Not initialized config will result in NCCL error. */
58	#define NCCL_CONFIG_INITIALIZER { \
59	sizeof(ncclConfig_t), /* size */ \
60	0xcafebeef, /* magic */ \
61	NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
62	1 /* blocking */ \
63	}
64
65	/ Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.*
66	* This integer is coded with the MAJOR, MINOR and PATCH level of the
67	* NCCL library
68	*/
69	ncclResult_t ncclGetVersion(int *version);
70	ncclResult_t pncclGetVersion(int *version);
71
72	/ Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be*
73	* called once and the Id should be distributed to all ranks in the
74	* communicator before calling ncclCommInitRank. */
75	ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
76	ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
77
78	/ Create a new communicator (multi thread/process version) with a configuration*
79	* set by users. */
80	ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
81	ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
82
83	/ Creates a new communicator (multi thread/process version).*
84	* rank must be between 0 and nranks-1 and unique within a communicator clique.
85	* Each rank is associated to a CUDA device, which has to be set before calling
86	* ncclCommInitRank.
87	* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
88	* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
89	ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
90	ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
91
92	/ Creates a clique of communicators (single process version).*
93	* This is a convenience function to create a single-process communicator clique.
94	* Returns an array of ndev newly initialized communicators in comm.
95	* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
96	* If devlist is NULL, the first ndev CUDA devices are used.
97	* Order of devlist defines user-order of processors within the communicator. */
98	ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
99	ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
100
101	/ Finalize a communicator. ncclCommFinalize flushes all issued communications,*
102	* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
103	* when the communicator is globally quiescent and related resources are freed; then,
104	* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
105	* itself) without blocking. */
106	ncclResult_t ncclCommFinalize(ncclComm_t comm);
107	ncclResult_t pncclCommFinalize(ncclComm_t comm);
108
109	/ Frees local resources associated with communicator object. /
110	ncclResult_t ncclCommDestroy(ncclComm_t comm);
111	ncclResult_t pncclCommDestroy(ncclComm_t comm);
112
113	/ Frees resources associated with communicator object and aborts any operations*
114	* that might still be running on the device. */
115	ncclResult_t ncclCommAbort(ncclComm_t comm);
116	ncclResult_t pncclCommAbort(ncclComm_t comm);
117
118	/ Returns a string for each error code. /
119	const char* ncclGetErrorString(ncclResult_t result);
120	const char* pncclGetErrorString(ncclResult_t result);
121
122	/ Returns a human-readable message of the last error that occurred.*
123	* comm is currently unused and can be set to NULL
124	*/
125	const char* ncclGetLastError(ncclComm_t comm);
126	const char* pncclGetError(ncclComm_t comm);
127
128	/ Checks whether the comm has encountered any asynchronous errors /
129	ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
130	ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
131
132	/ Gets the number of ranks in the communicator clique. /
133	ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
134	ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
135
136	/ Returns the cuda device number associated with the communicator. /
137	ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
138	ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
139
140	/ Returns the user-ordered "rank" associated with the communicator. /
141	ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
142	ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
143
144	/ Reduction operation selector /
145	typedef enum { ncclNumOps_dummy = `5` } ncclRedOp_dummy_t;
146	typedef enum { ncclSum = `0`,
147	ncclProd = `1`,
148	ncclMax = `2`,
149	ncclMin = `3`,
150	ncclAvg = `4`,
151	/ ncclNumOps: The number of built-in ncclRedOp_t values. Also*
152	* serves as the least possible value for dynamic ncclRedOp_t's
153	* as constructed by ncclRedOpCreate*** functions. */
154	ncclNumOps = `5`,
155	/ ncclMaxRedOp: The largest valid value for ncclRedOp_t.*
156	* It is defined to be the largest signed value (since compilers
157	* are permitted to use signed enums) that won't grow
158	* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
159	* maintain ABI compatibility. */
160	ncclMaxRedOp = `0x7fffffff`>>(`32`-`8`*sizeof(ncclRedOp_dummy_t))
161	} ncclRedOp_t;
162
163	/ Data types /
164	typedef enum { ncclInt8 = `0`, ncclChar = `0`,
165	ncclUint8 = `1`,
166	ncclInt32 = `2`, ncclInt = `2`,
167	ncclUint32 = `3`,
168	ncclInt64 = `4`,
169	ncclUint64 = `5`,
170	ncclFloat16 = `6`, ncclHalf = `6`,
171	ncclFloat32 = `7`, ncclFloat = `7`,
172	ncclFloat64 = `8`, ncclDouble = `8`,
173	#if defined(__CUDA_BF16_TYPES_EXIST__)
174	ncclBfloat16 = `9`,
175	ncclNumTypes = `10`
176	#else
177	ncclNumTypes = `9`
178	#endif
179	} ncclDataType_t;
180
181	/ ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. /
182	typedef enum {
183	/ ncclScalarDevice: The scalar is in device-visible memory and will be*
184	* dereferenced while the collective is running. */
185	ncclScalarDevice = `0`,
186
187	/ ncclScalarHostImmediate: The scalar is in host-visible memory and will be*
188	* dereferenced before the ncclRedOpCreate**() function returns. /
189	ncclScalarHostImmediate = `1`
190	} ncclScalarResidence_t;
191
192	/*
193	* ncclRedOpCreatePreMulSum
194	*
195	* Creates a new reduction operator which pre-multiplies input values by a given
196	* scalar locally before reducing them with peer values via summation. For use
197	* only with collectives launched against comm and datatype. The
198	* residence argument indicates how/when the memory pointed to by scalar
199	* will be dereferenced. Upon return, the newly created operator's handle
200	* is stored in op.
201	*/
202	ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t op, void* *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
203	ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t op, void* *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
204
205	/*
206	* ncclRedOpDestroy
207	*
208	* Destroys the reduction operator op. The operator must have been created by
209	* ncclRedOpCreatePreMul with the matching communicator comm. An operator may be
210	* destroyed as soon as the last NCCL function which is given that operator returns.
211	*/
212	ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
213	ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
214
215	/*
216	* Collective communication operations
217	*
218	* Collective communication operations must be called separately for each
219	* communicator in a communicator clique.
220	*
221	* They return when operations have been enqueued on the CUDA stream.
222	*
223	* Since they may perform inter-CPU synchronization, each call has to be done
224	* from a different thread or process, or need to use Group Semantics (see
225	* below).
226	*/
227
228	/*
229	* Reduce
230	*
231	* Reduces data arrays of length count in sendbuff into recvbuff using op
232	* operation.
233	* recvbuff may be NULL on all calls except for root device.
234	* root is the rank (not the CUDA device) where data will reside after the
235	* operation is complete.
236	*
237	* In-place operation will happen if sendbuff == recvbuff.
238	*/
239	ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
240	ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
241	ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
242	ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
243
244	/*
245	* (deprecated) Broadcast (in-place)
246	*
247	* Copies count values from root to all other devices.
248	* root is the rank (not the CUDA device) where data resides before the
249	* operation is started.
250	*
251	* This operation is implicitely in place.
252	*/
253	ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
254	ncclComm_t comm, cudaStream_t stream);
255	ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
256	ncclComm_t comm, cudaStream_t stream);
257
258	/*
259	* Broadcast
260	*
261	* Copies count values from root to all other devices.
262	* root is the rank (not the CUDA device) where data resides before the
263	* operation is started.
264	*
265	* In-place operation will happen if sendbuff == recvbuff.
266	*/
267	ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
268	ncclComm_t comm, cudaStream_t stream);
269	ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
270	ncclComm_t comm, cudaStream_t stream);
271
272	/*
273	* All-Reduce
274	*
275	* Reduces data arrays of length count in sendbuff using op operation, and
276	* leaves identical copies of result on each recvbuff.
277	*
278	* In-place operation will happen if sendbuff == recvbuff.
279	*/
280	ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
281	ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
282	ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
283	ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
284
285	/*
286	* Reduce-Scatter
287	*
288	* Reduces data in sendbuff using op operation and leaves reduced result
289	* scattered over the devices so that recvbuff on rank i will contain the i-th
290	* block of the result.
291	* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
292	* should have a size of at least nranks*recvcount elements.
293	*
294	* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
295	*/
296	ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
297	size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
298	cudaStream_t stream);
299	ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
300	size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
301	cudaStream_t stream);
302
303	/*
304	* All-Gather
305	*
306	* Each device gathers sendcount values from other GPUs into recvbuff,
307	* receiving data from rank i at offset i*sendcount.
308	* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
309	* should have a size of at least nranks*sendcount elements.
310	*
311	* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
312	*/
313	ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
314	ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
315	ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
316	ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
317
318	/*
319	* Send
320	*
321	* Send data from sendbuff to rank peer.
322	*
323	* Rank peer needs to call ncclRecv with the same datatype and the same count from this
324	* rank.
325	*
326	* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
327	* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
328	* ncclGroupEnd section.
329	*/
330	ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
331	ncclComm_t comm, cudaStream_t stream);
332	ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
333	ncclComm_t comm, cudaStream_t stream);
334
335	/*
336	* Receive
337	*
338	* Receive data from rank peer into recvbuff.
339	*
340	* Rank peer needs to call ncclSend with the same datatype and the same count to this
341	* rank.
342	*
343	* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
344	* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
345	* ncclGroupEnd section.
346	*/
347	ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
348	ncclComm_t comm, cudaStream_t stream);
349	ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
350	ncclComm_t comm, cudaStream_t stream);
351
352	/*
353	* Group semantics
354	*
355	* When managing multiple GPUs from a single thread, and since NCCL collective
356	* calls may perform inter-CPU synchronization, we need to "group" calls for
357	* different ranks/devices into a single call.
358	*
359	* Grouping NCCL calls as being part of the same collective operation is done
360	* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
361	* collective calls until the ncclGroupEnd call, which will wait for all calls
362	* to be complete. Note that for collective communication, ncclGroupEnd only
363	* guarantees that the operations are enqueued on the streams, not that
364	* the operation is effectively done.
365	*
366	* Both collective communication and ncclCommInitRank can be used in conjunction
367	* of ncclGroupStart/ncclGroupEnd, but not together.
368	*
369	* Group semantics also allow to fuse multiple operations on the same device
370	* to improve performance (for aggregated collective calls), or to permit
371	* concurrent progress of multiple send/receive operations.
372	*/
373
374	/*
375	* Group Start
376	*
377	* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
378	* a single NCCL operation. Nothing will be started on the CUDA stream until
379	* ncclGroupEnd.
380	*/
381	ncclResult_t ncclGroupStart();
382	ncclResult_t pncclGroupStart();
383
384	/*
385	* Group End
386	*
387	* End a group call. Start a fused NCCL operation consisting of all calls since
388	* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
389	* need to be called after ncclGroupEnd.
390	*/
391	ncclResult_t ncclGroupEnd();
392	ncclResult_t pncclGroupEnd();
393
394	#ifdef __cplusplus
395	} // end extern "C"
396	#endif
397
398	#endif // end include guard
399

Browse the source code of pytorch/build/nccl/include/nccl.h