1/*************************************************************************
2 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3 *
4 * See LICENSE.txt for license information
5 ************************************************************************/
6
7#ifndef NCCL_H_
8#define NCCL_H_
9
10#include <cuda_runtime.h>
11#include <cuda_fp16.h>
12#if CUDART_VERSION >= 11000
13#include <cuda_bf16.h>
14#endif
15
16#define NCCL_MAJOR 2
17#define NCCL_MINOR 14
18#define NCCL_PATCH 3
19#define NCCL_SUFFIX ""
20
21#define NCCL_VERSION_CODE 21403
22#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
23
24#ifdef __cplusplus
25extern "C" {
26#endif
27
28/* Opaque handle to communicator */
29typedef struct ncclComm* ncclComm_t;
30
31#define NCCL_UNIQUE_ID_BYTES 128
32typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
33
34/* Error type */
35typedef enum { ncclSuccess = 0,
36 ncclUnhandledCudaError = 1,
37 ncclSystemError = 2,
38 ncclInternalError = 3,
39 ncclInvalidArgument = 4,
40 ncclInvalidUsage = 5,
41 ncclRemoteError = 6,
42 ncclInProgress = 7,
43 ncclNumResults = 8 } ncclResult_t;
44
45/* Communicator configuration. Users can assign value to attributes to specify the
46 * behavior of a communicator. */
47typedef struct ncclConfig_v21400 {
48 /* attributes that users should never touch. */
49 size_t size;
50 unsigned int magic;
51 unsigned int version;
52 /* attributes that users are able to customize. */
53 int blocking;
54} ncclConfig_t;
55
56/* Config initializer must be assigned to initialize config structure when it is created.
57 * Not initialized config will result in NCCL error. */
58#define NCCL_CONFIG_INITIALIZER { \
59 sizeof(ncclConfig_t), /* size */ \
60 0xcafebeef, /* magic */ \
61 NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
62 1 /* blocking */ \
63}
64
65/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
66 * This integer is coded with the MAJOR, MINOR and PATCH level of the
67 * NCCL library
68 */
69ncclResult_t ncclGetVersion(int *version);
70ncclResult_t pncclGetVersion(int *version);
71
72/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
73 * called once and the Id should be distributed to all ranks in the
74 * communicator before calling ncclCommInitRank. */
75ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
76ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
77
78/* Create a new communicator (multi thread/process version) with a configuration
79 * set by users. */
80ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
81ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
82
83/* Creates a new communicator (multi thread/process version).
84 * rank must be between 0 and nranks-1 and unique within a communicator clique.
85 * Each rank is associated to a CUDA device, which has to be set before calling
86 * ncclCommInitRank.
87 * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
88 * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
89ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
90ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
91
92/* Creates a clique of communicators (single process version).
93 * This is a convenience function to create a single-process communicator clique.
94 * Returns an array of ndev newly initialized communicators in comm.
95 * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
96 * If devlist is NULL, the first ndev CUDA devices are used.
97 * Order of devlist defines user-order of processors within the communicator. */
98ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
99ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
100
101/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
102 * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
103 * when the communicator is globally quiescent and related resources are freed; then,
104 * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
105 * itself) without blocking. */
106ncclResult_t ncclCommFinalize(ncclComm_t comm);
107ncclResult_t pncclCommFinalize(ncclComm_t comm);
108
109/* Frees local resources associated with communicator object. */
110ncclResult_t ncclCommDestroy(ncclComm_t comm);
111ncclResult_t pncclCommDestroy(ncclComm_t comm);
112
113/* Frees resources associated with communicator object and aborts any operations
114 * that might still be running on the device. */
115ncclResult_t ncclCommAbort(ncclComm_t comm);
116ncclResult_t pncclCommAbort(ncclComm_t comm);
117
118/* Returns a string for each error code. */
119const char* ncclGetErrorString(ncclResult_t result);
120const char* pncclGetErrorString(ncclResult_t result);
121
122/* Returns a human-readable message of the last error that occurred.
123 * comm is currently unused and can be set to NULL
124 */
125const char* ncclGetLastError(ncclComm_t comm);
126const char* pncclGetError(ncclComm_t comm);
127
128/* Checks whether the comm has encountered any asynchronous errors */
129ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
130ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
131
132/* Gets the number of ranks in the communicator clique. */
133ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
134ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
135
136/* Returns the cuda device number associated with the communicator. */
137ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
138ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
139
140/* Returns the user-ordered "rank" associated with the communicator. */
141ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
142ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
143
144/* Reduction operation selector */
145typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
146typedef enum { ncclSum = 0,
147 ncclProd = 1,
148 ncclMax = 2,
149 ncclMin = 3,
150 ncclAvg = 4,
151 /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
152 * serves as the least possible value for dynamic ncclRedOp_t's
153 * as constructed by ncclRedOpCreate*** functions. */
154 ncclNumOps = 5,
155 /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
156 * It is defined to be the largest signed value (since compilers
157 * are permitted to use signed enums) that won't grow
158 * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
159 * maintain ABI compatibility. */
160 ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
161 } ncclRedOp_t;
162
163/* Data types */
164typedef enum { ncclInt8 = 0, ncclChar = 0,
165 ncclUint8 = 1,
166 ncclInt32 = 2, ncclInt = 2,
167 ncclUint32 = 3,
168 ncclInt64 = 4,
169 ncclUint64 = 5,
170 ncclFloat16 = 6, ncclHalf = 6,
171 ncclFloat32 = 7, ncclFloat = 7,
172 ncclFloat64 = 8, ncclDouble = 8,
173#if defined(__CUDA_BF16_TYPES_EXIST__)
174 ncclBfloat16 = 9,
175 ncclNumTypes = 10
176#else
177 ncclNumTypes = 9
178#endif
179} ncclDataType_t;
180
181/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
182typedef enum {
183 /* ncclScalarDevice: The scalar is in device-visible memory and will be
184 * dereferenced while the collective is running. */
185 ncclScalarDevice = 0,
186
187 /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
188 * dereferenced before the ncclRedOpCreate***() function returns. */
189 ncclScalarHostImmediate = 1
190} ncclScalarResidence_t;
191
192/*
193 * ncclRedOpCreatePreMulSum
194 *
195 * Creates a new reduction operator which pre-multiplies input values by a given
196 * scalar locally before reducing them with peer values via summation. For use
197 * only with collectives launched against *comm* and *datatype*. The
198 * *residence* argument indicates how/when the memory pointed to by *scalar*
199 * will be dereferenced. Upon return, the newly created operator's handle
200 * is stored in *op*.
201 */
202ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
203ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
204
205/*
206 * ncclRedOpDestroy
207 *
208 * Destroys the reduction operator *op*. The operator must have been created by
209 * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
210 * destroyed as soon as the last NCCL function which is given that operator returns.
211 */
212ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
213ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
214
215/*
216 * Collective communication operations
217 *
218 * Collective communication operations must be called separately for each
219 * communicator in a communicator clique.
220 *
221 * They return when operations have been enqueued on the CUDA stream.
222 *
223 * Since they may perform inter-CPU synchronization, each call has to be done
224 * from a different thread or process, or need to use Group Semantics (see
225 * below).
226 */
227
228/*
229 * Reduce
230 *
231 * Reduces data arrays of length count in sendbuff into recvbuff using op
232 * operation.
233 * recvbuff may be NULL on all calls except for root device.
234 * root is the rank (not the CUDA device) where data will reside after the
235 * operation is complete.
236 *
237 * In-place operation will happen if sendbuff == recvbuff.
238 */
239ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
240 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
241ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
242 ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
243
244/*
245 * (deprecated) Broadcast (in-place)
246 *
247 * Copies count values from root to all other devices.
248 * root is the rank (not the CUDA device) where data resides before the
249 * operation is started.
250 *
251 * This operation is implicitely in place.
252 */
253ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
254 ncclComm_t comm, cudaStream_t stream);
255ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
256 ncclComm_t comm, cudaStream_t stream);
257
258/*
259 * Broadcast
260 *
261 * Copies count values from root to all other devices.
262 * root is the rank (not the CUDA device) where data resides before the
263 * operation is started.
264 *
265 * In-place operation will happen if sendbuff == recvbuff.
266 */
267ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
268 ncclComm_t comm, cudaStream_t stream);
269ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
270 ncclComm_t comm, cudaStream_t stream);
271
272/*
273 * All-Reduce
274 *
275 * Reduces data arrays of length count in sendbuff using op operation, and
276 * leaves identical copies of result on each recvbuff.
277 *
278 * In-place operation will happen if sendbuff == recvbuff.
279 */
280ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
281 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
282ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
283 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
284
285/*
286 * Reduce-Scatter
287 *
288 * Reduces data in sendbuff using op operation and leaves reduced result
289 * scattered over the devices so that recvbuff on rank i will contain the i-th
290 * block of the result.
291 * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
292 * should have a size of at least nranks*recvcount elements.
293 *
294 * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
295 */
296ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
297 size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
298 cudaStream_t stream);
299ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
300 size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
301 cudaStream_t stream);
302
303/*
304 * All-Gather
305 *
306 * Each device gathers sendcount values from other GPUs into recvbuff,
307 * receiving data from rank i at offset i*sendcount.
308 * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
309 * should have a size of at least nranks*sendcount elements.
310 *
311 * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
312 */
313ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
314 ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
315ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
316 ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
317
318/*
319 * Send
320 *
321 * Send data from sendbuff to rank peer.
322 *
323 * Rank peer needs to call ncclRecv with the same datatype and the same count from this
324 * rank.
325 *
326 * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
327 * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
328 * ncclGroupEnd section.
329 */
330ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
331 ncclComm_t comm, cudaStream_t stream);
332ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
333 ncclComm_t comm, cudaStream_t stream);
334
335/*
336 * Receive
337 *
338 * Receive data from rank peer into recvbuff.
339 *
340 * Rank peer needs to call ncclSend with the same datatype and the same count to this
341 * rank.
342 *
343 * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
344 * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
345 * ncclGroupEnd section.
346 */
347ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
348 ncclComm_t comm, cudaStream_t stream);
349ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
350 ncclComm_t comm, cudaStream_t stream);
351
352/*
353 * Group semantics
354 *
355 * When managing multiple GPUs from a single thread, and since NCCL collective
356 * calls may perform inter-CPU synchronization, we need to "group" calls for
357 * different ranks/devices into a single call.
358 *
359 * Grouping NCCL calls as being part of the same collective operation is done
360 * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
361 * collective calls until the ncclGroupEnd call, which will wait for all calls
362 * to be complete. Note that for collective communication, ncclGroupEnd only
363 * guarantees that the operations are enqueued on the streams, not that
364 * the operation is effectively done.
365 *
366 * Both collective communication and ncclCommInitRank can be used in conjunction
367 * of ncclGroupStart/ncclGroupEnd, but not together.
368 *
369 * Group semantics also allow to fuse multiple operations on the same device
370 * to improve performance (for aggregated collective calls), or to permit
371 * concurrent progress of multiple send/receive operations.
372 */
373
374/*
375 * Group Start
376 *
377 * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
378 * a single NCCL operation. Nothing will be started on the CUDA stream until
379 * ncclGroupEnd.
380 */
381ncclResult_t ncclGroupStart();
382ncclResult_t pncclGroupStart();
383
384/*
385 * Group End
386 *
387 * End a group call. Start a fused NCCL operation consisting of all calls since
388 * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
389 * need to be called after ncclGroupEnd.
390 */
391ncclResult_t ncclGroupEnd();
392ncclResult_t pncclGroupEnd();
393
394#ifdef __cplusplus
395} // end extern "C"
396#endif
397
398#endif // end include guard
399