1 | /************************************************************************* |
2 | * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. |
3 | * |
4 | * See LICENSE.txt for license information |
5 | ************************************************************************/ |
6 | |
7 | #ifndef NCCL_H_ |
8 | #define NCCL_H_ |
9 | |
10 | #include <cuda_runtime.h> |
11 | #include <cuda_fp16.h> |
12 | #if CUDART_VERSION >= 11000 |
13 | #include <cuda_bf16.h> |
14 | #endif |
15 | |
16 | #define NCCL_MAJOR 2 |
17 | #define NCCL_MINOR 14 |
18 | #define NCCL_PATCH 3 |
19 | #define NCCL_SUFFIX "" |
20 | |
21 | #define NCCL_VERSION_CODE 21403 |
22 | #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) |
23 | |
24 | #ifdef __cplusplus |
25 | extern "C" { |
26 | #endif |
27 | |
28 | /* Opaque handle to communicator */ |
29 | typedef struct ncclComm* ncclComm_t; |
30 | |
31 | #define NCCL_UNIQUE_ID_BYTES 128 |
32 | typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; |
33 | |
34 | /* Error type */ |
35 | typedef enum { ncclSuccess = 0, |
36 | ncclUnhandledCudaError = 1, |
37 | ncclSystemError = 2, |
38 | ncclInternalError = 3, |
39 | ncclInvalidArgument = 4, |
40 | ncclInvalidUsage = 5, |
41 | ncclRemoteError = 6, |
42 | ncclInProgress = 7, |
43 | ncclNumResults = 8 } ncclResult_t; |
44 | |
45 | /* Communicator configuration. Users can assign value to attributes to specify the |
46 | * behavior of a communicator. */ |
47 | typedef struct ncclConfig_v21400 { |
48 | /* attributes that users should never touch. */ |
49 | size_t size; |
50 | unsigned int magic; |
51 | unsigned int version; |
52 | /* attributes that users are able to customize. */ |
53 | int blocking; |
54 | } ncclConfig_t; |
55 | |
56 | /* Config initializer must be assigned to initialize config structure when it is created. |
57 | * Not initialized config will result in NCCL error. */ |
58 | #define NCCL_CONFIG_INITIALIZER { \ |
59 | sizeof(ncclConfig_t), /* size */ \ |
60 | 0xcafebeef, /* magic */ \ |
61 | NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ |
62 | 1 /* blocking */ \ |
63 | } |
64 | |
65 | /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. |
66 | * This integer is coded with the MAJOR, MINOR and PATCH level of the |
67 | * NCCL library |
68 | */ |
69 | ncclResult_t ncclGetVersion(int *version); |
70 | ncclResult_t pncclGetVersion(int *version); |
71 | |
72 | /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be |
73 | * called once and the Id should be distributed to all ranks in the |
74 | * communicator before calling ncclCommInitRank. */ |
75 | ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); |
76 | ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); |
77 | |
78 | /* Create a new communicator (multi thread/process version) with a configuration |
79 | * set by users. */ |
80 | ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); |
81 | ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); |
82 | |
83 | /* Creates a new communicator (multi thread/process version). |
84 | * rank must be between 0 and nranks-1 and unique within a communicator clique. |
85 | * Each rank is associated to a CUDA device, which has to be set before calling |
86 | * ncclCommInitRank. |
87 | * ncclCommInitRank implicitly syncronizes with other ranks, so it must be |
88 | * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ |
89 | ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); |
90 | ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); |
91 | |
92 | /* Creates a clique of communicators (single process version). |
93 | * This is a convenience function to create a single-process communicator clique. |
94 | * Returns an array of ndev newly initialized communicators in comm. |
95 | * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). |
96 | * If devlist is NULL, the first ndev CUDA devices are used. |
97 | * Order of devlist defines user-order of processors within the communicator. */ |
98 | ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); |
99 | ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); |
100 | |
101 | /* Finalize a communicator. ncclCommFinalize flushes all issued communications, |
102 | * and marks communicator state as ncclInProgress. The state will change to ncclSuccess |
103 | * when the communicator is globally quiescent and related resources are freed; then, |
104 | * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator |
105 | * itself) without blocking. */ |
106 | ncclResult_t ncclCommFinalize(ncclComm_t comm); |
107 | ncclResult_t pncclCommFinalize(ncclComm_t comm); |
108 | |
109 | /* Frees local resources associated with communicator object. */ |
110 | ncclResult_t ncclCommDestroy(ncclComm_t comm); |
111 | ncclResult_t pncclCommDestroy(ncclComm_t comm); |
112 | |
113 | /* Frees resources associated with communicator object and aborts any operations |
114 | * that might still be running on the device. */ |
115 | ncclResult_t ncclCommAbort(ncclComm_t comm); |
116 | ncclResult_t pncclCommAbort(ncclComm_t comm); |
117 | |
118 | /* Returns a string for each error code. */ |
119 | const char* ncclGetErrorString(ncclResult_t result); |
120 | const char* pncclGetErrorString(ncclResult_t result); |
121 | |
122 | /* Returns a human-readable message of the last error that occurred. |
123 | * comm is currently unused and can be set to NULL |
124 | */ |
125 | const char* ncclGetLastError(ncclComm_t comm); |
126 | const char* pncclGetError(ncclComm_t comm); |
127 | |
128 | /* Checks whether the comm has encountered any asynchronous errors */ |
129 | ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); |
130 | ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); |
131 | |
132 | /* Gets the number of ranks in the communicator clique. */ |
133 | ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); |
134 | ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); |
135 | |
136 | /* Returns the cuda device number associated with the communicator. */ |
137 | ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); |
138 | ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); |
139 | |
140 | /* Returns the user-ordered "rank" associated with the communicator. */ |
141 | ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); |
142 | ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); |
143 | |
144 | /* Reduction operation selector */ |
145 | typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; |
146 | typedef enum { ncclSum = 0, |
147 | ncclProd = 1, |
148 | ncclMax = 2, |
149 | ncclMin = 3, |
150 | ncclAvg = 4, |
151 | /* ncclNumOps: The number of built-in ncclRedOp_t values. Also |
152 | * serves as the least possible value for dynamic ncclRedOp_t's |
153 | * as constructed by ncclRedOpCreate*** functions. */ |
154 | ncclNumOps = 5, |
155 | /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. |
156 | * It is defined to be the largest signed value (since compilers |
157 | * are permitted to use signed enums) that won't grow |
158 | * sizeof(ncclRedOp_t) when compared to previous NCCL versions to |
159 | * maintain ABI compatibility. */ |
160 | ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) |
161 | } ncclRedOp_t; |
162 | |
163 | /* Data types */ |
164 | typedef enum { ncclInt8 = 0, ncclChar = 0, |
165 | ncclUint8 = 1, |
166 | ncclInt32 = 2, ncclInt = 2, |
167 | ncclUint32 = 3, |
168 | ncclInt64 = 4, |
169 | ncclUint64 = 5, |
170 | ncclFloat16 = 6, ncclHalf = 6, |
171 | ncclFloat32 = 7, ncclFloat = 7, |
172 | ncclFloat64 = 8, ncclDouble = 8, |
173 | #if defined(__CUDA_BF16_TYPES_EXIST__) |
174 | ncclBfloat16 = 9, |
175 | ncclNumTypes = 10 |
176 | #else |
177 | ncclNumTypes = 9 |
178 | #endif |
179 | } ncclDataType_t; |
180 | |
181 | /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ |
182 | typedef enum { |
183 | /* ncclScalarDevice: The scalar is in device-visible memory and will be |
184 | * dereferenced while the collective is running. */ |
185 | ncclScalarDevice = 0, |
186 | |
187 | /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be |
188 | * dereferenced before the ncclRedOpCreate***() function returns. */ |
189 | ncclScalarHostImmediate = 1 |
190 | } ncclScalarResidence_t; |
191 | |
192 | /* |
193 | * ncclRedOpCreatePreMulSum |
194 | * |
195 | * Creates a new reduction operator which pre-multiplies input values by a given |
196 | * scalar locally before reducing them with peer values via summation. For use |
197 | * only with collectives launched against *comm* and *datatype*. The |
198 | * *residence* argument indicates how/when the memory pointed to by *scalar* |
199 | * will be dereferenced. Upon return, the newly created operator's handle |
200 | * is stored in *op*. |
201 | */ |
202 | ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); |
203 | ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); |
204 | |
205 | /* |
206 | * ncclRedOpDestroy |
207 | * |
208 | * Destroys the reduction operator *op*. The operator must have been created by |
209 | * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be |
210 | * destroyed as soon as the last NCCL function which is given that operator returns. |
211 | */ |
212 | ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); |
213 | ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); |
214 | |
215 | /* |
216 | * Collective communication operations |
217 | * |
218 | * Collective communication operations must be called separately for each |
219 | * communicator in a communicator clique. |
220 | * |
221 | * They return when operations have been enqueued on the CUDA stream. |
222 | * |
223 | * Since they may perform inter-CPU synchronization, each call has to be done |
224 | * from a different thread or process, or need to use Group Semantics (see |
225 | * below). |
226 | */ |
227 | |
228 | /* |
229 | * Reduce |
230 | * |
231 | * Reduces data arrays of length count in sendbuff into recvbuff using op |
232 | * operation. |
233 | * recvbuff may be NULL on all calls except for root device. |
234 | * root is the rank (not the CUDA device) where data will reside after the |
235 | * operation is complete. |
236 | * |
237 | * In-place operation will happen if sendbuff == recvbuff. |
238 | */ |
239 | ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, |
240 | ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); |
241 | ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, |
242 | ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); |
243 | |
244 | /* |
245 | * (deprecated) Broadcast (in-place) |
246 | * |
247 | * Copies count values from root to all other devices. |
248 | * root is the rank (not the CUDA device) where data resides before the |
249 | * operation is started. |
250 | * |
251 | * This operation is implicitely in place. |
252 | */ |
253 | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, |
254 | ncclComm_t comm, cudaStream_t stream); |
255 | ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, |
256 | ncclComm_t comm, cudaStream_t stream); |
257 | |
258 | /* |
259 | * Broadcast |
260 | * |
261 | * Copies count values from root to all other devices. |
262 | * root is the rank (not the CUDA device) where data resides before the |
263 | * operation is started. |
264 | * |
265 | * In-place operation will happen if sendbuff == recvbuff. |
266 | */ |
267 | ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, |
268 | ncclComm_t comm, cudaStream_t stream); |
269 | ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, |
270 | ncclComm_t comm, cudaStream_t stream); |
271 | |
272 | /* |
273 | * All-Reduce |
274 | * |
275 | * Reduces data arrays of length count in sendbuff using op operation, and |
276 | * leaves identical copies of result on each recvbuff. |
277 | * |
278 | * In-place operation will happen if sendbuff == recvbuff. |
279 | */ |
280 | ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, |
281 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); |
282 | ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, |
283 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); |
284 | |
285 | /* |
286 | * Reduce-Scatter |
287 | * |
288 | * Reduces data in sendbuff using op operation and leaves reduced result |
289 | * scattered over the devices so that recvbuff on rank i will contain the i-th |
290 | * block of the result. |
291 | * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff |
292 | * should have a size of at least nranks*recvcount elements. |
293 | * |
294 | * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. |
295 | */ |
296 | ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, |
297 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, |
298 | cudaStream_t stream); |
299 | ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, |
300 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, |
301 | cudaStream_t stream); |
302 | |
303 | /* |
304 | * All-Gather |
305 | * |
306 | * Each device gathers sendcount values from other GPUs into recvbuff, |
307 | * receiving data from rank i at offset i*sendcount. |
308 | * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff |
309 | * should have a size of at least nranks*sendcount elements. |
310 | * |
311 | * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. |
312 | */ |
313 | ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
314 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); |
315 | ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, |
316 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); |
317 | |
318 | /* |
319 | * Send |
320 | * |
321 | * Send data from sendbuff to rank peer. |
322 | * |
323 | * Rank peer needs to call ncclRecv with the same datatype and the same count from this |
324 | * rank. |
325 | * |
326 | * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations |
327 | * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ |
328 | * ncclGroupEnd section. |
329 | */ |
330 | ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, |
331 | ncclComm_t comm, cudaStream_t stream); |
332 | ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, |
333 | ncclComm_t comm, cudaStream_t stream); |
334 | |
335 | /* |
336 | * Receive |
337 | * |
338 | * Receive data from rank peer into recvbuff. |
339 | * |
340 | * Rank peer needs to call ncclSend with the same datatype and the same count to this |
341 | * rank. |
342 | * |
343 | * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations |
344 | * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ |
345 | * ncclGroupEnd section. |
346 | */ |
347 | ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, |
348 | ncclComm_t comm, cudaStream_t stream); |
349 | ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, |
350 | ncclComm_t comm, cudaStream_t stream); |
351 | |
352 | /* |
353 | * Group semantics |
354 | * |
355 | * When managing multiple GPUs from a single thread, and since NCCL collective |
356 | * calls may perform inter-CPU synchronization, we need to "group" calls for |
357 | * different ranks/devices into a single call. |
358 | * |
359 | * Grouping NCCL calls as being part of the same collective operation is done |
360 | * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all |
361 | * collective calls until the ncclGroupEnd call, which will wait for all calls |
362 | * to be complete. Note that for collective communication, ncclGroupEnd only |
363 | * guarantees that the operations are enqueued on the streams, not that |
364 | * the operation is effectively done. |
365 | * |
366 | * Both collective communication and ncclCommInitRank can be used in conjunction |
367 | * of ncclGroupStart/ncclGroupEnd, but not together. |
368 | * |
369 | * Group semantics also allow to fuse multiple operations on the same device |
370 | * to improve performance (for aggregated collective calls), or to permit |
371 | * concurrent progress of multiple send/receive operations. |
372 | */ |
373 | |
374 | /* |
375 | * Group Start |
376 | * |
377 | * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into |
378 | * a single NCCL operation. Nothing will be started on the CUDA stream until |
379 | * ncclGroupEnd. |
380 | */ |
381 | ncclResult_t ncclGroupStart(); |
382 | ncclResult_t pncclGroupStart(); |
383 | |
384 | /* |
385 | * Group End |
386 | * |
387 | * End a group call. Start a fused NCCL operation consisting of all calls since |
388 | * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations |
389 | * need to be called after ncclGroupEnd. |
390 | */ |
391 | ncclResult_t ncclGroupEnd(); |
392 | ncclResult_t pncclGroupEnd(); |
393 | |
394 | #ifdef __cplusplus |
395 | } // end extern "C" |
396 | #endif |
397 | |
398 | #endif // end include guard |
399 | |