1/**
2 * Copyright (c) 2017-present, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9#pragma once
10
11#include <algorithm>
12#include <atomic>
13#include <mutex>
14
15#include <cuda.h>
16#include <cuda_runtime.h>
17
18#include "gloo/algorithm.h"
19#include "gloo/config.h"
20#include "gloo/common/logging.h"
21
22// Check that configuration header was properly generated
23#if !GLOO_USE_CUDA
24#error "Expected GLOO_USE_CUDA to be defined"
25#endif
26
27namespace gloo {
28
29extern const cudaStream_t kStreamNotSet;
30extern const int kInvalidDeviceId;
31
32// Forward declarations
33template<typename T>
34class CudaDevicePointer;
35template <typename T>
36class CudaHostPointer;
37template<typename T>
38class CudaReductionFunction;
39
40class CudaShared {
41 public:
42 // Get the mutex used to synchronize CUDA and NCCL operations
43 static std::mutex& getMutex() {
44 return *mutex_;
45 }
46
47 // Set the mutex used to synchronize CUDA and NCCL operations
48 static void setMutex(std::mutex* m) {
49 mutex_ = m;
50 }
51
52 private:
53 static std::atomic<std::mutex*> mutex_;
54};
55
56class CudaStream {
57 public:
58 explicit CudaStream(int deviceId, cudaStream_t stream = kStreamNotSet);
59
60 // Move constructor
61 CudaStream(CudaStream&& other) noexcept;
62
63 ~CudaStream() noexcept(false);
64
65 cudaStream_t operator*() const {
66 return stream_;
67 }
68
69 int getDeviceID() const {
70 return deviceId_;
71 }
72
73 cudaStream_t getStream() const {
74 return stream_;
75 }
76
77 cudaEvent_t getEvent() const {
78 return event_;
79 }
80
81 template <typename T>
82 void copyAsync(CudaHostPointer<T>& dst, CudaDevicePointer<T>& src);
83 template <typename T>
84 void copyAsync(CudaHostPointer<T>& dst, CudaHostPointer<T>& src);
85 template <typename T>
86 void copyAsync(CudaDevicePointer<T>& dst, CudaDevicePointer<T>& src);
87 template <typename T>
88 void copyAsync(CudaDevicePointer<T>& dst, CudaHostPointer<T>& src);
89
90 void record();
91
92 void wait();
93
94 protected:
95 // Instances cannot be copied or copy-assigned
96 CudaStream(const CudaStream&) = delete;
97 CudaStream& operator=(const CudaStream&) = delete;
98
99 // GPU that the stream belongs to.
100 int deviceId_;
101
102 // Operations are always run on a stream such that they can run
103 // concurrently with other operations. The stream can be specified
104 // at construction time if one has already been created outside this
105 // library. If it is not specified, a new stream is created.
106 cudaStream_t stream_;
107 cudaEvent_t event_;
108
109 // If no stream is specified at construction time, this class
110 // allocates a new stream for operations against CUDA pointers.
111 // Record whether or not this instance is a stream's owner so that
112 // it is destroyed when this instance is destructed.
113 bool streamOwner_;
114};
115
116template<typename T>
117class CudaDevicePointer {
118 public:
119 static CudaDevicePointer<T> alloc(size_t count);
120
121 static CudaDevicePointer<T> create(T* ptr, size_t count);
122
123 static CudaDevicePointer<T> create(const CudaDevicePointer<T>& ptr) {
124 return CudaDevicePointer<T>::create(*ptr, ptr.getCount());
125 }
126
127 CudaDevicePointer(CudaDevicePointer&&) noexcept;
128 ~CudaDevicePointer() noexcept(false);
129
130 // Default constructor creates invalid instance
131 CudaDevicePointer()
132 : device_(nullptr),
133 count_(0),
134 owner_(false),
135 deviceId_(kInvalidDeviceId) {}
136
137 // Move assignment operator
138 CudaDevicePointer& operator=(CudaDevicePointer&&);
139
140 bool operator ==(const CudaDevicePointer<T>& other) const {
141 return device_ == other.device_ && count_ == other.count_;
142 }
143
144 T* operator*() const {
145 return device_;
146 }
147
148 T& operator[](size_t index) const {
149 return device_[index];
150 }
151
152 int getCount() const {
153 return count_;
154 }
155
156 int getDeviceID() const {
157 return deviceId_;
158 }
159
160 // Create range into this pointer
161 CudaDevicePointer<T> range(size_t offset, size_t count) const {
162 GLOO_ENFORCE_LE(offset + count, count_);
163 return CudaDevicePointer<T>(device_ + offset, count, false);
164 }
165
166 protected:
167 // Instances must be created through static functions
168 CudaDevicePointer(T* ptr, size_t count, bool owner);
169
170 // Instances cannot be copied or copy-assigned
171 CudaDevicePointer(const CudaDevicePointer&) = delete;
172 CudaDevicePointer& operator=(const CudaDevicePointer&) = delete;
173
174 // Device pointer
175 T* device_;
176
177 // Number of T elements in device pointer
178 size_t count_;
179
180 // Record whether or not this instance is this pointer's owner so
181 // that it is freed when this instance is destructed.
182 bool owner_ = false;
183
184 // GPU that the device pointer lives on
185 int deviceId_;
186};
187
188template <typename T>
189class CudaHostPointer {
190 public:
191 static CudaHostPointer<T> alloc(size_t count);
192
193 static CudaHostPointer<T> create(T* ptr, size_t count) {
194 return CudaHostPointer<T>(ptr, count, false);
195 }
196
197 CudaHostPointer(CudaHostPointer&&) noexcept;
198 ~CudaHostPointer() noexcept(false);
199
200 // Default constructor creates invalid instance
201 CudaHostPointer() : CudaHostPointer(nullptr, 0, false) {}
202
203 // Move assignment operator
204 CudaHostPointer& operator=(CudaHostPointer&&);
205
206 bool operator ==(const CudaHostPointer<T>& other) const {
207 return host_ == other.host_ && count_ == other.count_;
208 }
209
210 T* operator*() const {
211 return host_;
212 }
213
214 T& operator[](size_t index) const {
215 return host_[index];
216 }
217
218 int getCount() const {
219 return count_;
220 }
221
222 // Create range into this pointer
223 CudaHostPointer<T> range(size_t offset, size_t count) const {
224 GLOO_ENFORCE_LE(offset + count, count_);
225 return CudaHostPointer<T>(host_ + offset, count, false);
226 }
227
228 protected:
229 // Instances must be created through static functions
230 CudaHostPointer(T* ptr, size_t count, bool owner);
231
232 // Instances cannot be copied or copy-assigned
233 CudaHostPointer(const CudaHostPointer&) = delete;
234 CudaHostPointer& operator=(const CudaHostPointer&) = delete;
235
236 // Host pointer
237 T* host_;
238
239 // Number of T elements in host pointer
240 size_t count_;
241
242 // Record whether or not this instance is this pointer's owner so
243 // that it is freed when this instance is destructed.
244 bool owner_ = false;
245};
246
247template <typename T, typename Src, typename Dst>
248class CudaLocalMemcpy : public LocalOp<T> {
249 public:
250 CudaLocalMemcpy(
251 CudaStream& stream,
252 Src& src,
253 Dst& dst,
254 size_t offset,
255 size_t count)
256 : stream_(stream),
257 src_(src.range(offset, count)),
258 dst_(dst.range(offset, count)) {}
259
260 virtual void runAsync() {
261 stream_.copyAsync(dst_, src_);
262 }
263
264 virtual void wait() {
265 stream_.wait();
266 }
267
268 protected:
269 CudaStream& stream_;
270 Src src_;
271 Dst dst_;
272};
273
274template <typename T>
275void cudaSum(T* x, const T* y, size_t n, const cudaStream_t stream);
276
277template <typename T>
278void cudaProduct(T* x, const T* y, size_t n, const cudaStream_t stream);
279
280template <typename T>
281void cudaMax(T* x, const T* y, size_t n, const cudaStream_t stream);
282
283template <typename T>
284void cudaMin(T* x, const T* y, size_t n, const cudaStream_t stream);
285
286template <typename T>
287class CudaReductionFunction {
288 using DeviceFunction =
289 void(T*, const T*, size_t n, const cudaStream_t stream);
290 using HostFunction =
291 void(T*, const T*, size_t n);
292
293 public:
294 static const CudaReductionFunction<T>* sum;
295 static const CudaReductionFunction<T>* product;
296 static const CudaReductionFunction<T>* min;
297 static const CudaReductionFunction<T>* max;
298
299 CudaReductionFunction(
300 ReductionType type,
301 DeviceFunction* deviceFn,
302 HostFunction* hostFn)
303 : type_(type),
304 deviceFn_(deviceFn),
305 hostFn_(hostFn) {}
306
307 ReductionType type() const {
308 return type_;
309 }
310
311 // Backwards compatibility.
312 // Can be removed when all CUDA algorithms use CudaHostPointer.
313 void call(T* x, const T* y, size_t n) const {
314 hostFn_(x, y, n);
315 }
316
317 void call(
318 CudaHostPointer<T>& dst,
319 const CudaHostPointer<T>& src,
320 size_t n,
321 CudaStream& stream) const {
322 // The specified stream may still have a memcpy in flight to
323 // either of the CudaHostPointers. Wait on the stream to make sure
324 // they have finished before executing the reduction function.
325 stream.wait();
326 hostFn_(*dst, *src, n);
327 }
328
329 void call(
330 CudaDevicePointer<T>& dst,
331 const CudaDevicePointer<T>& src,
332 size_t n,
333 CudaStream& stream) const {
334 deviceFn_(*dst, *src, n, *stream);
335 stream.record();
336 }
337
338 protected:
339 const ReductionType type_;
340 DeviceFunction* deviceFn_;
341 HostFunction* hostFn_;
342
343 friend class CudaDevicePointer<T>;
344 friend class CudaHostPointer<T>;
345};
346
347template <typename T>
348const CudaReductionFunction<T>* CudaReductionFunction<T>::sum =
349 new CudaReductionFunction<T>(
350 SUM, &::gloo::cudaSum<T>, &::gloo::sum<T>);
351template <typename T>
352const CudaReductionFunction<T>* CudaReductionFunction<T>::product =
353 new CudaReductionFunction<T>(
354 PRODUCT, &::gloo::cudaProduct<T>, &::gloo::product<T>);
355template <typename T>
356const CudaReductionFunction<T>* CudaReductionFunction<T>::min =
357 new CudaReductionFunction<T>(
358 MIN, &::gloo::cudaMin<T>, &::gloo::min<T>);
359template <typename T>
360const CudaReductionFunction<T>* CudaReductionFunction<T>::max =
361 new CudaReductionFunction<T>(
362 MAX, &::gloo::cudaMax<T>, &::gloo::max<T>);
363
364} // namespace gloo
365