1 | /** |
2 | * Copyright (c) 2017-present, Facebook, Inc. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under the BSD-style license found in the |
6 | * LICENSE file in the root directory of this source tree. |
7 | */ |
8 | |
9 | #pragma once |
10 | |
11 | #include <algorithm> |
12 | #include <atomic> |
13 | #include <mutex> |
14 | |
15 | #include <cuda.h> |
16 | #include <cuda_runtime.h> |
17 | |
18 | #include "gloo/algorithm.h" |
19 | #include "gloo/config.h" |
20 | #include "gloo/common/logging.h" |
21 | |
22 | // Check that configuration header was properly generated |
23 | #if !GLOO_USE_CUDA |
24 | #error "Expected GLOO_USE_CUDA to be defined" |
25 | #endif |
26 | |
27 | namespace gloo { |
28 | |
29 | extern const cudaStream_t kStreamNotSet; |
30 | extern const int kInvalidDeviceId; |
31 | |
32 | // Forward declarations |
33 | template<typename T> |
34 | class CudaDevicePointer; |
35 | template <typename T> |
36 | class CudaHostPointer; |
37 | template<typename T> |
38 | class CudaReductionFunction; |
39 | |
40 | class CudaShared { |
41 | public: |
42 | // Get the mutex used to synchronize CUDA and NCCL operations |
43 | static std::mutex& getMutex() { |
44 | return *mutex_; |
45 | } |
46 | |
47 | // Set the mutex used to synchronize CUDA and NCCL operations |
48 | static void setMutex(std::mutex* m) { |
49 | mutex_ = m; |
50 | } |
51 | |
52 | private: |
53 | static std::atomic<std::mutex*> mutex_; |
54 | }; |
55 | |
56 | class CudaStream { |
57 | public: |
58 | explicit CudaStream(int deviceId, cudaStream_t stream = kStreamNotSet); |
59 | |
60 | // Move constructor |
61 | CudaStream(CudaStream&& other) noexcept; |
62 | |
63 | ~CudaStream() noexcept(false); |
64 | |
65 | cudaStream_t operator*() const { |
66 | return stream_; |
67 | } |
68 | |
69 | int getDeviceID() const { |
70 | return deviceId_; |
71 | } |
72 | |
73 | cudaStream_t getStream() const { |
74 | return stream_; |
75 | } |
76 | |
77 | cudaEvent_t getEvent() const { |
78 | return event_; |
79 | } |
80 | |
81 | template <typename T> |
82 | void copyAsync(CudaHostPointer<T>& dst, CudaDevicePointer<T>& src); |
83 | template <typename T> |
84 | void copyAsync(CudaHostPointer<T>& dst, CudaHostPointer<T>& src); |
85 | template <typename T> |
86 | void copyAsync(CudaDevicePointer<T>& dst, CudaDevicePointer<T>& src); |
87 | template <typename T> |
88 | void copyAsync(CudaDevicePointer<T>& dst, CudaHostPointer<T>& src); |
89 | |
90 | void record(); |
91 | |
92 | void wait(); |
93 | |
94 | protected: |
95 | // Instances cannot be copied or copy-assigned |
96 | CudaStream(const CudaStream&) = delete; |
97 | CudaStream& operator=(const CudaStream&) = delete; |
98 | |
99 | // GPU that the stream belongs to. |
100 | int deviceId_; |
101 | |
102 | // Operations are always run on a stream such that they can run |
103 | // concurrently with other operations. The stream can be specified |
104 | // at construction time if one has already been created outside this |
105 | // library. If it is not specified, a new stream is created. |
106 | cudaStream_t stream_; |
107 | cudaEvent_t event_; |
108 | |
109 | // If no stream is specified at construction time, this class |
110 | // allocates a new stream for operations against CUDA pointers. |
111 | // Record whether or not this instance is a stream's owner so that |
112 | // it is destroyed when this instance is destructed. |
113 | bool streamOwner_; |
114 | }; |
115 | |
116 | template<typename T> |
117 | class CudaDevicePointer { |
118 | public: |
119 | static CudaDevicePointer<T> alloc(size_t count); |
120 | |
121 | static CudaDevicePointer<T> create(T* ptr, size_t count); |
122 | |
123 | static CudaDevicePointer<T> create(const CudaDevicePointer<T>& ptr) { |
124 | return CudaDevicePointer<T>::create(*ptr, ptr.getCount()); |
125 | } |
126 | |
127 | CudaDevicePointer(CudaDevicePointer&&) noexcept; |
128 | ~CudaDevicePointer() noexcept(false); |
129 | |
130 | // Default constructor creates invalid instance |
131 | CudaDevicePointer() |
132 | : device_(nullptr), |
133 | count_(0), |
134 | owner_(false), |
135 | deviceId_(kInvalidDeviceId) {} |
136 | |
137 | // Move assignment operator |
138 | CudaDevicePointer& operator=(CudaDevicePointer&&); |
139 | |
140 | bool operator ==(const CudaDevicePointer<T>& other) const { |
141 | return device_ == other.device_ && count_ == other.count_; |
142 | } |
143 | |
144 | T* operator*() const { |
145 | return device_; |
146 | } |
147 | |
148 | T& operator[](size_t index) const { |
149 | return device_[index]; |
150 | } |
151 | |
152 | int getCount() const { |
153 | return count_; |
154 | } |
155 | |
156 | int getDeviceID() const { |
157 | return deviceId_; |
158 | } |
159 | |
160 | // Create range into this pointer |
161 | CudaDevicePointer<T> range(size_t offset, size_t count) const { |
162 | GLOO_ENFORCE_LE(offset + count, count_); |
163 | return CudaDevicePointer<T>(device_ + offset, count, false); |
164 | } |
165 | |
166 | protected: |
167 | // Instances must be created through static functions |
168 | CudaDevicePointer(T* ptr, size_t count, bool owner); |
169 | |
170 | // Instances cannot be copied or copy-assigned |
171 | CudaDevicePointer(const CudaDevicePointer&) = delete; |
172 | CudaDevicePointer& operator=(const CudaDevicePointer&) = delete; |
173 | |
174 | // Device pointer |
175 | T* device_; |
176 | |
177 | // Number of T elements in device pointer |
178 | size_t count_; |
179 | |
180 | // Record whether or not this instance is this pointer's owner so |
181 | // that it is freed when this instance is destructed. |
182 | bool owner_ = false; |
183 | |
184 | // GPU that the device pointer lives on |
185 | int deviceId_; |
186 | }; |
187 | |
188 | template <typename T> |
189 | class CudaHostPointer { |
190 | public: |
191 | static CudaHostPointer<T> alloc(size_t count); |
192 | |
193 | static CudaHostPointer<T> create(T* ptr, size_t count) { |
194 | return CudaHostPointer<T>(ptr, count, false); |
195 | } |
196 | |
197 | CudaHostPointer(CudaHostPointer&&) noexcept; |
198 | ~CudaHostPointer() noexcept(false); |
199 | |
200 | // Default constructor creates invalid instance |
201 | CudaHostPointer() : CudaHostPointer(nullptr, 0, false) {} |
202 | |
203 | // Move assignment operator |
204 | CudaHostPointer& operator=(CudaHostPointer&&); |
205 | |
206 | bool operator ==(const CudaHostPointer<T>& other) const { |
207 | return host_ == other.host_ && count_ == other.count_; |
208 | } |
209 | |
210 | T* operator*() const { |
211 | return host_; |
212 | } |
213 | |
214 | T& operator[](size_t index) const { |
215 | return host_[index]; |
216 | } |
217 | |
218 | int getCount() const { |
219 | return count_; |
220 | } |
221 | |
222 | // Create range into this pointer |
223 | CudaHostPointer<T> range(size_t offset, size_t count) const { |
224 | GLOO_ENFORCE_LE(offset + count, count_); |
225 | return CudaHostPointer<T>(host_ + offset, count, false); |
226 | } |
227 | |
228 | protected: |
229 | // Instances must be created through static functions |
230 | CudaHostPointer(T* ptr, size_t count, bool owner); |
231 | |
232 | // Instances cannot be copied or copy-assigned |
233 | CudaHostPointer(const CudaHostPointer&) = delete; |
234 | CudaHostPointer& operator=(const CudaHostPointer&) = delete; |
235 | |
236 | // Host pointer |
237 | T* host_; |
238 | |
239 | // Number of T elements in host pointer |
240 | size_t count_; |
241 | |
242 | // Record whether or not this instance is this pointer's owner so |
243 | // that it is freed when this instance is destructed. |
244 | bool owner_ = false; |
245 | }; |
246 | |
247 | template <typename T, typename Src, typename Dst> |
248 | class CudaLocalMemcpy : public LocalOp<T> { |
249 | public: |
250 | CudaLocalMemcpy( |
251 | CudaStream& stream, |
252 | Src& src, |
253 | Dst& dst, |
254 | size_t offset, |
255 | size_t count) |
256 | : stream_(stream), |
257 | src_(src.range(offset, count)), |
258 | dst_(dst.range(offset, count)) {} |
259 | |
260 | virtual void runAsync() { |
261 | stream_.copyAsync(dst_, src_); |
262 | } |
263 | |
264 | virtual void wait() { |
265 | stream_.wait(); |
266 | } |
267 | |
268 | protected: |
269 | CudaStream& stream_; |
270 | Src src_; |
271 | Dst dst_; |
272 | }; |
273 | |
274 | template <typename T> |
275 | void cudaSum(T* x, const T* y, size_t n, const cudaStream_t stream); |
276 | |
277 | template <typename T> |
278 | void cudaProduct(T* x, const T* y, size_t n, const cudaStream_t stream); |
279 | |
280 | template <typename T> |
281 | void cudaMax(T* x, const T* y, size_t n, const cudaStream_t stream); |
282 | |
283 | template <typename T> |
284 | void cudaMin(T* x, const T* y, size_t n, const cudaStream_t stream); |
285 | |
286 | template <typename T> |
287 | class CudaReductionFunction { |
288 | using DeviceFunction = |
289 | void(T*, const T*, size_t n, const cudaStream_t stream); |
290 | using HostFunction = |
291 | void(T*, const T*, size_t n); |
292 | |
293 | public: |
294 | static const CudaReductionFunction<T>* sum; |
295 | static const CudaReductionFunction<T>* product; |
296 | static const CudaReductionFunction<T>* min; |
297 | static const CudaReductionFunction<T>* max; |
298 | |
299 | CudaReductionFunction( |
300 | ReductionType type, |
301 | DeviceFunction* deviceFn, |
302 | HostFunction* hostFn) |
303 | : type_(type), |
304 | deviceFn_(deviceFn), |
305 | hostFn_(hostFn) {} |
306 | |
307 | ReductionType type() const { |
308 | return type_; |
309 | } |
310 | |
311 | // Backwards compatibility. |
312 | // Can be removed when all CUDA algorithms use CudaHostPointer. |
313 | void call(T* x, const T* y, size_t n) const { |
314 | hostFn_(x, y, n); |
315 | } |
316 | |
317 | void call( |
318 | CudaHostPointer<T>& dst, |
319 | const CudaHostPointer<T>& src, |
320 | size_t n, |
321 | CudaStream& stream) const { |
322 | // The specified stream may still have a memcpy in flight to |
323 | // either of the CudaHostPointers. Wait on the stream to make sure |
324 | // they have finished before executing the reduction function. |
325 | stream.wait(); |
326 | hostFn_(*dst, *src, n); |
327 | } |
328 | |
329 | void call( |
330 | CudaDevicePointer<T>& dst, |
331 | const CudaDevicePointer<T>& src, |
332 | size_t n, |
333 | CudaStream& stream) const { |
334 | deviceFn_(*dst, *src, n, *stream); |
335 | stream.record(); |
336 | } |
337 | |
338 | protected: |
339 | const ReductionType type_; |
340 | DeviceFunction* deviceFn_; |
341 | HostFunction* hostFn_; |
342 | |
343 | friend class CudaDevicePointer<T>; |
344 | friend class CudaHostPointer<T>; |
345 | }; |
346 | |
347 | template <typename T> |
348 | const CudaReductionFunction<T>* CudaReductionFunction<T>::sum = |
349 | new CudaReductionFunction<T>( |
350 | SUM, &::gloo::cudaSum<T>, &::gloo::sum<T>); |
351 | template <typename T> |
352 | const CudaReductionFunction<T>* CudaReductionFunction<T>::product = |
353 | new CudaReductionFunction<T>( |
354 | PRODUCT, &::gloo::cudaProduct<T>, &::gloo::product<T>); |
355 | template <typename T> |
356 | const CudaReductionFunction<T>* CudaReductionFunction<T>::min = |
357 | new CudaReductionFunction<T>( |
358 | MIN, &::gloo::cudaMin<T>, &::gloo::min<T>); |
359 | template <typename T> |
360 | const CudaReductionFunction<T>* CudaReductionFunction<T>::max = |
361 | new CudaReductionFunction<T>( |
362 | MAX, &::gloo::cudaMax<T>, &::gloo::max<T>); |
363 | |
364 | } // namespace gloo |
365 | |