cuda.h source code [pytorch/third_party/gloo/gloo/cuda.h]

1	/**
2	* Copyright (c) 2017-present, Facebook, Inc.
3	* All rights reserved.
4	*
5	* This source code is licensed under the BSD-style license found in the
6	* LICENSE file in the root directory of this source tree.
7	*/
8
9	#pragma once
10
11	#include <algorithm>
12	#include <atomic>
13	#include <mutex>
14
15	#include <cuda.h>
16	#include <cuda_runtime.h>
17
18	#include "gloo/algorithm.h"
19	#include "gloo/config.h"
20	#include "gloo/common/logging.h"
21
22	// Check that configuration header was properly generated
23	#if !GLOO_USE_CUDA
24	#error "Expected GLOO_USE_CUDA to be defined"
25	#endif
26
27	namespace gloo {
28
29	extern const cudaStream_t kStreamNotSet;
30	extern const int kInvalidDeviceId;
31
32	// Forward declarations
33	template<typename T>
34	class CudaDevicePointer;
35	template <typename T>
36	class CudaHostPointer;
37	template<typename T>
38	class CudaReductionFunction;
39
40	class CudaShared {
41	public:
42	// Get the mutex used to synchronize CUDA and NCCL operations
43	static std::mutex& getMutex() {
44	return *mutex_;
45	}
46
47	// Set the mutex used to synchronize CUDA and NCCL operations
48	static void setMutex(std::mutex* m) {
49	mutex_ = m;
50	}
51
52	private:
53	static std::atomic<std::mutex*> mutex_;
54	};
55
56	class CudaStream {
57	public:
58	explicit CudaStream(int deviceId, cudaStream_t stream = kStreamNotSet);
59
60	// Move constructor
61	CudaStream(CudaStream&& other) noexcept;
62
63	~CudaStream() noexcept(false);
64
65	cudaStream_t operator() const* {
66	return stream_;
67	}
68
69	int getDeviceID() const {
70	return deviceId_;
71	}
72
73	cudaStream_t getStream() const {
74	return stream_;
75	}
76
77	cudaEvent_t getEvent() const {
78	return event_;
79	}
80
81	template <typename T>
82	void copyAsync(CudaHostPointer<T>& dst, CudaDevicePointer<T>& src);
83	template <typename T>
84	void copyAsync(CudaHostPointer<T>& dst, CudaHostPointer<T>& src);
85	template <typename T>
86	void copyAsync(CudaDevicePointer<T>& dst, CudaDevicePointer<T>& src);
87	template <typename T>
88	void copyAsync(CudaDevicePointer<T>& dst, CudaHostPointer<T>& src);
89
90	void record();
91
92	void wait();
93
94	protected:
95	// Instances cannot be copied or copy-assigned
96	CudaStream(const CudaStream&) = delete;
97	CudaStream& operator=(const CudaStream&) = delete;
98
99	// GPU that the stream belongs to.
100	int deviceId_;
101
102	// Operations are always run on a stream such that they can run
103	// concurrently with other operations. The stream can be specified
104	// at construction time if one has already been created outside this
105	// library. If it is not specified, a new stream is created.
106	cudaStream_t stream_;
107	cudaEvent_t event_;
108
109	// If no stream is specified at construction time, this class
110	// allocates a new stream for operations against CUDA pointers.
111	// Record whether or not this instance is a stream's owner so that
112	// it is destroyed when this instance is destructed.
113	bool streamOwner_;
114	};
115
116	template<typename T>
117	class CudaDevicePointer {
118	public:
119	static CudaDevicePointer<T> alloc(size_t count);
120
121	static CudaDevicePointer<T> create(T* ptr, size_t count);
122
123	static CudaDevicePointer<T> create(const CudaDevicePointer<T>& ptr) {
124	return CudaDevicePointer<T>::create(*ptr, ptr.getCount());
125	}
126
127	CudaDevicePointer(CudaDevicePointer&&) noexcept;
128	~CudaDevicePointer() noexcept(false);
129
130	// Default constructor creates invalid instance
131	CudaDevicePointer()
132	: device_(nullptr),
133	count_(`0`),
134	owner_(false),
135	deviceId_(kInvalidDeviceId) {}
136
137	// Move assignment operator
138	CudaDevicePointer& operator=(CudaDevicePointer&&);
139
140	bool operator ==(const CudaDevicePointer<T>& other) const {
141	return device_ == other.device_ && count_ == other.count_;
142	}
143
144	T* operator() const* {
145	return device_;
146	}
147
148	T& operator[](size_t index) const {
149	return device_[index];
150	}
151
152	int getCount() const {
153	return count_;
154	}
155
156	int getDeviceID() const {
157	return deviceId_;
158	}
159
160	// Create range into this pointer
161	CudaDevicePointer<T> range(size_t offset, size_t count) const {
162	GLOO_ENFORCE_LE(offset + count, count_);
163	return CudaDevicePointer<T>(device_ + offset, count, false);
164	}
165
166	protected:
167	// Instances must be created through static functions
168	CudaDevicePointer(T* ptr, size_t count, bool owner);
169
170	// Instances cannot be copied or copy-assigned
171	CudaDevicePointer(const CudaDevicePointer&) = delete;
172	CudaDevicePointer& operator=(const CudaDevicePointer&) = delete;
173
174	// Device pointer
175	T* device_;
176
177	// Number of T elements in device pointer
178	size_t count_;
179
180	// Record whether or not this instance is this pointer's owner so
181	// that it is freed when this instance is destructed.
182	bool owner_ = false;
183
184	// GPU that the device pointer lives on
185	int deviceId_;
186	};
187
188	template <typename T>
189	class CudaHostPointer {
190	public:
191	static CudaHostPointer<T> alloc(size_t count);
192
193	static CudaHostPointer<T> create(T* ptr, size_t count) {
194	return CudaHostPointer<T>(ptr, count, false);
195	}
196
197	CudaHostPointer(CudaHostPointer&&) noexcept;
198	~CudaHostPointer() noexcept(false);
199
200	// Default constructor creates invalid instance
201	CudaHostPointer() : CudaHostPointer(nullptr, `0`, false) {}
202
203	// Move assignment operator
204	CudaHostPointer& operator=(CudaHostPointer&&);
205
206	bool operator ==(const CudaHostPointer<T>& other) const {
207	return host_ == other.host_ && count_ == other.count_;
208	}
209
210	T* operator() const* {
211	return host_;
212	}
213
214	T& operator[](size_t index) const {
215	return host_[index];
216	}
217
218	int getCount() const {
219	return count_;
220	}
221
222	// Create range into this pointer
223	CudaHostPointer<T> range(size_t offset, size_t count) const {
224	GLOO_ENFORCE_LE(offset + count, count_);
225	return CudaHostPointer<T>(host_ + offset, count, false);
226	}
227
228	protected:
229	// Instances must be created through static functions
230	CudaHostPointer(T* ptr, size_t count, bool owner);
231
232	// Instances cannot be copied or copy-assigned
233	CudaHostPointer(const CudaHostPointer&) = delete;
234	CudaHostPointer& operator=(const CudaHostPointer&) = delete;
235
236	// Host pointer
237	T* host_;
238
239	// Number of T elements in host pointer
240	size_t count_;
241
242	// Record whether or not this instance is this pointer's owner so
243	// that it is freed when this instance is destructed.
244	bool owner_ = false;
245	};
246
247	template <typename T, typename Src, typename Dst>
248	class CudaLocalMemcpy : public LocalOp<T> {
249	public:
250	CudaLocalMemcpy(
251	CudaStream& stream,
252	Src& src,
253	Dst& dst,
254	size_t offset,
255	size_t count)
256	: stream_(stream),
257	src_(src.range(offset, count)),
258	dst_(dst.range(offset, count)) {}
259
260	virtual void runAsync() {
261	stream_.copyAsync(dst_, src_);
262	}
263
264	virtual void wait() {
265	stream_.wait();
266	}
267
268	protected:
269	CudaStream& stream_;
270	Src src_;
271	Dst dst_;
272	};
273
274	template <typename T>
275	void cudaSum(T* x, const T* y, size_t n, const cudaStream_t stream);
276
277	template <typename T>
278	void cudaProduct(T* x, const T* y, size_t n, const cudaStream_t stream);
279
280	template <typename T>
281	void cudaMax(T* x, const T* y, size_t n, const cudaStream_t stream);
282
283	template <typename T>
284	void cudaMin(T* x, const T* y, size_t n, const cudaStream_t stream);
285
286	template <typename T>
287	class CudaReductionFunction {
288	using DeviceFunction =
289	void(T, const* T, size_t n, const* cudaStream_t stream);
290	using HostFunction =
291	void(T, const* T*, size_t n);
292
293	public:
294	static const CudaReductionFunction<T>* sum;
295	static const CudaReductionFunction<T>* product;
296	static const CudaReductionFunction<T>* min;
297	static const CudaReductionFunction<T>* max;
298
299	CudaReductionFunction(
300	ReductionType type,
301	DeviceFunction* deviceFn,
302	HostFunction* hostFn)
303	: type_(type),
304	deviceFn_(deviceFn),
305	hostFn_(hostFn) {}
306
307	ReductionType type() const {
308	return type_;
309	}
310
311	// Backwards compatibility.
312	// Can be removed when all CUDA algorithms use CudaHostPointer.
313	void call(T* x, const T* y, size_t n) const {
314	hostFn_(x, y, n);
315	}
316
317	void call(
318	CudaHostPointer<T>& dst,
319	const CudaHostPointer<T>& src,
320	size_t n,
321	CudaStream& stream) const {
322	// The specified stream may still have a memcpy in flight to
323	// either of the CudaHostPointers. Wait on the stream to make sure
324	// they have finished before executing the reduction function.
325	stream.wait();
326	hostFn_(dst, src, n);
327	}
328
329	void call(
330	CudaDevicePointer<T>& dst,
331	const CudaDevicePointer<T>& src,
332	size_t n,
333	CudaStream& stream) const {
334	deviceFn_(dst, src, n, *stream);
335	stream.record();
336	}
337
338	protected:
339	const ReductionType type_;
340	DeviceFunction* deviceFn_;
341	HostFunction* hostFn_;
342
343	friend class CudaDevicePointer<T>;
344	friend class CudaHostPointer<T>;
345	};
346
347	template <typename T>
348	const CudaReductionFunction<T>* CudaReductionFunction<T>::sum =
349	new CudaReductionFunction<T>(
350	SUM, &::gloo::cudaSum<T>, &::gloo::sum<T>);
351	template <typename T>
352	const CudaReductionFunction<T>* CudaReductionFunction<T>::product =
353	new CudaReductionFunction<T>(
354	PRODUCT, &::gloo::cudaProduct<T>, &::gloo::product<T>);
355	template <typename T>
356	const CudaReductionFunction<T>* CudaReductionFunction<T>::min =
357	new CudaReductionFunction<T>(
358	MIN, &::gloo::cudaMin<T>, &::gloo::min<T>);
359	template <typename T>
360	const CudaReductionFunction<T>* CudaReductionFunction<T>::max =
361	new CudaReductionFunction<T>(
362	MAX, &::gloo::cudaMax<T>, &::gloo::max<T>);
363
364	} // namespace gloo
365

Browse the source code of pytorch/third_party/gloo/gloo/cuda.h