CUDAStream.h source code [pytorch/c10/cuda/CUDAStream.h]

1	#pragma once
2
3	#include <cstdint>
4	#include <utility>
5
6	#include <cuda_runtime_api.h>
7
8	#include <c10/core/DeviceGuard.h>
9	#include <c10/core/Stream.h>
10	#include <c10/cuda/CUDAFunctions.h>
11	#include <c10/util/Exception.h>
12
13	/*
14	* Stream pool note.
15	*
16	* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
17	* are backed by cuStreams, but they use several pools to minimize the costs
18	* associated with creating, retaining, and destroying cuStreams.
19	*
20	* There are three pools per device, and a device's pools are lazily created.
21	*
22	* The first pool contains only the default stream. When the default stream
23	* is requested it's returned.
24	*
25	* The second pool is the "low priority" or "default priority" streams. In
26	* HIP builds there is no distinction between streams in this pool and streams
27	* in the third pool (below). There are 32 of these streams per device, and
28	* when a stream is requested one of these streams is returned round-robin.
29	* That is, the first stream requested is at index 0, the second at index 1...
30	* to index 31, then index 0 again.
31	*
32	* This means that if 33 low priority streams are requested, the first and
33	* last streams requested are actually the same stream (under the covers)
34	* and kernels enqueued on them cannot run concurrently.
35	*
36	* The third pool is the "high priority" streams. The third pool acts like
37	* the second pool except the streams are created with a higher priority.
38	*
39	* These pools suggest that stream users should prefer many short-lived streams,
40	* as the cost of acquiring and releasing streams is effectively zero. If
41	* many longer-lived streams are required in performance critical scenarios
42	* then the functionality here may need to be extended to allow, for example,
43	* "reserving" a subset of the pool so that other streams do not accidentally
44	* overlap the performance critical streams.
45	*
46	* Note: although the notion of "current stream for device" is thread local
47	* (every OS thread has a separate current stream, as one might expect),
48	* the stream pool is global across all threads; stream 0 is always stream 0
49	* no matter which thread you use it on. Multiple threads can synchronize
50	* on the same stream. Although the CUDA documentation is not very clear
51	* on the matter, streams are thread safe; e.g., it is safe to enqueue
52	* a kernel on the same stream from two different threads.
53	*/
54
55	namespace c10 {
56	namespace cuda {
57
58	// Value object representing a CUDA stream. This is just a wrapper
59	// around c10::Stream, but it comes with a little extra CUDA-specific
60	// functionality (conversion to cudaStream_t), and a guarantee that
61	// the wrapped c10::Stream really is a CUDA stream.
62	class C10_CUDA_API CUDAStream {
63	public:
64	enum Unchecked { UNCHECKED };
65
66	/// Construct a CUDAStream from a Stream. This construction is checked,
67	/// and will raise an error if the Stream is not, in fact, a CUDA stream.
68	explicit CUDAStream(Stream stream) : stream_(stream) {
69	TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
70	}
71
72	/// Construct a CUDAStream from a Stream with no error checking.
73	/// This constructor uses the "named" constructor idiom, and can
74	/// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
75	explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
76
77	bool operator==(const CUDAStream& other) const noexcept {
78	return unwrap() == other.unwrap();
79	}
80
81	bool operator!=(const CUDAStream& other) const noexcept {
82	return unwrap() != other.unwrap();
83	}
84
85	/// Implicit conversion to cudaStream_t.
86	operator cudaStream_t() const {
87	return stream();
88	}
89
90	/// Implicit conversion to Stream (a.k.a., forget that the stream is a
91	/// CUDA stream).
92	operator Stream() const {
93	return unwrap();
94	}
95
96	/// Used to avoid baking in device type explicitly to Python-side API.
97	DeviceType device_type() const {
98	return DeviceType::CUDA;
99	}
100
101	/// Get the CUDA device index that this stream is associated with.
102	DeviceIndex device_index() const {
103	return stream_.device_index();
104	}
105
106	/// Get the full Device that this stream is associated with. The Device
107	/// is guaranteed to be a CUDA device.
108	Device device() const {
109	return Device (DeviceType::CUDA, device_index());
110	}
111
112	/// Return the stream ID corresponding to this particular stream.
113	StreamId id() const {
114	return stream_.id();
115	}
116
117	bool query() const {
118	DeviceGuard guard{stream_.device()};
119	cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
120
121	if (err == cudaSuccess) {
122	return true;
123	} else if (err != cudaErrorNotReady) {
124	C10_CUDA_CHECK(err);
125	} else {
126	// ignore and clear the error if not ready
127	(void)cudaGetLastError();
128	}
129
130	return false;
131	}
132
133	void synchronize() const {
134	DeviceGuard guard{stream_.device()};
135	c10::cuda::stream_synchronize(stream());
136	}
137
138	int priority() const {
139	DeviceGuard guard{stream_.device()};
140	int priority = `0`;
141	C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
142	return priority;
143	}
144
145	/// Explicit conversion to cudaStream_t.
146	cudaStream_t stream() const;
147
148	/// Explicit conversion to Stream.
149	Stream unwrap() const {
150	return stream_;
151	}
152
153	/// Reversibly pack a CUDAStream into a struct representation.
154	/// Previously the stream's data was packed into a single int64_t,
155	/// as it was assumed the fields would not require more than
156	/// 64 bits of storage in total.
157	/// See https://github.com/pytorch/pytorch/issues/75854
158	/// for more information regarding newer platforms that may violate
159	/// this assumption.
160	///
161	/// The CUDAStream can be unpacked using unpack().
162	struct c10::StreamData3 pack3() const {
163	return stream_.pack3();
164	}
165
166	// Unpack a CUDAStream from the 3 fields generated by pack().
167	static CUDAStream unpack3(
168	StreamId stream_id,
169	DeviceIndex device_index,
170	DeviceType device_type) {
171	return CUDAStream (Stream::unpack3(stream_id, device_index, device_type));
172	}
173
174	static std::tuple<int, int> priority_range() {
175	// Note: this returns the range of priority supported by PyTorch, not
176	// the range of priority supported by CUDA. The former is a subset of
177	// the latter. Currently PyTorch only supports 0 and -1, which are "low" and
178	// "high" priority.
179	int least_priority, greatest_priority;
180	C10_CUDA_CHECK(
181	cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
182	TORCH_INTERNAL_ASSERT(
183	least_priority >= `0`, "Unexpected CUDA stream priority range");
184	TORCH_INTERNAL_ASSERT(
185	greatest_priority <= -`1`, "Unexpected CUDA stream priority range");
186	return std::make_tuple(`0`, -`1`);
187	}
188
189	// Deleted for now; use CUDAEvent::block instead
190	// void synchronize_with(const CUDAEvent& event) const;
191
192	private:
193	Stream stream_;
194	};
195
196	/**
197	* Get a new stream from the CUDA stream pool. You can think of this
198	* as "creating" a new stream, but no such creation actually happens;
199	* instead, streams are preallocated from the pool and returned in a
200	* round-robin fashion.
201	*
202	* You can request a stream from the high priority pool by setting
203	* isHighPriority to true, or a stream for a specific device by setting device
204	* (defaulting to the current CUDA stream.)
205	*/
206	C10_API CUDAStream
207	getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -`1`);
208
209	/**
210	* Get a CUDAStream from a externally allocated one.
211	*
212	* This is mainly for interoperability with different libraries where we
213	* want to operate on a non-torch allocated stream for data exchange or similar
214	* purposes
215	*/
216	C10_API CUDAStream
217	getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
218
219	/**
220	* Get the default CUDA stream, for the passed CUDA device, or for the
221	* current device if no device index is passed. The default stream is
222	* where most computation occurs when you aren't explicitly using
223	* streams.
224	*/
225	C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -`1`);
226
227	/**
228	* Get the current CUDA stream, for the passed CUDA device, or for the
229	* current device if no device index is passed. The current CUDA stream
230	* will usually be the default CUDA stream for the device, but it may
231	* be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
232	* or 'CUDAStreamGuard'.
233	*/
234	C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -`1`);
235
236	/**
237	* Set the current stream on the device of the passed in stream to be
238	* the passed in stream. Yes, you read that right: this function
239	* has nothing to do with the current device: it toggles the current
240	* stream of the device of the passed stream.
241	*
242	* Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
243	* (which will switch both your current device and current stream in the way you
244	* expect, and reset it back to its original state afterwards).
245	*/
246	C10_API void setCurrentCUDAStream(CUDAStream stream);
247
248	C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
249
250	} // namespace cuda
251	} // namespace c10
252
253	namespace std {
254	template <>
255	struct hash<c10::cuda::CUDAStream> {
256	size_t operator()(c10::cuda::CUDAStream s) const noexcept {
257	return std::hash<c10::Stream>{}(s.unwrap());
258	}
259	};
260	} // namespace std
261

Browse the source code of pytorch/c10/cuda/CUDAStream.h