CudaIPCTypes.h source code [pytorch/torch/csrc/CudaIPCTypes.h]

1	#pragma once
2	#ifdef USE_CUDA
3	#include <c10/core/Allocator.h>
4	#include <c10/cuda/CUDACachingAllocator.h>
5	#include <c10/cuda/CUDAException.h>
6	#include <c10/cuda/CUDAGuard.h>
7	#include <c10/cuda/CUDAStream.h>
8	#include <c10/util/Logging.h>
9	#include <cuda_runtime_api.h>
10	#include <torch/csrc/Export.h>
11	#include <cstddef>
12	namespace torch {
13
14	TORCH_CUDA_CU_API bool CudaIPCCollect();
15
16	struct CudaIPCReceivedData final {
17	CudaIPCReceivedData() = default;
18	explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr)
19	: shared_ptr_(std::move(shared_ptr)) {}
20	std::shared_ptr<void> shared_ptr_;
21	};
22
23	struct CudaIPCSentData final {
24	std::string handle_;
25	int64_t offset_;
26	int64_t* counter_ptr_; // Reference counter shared memory block
27	at::DataPtr original_ptr_; // Original mem allocation
28	cudaEvent_t event_; // Sync cuEventDestroy
29	bool event_sync_required_;
30	at::Device device_;
31
32	CudaIPCSentData(
33	std::string handle,
34	int64_t offset,
35	int64_t* counter_ptr,
36	at::Device device);
37	~CudaIPCSentData();
38
39	int64_t counter_value();
40	std::string handle() {
41	return handle_;
42	}
43	int64_t offset() {
44	return offset_;
45	}
46	void set_original_ptr(at::DataPtr data_ptr) {
47	original_ptr_ = std::move(data_ptr);
48	}
49	};
50
51	TORCH_CUDA_CU_API at::DataPtr GetNewRefCountedSentData(
52	void* data,
53	at::Device device);
54
55	namespace {
56
57	constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = `10000`;
58	constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = `1000`;
59	// This was determined empirically that CUDA (v10.1 and below) have the limit
60	// on the number of recorded blocking interprocess events. It is around ~22,000.
61	// And to give us leeway, we picked 1000 as it gives us enough events to share
62	// tensors effectively.
63	constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = `1000`;
64
65	// All to be deleted data blocks with non zero reference counter goes there
66	struct CudaIPCSentDataLimbo final {
67	~CudaIPCSentDataLimbo();
68	bool collect();
69	void add(std::unique_ptr<CudaIPCSentData> shared_block);
70	uint64_t size();
71
72	private:
73	// TODO: Can be changed to FIFO in order to avoid full traverse on every
74	// collect()
75	std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_;
76	std::mutex limbo_mutex_;
77	};
78
79	struct CudaIPCRefCountersFile final {
80	CudaIPCRefCountersFile(
81	std::string handle,
82	uint64_t size,
83	at::DataPtr data_ptr)
84	: next_offset_(`0`),
85	size_(size),
86	used_slots_(`0`),
87	handle_(std::move(handle)),
88	refcounted_shared_mem_(std::move(data_ptr)) {}
89
90	int64_t* counter_ptr() {
91	return static_cast<int64_t*>(refcounted_shared_mem_.get()) + next_offset_;
92	}
93
94	void set_counter(uint64_t value) {
95	*counter_ptr() = value;
96	}
97
98	bool have_offsets() {
99	return next_offset_ < size_;
100	}
101
102	bool offsets_in_use() {
103	return used_slots_;
104	}
105
106	int64_t get_offset() {
107	return next_offset_;
108	}
109
110	void rotate_offset() {
111	next_offset_++;
112	used_slots_++;
113	}
114
115	void return_offset(uint64_t offset / unused /) {
116	used_slots_--;
117	}
118
119	std::string handle() {
120	return handle_;
121	}
122
123	private:
124	uint64_t next_offset_;
125	uint64_t size_;
126	uint64_t used_slots_;
127	std::string handle_;
128	at::DataPtr refcounted_shared_mem_;
129	};
130
131	} // namespace
132	} // namespace torch
133
134	namespace c10 {
135	namespace {
136	class CudaIPCCollectCallback : public FreeMemoryCallback {
137	public:
138	bool Execute() override {
139	return torch::CudaIPCCollect();
140	}
141	};
142	} // namespace
143
144	} // namespace c10
145
146	#endif
147

Browse the source code of pytorch/torch/csrc/CudaIPCTypes.h