1 | /** |
2 | * Copyright (c) 2017-present, Facebook, Inc. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under the BSD-style license found in the |
6 | * LICENSE file in the root directory of this source tree. |
7 | */ |
8 | |
9 | #pragma once |
10 | |
11 | #include <cstdint> |
12 | #include <functional> |
13 | #include <memory> |
14 | #include <mutex> |
15 | |
16 | #ifdef __linux__ |
17 | #include "gloo/common/linux.h" |
18 | #endif |
19 | #include "gloo/common/logging.h" |
20 | #include "gloo/cuda.h" |
21 | #include "gloo/transport/device.h" |
22 | |
23 | namespace gloo { |
24 | |
25 | #define CUDA_CHECK(condition) \ |
26 | do { \ |
27 | cudaError_t error = condition; \ |
28 | GLOO_ENFORCE_EQ( \ |
29 | error, \ |
30 | cudaSuccess, \ |
31 | "Error at: ", \ |
32 | __FILE__, \ |
33 | ":", \ |
34 | __LINE__, \ |
35 | ": ", \ |
36 | cudaGetErrorString(error)); \ |
37 | } while (0) |
38 | |
39 | inline int getCurrentGPUID() { |
40 | int id = 0; |
41 | CUDA_CHECK(cudaGetDevice(&id)); |
42 | return id; |
43 | } |
44 | |
45 | inline int getGPUIDForPointer(const void* ptr) { |
46 | cudaPointerAttributes attr; |
47 | CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); |
48 | return attr.device; |
49 | } |
50 | |
51 | inline int getDeviceCount() { |
52 | int count; |
53 | CUDA_CHECK(cudaGetDeviceCount(&count)); |
54 | return count; |
55 | } |
56 | |
57 | const std::string& getCudaPCIBusID(int device); |
58 | |
59 | template<typename T> |
60 | int findCudaDevicePointerClosestToDevice( |
61 | std::vector<CudaDevicePointer<T> >& ptrs, |
62 | std::shared_ptr<transport::Device>& dev) { |
63 | // Compute distance between every pointer |
64 | auto devBusID = dev->getPCIBusID(); |
65 | std::vector<int> distance(ptrs.size()); |
66 | int minDistance = INT_MAX; |
67 | int minDistanceCount = 0; |
68 | for (auto i = 0; i < ptrs.size(); i++) { |
69 | #ifdef __linux__ |
70 | auto cudaBusID = getCudaPCIBusID(ptrs[i].getDeviceID()); |
71 | distance[i] = pciDistance(devBusID, cudaBusID); |
72 | #else |
73 | distance[i] = 0; |
74 | #endif |
75 | if (distance[i] <= minDistance) { |
76 | if (distance[i] < minDistance) { |
77 | minDistance = distance[i]; |
78 | minDistanceCount = 0; |
79 | } |
80 | minDistanceCount++; |
81 | } |
82 | } |
83 | // Choose random pointer closest to device; |
84 | auto minOffset = rand() % minDistanceCount; |
85 | int minIndex = 0; |
86 | for (auto i = 0; i < ptrs.size(); i++) { |
87 | if (distance[i] == minDistance) { |
88 | if (minOffset == 0) { |
89 | minIndex = i; |
90 | } |
91 | minOffset--; |
92 | } |
93 | } |
94 | return minIndex; |
95 | } |
96 | |
97 | class CudaDeviceGuard { |
98 | public: |
99 | CudaDeviceGuard() : previous_(getCurrentGPUID()) { |
100 | } |
101 | |
102 | ~CudaDeviceGuard() noexcept(false) { |
103 | CUDA_CHECK(cudaSetDevice(previous_)); |
104 | } |
105 | |
106 | private: |
107 | int previous_; |
108 | }; |
109 | |
110 | class CudaDeviceScope { |
111 | public: |
112 | explicit CudaDeviceScope(int device) : guard_() { |
113 | CUDA_CHECK(cudaSetDevice(device)); |
114 | } |
115 | |
116 | private: |
117 | CudaDeviceGuard guard_; |
118 | }; |
119 | |
120 | // Managed chunk of GPU memory. |
121 | // Convenience class used for tests and benchmarks. |
122 | template<typename T> |
123 | class CudaMemory { |
124 | public: |
125 | explicit CudaMemory(size_t elements); |
126 | CudaMemory(CudaMemory&&) noexcept; |
127 | ~CudaMemory() noexcept(false); |
128 | |
129 | T* operator*() const { |
130 | return ptr_; |
131 | } |
132 | |
133 | const size_t elements; |
134 | const size_t bytes; |
135 | |
136 | protected: |
137 | CudaMemory(const CudaMemory&) = delete; |
138 | CudaMemory& operator=(const CudaMemory&) = delete; |
139 | |
140 | int device_; |
141 | T* ptr_; |
142 | }; |
143 | |
144 | // Container class for a set of per-device streams |
145 | class CudaDeviceStreams { |
146 | public: |
147 | CudaDeviceStreams() { |
148 | const int numDevices = getDeviceCount(); |
149 | streams_.reserve(numDevices); |
150 | for (auto i = 0; i < numDevices; i++) { |
151 | streams_.emplace_back(i); |
152 | } |
153 | } |
154 | cudaStream_t operator[](const int i) { |
155 | GLOO_ENFORCE_LT(i, streams_.size()); |
156 | return *streams_[i]; |
157 | } |
158 | |
159 | protected: |
160 | CudaDeviceStreams(const CudaDeviceStreams&) = delete; |
161 | CudaDeviceStreams& operator=(const CudaDeviceStreams&) = delete; |
162 | |
163 | std::vector<CudaStream> streams_; |
164 | }; |
165 | |
166 | } // namespace gloo |
167 | |