1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #ifndef GLOW_BACKENDS_DEVICEMANAGER_H |
17 | #define GLOW_BACKENDS_DEVICEMANAGER_H |
18 | |
19 | #include "glow/Backend/CompiledFunction.h" |
20 | #include "glow/Base/DeviceTensorTransferManager.h" |
21 | #include "glow/ExecutionContext/ExecutionContext.h" |
22 | #include "glow/Graph/Graph.h" |
23 | #include "glow/Runtime/RuntimeTypes.h" |
24 | #include "glow/Runtime/StatsExporter.h" |
25 | #include "glow/Support/Error.h" |
26 | |
27 | #include <atomic> |
28 | #include <functional> |
29 | #include <map> |
30 | #include <mutex> |
31 | #include <string> |
32 | |
33 | namespace glow { |
34 | namespace runtime { |
35 | |
36 | /// Callback signalling success/failure of evicting a function from a Device. |
37 | using EvictFunctionCBTy = std::function<void(std::string functionName, Error)>; |
38 | |
39 | /// Callback signalling success/failure of loading a Module onto a device. |
40 | using ReadyCBTy = std::function<void(const Module *, Error)>; |
41 | |
42 | /// Map of Function name -> CompiledFunction, used when loading a network onto a |
43 | /// device. |
44 | using FunctionMapTy = std::map<std::string, CompiledFunction *>; |
45 | |
46 | /// Interface managing a specific instance of a device. |
47 | class DeviceManager : public DeviceTensorTransferManager { |
48 | protected: |
49 | /// Configuration object for the device. |
50 | DeviceConfig config_; |
51 | |
52 | /// Lock to protect allocations_ from being accessed concurrently. This can |
53 | /// occur when multiple networks are added concurrently. |
54 | std::mutex bufferLock_; |
55 | |
56 | /// String for logging available memory for the device. |
57 | const std::string availableMemoryKey_{"glow.device.available_memory.device" }; |
58 | |
59 | /// String for logging used memory for the device. |
60 | const std::string usedMemoryKey_{"glow.device.used_memory.device" }; |
61 | |
62 | /// Maximum available memory on the device. |
63 | std::atomic<uint64_t> maxMemoryBytes_{0}; |
64 | |
65 | /// Amount of memory used by all models. |
66 | std::atomic<uint64_t> usedMemoryBytes_{0}; |
67 | |
68 | /// Keeps the stats exporter registry object alive till destructor. |
69 | std::shared_ptr<StatsExporterRegistry> statsExporterRegistry_; |
70 | |
71 | /// Set of all buffer allocations, these should all be freed when the device |
72 | /// manager is destroyed. |
73 | std::set<void *> allocations_; |
74 | |
75 | /// Helper method to export memory usage counters. |
76 | void exportMemoryCounters() { |
77 | statsExporterRegistry_->setCounter(availableMemoryKey_, |
78 | maxMemoryBytes_ - usedMemoryBytes_); |
79 | statsExporterRegistry_->setCounter(usedMemoryKey_, usedMemoryBytes_); |
80 | } |
81 | |
82 | /// Helper method to zero out memory counters, used when a device is freed. |
83 | void zeroMemoryCounters() { |
84 | statsExporterRegistry_->setCounter(availableMemoryKey_, 0); |
85 | statsExporterRegistry_->setCounter(usedMemoryKey_, 0); |
86 | } |
87 | |
88 | public: |
89 | DeviceManager(const DeviceConfig &config) |
90 | : config_(config), |
91 | availableMemoryKey_("glow.device.available_memory.device" + |
92 | std::to_string(config_.deviceID)), |
93 | usedMemoryKey_("glow.device.used_memory.device" + |
94 | std::to_string(config_.deviceID)), |
95 | maxMemoryBytes_(config_.getDeviceMemory(2000000000)), |
96 | statsExporterRegistry_(StatsExporterRegistry::Stats()) {} |
97 | |
98 | virtual ~DeviceManager() { |
99 | // Free all allocated buffers. |
100 | for (auto &buffer : allocations_) { |
101 | alignedFree(buffer); |
102 | } |
103 | } |
104 | |
105 | /// Create a device manager based on the device config \p config. |
106 | static DeviceManager *createDeviceManager(const DeviceConfig &config); |
107 | |
108 | /// Query the system for the number of devices of a specified kind. |
109 | static unsigned numDevices(llvm::StringRef backendName); |
110 | |
111 | /// Device discovery for a given backend kind. Returns a vector of configs for |
112 | /// all found devices. |
113 | static std::vector<std::unique_ptr<runtime::DeviceConfig>> |
114 | generateDeviceConfigs(llvm::StringRef backendName, bool scanDevices = false); |
115 | |
116 | /// Initialize the device. |
117 | virtual Error init() { return Error::success(); } |
118 | |
119 | /// \returns a pointer to a buffer of size \p size allocated on the host, that |
120 | /// satistfies any requirements for pinning/alignment for transferring to/from |
121 | /// the device. The lifetime of this buffer is managed by the device manager. |
122 | virtual void *allocateDeviceIOBuffer(dim_t size) { |
123 | std::lock_guard<std::mutex> lock(bufferLock_); |
124 | void *buffer = alignedAlloc(size, TensorAlignment); |
125 | allocations_.insert(buffer); |
126 | return buffer; |
127 | }; |
128 | |
129 | /// Free all allocated buffers associated with /p PH. |
130 | virtual void freeAllocatedDeviceIOBuffer(void *buffer) { |
131 | std::lock_guard<std::mutex> lock(bufferLock_); |
132 | auto it = allocations_.find(buffer); |
133 | if (it != allocations_.end()) { |
134 | alignedFree(buffer); |
135 | allocations_.erase(it); |
136 | } |
137 | } |
138 | |
139 | /// Load the provided module into the device, readyCB will be called when |
140 | /// ready to use. |
141 | /// \p functions contains the list of functions to load, keyed by their name |
142 | /// (as used in runFunction). |
143 | virtual void addNetwork(const Module *module, FunctionMapTy functions, |
144 | ReadyCBTy readyCB) = 0; |
145 | |
146 | /// Remove (and delete) the provided function, freeing |
147 | /// up space on the device. \p evictCB will be called when the operation |
148 | /// is completed or attempted and failed. |
149 | virtual void evictNetwork( |
150 | std::string functionName, |
151 | EvictFunctionCBTy evictCB = [](std::string, Error) {}) = 0; |
152 | |
153 | /// Execute the named Function in an already provided network on the device. |
154 | /// functionName must match the name of a function already added. |
155 | /// The ExecutionContext's PlaceholderBindings should have all Placeholders |
156 | /// allocated. resultCB will be called with the ExecutionContext containing |
157 | /// output tensors filled, and any generated TraceEvents. |
158 | virtual runtime::RunIdentifierTy |
159 | runFunction(std::string functionName, |
160 | std::unique_ptr<ExecutionContext> context, |
161 | runtime::ResultCBTy resultCB) = 0; |
162 | |
163 | /// Copies the contents of Tensor \p T to the device resource allocated to |
164 | /// Placeholder \p PH. once finished calls \p resultCB with the result of the |
165 | /// operation. |
166 | virtual void |
167 | transferStaticPlaceholderToDevice(Placeholder *PH, Tensor *T, |
168 | std::function<void(Error)> resultCB) { |
169 | resultCB(MAKE_ERR(ErrorValue::ErrorCode::RUNTIME_ERROR, |
170 | "Unsupported feature, cannot copy Placeholder." )); |
171 | }; |
172 | |
173 | /// Stops execution and shuts down the Device. |
174 | virtual Error stop(bool block = true) { return Error::success(); }; |
175 | |
176 | /// \returns the name of backend that powers this Device. |
177 | llvm::StringRef getBackendName() { return config_.backendName; } |
178 | |
179 | /// \returns a string with \p name in parameters. |
180 | llvm::StringRef getParamByName(llvm::StringRef name) const { |
181 | auto it = config_.parameters.find(name); |
182 | if (it != config_.parameters.end()) { |
183 | return it->second; |
184 | } |
185 | return "" ; |
186 | } |
187 | |
188 | /// \returns the maximum memory (in bytes) available on the device. |
189 | virtual uint64_t getMaximumMemory() const = 0; |
190 | |
191 | /// \returns the currently available memory (in bytes) available on the |
192 | /// device, for provisioning new networks. |
193 | virtual uint64_t getAvailableMemory() const = 0; |
194 | |
195 | /// \returns true if we expect a Module with the estimated constant size will |
196 | /// fit on the device. |
197 | virtual bool isMemoryAvailable(uint64_t estimate) const = 0; |
198 | |
199 | /// \returns the DeviceConfig which initialized this device. |
200 | const DeviceConfig &getDeviceConfig() { return config_; } |
201 | |
202 | /// \returns the DeviceInfo for this device containing peak limits for |
203 | /// compute and bandwidths (used in partitioning). |
204 | virtual DeviceInfo getDeviceInfo() const { return DeviceInfo(); } |
205 | |
206 | /// Copies the contents of \p tensor from the host to the \p location |
207 | /// address on this device. Updates the tensor residency info. |
208 | virtual void |
209 | transferToDevice(Tensor &tensor, void *locationContext, |
210 | std::function<void(Error)> resultCB = GLOW_DRT_DEFAULT_CB) { |
211 | DCHECK("Not Implemented" ); |
212 | resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED, |
213 | "Direct transfer not supported on this device" )); |
214 | } |
215 | |
216 | /// Copies the device buffer associated with \p tensor to the host. |
217 | /// The tensor must be resident on this device. If \p release is true, |
218 | /// frees the device memory. Updates the tensor residency info. |
219 | virtual void transferFromDevice( |
220 | Tensor &tensor, bool release = true, |
221 | std::function<void(Error)> resultCB = GLOW_DRT_DEFAULT_CB) { |
222 | DCHECK("Not Implemented" ); |
223 | resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED, |
224 | "Direct transfer not supported on this device" )); |
225 | } |
226 | |
227 | /// Releases the device buffer associated with \p tensor. |
228 | virtual bool releaseDeviceTensor(void *locationContext) { |
229 | DCHECK("Not Implemented" ); |
230 | return false; |
231 | } |
232 | |
233 | /// Starts device tracing \returns Error if fails. |
234 | virtual Error startDeviceTrace(TraceContext *traceContext) { |
235 | return Error::success(); |
236 | } |
237 | /// Stops device tracing \returns Error if fails. |
238 | virtual Error stopDeviceTrace(TraceContext *traceContext) { |
239 | return Error::success(); |
240 | } |
241 | }; |
242 | |
243 | } // namespace runtime |
244 | } // namespace glow |
245 | |
246 | #endif // GLOW_BACKENDS_DEVICEMANAGER_H |
247 | |