1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef GLOW_BACKENDS_DEVICEMANAGER_H
17#define GLOW_BACKENDS_DEVICEMANAGER_H
18
19#include "glow/Backend/CompiledFunction.h"
20#include "glow/Base/DeviceTensorTransferManager.h"
21#include "glow/ExecutionContext/ExecutionContext.h"
22#include "glow/Graph/Graph.h"
23#include "glow/Runtime/RuntimeTypes.h"
24#include "glow/Runtime/StatsExporter.h"
25#include "glow/Support/Error.h"
26
27#include <atomic>
28#include <functional>
29#include <map>
30#include <mutex>
31#include <string>
32
33namespace glow {
34namespace runtime {
35
36/// Callback signalling success/failure of evicting a function from a Device.
37using EvictFunctionCBTy = std::function<void(std::string functionName, Error)>;
38
39/// Callback signalling success/failure of loading a Module onto a device.
40using ReadyCBTy = std::function<void(const Module *, Error)>;
41
42/// Map of Function name -> CompiledFunction, used when loading a network onto a
43/// device.
44using FunctionMapTy = std::map<std::string, CompiledFunction *>;
45
46/// Interface managing a specific instance of a device.
47class DeviceManager : public DeviceTensorTransferManager {
48protected:
49 /// Configuration object for the device.
50 DeviceConfig config_;
51
52 /// Lock to protect allocations_ from being accessed concurrently. This can
53 /// occur when multiple networks are added concurrently.
54 std::mutex bufferLock_;
55
56 /// String for logging available memory for the device.
57 const std::string availableMemoryKey_{"glow.device.available_memory.device"};
58
59 /// String for logging used memory for the device.
60 const std::string usedMemoryKey_{"glow.device.used_memory.device"};
61
62 /// Maximum available memory on the device.
63 std::atomic<uint64_t> maxMemoryBytes_{0};
64
65 /// Amount of memory used by all models.
66 std::atomic<uint64_t> usedMemoryBytes_{0};
67
68 /// Keeps the stats exporter registry object alive till destructor.
69 std::shared_ptr<StatsExporterRegistry> statsExporterRegistry_;
70
71 /// Set of all buffer allocations, these should all be freed when the device
72 /// manager is destroyed.
73 std::set<void *> allocations_;
74
75 /// Helper method to export memory usage counters.
76 void exportMemoryCounters() {
77 statsExporterRegistry_->setCounter(availableMemoryKey_,
78 maxMemoryBytes_ - usedMemoryBytes_);
79 statsExporterRegistry_->setCounter(usedMemoryKey_, usedMemoryBytes_);
80 }
81
82 /// Helper method to zero out memory counters, used when a device is freed.
83 void zeroMemoryCounters() {
84 statsExporterRegistry_->setCounter(availableMemoryKey_, 0);
85 statsExporterRegistry_->setCounter(usedMemoryKey_, 0);
86 }
87
88public:
89 DeviceManager(const DeviceConfig &config)
90 : config_(config),
91 availableMemoryKey_("glow.device.available_memory.device" +
92 std::to_string(config_.deviceID)),
93 usedMemoryKey_("glow.device.used_memory.device" +
94 std::to_string(config_.deviceID)),
95 maxMemoryBytes_(config_.getDeviceMemory(2000000000)),
96 statsExporterRegistry_(StatsExporterRegistry::Stats()) {}
97
98 virtual ~DeviceManager() {
99 // Free all allocated buffers.
100 for (auto &buffer : allocations_) {
101 alignedFree(buffer);
102 }
103 }
104
105 /// Create a device manager based on the device config \p config.
106 static DeviceManager *createDeviceManager(const DeviceConfig &config);
107
108 /// Query the system for the number of devices of a specified kind.
109 static unsigned numDevices(llvm::StringRef backendName);
110
111 /// Device discovery for a given backend kind. Returns a vector of configs for
112 /// all found devices.
113 static std::vector<std::unique_ptr<runtime::DeviceConfig>>
114 generateDeviceConfigs(llvm::StringRef backendName, bool scanDevices = false);
115
116 /// Initialize the device.
117 virtual Error init() { return Error::success(); }
118
119 /// \returns a pointer to a buffer of size \p size allocated on the host, that
120 /// satistfies any requirements for pinning/alignment for transferring to/from
121 /// the device. The lifetime of this buffer is managed by the device manager.
122 virtual void *allocateDeviceIOBuffer(dim_t size) {
123 std::lock_guard<std::mutex> lock(bufferLock_);
124 void *buffer = alignedAlloc(size, TensorAlignment);
125 allocations_.insert(buffer);
126 return buffer;
127 };
128
129 /// Free all allocated buffers associated with /p PH.
130 virtual void freeAllocatedDeviceIOBuffer(void *buffer) {
131 std::lock_guard<std::mutex> lock(bufferLock_);
132 auto it = allocations_.find(buffer);
133 if (it != allocations_.end()) {
134 alignedFree(buffer);
135 allocations_.erase(it);
136 }
137 }
138
139 /// Load the provided module into the device, readyCB will be called when
140 /// ready to use.
141 /// \p functions contains the list of functions to load, keyed by their name
142 /// (as used in runFunction).
143 virtual void addNetwork(const Module *module, FunctionMapTy functions,
144 ReadyCBTy readyCB) = 0;
145
146 /// Remove (and delete) the provided function, freeing
147 /// up space on the device. \p evictCB will be called when the operation
148 /// is completed or attempted and failed.
149 virtual void evictNetwork(
150 std::string functionName,
151 EvictFunctionCBTy evictCB = [](std::string, Error) {}) = 0;
152
153 /// Execute the named Function in an already provided network on the device.
154 /// functionName must match the name of a function already added.
155 /// The ExecutionContext's PlaceholderBindings should have all Placeholders
156 /// allocated. resultCB will be called with the ExecutionContext containing
157 /// output tensors filled, and any generated TraceEvents.
158 virtual runtime::RunIdentifierTy
159 runFunction(std::string functionName,
160 std::unique_ptr<ExecutionContext> context,
161 runtime::ResultCBTy resultCB) = 0;
162
163 /// Copies the contents of Tensor \p T to the device resource allocated to
164 /// Placeholder \p PH. once finished calls \p resultCB with the result of the
165 /// operation.
166 virtual void
167 transferStaticPlaceholderToDevice(Placeholder *PH, Tensor *T,
168 std::function<void(Error)> resultCB) {
169 resultCB(MAKE_ERR(ErrorValue::ErrorCode::RUNTIME_ERROR,
170 "Unsupported feature, cannot copy Placeholder."));
171 };
172
173 /// Stops execution and shuts down the Device.
174 virtual Error stop(bool block = true) { return Error::success(); };
175
176 /// \returns the name of backend that powers this Device.
177 llvm::StringRef getBackendName() { return config_.backendName; }
178
179 /// \returns a string with \p name in parameters.
180 llvm::StringRef getParamByName(llvm::StringRef name) const {
181 auto it = config_.parameters.find(name);
182 if (it != config_.parameters.end()) {
183 return it->second;
184 }
185 return "";
186 }
187
188 /// \returns the maximum memory (in bytes) available on the device.
189 virtual uint64_t getMaximumMemory() const = 0;
190
191 /// \returns the currently available memory (in bytes) available on the
192 /// device, for provisioning new networks.
193 virtual uint64_t getAvailableMemory() const = 0;
194
195 /// \returns true if we expect a Module with the estimated constant size will
196 /// fit on the device.
197 virtual bool isMemoryAvailable(uint64_t estimate) const = 0;
198
199 /// \returns the DeviceConfig which initialized this device.
200 const DeviceConfig &getDeviceConfig() { return config_; }
201
202 /// \returns the DeviceInfo for this device containing peak limits for
203 /// compute and bandwidths (used in partitioning).
204 virtual DeviceInfo getDeviceInfo() const { return DeviceInfo(); }
205
206 /// Copies the contents of \p tensor from the host to the \p location
207 /// address on this device. Updates the tensor residency info.
208 virtual void
209 transferToDevice(Tensor &tensor, void *locationContext,
210 std::function<void(Error)> resultCB = GLOW_DRT_DEFAULT_CB) {
211 DCHECK("Not Implemented");
212 resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED,
213 "Direct transfer not supported on this device"));
214 }
215
216 /// Copies the device buffer associated with \p tensor to the host.
217 /// The tensor must be resident on this device. If \p release is true,
218 /// frees the device memory. Updates the tensor residency info.
219 virtual void transferFromDevice(
220 Tensor &tensor, bool release = true,
221 std::function<void(Error)> resultCB = GLOW_DRT_DEFAULT_CB) {
222 DCHECK("Not Implemented");
223 resultCB(MAKE_ERR(ErrorValue::ErrorCode::DEVICE_FEATURE_NOT_SUPPORTED,
224 "Direct transfer not supported on this device"));
225 }
226
227 /// Releases the device buffer associated with \p tensor.
228 virtual bool releaseDeviceTensor(void *locationContext) {
229 DCHECK("Not Implemented");
230 return false;
231 }
232
233 /// Starts device tracing \returns Error if fails.
234 virtual Error startDeviceTrace(TraceContext *traceContext) {
235 return Error::success();
236 }
237 /// Stops device tracing \returns Error if fails.
238 virtual Error stopDeviceTrace(TraceContext *traceContext) {
239 return Error::success();
240 }
241};
242
243} // namespace runtime
244} // namespace glow
245
246#endif // GLOW_BACKENDS_DEVICEMANAGER_H
247