1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include <atomic>
17#include <iterator>
18#include <memory>
19#include <vector>
20
21#include "tensorflow/core/common_runtime/device_mgr.h"
22#include "tensorflow/core/common_runtime/local_device.h"
23#include "tensorflow/core/framework/device_attributes.pb.h"
24#include "tensorflow/core/lib/core/errors.h"
25#include "tensorflow/core/platform/logging.h"
26#include "tensorflow/core/util/device_name_utils.h"
27
28namespace tensorflow {
29
30DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {}
31
32DynamicDeviceMgr::DynamicDeviceMgr(
33 std::vector<std::unique_ptr<Device>> devices) {
34 Status status = AddDevices(std::move(devices));
35 CHECK(status.ok()); // Crash OK
36 mutex_lock l(devices_mu_);
37 // Initialize cpu_device_.
38 for (int i = 0; i < dynamic_devices_.size(); ++i) {
39 auto* d = dynamic_devices_[i].get();
40 if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) {
41 cpu_device_ = d;
42 break;
43 }
44 }
45}
46
47DynamicDeviceMgr::~DynamicDeviceMgr() {
48 // Release resources ahead of destroying the device manager as the resource
49 // destructors (e.g. ~IteratorResource) assume devices still exist.
50 mutex_lock l(devices_mu_);
51 for (const auto& d : dynamic_devices_) {
52 // TODO(tf-runtime-team): clear devices' resource mgr in devices'
53 // destructor.
54 d->ClearResourceMgr();
55 }
56}
57
58void DynamicDeviceMgr::ListDeviceAttributes(
59 std::vector<DeviceAttributes>* devices) const {
60 tf_shared_lock l(devices_mu_);
61 devices->reserve(dynamic_devices_.size());
62 for (const auto& d : dynamic_devices_) {
63 devices->emplace_back(d->attributes());
64 }
65}
66
67std::vector<Device*> DynamicDeviceMgr::ListDevices() const {
68 tf_shared_lock l(devices_mu_);
69 std::vector<Device*> devices;
70 devices.reserve(dynamic_devices_.size());
71 for (const auto& d : dynamic_devices_) {
72 devices.emplace_back(d.get());
73 }
74 return devices;
75}
76
77string DynamicDeviceMgr::DebugString() const {
78 string out;
79 tf_shared_lock l(devices_mu_);
80 for (const auto& d : dynamic_devices_) {
81 strings::StrAppend(&out, d->name(), "\n");
82 }
83 return out;
84}
85
86string DynamicDeviceMgr::DeviceMappingString() const {
87 string out;
88 tf_shared_lock l(devices_mu_);
89 for (const auto& d : dynamic_devices_) {
90 if (!d->attributes().physical_device_desc().empty()) {
91 strings::StrAppend(&out, d->name(), " -> ",
92 d->attributes().physical_device_desc(), "\n");
93 }
94 }
95 return out;
96}
97
98Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
99 tf_shared_lock l(devices_mu_);
100 auto iter = device_map_.find(string(name));
101 if (iter == device_map_.end()) {
102 std::vector<StringPiece> device_names;
103 for (auto&& itr : device_map_) {
104 device_names.push_back(itr.first);
105 }
106 VLOG(1) << "Unknown device: " << name
107 << " all devices: " << absl::StrJoin(device_names, ", ");
108 return errors::InvalidArgument(name, " unknown device.");
109 }
110 *device = iter->second;
111 return OkStatus();
112}
113
114bool DynamicDeviceMgr::ContainsDevice(int64_t device_incarnation) const {
115 tf_shared_lock l(devices_mu_);
116 return device_incarnation_set_.contains(device_incarnation);
117}
118
119void DynamicDeviceMgr::ClearContainers(
120 gtl::ArraySlice<string> containers) const {
121 Status s;
122 tf_shared_lock l(devices_mu_);
123 for (const auto& d : dynamic_devices_) {
124 if (containers.empty()) {
125 s.Update(d->resource_manager()->Cleanup(
126 d->resource_manager()->default_container()));
127 } else {
128 for (const string& c : containers) {
129 s.Update(d->resource_manager()->Cleanup(c));
130 }
131 }
132 if (!s.ok()) {
133 LOG(WARNING) << s;
134 }
135 }
136}
137
138int DynamicDeviceMgr::NumDeviceType(const string& type) const {
139 tf_shared_lock l(devices_mu_);
140 auto iter = device_type_counts_.find(type);
141 if (iter != device_type_counts_.end()) return iter->second;
142 return 0;
143}
144
145Status DynamicDeviceMgr::AddDevices(
146 std::vector<std::unique_ptr<Device>> devices) {
147 mutex_lock l(devices_mu_);
148 for (auto& d : devices) {
149 if (device_map_.find(d->name()) != device_map_.end()) {
150 return errors::InvalidArgument(
151 "Trying to add device ", d->name(),
152 " to manager but its name conflicts with an existing device.");
153 }
154 // Register under the (1) full name and (2) canonical name.
155 for (const string& name :
156 DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
157 device_map_[name] = d.get();
158 }
159 // Register under the (3) local name and (4) legacy local name.
160 for (const string& name :
161 DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
162 device_map_[name] = d.get();
163 }
164 device_type_counts_[d->device_type()]++;
165 device_incarnation_set_.insert(d->attributes().incarnation());
166 dynamic_devices_.push_back(std::move(d));
167 }
168 return OkStatus();
169}
170
171Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) {
172 mutex_lock l(devices_mu_);
173
174 for (const auto& d : devices) {
175 if (d == cpu_device_) {
176 TF_RETURN_IF_ERROR(
177 errors::InvalidArgument("Can not remove HostCPU device ", d->name()));
178 }
179 int i = 0;
180 for (; i < dynamic_devices_.size(); ++i) {
181 if (d == dynamic_devices_[i].get()) break;
182 }
183 if (i >= dynamic_devices_.size()) {
184 return errors::InvalidArgument("Unknown device ", d->name());
185 }
186 }
187
188 for (const auto& d : devices) {
189 // Clear registration of (1) full name and (2) canonical name
190 for (const string& name :
191 DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
192 device_map_.erase(name);
193 }
194 // Clear registration of (3) local name and (4) legacy local name
195 for (const string& name :
196 DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
197 device_map_.erase(name);
198 }
199 device_type_counts_[d->device_type()]--;
200 device_incarnation_set_.erase(d->attributes().incarnation());
201
202 int i = 0;
203 for (; i < dynamic_devices_.size(); ++i) {
204 if (d == dynamic_devices_[i].get()) break;
205 }
206 // There shouldn't be unknown devices at this point.
207 CHECK(i < dynamic_devices_.size()); // Crash OK
208 stale_devices_.add(std::move(dynamic_devices_[i]));
209 dynamic_devices_.erase(dynamic_devices_.begin() + i);
210 }
211 return OkStatus();
212}
213
214Status DynamicDeviceMgr::RemoveDevicesByName(
215 const std::vector<string>& device_names) {
216 std::vector<Device*> devices_to_remove;
217 for (const string& name : device_names) {
218 Device* device;
219 TF_RETURN_IF_ERROR(LookupDevice(name, &device));
220 devices_to_remove.emplace_back(device);
221 }
222 return RemoveDevices(devices_to_remove);
223}
224
225Device* DynamicDeviceMgr::HostCPU() const {
226 Device* device = cpu_device_.load(std::memory_order_relaxed);
227
228 // Host CPU device can't be removed, so if we found valid device once, we
229 // do not need to check that it is still in the device list.
230 if (device != nullptr) return device;
231
232 mutex_lock l(devices_mu_);
233 for (int i = 0; i < dynamic_devices_.size(); ++i) {
234 Device* d = dynamic_devices_[i].get();
235 if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) {
236 cpu_device_ = d;
237 break;
238 }
239 }
240
241 return cpu_device_.load(std::memory_order_relaxed);
242}
243
244} // namespace tensorflow
245