1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include <atomic> |
17 | #include <iterator> |
18 | #include <memory> |
19 | #include <vector> |
20 | |
21 | #include "tensorflow/core/common_runtime/device_mgr.h" |
22 | #include "tensorflow/core/common_runtime/local_device.h" |
23 | #include "tensorflow/core/framework/device_attributes.pb.h" |
24 | #include "tensorflow/core/lib/core/errors.h" |
25 | #include "tensorflow/core/platform/logging.h" |
26 | #include "tensorflow/core/util/device_name_utils.h" |
27 | |
28 | namespace tensorflow { |
29 | |
30 | DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {} |
31 | |
32 | DynamicDeviceMgr::DynamicDeviceMgr( |
33 | std::vector<std::unique_ptr<Device>> devices) { |
34 | Status status = AddDevices(std::move(devices)); |
35 | CHECK(status.ok()); // Crash OK |
36 | mutex_lock l(devices_mu_); |
37 | // Initialize cpu_device_. |
38 | for (int i = 0; i < dynamic_devices_.size(); ++i) { |
39 | auto* d = dynamic_devices_[i].get(); |
40 | if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) { |
41 | cpu_device_ = d; |
42 | break; |
43 | } |
44 | } |
45 | } |
46 | |
47 | DynamicDeviceMgr::~DynamicDeviceMgr() { |
48 | // Release resources ahead of destroying the device manager as the resource |
49 | // destructors (e.g. ~IteratorResource) assume devices still exist. |
50 | mutex_lock l(devices_mu_); |
51 | for (const auto& d : dynamic_devices_) { |
52 | // TODO(tf-runtime-team): clear devices' resource mgr in devices' |
53 | // destructor. |
54 | d->ClearResourceMgr(); |
55 | } |
56 | } |
57 | |
58 | void DynamicDeviceMgr::ListDeviceAttributes( |
59 | std::vector<DeviceAttributes>* devices) const { |
60 | tf_shared_lock l(devices_mu_); |
61 | devices->reserve(dynamic_devices_.size()); |
62 | for (const auto& d : dynamic_devices_) { |
63 | devices->emplace_back(d->attributes()); |
64 | } |
65 | } |
66 | |
67 | std::vector<Device*> DynamicDeviceMgr::ListDevices() const { |
68 | tf_shared_lock l(devices_mu_); |
69 | std::vector<Device*> devices; |
70 | devices.reserve(dynamic_devices_.size()); |
71 | for (const auto& d : dynamic_devices_) { |
72 | devices.emplace_back(d.get()); |
73 | } |
74 | return devices; |
75 | } |
76 | |
77 | string DynamicDeviceMgr::DebugString() const { |
78 | string out; |
79 | tf_shared_lock l(devices_mu_); |
80 | for (const auto& d : dynamic_devices_) { |
81 | strings::StrAppend(&out, d->name(), "\n" ); |
82 | } |
83 | return out; |
84 | } |
85 | |
86 | string DynamicDeviceMgr::DeviceMappingString() const { |
87 | string out; |
88 | tf_shared_lock l(devices_mu_); |
89 | for (const auto& d : dynamic_devices_) { |
90 | if (!d->attributes().physical_device_desc().empty()) { |
91 | strings::StrAppend(&out, d->name(), " -> " , |
92 | d->attributes().physical_device_desc(), "\n" ); |
93 | } |
94 | } |
95 | return out; |
96 | } |
97 | |
98 | Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const { |
99 | tf_shared_lock l(devices_mu_); |
100 | auto iter = device_map_.find(string(name)); |
101 | if (iter == device_map_.end()) { |
102 | std::vector<StringPiece> device_names; |
103 | for (auto&& itr : device_map_) { |
104 | device_names.push_back(itr.first); |
105 | } |
106 | VLOG(1) << "Unknown device: " << name |
107 | << " all devices: " << absl::StrJoin(device_names, ", " ); |
108 | return errors::InvalidArgument(name, " unknown device." ); |
109 | } |
110 | *device = iter->second; |
111 | return OkStatus(); |
112 | } |
113 | |
114 | bool DynamicDeviceMgr::ContainsDevice(int64_t device_incarnation) const { |
115 | tf_shared_lock l(devices_mu_); |
116 | return device_incarnation_set_.contains(device_incarnation); |
117 | } |
118 | |
119 | void DynamicDeviceMgr::ClearContainers( |
120 | gtl::ArraySlice<string> containers) const { |
121 | Status s; |
122 | tf_shared_lock l(devices_mu_); |
123 | for (const auto& d : dynamic_devices_) { |
124 | if (containers.empty()) { |
125 | s.Update(d->resource_manager()->Cleanup( |
126 | d->resource_manager()->default_container())); |
127 | } else { |
128 | for (const string& c : containers) { |
129 | s.Update(d->resource_manager()->Cleanup(c)); |
130 | } |
131 | } |
132 | if (!s.ok()) { |
133 | LOG(WARNING) << s; |
134 | } |
135 | } |
136 | } |
137 | |
138 | int DynamicDeviceMgr::NumDeviceType(const string& type) const { |
139 | tf_shared_lock l(devices_mu_); |
140 | auto iter = device_type_counts_.find(type); |
141 | if (iter != device_type_counts_.end()) return iter->second; |
142 | return 0; |
143 | } |
144 | |
145 | Status DynamicDeviceMgr::AddDevices( |
146 | std::vector<std::unique_ptr<Device>> devices) { |
147 | mutex_lock l(devices_mu_); |
148 | for (auto& d : devices) { |
149 | if (device_map_.find(d->name()) != device_map_.end()) { |
150 | return errors::InvalidArgument( |
151 | "Trying to add device " , d->name(), |
152 | " to manager but its name conflicts with an existing device." ); |
153 | } |
154 | // Register under the (1) full name and (2) canonical name. |
155 | for (const string& name : |
156 | DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) { |
157 | device_map_[name] = d.get(); |
158 | } |
159 | // Register under the (3) local name and (4) legacy local name. |
160 | for (const string& name : |
161 | DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) { |
162 | device_map_[name] = d.get(); |
163 | } |
164 | device_type_counts_[d->device_type()]++; |
165 | device_incarnation_set_.insert(d->attributes().incarnation()); |
166 | dynamic_devices_.push_back(std::move(d)); |
167 | } |
168 | return OkStatus(); |
169 | } |
170 | |
171 | Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) { |
172 | mutex_lock l(devices_mu_); |
173 | |
174 | for (const auto& d : devices) { |
175 | if (d == cpu_device_) { |
176 | TF_RETURN_IF_ERROR( |
177 | errors::InvalidArgument("Can not remove HostCPU device " , d->name())); |
178 | } |
179 | int i = 0; |
180 | for (; i < dynamic_devices_.size(); ++i) { |
181 | if (d == dynamic_devices_[i].get()) break; |
182 | } |
183 | if (i >= dynamic_devices_.size()) { |
184 | return errors::InvalidArgument("Unknown device " , d->name()); |
185 | } |
186 | } |
187 | |
188 | for (const auto& d : devices) { |
189 | // Clear registration of (1) full name and (2) canonical name |
190 | for (const string& name : |
191 | DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) { |
192 | device_map_.erase(name); |
193 | } |
194 | // Clear registration of (3) local name and (4) legacy local name |
195 | for (const string& name : |
196 | DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) { |
197 | device_map_.erase(name); |
198 | } |
199 | device_type_counts_[d->device_type()]--; |
200 | device_incarnation_set_.erase(d->attributes().incarnation()); |
201 | |
202 | int i = 0; |
203 | for (; i < dynamic_devices_.size(); ++i) { |
204 | if (d == dynamic_devices_[i].get()) break; |
205 | } |
206 | // There shouldn't be unknown devices at this point. |
207 | CHECK(i < dynamic_devices_.size()); // Crash OK |
208 | stale_devices_.add(std::move(dynamic_devices_[i])); |
209 | dynamic_devices_.erase(dynamic_devices_.begin() + i); |
210 | } |
211 | return OkStatus(); |
212 | } |
213 | |
214 | Status DynamicDeviceMgr::RemoveDevicesByName( |
215 | const std::vector<string>& device_names) { |
216 | std::vector<Device*> devices_to_remove; |
217 | for (const string& name : device_names) { |
218 | Device* device; |
219 | TF_RETURN_IF_ERROR(LookupDevice(name, &device)); |
220 | devices_to_remove.emplace_back(device); |
221 | } |
222 | return RemoveDevices(devices_to_remove); |
223 | } |
224 | |
225 | Device* DynamicDeviceMgr::HostCPU() const { |
226 | Device* device = cpu_device_.load(std::memory_order_relaxed); |
227 | |
228 | // Host CPU device can't be removed, so if we found valid device once, we |
229 | // do not need to check that it is still in the device list. |
230 | if (device != nullptr) return device; |
231 | |
232 | mutex_lock l(devices_mu_); |
233 | for (int i = 0; i < dynamic_devices_.size(); ++i) { |
234 | Device* d = dynamic_devices_[i].get(); |
235 | if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) { |
236 | cpu_device_ = d; |
237 | break; |
238 | } |
239 | } |
240 | |
241 | return cpu_device_.load(std::memory_order_relaxed); |
242 | } |
243 | |
244 | } // namespace tensorflow |
245 | |