1 | /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h" |
16 | |
17 | #include "tensorflow/core/common_runtime/device_mgr.h" |
18 | #include "tensorflow/core/framework/device_attributes.pb.h" |
19 | #include "tensorflow/core/platform/errors.h" |
20 | |
21 | namespace tensorflow { |
22 | |
23 | DeviceResolverDistributed::DeviceResolverDistributed(const DeviceMgr* dev_mgr) { |
24 | mutex_lock l(mu_); |
25 | for (Device* device : dev_mgr->ListDevices()) { |
26 | attr_table_[device->name()] = device->attributes(); |
27 | } |
28 | } |
29 | |
30 | Status DeviceResolverDistributed::GetDeviceAttributes( |
31 | const string& device, DeviceAttributes* attributes) { |
32 | mutex_lock l(mu_); |
33 | auto it = attr_table_.find(device); |
34 | if (it == attr_table_.end()) { |
35 | return errors::NotFound(device, " not found" ); |
36 | } |
37 | *attributes = it->second; |
38 | return OkStatus(); |
39 | } |
40 | |
41 | Status DeviceResolverDistributed::GetAllDeviceAttributes( |
42 | const string& task, std::vector<DeviceAttributes>* attributes) { |
43 | mutex_lock l(mu_); |
44 | attributes->clear(); |
45 | for (const auto& it : attr_table_) { |
46 | const string& device_name = it.first; |
47 | if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) { |
48 | attributes->push_back(it.second); |
49 | } |
50 | } |
51 | if (attributes->empty()) { |
52 | return errors::NotFound(task, " not found in the cache" ); |
53 | } |
54 | return OkStatus(); |
55 | } |
56 | |
57 | Status DeviceResolverDistributed::UpdateDeviceAttributes( |
58 | const std::vector<DeviceAttributes>& attributes) { |
59 | mutex_lock l(mu_); |
60 | for (const DeviceAttributes& attr : attributes) { |
61 | auto item = attr_table_.insert({attr.name(), attr}); |
62 | auto it = item.first; |
63 | bool success = item.second; |
64 | // Returns error if the device already exists in the cache and has a |
65 | // different incarnation. |
66 | if (!success && it->second.incarnation() != attr.incarnation()) { |
67 | return errors::FailedPrecondition( |
68 | attr.name(), |
69 | "exists in cache with a different incarnation. " |
70 | "This usually means the remote worker has restarted" ); |
71 | } |
72 | } |
73 | return OkStatus(); |
74 | } |
75 | |
76 | } // namespace tensorflow |
77 | |