1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_ |
17 | #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_ |
18 | |
19 | #include <vector> |
20 | |
21 | #include "tensorflow/core/platform/types.h" |
22 | |
23 | namespace tsl { |
24 | class Env; |
25 | namespace thread { |
26 | class ThreadPool; |
27 | } // namespace thread |
28 | } // namespace tsl |
29 | namespace tensorflow { |
30 | using Env = tsl::Env; |
31 | |
32 | namespace thread { |
33 | using tsl::thread::ThreadPool; |
34 | } // namespace thread |
35 | |
36 | class CollectiveExecutorMgrInterface; |
37 | class Device; |
38 | class DeviceMgr; |
39 | class RendezvousMgrInterface; |
40 | class SessionMgr; |
41 | |
42 | // The worker environment class, which holds a bag of pointers to |
43 | // per-worker singletons. |
44 | // |
45 | // WorkerEnv does not own its member pointers. |
46 | struct WorkerEnv { |
47 | Env* env = nullptr; |
48 | |
49 | // session_mgr encapsulates state for each session. |
50 | SessionMgr* session_mgr = nullptr; |
51 | |
52 | // The local devices of this worker. Devices are owned by the device_mgr. |
53 | // |
54 | // REQUIRES: !local_devices.empty(). |
55 | std::vector<Device*> local_devices; |
56 | |
57 | // In large scaled distributed training, many singleton components (e.g. |
58 | // Rendezvous) can becomes the bottleneck of the system. This field allows |
59 | // us to shard the single components. This number will scale up with number |
60 | // of tasks in this cluster. It is always greater than 1. |
61 | int experimental_num_shards = 1; |
62 | |
63 | // device_mgr manages local devices (cpu and gpu). The WorkerService |
64 | // is the network interface for managed devices. |
65 | // |
66 | // Note: Please use the device_mgr associated with your session if appropriate |
67 | // instead of this one. Using this device_mgr does not support ClusterSpec |
68 | // propagated sessions. |
69 | DeviceMgr* device_mgr = nullptr; |
70 | |
71 | // A set of rendezvous keyed by step ids. |
72 | RendezvousMgrInterface* rendezvous_mgr = nullptr; |
73 | |
74 | // Generates per-step CollectiveExecutors and has access to utilities |
75 | // supporting collective operations. |
76 | std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr; |
77 | |
78 | // A pool of threads for scheduling compute work. |
79 | thread::ThreadPool* compute_pool = nullptr; |
80 | }; |
81 | |
82 | } // end namespace tensorflow |
83 | |
84 | #endif // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_ |
85 | |