1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
17#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
18
19#include <vector>
20
21#include "tensorflow/core/platform/types.h"
22
23namespace tsl {
24class Env;
25namespace thread {
26class ThreadPool;
27} // namespace thread
28} // namespace tsl
29namespace tensorflow {
30using Env = tsl::Env;
31
32namespace thread {
33using tsl::thread::ThreadPool;
34} // namespace thread
35
36class CollectiveExecutorMgrInterface;
37class Device;
38class DeviceMgr;
39class RendezvousMgrInterface;
40class SessionMgr;
41
42// The worker environment class, which holds a bag of pointers to
43// per-worker singletons.
44//
45// WorkerEnv does not own its member pointers.
46struct WorkerEnv {
47 Env* env = nullptr;
48
49 // session_mgr encapsulates state for each session.
50 SessionMgr* session_mgr = nullptr;
51
52 // The local devices of this worker. Devices are owned by the device_mgr.
53 //
54 // REQUIRES: !local_devices.empty().
55 std::vector<Device*> local_devices;
56
57 // In large scaled distributed training, many singleton components (e.g.
58 // Rendezvous) can becomes the bottleneck of the system. This field allows
59 // us to shard the single components. This number will scale up with number
60 // of tasks in this cluster. It is always greater than 1.
61 int experimental_num_shards = 1;
62
63 // device_mgr manages local devices (cpu and gpu). The WorkerService
64 // is the network interface for managed devices.
65 //
66 // Note: Please use the device_mgr associated with your session if appropriate
67 // instead of this one. Using this device_mgr does not support ClusterSpec
68 // propagated sessions.
69 DeviceMgr* device_mgr = nullptr;
70
71 // A set of rendezvous keyed by step ids.
72 RendezvousMgrInterface* rendezvous_mgr = nullptr;
73
74 // Generates per-step CollectiveExecutors and has access to utilities
75 // supporting collective operations.
76 std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr;
77
78 // A pool of threads for scheduling compute work.
79 thread::ThreadPool* compute_pool = nullptr;
80};
81
82} // end namespace tensorflow
83
84#endif // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
85