1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_
17#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_
18
19#include <string>
20
21#include "tensorflow/core/distributed_runtime/worker_env.h"
22#include "tensorflow/core/framework/rendezvous.h"
23#include "tensorflow/core/lib/core/status.h"
24#include "tensorflow/core/platform/types.h"
25
26namespace tensorflow {
27
28class WorkerSession;
29
30// RemoteRendezvous follow a 2-part initialization. First the objects are
31// constructed. Eventually, they will be initialized. Clients of the
32// RendezvousMgrInterface must guarantee to call Initialize on the returned
33// RemoteRendezvous eventually.
34//
35// Partially initialized RemoteRendezvous must respect the Rendezvous interface
36// (i.e. Send() must never block), however implementations are not expected to
37// actually perform the underlying operations until after the RemoteRendezvous
38// has been Initialize'd.
39class RemoteRendezvous : public Rendezvous {
40 public:
41 // Fully construct the RemoteRendezvous.
42 virtual Status Initialize(WorkerSession* session) = 0;
43
44 // In remote eager, set current instance as context default rendezvous which
45 // will be used for eager op-by-op execution.
46 virtual void SetRemoteEagerContextDefault() = 0;
47 // In remote eager, get if current instance is context default rendezvous.
48 virtual bool IsRemoteEagerContextDefault() = 0;
49
50 protected:
51 bool is_cross_process() override { return true; }
52};
53
54// RendezvousMgr keeps track of a set of local rendezvous instances.
55// All tensors sent by this worker are buffered in a RendezvousMgr
56// until the tensor is received. Each global unique "step_id"
57// corresponds to one local rendezvous instance managed by a
58// RendezvousMgr.
59//
60// E.g.,
61// Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
62// fork execution of an graph executor using "rendez" on thread 1;
63// fork execution of another graph executor using "rendez" on thread 2;
64// ...
65// join threads 1 and 2;
66//
67// In the example above, execution in thread 1 and 2 communicates with
68// each other by send/recv operations through the "rend".
69//
70// Tensors sent and recved through rendezvous managed by this
71// RendezvousMgr must have keys generated by Rendezvous::CreateKey.
72class RendezvousMgrInterface {
73 public:
74 RendezvousMgrInterface() {}
75 virtual ~RendezvousMgrInterface() {}
76
77 // Returns Rendezvous supporting send and recv among workers in the
78 // "step_id". The caller takes ownership of one reference on the
79 // returned Rendezvous instance.
80 //
81 // Note: the caller must guarantee to eventually call Initialize on the
82 // returned RemoteRendezvous
83 virtual RemoteRendezvous* Find(int64_t step_id) = 0;
84
85 // Finds the local rendezvous instance for the "step_id". Runs
86 // "done" when the tensor for "key" is produced or an error occurs.
87 //
88 // This method is used by the rpc handler of RecvTensor.
89 virtual void RecvLocalAsync(int64_t step_id,
90 const Rendezvous::ParsedKey& parsed,
91 Rendezvous::DoneCallback done) = 0;
92
93 // Synchronous wrapper for RecvLocalAsync.
94 virtual Status RecvLocal(int64_t step_id, const Rendezvous::ParsedKey& parsed,
95 Tensor* val, bool* is_dead) = 0;
96
97 // Removes rendezvous for "step_id".
98 //
99 // TODO(zhifengc): Have a background thread in worker that
100 // periodically calls CleanupAll().
101 virtual void Cleanup(int64_t step_id) = 0;
102
103 // Remove all rendezvous instances owned by the rendezvous_mgr.
104 virtual void CleanupAll() = 0;
105};
106
107} // end namespace tensorflow
108
109#endif // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_
110