1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_ |
17 | #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_ |
18 | |
19 | #include <string> |
20 | |
21 | #include "tensorflow/core/distributed_runtime/worker_env.h" |
22 | #include "tensorflow/core/framework/rendezvous.h" |
23 | #include "tensorflow/core/lib/core/status.h" |
24 | #include "tensorflow/core/platform/types.h" |
25 | |
26 | namespace tensorflow { |
27 | |
28 | class WorkerSession; |
29 | |
30 | // RemoteRendezvous follow a 2-part initialization. First the objects are |
31 | // constructed. Eventually, they will be initialized. Clients of the |
32 | // RendezvousMgrInterface must guarantee to call Initialize on the returned |
33 | // RemoteRendezvous eventually. |
34 | // |
35 | // Partially initialized RemoteRendezvous must respect the Rendezvous interface |
36 | // (i.e. Send() must never block), however implementations are not expected to |
37 | // actually perform the underlying operations until after the RemoteRendezvous |
38 | // has been Initialize'd. |
39 | class RemoteRendezvous : public Rendezvous { |
40 | public: |
41 | // Fully construct the RemoteRendezvous. |
42 | virtual Status Initialize(WorkerSession* session) = 0; |
43 | |
44 | // In remote eager, set current instance as context default rendezvous which |
45 | // will be used for eager op-by-op execution. |
46 | virtual void SetRemoteEagerContextDefault() = 0; |
47 | // In remote eager, get if current instance is context default rendezvous. |
48 | virtual bool IsRemoteEagerContextDefault() = 0; |
49 | |
50 | protected: |
51 | bool is_cross_process() override { return true; } |
52 | }; |
53 | |
54 | // RendezvousMgr keeps track of a set of local rendezvous instances. |
55 | // All tensors sent by this worker are buffered in a RendezvousMgr |
56 | // until the tensor is received. Each global unique "step_id" |
57 | // corresponds to one local rendezvous instance managed by a |
58 | // RendezvousMgr. |
59 | // |
60 | // E.g., |
61 | // Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935); |
62 | // fork execution of an graph executor using "rendez" on thread 1; |
63 | // fork execution of another graph executor using "rendez" on thread 2; |
64 | // ... |
65 | // join threads 1 and 2; |
66 | // |
67 | // In the example above, execution in thread 1 and 2 communicates with |
68 | // each other by send/recv operations through the "rend". |
69 | // |
70 | // Tensors sent and recved through rendezvous managed by this |
71 | // RendezvousMgr must have keys generated by Rendezvous::CreateKey. |
72 | class RendezvousMgrInterface { |
73 | public: |
74 | RendezvousMgrInterface() {} |
75 | virtual ~RendezvousMgrInterface() {} |
76 | |
77 | // Returns Rendezvous supporting send and recv among workers in the |
78 | // "step_id". The caller takes ownership of one reference on the |
79 | // returned Rendezvous instance. |
80 | // |
81 | // Note: the caller must guarantee to eventually call Initialize on the |
82 | // returned RemoteRendezvous |
83 | virtual RemoteRendezvous* Find(int64_t step_id) = 0; |
84 | |
85 | // Finds the local rendezvous instance for the "step_id". Runs |
86 | // "done" when the tensor for "key" is produced or an error occurs. |
87 | // |
88 | // This method is used by the rpc handler of RecvTensor. |
89 | virtual void RecvLocalAsync(int64_t step_id, |
90 | const Rendezvous::ParsedKey& parsed, |
91 | Rendezvous::DoneCallback done) = 0; |
92 | |
93 | // Synchronous wrapper for RecvLocalAsync. |
94 | virtual Status RecvLocal(int64_t step_id, const Rendezvous::ParsedKey& parsed, |
95 | Tensor* val, bool* is_dead) = 0; |
96 | |
97 | // Removes rendezvous for "step_id". |
98 | // |
99 | // TODO(zhifengc): Have a background thread in worker that |
100 | // periodically calls CleanupAll(). |
101 | virtual void Cleanup(int64_t step_id) = 0; |
102 | |
103 | // Remove all rendezvous instances owned by the rendezvous_mgr. |
104 | virtual void CleanupAll() = 0; |
105 | }; |
106 | |
107 | } // end namespace tensorflow |
108 | |
109 | #endif // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RENDEZVOUS_MGR_INTERFACE_H_ |
110 | |