1 | #pragma once |
2 | |
3 | #include <torch/csrc/distributed/c10d/PrefixStore.hpp> |
4 | #include <torch/csrc/distributed/rpc/utils.h> |
5 | |
6 | namespace torch { |
7 | namespace distributed { |
8 | namespace rpc { |
9 | |
10 | // All RPC peers should call into this function at the same time. Each peer |
11 | // provides its own id and name, and this function uses the given Store to |
12 | // gather global name-to-id mapping on all peers. |
13 | std::unordered_map<std::string, worker_id_t> collectNames( |
14 | ::c10d::PrefixStore store, |
15 | const worker_id_t selfId, |
16 | const std::string& selfName, |
17 | const int worldSize); |
18 | |
19 | // Ranks in dynamic RPC groups will initially call into this to establish the |
20 | // name-to-id mapping for the current peers in the group. The current rank will |
21 | // put its own worker info in the store and discover all the ranks that came |
22 | // before it. NOTE: This needs to be called with the Dynamic RPC group |
23 | // membership management token held. |
24 | std::unordered_map<std::string, worker_id_t> collectCurrentNames( |
25 | ::c10d::PrefixStore store, |
26 | const worker_id_t selfId, |
27 | const std::string& selfName); |
28 | |
29 | // Remove name frmo Store, used in dynamic RPC groups. |
30 | // NOTE: This needs to be called with the Dynamic RPC group |
31 | // membership management token held. |
32 | void removeCurrentName( |
33 | ::c10d::PrefixStore store, |
34 | const worker_id_t selfId, |
35 | const std::string& selfName); |
36 | |
37 | // This performs a synchronization of all call counts by using store. |
38 | // All RPC peers wait for others to join to exit at the same time. |
39 | int syncCallCount( |
40 | ::c10d::PrefixStore store, |
41 | const int worldSize, |
42 | int activeCalls = 0); |
43 | |
44 | } // namespace rpc |
45 | } // namespace distributed |
46 | } // namespace torch |
47 | |