1 | /* Copyright 2021 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0(the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | #ifndef TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_ |
16 | #define TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_ |
17 | |
18 | #include <functional> |
19 | #include <string> |
20 | #include <utility> |
21 | |
22 | #include "absl/base/attributes.h" |
23 | #include "absl/container/flat_hash_map.h" |
24 | #include "absl/strings/string_view.h" |
25 | #include "tensorflow/core/framework/resource_base.h" |
26 | #include "tensorflow/core/platform/mutex.h" |
27 | #include "tensorflow/core/platform/status.h" |
28 | #include "tensorflow/core/platform/statusor.h" |
29 | #include "tensorflow/core/platform/types.h" |
30 | |
31 | namespace tensorflow { |
32 | namespace checkpoint { |
33 | |
34 | ABSL_CONST_INIT extern const absl::string_view |
35 | kCheckpointCallbackManagerResourceName; |
36 | |
37 | // StatusOr<std::string> save_callback(absl::string_view checkpoint_id); |
38 | using SaveCallback = std::function<StatusOr<std::string>(absl::string_view)>; |
39 | |
40 | // Status restore_callback(absl::string_view checkpoint_id, |
41 | // absl::string_view content_from_checkpoint); |
42 | using RestoreCallback = |
43 | std::function<Status(absl::string_view, absl::string_view)>; |
44 | |
45 | // A class to save and restore additional information for checkpointing. |
46 | class CheckpointCallbackManager : public ResourceBase { |
47 | public: |
48 | CheckpointCallbackManager() = default; |
49 | |
50 | // Not copyable or movable |
51 | CheckpointCallbackManager(const CheckpointCallbackManager&) = delete; |
52 | CheckpointCallbackManager& operator=(const CheckpointCallbackManager&) = |
53 | delete; |
54 | |
55 | std::string DebugString() const override { |
56 | return "CheckpointCallbackManager" ; |
57 | } |
58 | |
59 | // Infers a checkpoint id and directory from a prefix |
60 | // passed to SaveV2 / RestoreV2 Ops |
61 | static StatusOr<std::pair<std::string, std::string>> |
62 | GetCheckpointIdAndPathFromPrefix(absl::string_view prefix); |
63 | |
64 | // Register a save callback. |
65 | // The passed callback will be triggered with an identified checkpoint id. |
66 | // The callback should return a string content needs to be stored |
67 | // as a part of a checkpoint, and then the content is stored as a file |
68 | // with the registered the file_extension. |
69 | Status RegisterSaveCallback(absl::string_view file_extension, |
70 | SaveCallback callback); |
71 | |
72 | // Checks if a registered save callback exists for an extension. |
73 | bool DoesSaveCallbackExist(absl::string_view file_extension); |
74 | |
75 | // Register a restore callback. |
76 | // The passed file_extension is used to generate a file name together with |
77 | // an identified checkpoint_id. If the file exists, the registered callback |
78 | // is triggered with the content of the file. |
79 | Status RegisterRestoreCallback(absl::string_view file_extension, |
80 | RestoreCallback callback); |
81 | |
82 | // Checks if a registered restore callback exists for an extension. |
83 | bool DoesRestoreCallbackExist(absl::string_view file_extension); |
84 | |
85 | // Should be triggered from SaveV2()::Compute(). |
86 | void Save(absl::string_view prefix); |
87 | |
88 | // Should be triggered from RestoreV2()::Compute(). |
89 | void Restore(absl::string_view prefix); |
90 | |
91 | private: |
92 | mutable mutex mu_; |
93 | |
94 | absl::flat_hash_map<std::string, SaveCallback> save_callbacks_ |
95 | TF_GUARDED_BY(mu_); |
96 | absl::flat_hash_map<std::string, RestoreCallback> restore_callbacks_ |
97 | TF_GUARDED_BY(mu_); |
98 | |
99 | // Checkpoint save and restore could happen before save / restore callbacks |
100 | // are registered. The last checkpoint information is kept in these variables |
101 | // to trigger the registered callback lazily. |
102 | std::pair<std::string, std::string> last_restored_checkpoint_id_and_dir_ |
103 | TF_GUARDED_BY(mu_); |
104 | |
105 | std::pair<std::string, std::string> last_saved_checkpoint_id_and_dir_ |
106 | TF_GUARDED_BY(mu_); |
107 | }; |
108 | |
109 | } // namespace checkpoint |
110 | } // namespace tensorflow |
111 | |
112 | #endif // TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_ |
113 | |