1 | /* Copyright 2021 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/cc/saved_model/metrics.h" |
17 | |
18 | #include <string> |
19 | |
20 | #include "tensorflow/core/lib/monitoring/counter.h" |
21 | #include "tensorflow/core/lib/monitoring/sampler.h" |
22 | |
23 | namespace tensorflow { |
24 | namespace metrics { |
25 | |
26 | namespace { |
27 | |
28 | // Counter that tracks total number and `write_version` of SavedModels written. |
29 | auto* saved_model_write_counter = monitoring::Counter<1>::New( |
30 | "/tensorflow/core/saved_model/write/count" , |
31 | "The number of SavedModels successfully written." , "write_version" ); |
32 | |
33 | // Counter that tracks total number and `write_version` of SavedModels read. |
34 | auto* saved_model_read_counter = monitoring::Counter<1>::New( |
35 | "/tensorflow/core/saved_model/read/count" , |
36 | "The number of SavedModels successfully loaded." , "write_version" ); |
37 | |
38 | // Counter that tracks number of calls for each SavedModel write API. Summing |
39 | // across "api_label" is not expected to equal the ".../write/count" cell value |
40 | // because programs can invoke more than one API to save a single SM and |
41 | // because the API may error out before successfully writing a SM. |
42 | auto* saved_model_write_api = monitoring::Counter<1>::New( |
43 | "/tensorflow/core/saved_model/write/api" , |
44 | "The API used to write the SavedModel." , "api_label" ); |
45 | |
46 | // Counter that tracks number of calls for each SavedModel read API. Summing |
47 | // across "api_label" is not expected to equal the ".../read/count" cell value |
48 | // because programs can invoke more than one API to load a single SM and |
49 | // because the API may error out before successfully reading a SM. |
50 | auto* saved_model_read_api = monitoring::Counter<1>::New( |
51 | "/tensorflow/core/saved_model/read/api" , |
52 | "The API used to load the SavedModel." , "api_label" ); |
53 | |
54 | // Distribution of checkpoint write durations. |
55 | auto* checkpoint_write_durations = monitoring::Sampler<1>::New( |
56 | { |
57 | "/tensorflow/core/checkpoint/write/write_durations" , // Metric name. |
58 | "Distribution of the wall time duration in microseconds of the " |
59 | "checkpoint write operation." , // Metric description. |
60 | "api_label" // Cell label. |
61 | }, |
62 | // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes. |
63 | monitoring::Buckets::Exponential(1000, 1.5, 41)); |
64 | |
65 | // Distribution of checkpoint read durations. |
66 | auto* checkpoint_read_durations = monitoring::Sampler<1>::New( |
67 | { |
68 | "/tensorflow/core/checkpoint/read/read_durations" , // Metric name. |
69 | "Distribution of the wall time duration in microseconds of the " |
70 | "checkpoint read operation." , // Metric description. |
71 | "api_label" // Cell label. |
72 | }, |
73 | // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes. |
74 | monitoring::Buckets::Exponential(1000, 1.5, 41)); |
75 | |
76 | // Distribution of async checkpoint write durations. |
77 | auto* async_checkpoint_write_durations = monitoring::Sampler<1>::New( |
78 | { |
79 | "/tensorflow/core/checkpoint/write/async_write_durations" , // Metric |
80 | // name. |
81 | "Distribution of the wall time duration in microseconds of the async " |
82 | "checkpoint write operation" , // Metric description. |
83 | "api_label" // Cell label. |
84 | }, |
85 | // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes. |
86 | monitoring::Buckets::Exponential(1000, 1.5, 41)); |
87 | |
88 | // Counter that accumulates total time elapsed between module import time and |
89 | // the last successful Checkpoint write prior to job pre-emption or completion. |
90 | auto* checkpoint_training_time_saved = monitoring::Counter<1>::New( |
91 | "/tensorflow/core/checkpoint/write/training_time_saved" , |
92 | "Total time in microseconds elapsed between two consecutive write " |
93 | "operations in a single job or between Checkpoint construction and the " |
94 | "first write operation." , |
95 | "api_label" ); |
96 | |
97 | // Counter that records filesize (MB) of written checkpoint. Contains two cells: |
98 | // (api_label, filesize). Cardinality should not be an issue as the filesize |
99 | // should be equal among all checkpoints written per job. |
100 | auto* checkpoint_size = monitoring::Counter<2>::New( |
101 | "/tensorflow/core/checkpoint/write/checkpoint_size" , |
102 | "Size of checkpoint (.index and sharded data files), rounded to the " |
103 | "nearest 100 MB." , |
104 | "api_label" , "filesize" ); |
105 | |
106 | } // namespace |
107 | |
108 | monitoring::CounterCell& SavedModelWrite(absl::string_view write_version) { |
109 | return *saved_model_write_counter->GetCell(std::string(write_version)); |
110 | } |
111 | |
112 | monitoring::CounterCell& SavedModelRead(absl::string_view write_version) { |
113 | return *saved_model_read_counter->GetCell(std::string(write_version)); |
114 | } |
115 | |
116 | monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label) { |
117 | return *saved_model_write_api->GetCell(std::string(api_label)); |
118 | } |
119 | |
120 | monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) { |
121 | return *saved_model_read_api->GetCell(std::string(api_label)); |
122 | } |
123 | |
124 | monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) { |
125 | return *checkpoint_read_durations->GetCell(std::string(api_label)); |
126 | } |
127 | |
128 | monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label) { |
129 | return *checkpoint_write_durations->GetCell(std::string(api_label)); |
130 | } |
131 | |
132 | monitoring::SamplerCell& AsyncCheckpointWriteDuration( |
133 | absl::string_view api_label) { |
134 | return *async_checkpoint_write_durations->GetCell(std::string(api_label)); |
135 | } |
136 | |
137 | monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label) { |
138 | return *checkpoint_training_time_saved->GetCell(std::string(api_label)); |
139 | } |
140 | |
141 | monitoring::CounterCell& CheckpointSize(absl::string_view api_label, |
142 | int64_t filesize) { |
143 | return *checkpoint_size->GetCell(std::string(api_label), |
144 | std::to_string(filesize)); |
145 | } |
146 | |
147 | } // namespace metrics |
148 | } // namespace tensorflow |
149 | |