1/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/cc/saved_model/metrics.h"
17
18#include <string>
19
20#include "tensorflow/core/lib/monitoring/counter.h"
21#include "tensorflow/core/lib/monitoring/sampler.h"
22
23namespace tensorflow {
24namespace metrics {
25
26namespace {
27
28// Counter that tracks total number and `write_version` of SavedModels written.
29auto* saved_model_write_counter = monitoring::Counter<1>::New(
30 "/tensorflow/core/saved_model/write/count",
31 "The number of SavedModels successfully written.", "write_version");
32
33// Counter that tracks total number and `write_version` of SavedModels read.
34auto* saved_model_read_counter = monitoring::Counter<1>::New(
35 "/tensorflow/core/saved_model/read/count",
36 "The number of SavedModels successfully loaded.", "write_version");
37
38// Counter that tracks number of calls for each SavedModel write API. Summing
39// across "api_label" is not expected to equal the ".../write/count" cell value
40// because programs can invoke more than one API to save a single SM and
41// because the API may error out before successfully writing a SM.
42auto* saved_model_write_api = monitoring::Counter<1>::New(
43 "/tensorflow/core/saved_model/write/api",
44 "The API used to write the SavedModel.", "api_label");
45
46// Counter that tracks number of calls for each SavedModel read API. Summing
47// across "api_label" is not expected to equal the ".../read/count" cell value
48// because programs can invoke more than one API to load a single SM and
49// because the API may error out before successfully reading a SM.
50auto* saved_model_read_api = monitoring::Counter<1>::New(
51 "/tensorflow/core/saved_model/read/api",
52 "The API used to load the SavedModel.", "api_label");
53
54// Distribution of checkpoint write durations.
55auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
56 {
57 "/tensorflow/core/checkpoint/write/write_durations", // Metric name.
58 "Distribution of the wall time duration in microseconds of the "
59 "checkpoint write operation.", // Metric description.
60 "api_label" // Cell label.
61 },
62 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
63 monitoring::Buckets::Exponential(1000, 1.5, 41));
64
65// Distribution of checkpoint read durations.
66auto* checkpoint_read_durations = monitoring::Sampler<1>::New(
67 {
68 "/tensorflow/core/checkpoint/read/read_durations", // Metric name.
69 "Distribution of the wall time duration in microseconds of the "
70 "checkpoint read operation.", // Metric description.
71 "api_label" // Cell label.
72 },
73 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
74 monitoring::Buckets::Exponential(1000, 1.5, 41));
75
76// Distribution of async checkpoint write durations.
77auto* async_checkpoint_write_durations = monitoring::Sampler<1>::New(
78 {
79 "/tensorflow/core/checkpoint/write/async_write_durations", // Metric
80 // name.
81 "Distribution of the wall time duration in microseconds of the async "
82 "checkpoint write operation", // Metric description.
83 "api_label" // Cell label.
84 },
85 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
86 monitoring::Buckets::Exponential(1000, 1.5, 41));
87
88// Counter that accumulates total time elapsed between module import time and
89// the last successful Checkpoint write prior to job pre-emption or completion.
90auto* checkpoint_training_time_saved = monitoring::Counter<1>::New(
91 "/tensorflow/core/checkpoint/write/training_time_saved",
92 "Total time in microseconds elapsed between two consecutive write "
93 "operations in a single job or between Checkpoint construction and the "
94 "first write operation.",
95 "api_label");
96
97// Counter that records filesize (MB) of written checkpoint. Contains two cells:
98// (api_label, filesize). Cardinality should not be an issue as the filesize
99// should be equal among all checkpoints written per job.
100auto* checkpoint_size = monitoring::Counter<2>::New(
101 "/tensorflow/core/checkpoint/write/checkpoint_size",
102 "Size of checkpoint (.index and sharded data files), rounded to the "
103 "nearest 100 MB.",
104 "api_label", "filesize");
105
106} // namespace
107
108monitoring::CounterCell& SavedModelWrite(absl::string_view write_version) {
109 return *saved_model_write_counter->GetCell(std::string(write_version));
110}
111
112monitoring::CounterCell& SavedModelRead(absl::string_view write_version) {
113 return *saved_model_read_counter->GetCell(std::string(write_version));
114}
115
116monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label) {
117 return *saved_model_write_api->GetCell(std::string(api_label));
118}
119
120monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) {
121 return *saved_model_read_api->GetCell(std::string(api_label));
122}
123
124monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
125 return *checkpoint_read_durations->GetCell(std::string(api_label));
126}
127
128monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label) {
129 return *checkpoint_write_durations->GetCell(std::string(api_label));
130}
131
132monitoring::SamplerCell& AsyncCheckpointWriteDuration(
133 absl::string_view api_label) {
134 return *async_checkpoint_write_durations->GetCell(std::string(api_label));
135}
136
137monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label) {
138 return *checkpoint_training_time_saved->GetCell(std::string(api_label));
139}
140
141monitoring::CounterCell& CheckpointSize(absl::string_view api_label,
142 int64_t filesize) {
143 return *checkpoint_size->GetCell(std::string(api_label),
144 std::to_string(filesize));
145}
146
147} // namespace metrics
148} // namespace tensorflow
149