metrics.cc source code [tensorflow/tensorflow/cc/saved_model/metrics.cc]

1	/ Copyright 2021 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#include "tensorflow/cc/saved_model/metrics.h"
17
18	#include <string>
19
20	#include "tensorflow/core/lib/monitoring/counter.h"
21	#include "tensorflow/core/lib/monitoring/sampler.h"
22
23	namespace tensorflow {
24	namespace metrics {
25
26	namespace {
27
28	// Counter that tracks total number and `write_version` of SavedModels written.
29	auto* saved_model_write_counter = monitoring::Counter<`1`>::New(
30	"/tensorflow/core/saved_model/write/count",
31	"The number of SavedModels successfully written.", "write_version");
32
33	// Counter that tracks total number and `write_version` of SavedModels read.
34	auto* saved_model_read_counter = monitoring::Counter<`1`>::New(
35	"/tensorflow/core/saved_model/read/count",
36	"The number of SavedModels successfully loaded.", "write_version");
37
38	// Counter that tracks number of calls for each SavedModel write API. Summing
39	// across "api_label" is not expected to equal the ".../write/count" cell value
40	// because programs can invoke more than one API to save a single SM and
41	// because the API may error out before successfully writing a SM.
42	auto* saved_model_write_api = monitoring::Counter<`1`>::New(
43	"/tensorflow/core/saved_model/write/api",
44	"The API used to write the SavedModel.", "api_label");
45
46	// Counter that tracks number of calls for each SavedModel read API. Summing
47	// across "api_label" is not expected to equal the ".../read/count" cell value
48	// because programs can invoke more than one API to load a single SM and
49	// because the API may error out before successfully reading a SM.
50	auto* saved_model_read_api = monitoring::Counter<`1`>::New(
51	"/tensorflow/core/saved_model/read/api",
52	"The API used to load the SavedModel.", "api_label");
53
54	// Distribution of checkpoint write durations.
55	auto* checkpoint_write_durations = monitoring::Sampler<`1`>::New(
56	{
57	"/tensorflow/core/checkpoint/write/write_durations", // Metric name.
58	"Distribution of the wall time duration in microseconds of the "
59	"checkpoint write operation.", // Metric description.
60	"api_label" // Cell label.
61	},
62	// Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
63	monitoring::Buckets::Exponential(`1000`, `1.5`, `41`));
64
65	// Distribution of checkpoint read durations.
66	auto* checkpoint_read_durations = monitoring::Sampler<`1`>::New(
67	{
68	"/tensorflow/core/checkpoint/read/read_durations", // Metric name.
69	"Distribution of the wall time duration in microseconds of the "
70	"checkpoint read operation.", // Metric description.
71	"api_label" // Cell label.
72	},
73	// Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
74	monitoring::Buckets::Exponential(`1000`, `1.5`, `41`));
75
76	// Distribution of async checkpoint write durations.
77	auto* async_checkpoint_write_durations = monitoring::Sampler<`1`>::New(
78	{
79	"/tensorflow/core/checkpoint/write/async_write_durations", // Metric
80	// name.
81	"Distribution of the wall time duration in microseconds of the async "
82	"checkpoint write operation", // Metric description.
83	"api_label" // Cell label.
84	},
85	// Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
86	monitoring::Buckets::Exponential(`1000`, `1.5`, `41`));
87
88	// Counter that accumulates total time elapsed between module import time and
89	// the last successful Checkpoint write prior to job pre-emption or completion.
90	auto* checkpoint_training_time_saved = monitoring::Counter<`1`>::New(
91	"/tensorflow/core/checkpoint/write/training_time_saved",
92	"Total time in microseconds elapsed between two consecutive write "
93	"operations in a single job or between Checkpoint construction and the "
94	"first write operation.",
95	"api_label");
96
97	// Counter that records filesize (MB) of written checkpoint. Contains two cells:
98	// (api_label, filesize). Cardinality should not be an issue as the filesize
99	// should be equal among all checkpoints written per job.
100	auto* checkpoint_size = monitoring::Counter<`2`>::New(
101	"/tensorflow/core/checkpoint/write/checkpoint_size",
102	"Size of checkpoint (.index and sharded data files), rounded to the "
103	"nearest 100 MB.",
104	"api_label", "filesize");
105
106	} // namespace
107
108	monitoring::CounterCell& SavedModelWrite(absl::string_view write_version) {
109	return *saved_model_write_counter->GetCell(std::string (write_version));
110	}
111
112	monitoring::CounterCell& SavedModelRead(absl::string_view write_version) {
113	return *saved_model_read_counter->GetCell(std::string (write_version));
114	}
115
116	monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label) {
117	return *saved_model_write_api->GetCell(std::string (api_label));
118	}
119
120	monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) {
121	return *saved_model_read_api->GetCell(std::string (api_label));
122	}
123
124	monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
125	return *checkpoint_read_durations->GetCell(std::string (api_label));
126	}
127
128	monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label) {
129	return *checkpoint_write_durations->GetCell(std::string (api_label));
130	}
131
132	monitoring::SamplerCell& AsyncCheckpointWriteDuration(
133	absl::string_view api_label) {
134	return *async_checkpoint_write_durations->GetCell(std::string (api_label));
135	}
136
137	monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label) {
138	return *checkpoint_training_time_saved->GetCell(std::string (api_label));
139	}
140
141	monitoring::CounterCell& CheckpointSize(absl::string_view api_label,
142	int64_t filesize) {
143	return *checkpoint_size->GetCell(std::string (api_label),
144	std::to_string(filesize));
145	}
146
147	} // namespace metrics
148	} // namespace tensorflow
149

Browse the source code of tensorflow/tensorflow/cc/saved_model/metrics.cc