metrics.h source code [tensorflow/tensorflow/core/framework/metrics.h]

1	/ Copyright 2018 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
17	#define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
18
19	#include "absl/container/flat_hash_map.h"
20	#include "tensorflow/core/framework/dataset_options.pb.h"
21	#include "tensorflow/core/lib/monitoring/counter.h"
22	#include "tensorflow/core/lib/monitoring/gauge.h"
23	#include "tensorflow/core/platform/env.h"
24	#include "tensorflow/core/platform/statusor.h"
25	#include "tensorflow/core/platform/types.h"
26	#include "tensorflow/core/protobuf/data_service.pb.h"
27
28	namespace tensorflow {
29	namespace metrics {
30
31	// Records that a tf.data.Dataset executed by the program used autotuning.
32	//
33	// The `name` argument identifies the Dataset type (e.g. "ParallelMap").
34	void RecordTFDataAutotune(const string& name);
35
36	// Returns a counter that can be used to record the number of bytes produced by
37	// a tf.data.Dataset.
38	//
39	// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
40	monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
41
42	// Returns a counter that can be used to record the number of bytes produced by
43	// a tf.data.Dataset.
44	//
45	// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
46	monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
47
48	// Returns a counter than can be used to record the number of bytes read from
49	// the filesystem by a tf.data.Dataset source.
50	//
51	// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
52	//
53	// TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
54	monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
55
56	// Returns a counter than can be used to record the number of elements produced
57	// by a tf.data.Dataset.
58	//
59	// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
60	monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
61
62	// Returns a gauge than can be used to record the performance model information.
63	//
64	// The `id` argument represents the (unique) model ID.
65	monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
66	const string& id);
67
68	// Records the number of bytes fetched from tf.data.Dataset iterator.
69	void RecordTFDataBytesFetched(int64_t num_bytes);
70
71	// Records the number of times tf.data experiment is applied to input pipelines.
72	void RecordTFDataExperiment(const string& name);
73
74	// Records the time (in microseconds) spent in a single invocation of
75	// `ItertatorResource::GetNext()`.
76	void RecordTFDataGetNextDuration(uint64 duration_us);
77
78	// Records the histogram of ratios of tf.data autotune algorithm used RAM over
79	// the ram budget.
80	void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio);
81
82	// Records the histogram of ratios of tf.data autotune algorithm max buffer
83	// bytes over the ram budget.
84	void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio);
85
86	// Records the number of times each tf.data fingerprint is used
87	// to measure duplicate pre-processing.
88	//
89	// The `name` argument identifies the Dataset graph fingerprint,
90	// created using GraphHash().
91	void RecordTFDataFingerprint(const string& name);
92
93	// Records the time (in microseconds) during which `IteratorResource` was busy
94	// processing at least one `GetNext()` request.
95	void RecordTFDataIteratorBusy(uint64 duration_us);
96
97	// Records the time (in microseconds) between `IteratorResource` receiving the
98	// first `GetNext()` request and responding to the last `GetNext()` request.
99	void RecordTFDataIteratorLifetime(uint64 duration_us);
100
101	// Records the time histogram (in microseconds) between `IteratorResource`
102	// responding to a `GetNext()` request and receiving the next `GetNext()`
103	// request.
104	void RecordTFDataIteratorGap(uint64 duration_us);
105
106	// Records the number of independent graph changes resulting from the
107	// application of a tf.data optimization.
108	//
109	// The `name` argument identifies the optimization (e.g. "noop_elimination").
110	void RecordTFDataOptimization(const string& name, int64_t num_changes);
111
112	// Records that a tf.data service worker has been created.
113	void RecordTFDataServiceWorkerCreated();
114
115	// Records that a tf.data service job has been created.
116	void RecordTFDataServiceJobsCreated(
117	const tensorflow::data::ProcessingModeDef& processing_mode,
118	bool is_coordinated_read);
119
120	// Records tf.data service iterators created by clients.
121	void RecordTFDataServiceClientIterators(
122	int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
123	const tensorflow::data::ProcessingModeDef& processing_mode,
124	bool is_coordinated_read);
125
126	// Records tf.data service cross-trainer cache queries.
127	void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit);
128
129	// Records tf.data service cross-trainer cache memory usage in bytes.
130	void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes);
131
132	// Records the file name read by a tf.data Dataset.
133	//
134	// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
135	void RecordTFDataFilename(const string& name, const string& filename);
136
137	// Records statistics of tf.data auto sharding.
138	//
139	// The `id` is a unique identifier of the input pipeline. The `policy`
140	// identifies the auto-sharding policy used, the `num_workers` identifies the
141	// number of workers, and `num_replicas` identifies the number of replicas.
142	void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
143	int64 num_workers, int64 num_replicas);
144
145	// Records statistics of whether we can rewrite batch size in tf.data auto
146	// sharding.
147	//
148	// The `id` is a unique identifier of the input pipeline. The `eligible`
149	// indicates whether the input pipeline is eligible for the rewrite. The
150	// `ineligible_reason` is the reason if the input pipeline is ineligible.
151	void RecordTFDataAutoShardRewriteBatchSize(
152	bool eligible, const std::vector<string>& ineligible_reason);
153
154	// Records the number of times each tf.data autotuning algorithm stopping
155	// criterion is met.
156	void RecordTFDataAutotuneStoppingCriteria(const string& name);
157
158	// Records parsing of dense tensor features.
159	void RecordParseDenseFeature(int64_t num_features);
160
161	// Records parsing of sparse tensor features.
162	void RecordParseSparseFeature(int64_t num_features);
163
164	// Records parsing of ragged tensor features.
165	void RecordParseRaggedFeature(int64_t num_features);
166
167	// Records the size of input/output tensors in bytes.
168	void RecordGraphInputTensors(const size_t size);
169	void RecordGraphOutputTensors(const size_t size);
170
171	// Records the number of cores requested by graphs with XLA SPMD enabled.
172	void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
173
174	void UpdateGraphExecTime(const uint64 running_time_usecs);
175	void UpdateGraphPendingQueueLength(uint64 len);
176
177	// Records that one output of an op of type `op_name` was unused.
178	void RecordUnusedOutput(const string& op_name);
179
180	// Updates the metrics stored about time spent building graphs.
181	//
182	// By "GraphBuild", we refer to building a client graph, which is a sub-graph of
183	// the full graph, induced by a set of options. In particular, these options
184	// include the feeds and fetches requested.
185	//
186	// This includes time spent:
187	// optimizing the graphs with Grappler*
188	// pruning the sub-graph (unless the place_pruned_graph option is set)*
189	//
190	// When executing eagerly, this will not record any activity.
191	//
192	// TODO(jtkeeling): Should we record building/optimizing tf.functions?
193	void UpdateGraphBuildTime(const uint64 running_time_usecs);
194
195	// Updates the metric stored for time spent optimizing function graphs.
196	void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs);
197
198	// Records the activity of the first phase of the mlir bridge using the
199	// tf_metadata.tf_mlir_bridge_first_phase_count metric.
200	// device_type: tpu, cpu, gpu, etc.
201	// bridge_version: v1 compat, v2, etc.
202	// fallback_enabled: true if fallback will happen, false if not
203	// result: outcome of bridge (success, failure, disabled, invalid_graph, etc.)
204	void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
205	const std::string& bridge_version,
206	bool fallback_enabled,
207	const std::string& result);
208
209	// Records the activity per op using the
210	// tf_metadata.tf_mlir_bridge_graph_analysis_per_op.
211	// op_name: the name of op.
212	// construction_context: eager, session, Not tracked.
213	// is_single_core_inference_mode: true, false.
214	// unsupported_reason: the reason why the graph is not supported in MLIR-based
215	// bridge, like invalid graph, has unsupported ops, etc.
216	// has_unsupported_features: true indicates MLIR-based bridge is disabled,
217	// false indicates MLIR-based bridge is enabled.
218
219	void UpdateTfMlirBridgeGraphAnalysisPerOp(
220	const std::string& op_name, const std::string& construction_context,
221	bool is_single_core_inference_mode, const std::string& num_replicas,
222	const std::string& num_cores_per_replica, const std::string& use_tpu,
223	const std::string& allow_soft_placement,
224	const std::string& use_spmd_for_xla_partitioning,
225	const std::string& unsupported_reason, bool has_unsupported_features);
226
227	// Convenience class allowing RAII style of reporting for a monitoring::Counter.
228	template <int NumLabels>
229	class ScopedCounter final {
230	public:
231	ScopedCounter(monitoring::Counter<NumLabels>* const counter,
232	const std::array<std::string, NumLabels>& labels)
233	: counter_(counter), labels_(labels) {
234	Init();
235	}
236
237	// Report counter and stop it. Counter needs to be reset to perform
238	// next measurement.
239	void ReportAndStop() {
240	if (started_) {
241	started_ = false;
242	ReportInternal(std::make_index_sequence<NumLabels>());
243	}
244	}
245
246	// Start the measurement with the new set of labels.
247	void Reset(const std::array<std::string, NumLabels>& labels) {
248	labels_ = labels;
249	Init();
250	}
251
252	// Start the measurement with the existing set of labels.
253	void Reset() { Init(); }
254
255	// Returns duration of the current interval in case the timer has started.
256	// Returns nullopt otherwise.
257	absl::optional<uint64> DurationMicroSec() const {
258	return started_ ? absl::optional<uint64>(
259	accumulated_time_ +
260	tensorflow::Env::Default()->NowMicros() - start_time_)
261	: absl::nullopt;
262	}
263
264	// Temporarily stop the timer, but keep accumulated time.
265	void AccumulateAndStop() {
266	if (started_) {
267	accumulated_time_ = tensorflow::Env::Default()->NowMicros() - start_time_;
268	started_ = false;
269	}
270	}
271
272	// Start previously stopped timer.
273	void Start() {
274	if (started_) return;
275
276	// Keep previously accumulated time if any.
277	start_time_ = tensorflow::Env::Default()->NowMicros();
278	started_ = true;
279	}
280
281	~ScopedCounter() { ReportAndStop(); }
282
283	private:
284	template <std::size_t... S>
285	void ReportInternal(std::index_sequence<S...>) {
286	uint64 time_interval =
287	tensorflow::Env::Default()->NowMicros() - start_time_;
288	time_interval += accumulated_time_;
289	if (time_interval > `0`) {
290	counter_->GetCell(labels_[S]...)->IncrementBy(time_interval);
291	}
292	}
293
294	void Init() {
295	start_time_ = tensorflow::Env::Default()->NowMicros();
296	started_ = true;
297	accumulated_time_ = `0`;
298	}
299
300	monitoring::Counter<NumLabels>* counter_;
301	std::array<std::string, NumLabels> labels_;
302	bool started_{false};
303	uint64 start_time_;
304	uint64 accumulated_time_;
305	};
306
307	// Returns a counter used to capture timing metrics for graph optimization
308	// passes.
309	monitoring::Counter<`2`>* GetGraphOptimizationCounter();
310
311	// Updates metrics for time to distribute variables to all TPU hosts.
312	void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
313
314	// Updates the metrics stored about time XLA spents compiling graphs.
315	void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
316
317	// Updates the metrics stored about time BFC allocator spents during delay.
318	void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
319
320	// Increments (by 1) a simple integer counter that is exposed for testing.
321	void IncrementTestCounter(const string& name, const string& label);
322
323	// Read-only access to a counter for testing.
324	const monitoring::CounterCell* TestCounter(const string& name,
325	const string& label);
326
327	// Read-only wrapper for a TestCounter to track increments between calls.
328	class TestDelta {
329	public:
330	TestDelta(const string& name, const string& label);
331	void Reset();
332	int64 Get();
333
334	private:
335	const monitoring::CounterCell* cell_;
336	int64 last_value_;
337	};
338	void UpdateTpuErrorCounter(const string& op, const string& error_type);
339	void UpdateEagerClientErrorCounter(const string& error_source,
340	const string& error_type);
341
342	} // namespace metrics
343	} // namespace tensorflow
344
345	#endif // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
346

Browse the source code of tensorflow/tensorflow/core/framework/metrics.h