1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
17#define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
18
19#include "absl/container/flat_hash_map.h"
20#include "tensorflow/core/framework/dataset_options.pb.h"
21#include "tensorflow/core/lib/monitoring/counter.h"
22#include "tensorflow/core/lib/monitoring/gauge.h"
23#include "tensorflow/core/platform/env.h"
24#include "tensorflow/core/platform/statusor.h"
25#include "tensorflow/core/platform/types.h"
26#include "tensorflow/core/protobuf/data_service.pb.h"
27
28namespace tensorflow {
29namespace metrics {
30
31// Records that a tf.data.Dataset executed by the program used autotuning.
32//
33// The `name` argument identifies the Dataset type (e.g. "ParallelMap").
34void RecordTFDataAutotune(const string& name);
35
36// Returns a counter that can be used to record the number of bytes produced by
37// a tf.data.Dataset.
38//
39// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
40monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
41
42// Returns a counter that can be used to record the number of bytes produced by
43// a tf.data.Dataset.
44//
45// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
46monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
47
48// Returns a counter than can be used to record the number of bytes read from
49// the filesystem by a tf.data.Dataset source.
50//
51// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
52//
53// TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
54monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
55
56// Returns a counter than can be used to record the number of elements produced
57// by a tf.data.Dataset.
58//
59// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
60monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
61
62// Returns a gauge than can be used to record the performance model information.
63//
64// The `id` argument represents the (unique) model ID.
65monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
66 const string& id);
67
68// Records the number of bytes fetched from tf.data.Dataset iterator.
69void RecordTFDataBytesFetched(int64_t num_bytes);
70
71// Records the number of times tf.data experiment is applied to input pipelines.
72void RecordTFDataExperiment(const string& name);
73
74// Records the time (in microseconds) spent in a single invocation of
75// `ItertatorResource::GetNext()`.
76void RecordTFDataGetNextDuration(uint64 duration_us);
77
78// Records the histogram of ratios of tf.data autotune algorithm used RAM over
79// the ram budget.
80void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio);
81
82// Records the histogram of ratios of tf.data autotune algorithm max buffer
83// bytes over the ram budget.
84void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio);
85
86// Records the number of times each tf.data fingerprint is used
87// to measure duplicate pre-processing.
88//
89// The `name` argument identifies the Dataset graph fingerprint,
90// created using GraphHash().
91void RecordTFDataFingerprint(const string& name);
92
93// Records the time (in microseconds) during which `IteratorResource` was busy
94// processing at least one `GetNext()` request.
95void RecordTFDataIteratorBusy(uint64 duration_us);
96
97// Records the time (in microseconds) between `IteratorResource` receiving the
98// first `GetNext()` request and responding to the last `GetNext()` request.
99void RecordTFDataIteratorLifetime(uint64 duration_us);
100
101// Records the time histogram (in microseconds) between `IteratorResource`
102// responding to a `GetNext()` request and receiving the next `GetNext()`
103// request.
104void RecordTFDataIteratorGap(uint64 duration_us);
105
106// Records the number of independent graph changes resulting from the
107// application of a tf.data optimization.
108//
109// The `name` argument identifies the optimization (e.g. "noop_elimination").
110void RecordTFDataOptimization(const string& name, int64_t num_changes);
111
112// Records that a tf.data service worker has been created.
113void RecordTFDataServiceWorkerCreated();
114
115// Records that a tf.data service job has been created.
116void RecordTFDataServiceJobsCreated(
117 const tensorflow::data::ProcessingModeDef& processing_mode,
118 bool is_coordinated_read);
119
120// Records tf.data service iterators created by clients.
121void RecordTFDataServiceClientIterators(
122 int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
123 const tensorflow::data::ProcessingModeDef& processing_mode,
124 bool is_coordinated_read);
125
126// Records tf.data service cross-trainer cache queries.
127void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit);
128
129// Records tf.data service cross-trainer cache memory usage in bytes.
130void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes);
131
132// Records the file name read by a tf.data Dataset.
133//
134// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
135void RecordTFDataFilename(const string& name, const string& filename);
136
137// Records statistics of tf.data auto sharding.
138//
139// The `id` is a unique identifier of the input pipeline. The `policy`
140// identifies the auto-sharding policy used, the `num_workers` identifies the
141// number of workers, and `num_replicas` identifies the number of replicas.
142void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
143 int64 num_workers, int64 num_replicas);
144
145// Records statistics of whether we can rewrite batch size in tf.data auto
146// sharding.
147//
148// The `id` is a unique identifier of the input pipeline. The `eligible`
149// indicates whether the input pipeline is eligible for the rewrite. The
150// `ineligible_reason` is the reason if the input pipeline is ineligible.
151void RecordTFDataAutoShardRewriteBatchSize(
152 bool eligible, const std::vector<string>& ineligible_reason);
153
154// Records the number of times each tf.data autotuning algorithm stopping
155// criterion is met.
156void RecordTFDataAutotuneStoppingCriteria(const string& name);
157
158// Records parsing of dense tensor features.
159void RecordParseDenseFeature(int64_t num_features);
160
161// Records parsing of sparse tensor features.
162void RecordParseSparseFeature(int64_t num_features);
163
164// Records parsing of ragged tensor features.
165void RecordParseRaggedFeature(int64_t num_features);
166
167// Records the size of input/output tensors in bytes.
168void RecordGraphInputTensors(const size_t size);
169void RecordGraphOutputTensors(const size_t size);
170
171// Records the number of cores requested by graphs with XLA SPMD enabled.
172void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
173
174void UpdateGraphExecTime(const uint64 running_time_usecs);
175void UpdateGraphPendingQueueLength(uint64 len);
176
177// Records that one output of an op of type `op_name` was unused.
178void RecordUnusedOutput(const string& op_name);
179
180// Updates the metrics stored about time spent building graphs.
181//
182// By "GraphBuild", we refer to building a client graph, which is a sub-graph of
183// the full graph, induced by a set of options. In particular, these options
184// include the feeds and fetches requested.
185//
186// This includes time spent:
187// * optimizing the graphs with Grappler
188// * pruning the sub-graph (unless the place_pruned_graph option is set)
189//
190// When executing eagerly, this will not record any activity.
191//
192// TODO(jtkeeling): Should we record building/optimizing tf.functions?
193void UpdateGraphBuildTime(const uint64 running_time_usecs);
194
195// Updates the metric stored for time spent optimizing function graphs.
196void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs);
197
198// Records the activity of the first phase of the mlir bridge using the
199// tf_metadata.tf_mlir_bridge_first_phase_count metric.
200// device_type: tpu, cpu, gpu, etc.
201// bridge_version: v1 compat, v2, etc.
202// fallback_enabled: true if fallback will happen, false if not
203// result: outcome of bridge (success, failure, disabled, invalid_graph, etc.)
204void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
205 const std::string& bridge_version,
206 bool fallback_enabled,
207 const std::string& result);
208
209// Records the activity per op using the
210// tf_metadata.tf_mlir_bridge_graph_analysis_per_op.
211// op_name: the name of op.
212// construction_context: eager, session, Not tracked.
213// is_single_core_inference_mode: true, false.
214// unsupported_reason: the reason why the graph is not supported in MLIR-based
215// bridge, like invalid graph, has unsupported ops, etc.
216// has_unsupported_features: true indicates MLIR-based bridge is disabled,
217// false indicates MLIR-based bridge is enabled.
218
219void UpdateTfMlirBridgeGraphAnalysisPerOp(
220 const std::string& op_name, const std::string& construction_context,
221 bool is_single_core_inference_mode, const std::string& num_replicas,
222 const std::string& num_cores_per_replica, const std::string& use_tpu,
223 const std::string& allow_soft_placement,
224 const std::string& use_spmd_for_xla_partitioning,
225 const std::string& unsupported_reason, bool has_unsupported_features);
226
227// Convenience class allowing RAII style of reporting for a monitoring::Counter.
228template <int NumLabels>
229class ScopedCounter final {
230 public:
231 ScopedCounter(monitoring::Counter<NumLabels>* const counter,
232 const std::array<std::string, NumLabels>& labels)
233 : counter_(counter), labels_(labels) {
234 Init();
235 }
236
237 // Report counter and stop it. Counter needs to be reset to perform
238 // next measurement.
239 void ReportAndStop() {
240 if (started_) {
241 started_ = false;
242 ReportInternal(std::make_index_sequence<NumLabels>());
243 }
244 }
245
246 // Start the measurement with the new set of labels.
247 void Reset(const std::array<std::string, NumLabels>& labels) {
248 labels_ = labels;
249 Init();
250 }
251
252 // Start the measurement with the existing set of labels.
253 void Reset() { Init(); }
254
255 // Returns duration of the current interval in case the timer has started.
256 // Returns nullopt otherwise.
257 absl::optional<uint64> DurationMicroSec() const {
258 return started_ ? absl::optional<uint64>(
259 accumulated_time_ +
260 tensorflow::Env::Default()->NowMicros() - start_time_)
261 : absl::nullopt;
262 }
263
264 // Temporarily stop the timer, but keep accumulated time.
265 void AccumulateAndStop() {
266 if (started_) {
267 accumulated_time_ = tensorflow::Env::Default()->NowMicros() - start_time_;
268 started_ = false;
269 }
270 }
271
272 // Start previously stopped timer.
273 void Start() {
274 if (started_) return;
275
276 // Keep previously accumulated time if any.
277 start_time_ = tensorflow::Env::Default()->NowMicros();
278 started_ = true;
279 }
280
281 ~ScopedCounter() { ReportAndStop(); }
282
283 private:
284 template <std::size_t... S>
285 void ReportInternal(std::index_sequence<S...>) {
286 uint64 time_interval =
287 tensorflow::Env::Default()->NowMicros() - start_time_;
288 time_interval += accumulated_time_;
289 if (time_interval > 0) {
290 counter_->GetCell(labels_[S]...)->IncrementBy(time_interval);
291 }
292 }
293
294 void Init() {
295 start_time_ = tensorflow::Env::Default()->NowMicros();
296 started_ = true;
297 accumulated_time_ = 0;
298 }
299
300 monitoring::Counter<NumLabels>* counter_;
301 std::array<std::string, NumLabels> labels_;
302 bool started_{false};
303 uint64 start_time_;
304 uint64 accumulated_time_;
305};
306
307// Returns a counter used to capture timing metrics for graph optimization
308// passes.
309monitoring::Counter<2>* GetGraphOptimizationCounter();
310
311// Updates metrics for time to distribute variables to all TPU hosts.
312void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
313
314// Updates the metrics stored about time XLA spents compiling graphs.
315void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
316
317// Updates the metrics stored about time BFC allocator spents during delay.
318void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
319
320// Increments (by 1) a simple integer counter that is exposed for testing.
321void IncrementTestCounter(const string& name, const string& label);
322
323// Read-only access to a counter for testing.
324const monitoring::CounterCell* TestCounter(const string& name,
325 const string& label);
326
327// Read-only wrapper for a TestCounter to track increments between calls.
328class TestDelta {
329 public:
330 TestDelta(const string& name, const string& label);
331 void Reset();
332 int64 Get();
333
334 private:
335 const monitoring::CounterCell* cell_;
336 int64 last_value_;
337};
338void UpdateTpuErrorCounter(const string& op, const string& error_type);
339void UpdateEagerClientErrorCounter(const string& error_source,
340 const string& error_type);
341
342} // namespace metrics
343} // namespace tensorflow
344
345#endif // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
346