1 | /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ |
17 | #define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ |
18 | |
19 | #include "absl/container/flat_hash_map.h" |
20 | #include "tensorflow/core/framework/dataset_options.pb.h" |
21 | #include "tensorflow/core/lib/monitoring/counter.h" |
22 | #include "tensorflow/core/lib/monitoring/gauge.h" |
23 | #include "tensorflow/core/platform/env.h" |
24 | #include "tensorflow/core/platform/statusor.h" |
25 | #include "tensorflow/core/platform/types.h" |
26 | #include "tensorflow/core/protobuf/data_service.pb.h" |
27 | |
28 | namespace tensorflow { |
29 | namespace metrics { |
30 | |
31 | // Records that a tf.data.Dataset executed by the program used autotuning. |
32 | // |
33 | // The `name` argument identifies the Dataset type (e.g. "ParallelMap"). |
34 | void RecordTFDataAutotune(const string& name); |
35 | |
36 | // Returns a counter that can be used to record the number of bytes produced by |
37 | // a tf.data.Dataset. |
38 | // |
39 | // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). |
40 | monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name); |
41 | |
42 | // Returns a counter that can be used to record the number of bytes produced by |
43 | // a tf.data.Dataset. |
44 | // |
45 | // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). |
46 | monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name); |
47 | |
48 | // Returns a counter than can be used to record the number of bytes read from |
49 | // the filesystem by a tf.data.Dataset source. |
50 | // |
51 | // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset"). |
52 | // |
53 | // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter? |
54 | monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name); |
55 | |
56 | // Returns a counter than can be used to record the number of elements produced |
57 | // by a tf.data.Dataset. |
58 | // |
59 | // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). |
60 | monitoring::CounterCell* GetTFDataElementsCounter(const string& name); |
61 | |
62 | // Returns a gauge than can be used to record the performance model information. |
63 | // |
64 | // The `id` argument represents the (unique) model ID. |
65 | monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge( |
66 | const string& id); |
67 | |
68 | // Records the number of bytes fetched from tf.data.Dataset iterator. |
69 | void RecordTFDataBytesFetched(int64_t num_bytes); |
70 | |
71 | // Records the number of times tf.data experiment is applied to input pipelines. |
72 | void RecordTFDataExperiment(const string& name); |
73 | |
74 | // Records the time (in microseconds) spent in a single invocation of |
75 | // `ItertatorResource::GetNext()`. |
76 | void RecordTFDataGetNextDuration(uint64 duration_us); |
77 | |
78 | // Records the histogram of ratios of tf.data autotune algorithm used RAM over |
79 | // the ram budget. |
80 | void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio); |
81 | |
82 | // Records the histogram of ratios of tf.data autotune algorithm max buffer |
83 | // bytes over the ram budget. |
84 | void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio); |
85 | |
86 | // Records the number of times each tf.data fingerprint is used |
87 | // to measure duplicate pre-processing. |
88 | // |
89 | // The `name` argument identifies the Dataset graph fingerprint, |
90 | // created using GraphHash(). |
91 | void RecordTFDataFingerprint(const string& name); |
92 | |
93 | // Records the time (in microseconds) during which `IteratorResource` was busy |
94 | // processing at least one `GetNext()` request. |
95 | void RecordTFDataIteratorBusy(uint64 duration_us); |
96 | |
97 | // Records the time (in microseconds) between `IteratorResource` receiving the |
98 | // first `GetNext()` request and responding to the last `GetNext()` request. |
99 | void RecordTFDataIteratorLifetime(uint64 duration_us); |
100 | |
101 | // Records the time histogram (in microseconds) between `IteratorResource` |
102 | // responding to a `GetNext()` request and receiving the next `GetNext()` |
103 | // request. |
104 | void RecordTFDataIteratorGap(uint64 duration_us); |
105 | |
106 | // Records the number of independent graph changes resulting from the |
107 | // application of a tf.data optimization. |
108 | // |
109 | // The `name` argument identifies the optimization (e.g. "noop_elimination"). |
110 | void RecordTFDataOptimization(const string& name, int64_t num_changes); |
111 | |
112 | // Records that a tf.data service worker has been created. |
113 | void RecordTFDataServiceWorkerCreated(); |
114 | |
115 | // Records that a tf.data service job has been created. |
116 | void RecordTFDataServiceJobsCreated( |
117 | const tensorflow::data::ProcessingModeDef& processing_mode, |
118 | bool is_coordinated_read); |
119 | |
120 | // Records tf.data service iterators created by clients. |
121 | void RecordTFDataServiceClientIterators( |
122 | int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode, |
123 | const tensorflow::data::ProcessingModeDef& processing_mode, |
124 | bool is_coordinated_read); |
125 | |
126 | // Records tf.data service cross-trainer cache queries. |
127 | void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit); |
128 | |
129 | // Records tf.data service cross-trainer cache memory usage in bytes. |
130 | void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes); |
131 | |
132 | // Records the file name read by a tf.data Dataset. |
133 | // |
134 | // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset"). |
135 | void RecordTFDataFilename(const string& name, const string& filename); |
136 | |
137 | // Records statistics of tf.data auto sharding. |
138 | // |
139 | // The `id` is a unique identifier of the input pipeline. The `policy` |
140 | // identifies the auto-sharding policy used, the `num_workers` identifies the |
141 | // number of workers, and `num_replicas` identifies the number of replicas. |
142 | void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy, |
143 | int64 num_workers, int64 num_replicas); |
144 | |
145 | // Records statistics of whether we can rewrite batch size in tf.data auto |
146 | // sharding. |
147 | // |
148 | // The `id` is a unique identifier of the input pipeline. The `eligible` |
149 | // indicates whether the input pipeline is eligible for the rewrite. The |
150 | // `ineligible_reason` is the reason if the input pipeline is ineligible. |
151 | void RecordTFDataAutoShardRewriteBatchSize( |
152 | bool eligible, const std::vector<string>& ineligible_reason); |
153 | |
154 | // Records the number of times each tf.data autotuning algorithm stopping |
155 | // criterion is met. |
156 | void RecordTFDataAutotuneStoppingCriteria(const string& name); |
157 | |
158 | // Records parsing of dense tensor features. |
159 | void RecordParseDenseFeature(int64_t num_features); |
160 | |
161 | // Records parsing of sparse tensor features. |
162 | void RecordParseSparseFeature(int64_t num_features); |
163 | |
164 | // Records parsing of ragged tensor features. |
165 | void RecordParseRaggedFeature(int64_t num_features); |
166 | |
167 | // Records the size of input/output tensors in bytes. |
168 | void RecordGraphInputTensors(const size_t size); |
169 | void RecordGraphOutputTensors(const size_t size); |
170 | |
171 | // Records the number of cores requested by graphs with XLA SPMD enabled. |
172 | void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica); |
173 | |
174 | void UpdateGraphExecTime(const uint64 running_time_usecs); |
175 | void UpdateGraphPendingQueueLength(uint64 len); |
176 | |
177 | // Records that one output of an op of type `op_name` was unused. |
178 | void RecordUnusedOutput(const string& op_name); |
179 | |
180 | // Updates the metrics stored about time spent building graphs. |
181 | // |
182 | // By "GraphBuild", we refer to building a client graph, which is a sub-graph of |
183 | // the full graph, induced by a set of options. In particular, these options |
184 | // include the feeds and fetches requested. |
185 | // |
186 | // This includes time spent: |
187 | // * optimizing the graphs with Grappler |
188 | // * pruning the sub-graph (unless the place_pruned_graph option is set) |
189 | // |
190 | // When executing eagerly, this will not record any activity. |
191 | // |
192 | // TODO(jtkeeling): Should we record building/optimizing tf.functions? |
193 | void UpdateGraphBuildTime(const uint64 running_time_usecs); |
194 | |
195 | // Updates the metric stored for time spent optimizing function graphs. |
196 | void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs); |
197 | |
198 | // Records the activity of the first phase of the mlir bridge using the |
199 | // tf_metadata.tf_mlir_bridge_first_phase_count metric. |
200 | // device_type: tpu, cpu, gpu, etc. |
201 | // bridge_version: v1 compat, v2, etc. |
202 | // fallback_enabled: true if fallback will happen, false if not |
203 | // result: outcome of bridge (success, failure, disabled, invalid_graph, etc.) |
204 | void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type, |
205 | const std::string& bridge_version, |
206 | bool fallback_enabled, |
207 | const std::string& result); |
208 | |
209 | // Records the activity per op using the |
210 | // tf_metadata.tf_mlir_bridge_graph_analysis_per_op. |
211 | // op_name: the name of op. |
212 | // construction_context: eager, session, Not tracked. |
213 | // is_single_core_inference_mode: true, false. |
214 | // unsupported_reason: the reason why the graph is not supported in MLIR-based |
215 | // bridge, like invalid graph, has unsupported ops, etc. |
216 | // has_unsupported_features: true indicates MLIR-based bridge is disabled, |
217 | // false indicates MLIR-based bridge is enabled. |
218 | |
219 | void UpdateTfMlirBridgeGraphAnalysisPerOp( |
220 | const std::string& op_name, const std::string& construction_context, |
221 | bool is_single_core_inference_mode, const std::string& num_replicas, |
222 | const std::string& num_cores_per_replica, const std::string& use_tpu, |
223 | const std::string& allow_soft_placement, |
224 | const std::string& use_spmd_for_xla_partitioning, |
225 | const std::string& unsupported_reason, bool has_unsupported_features); |
226 | |
227 | // Convenience class allowing RAII style of reporting for a monitoring::Counter. |
228 | template <int NumLabels> |
229 | class ScopedCounter final { |
230 | public: |
231 | ScopedCounter(monitoring::Counter<NumLabels>* const counter, |
232 | const std::array<std::string, NumLabels>& labels) |
233 | : counter_(counter), labels_(labels) { |
234 | Init(); |
235 | } |
236 | |
237 | // Report counter and stop it. Counter needs to be reset to perform |
238 | // next measurement. |
239 | void ReportAndStop() { |
240 | if (started_) { |
241 | started_ = false; |
242 | ReportInternal(std::make_index_sequence<NumLabels>()); |
243 | } |
244 | } |
245 | |
246 | // Start the measurement with the new set of labels. |
247 | void Reset(const std::array<std::string, NumLabels>& labels) { |
248 | labels_ = labels; |
249 | Init(); |
250 | } |
251 | |
252 | // Start the measurement with the existing set of labels. |
253 | void Reset() { Init(); } |
254 | |
255 | // Returns duration of the current interval in case the timer has started. |
256 | // Returns nullopt otherwise. |
257 | absl::optional<uint64> DurationMicroSec() const { |
258 | return started_ ? absl::optional<uint64>( |
259 | accumulated_time_ + |
260 | tensorflow::Env::Default()->NowMicros() - start_time_) |
261 | : absl::nullopt; |
262 | } |
263 | |
264 | // Temporarily stop the timer, but keep accumulated time. |
265 | void AccumulateAndStop() { |
266 | if (started_) { |
267 | accumulated_time_ = tensorflow::Env::Default()->NowMicros() - start_time_; |
268 | started_ = false; |
269 | } |
270 | } |
271 | |
272 | // Start previously stopped timer. |
273 | void Start() { |
274 | if (started_) return; |
275 | |
276 | // Keep previously accumulated time if any. |
277 | start_time_ = tensorflow::Env::Default()->NowMicros(); |
278 | started_ = true; |
279 | } |
280 | |
281 | ~ScopedCounter() { ReportAndStop(); } |
282 | |
283 | private: |
284 | template <std::size_t... S> |
285 | void ReportInternal(std::index_sequence<S...>) { |
286 | uint64 time_interval = |
287 | tensorflow::Env::Default()->NowMicros() - start_time_; |
288 | time_interval += accumulated_time_; |
289 | if (time_interval > 0) { |
290 | counter_->GetCell(labels_[S]...)->IncrementBy(time_interval); |
291 | } |
292 | } |
293 | |
294 | void Init() { |
295 | start_time_ = tensorflow::Env::Default()->NowMicros(); |
296 | started_ = true; |
297 | accumulated_time_ = 0; |
298 | } |
299 | |
300 | monitoring::Counter<NumLabels>* counter_; |
301 | std::array<std::string, NumLabels> labels_; |
302 | bool started_{false}; |
303 | uint64 start_time_; |
304 | uint64 accumulated_time_; |
305 | }; |
306 | |
307 | // Returns a counter used to capture timing metrics for graph optimization |
308 | // passes. |
309 | monitoring::Counter<2>* GetGraphOptimizationCounter(); |
310 | |
311 | // Updates metrics for time to distribute variables to all TPU hosts. |
312 | void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs); |
313 | |
314 | // Updates the metrics stored about time XLA spents compiling graphs. |
315 | void UpdateXlaCompilationTime(const uint64 compilation_time_usecs); |
316 | |
317 | // Updates the metrics stored about time BFC allocator spents during delay. |
318 | void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs); |
319 | |
320 | // Increments (by 1) a simple integer counter that is exposed for testing. |
321 | void IncrementTestCounter(const string& name, const string& label); |
322 | |
323 | // Read-only access to a counter for testing. |
324 | const monitoring::CounterCell* TestCounter(const string& name, |
325 | const string& label); |
326 | |
327 | // Read-only wrapper for a TestCounter to track increments between calls. |
328 | class TestDelta { |
329 | public: |
330 | TestDelta(const string& name, const string& label); |
331 | void Reset(); |
332 | int64 Get(); |
333 | |
334 | private: |
335 | const monitoring::CounterCell* cell_; |
336 | int64 last_value_; |
337 | }; |
338 | void UpdateTpuErrorCounter(const string& op, const string& error_type); |
339 | void UpdateEagerClientErrorCounter(const string& error_source, |
340 | const string& error_type); |
341 | |
342 | } // namespace metrics |
343 | } // namespace tensorflow |
344 | |
345 | #endif // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ |
346 | |