1 | /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/core/framework/metrics.h" |
17 | |
18 | #include <cstdint> |
19 | #include <string> |
20 | |
21 | #include "absl/strings/str_cat.h" |
22 | #include "tensorflow/core/lib/monitoring/counter.h" |
23 | #include "tensorflow/core/lib/monitoring/gauge.h" |
24 | #include "tensorflow/core/lib/monitoring/sampler.h" |
25 | #include "tensorflow/core/protobuf/data_service.pb.h" |
26 | |
27 | namespace tensorflow { |
28 | namespace metrics { |
29 | namespace { |
30 | |
31 | auto* graph_runs = monitoring::Counter<0>::New( |
32 | "/tensorflow/core/graph_runs" , |
33 | "The number of graph executions used to collect " |
34 | "/tensorflow/core/graph_run_time_usecs" ); |
35 | |
36 | auto* graph_run_time_usecs = monitoring::Counter<0>::New( |
37 | "/tensorflow/core/graph_run_time_usecs" , |
38 | "The total time spent on executing graphs in microseconds." ); |
39 | |
40 | auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New( |
41 | {"/tensorflow/core/graph_run_time_usecs_histogram" , |
42 | "The wall-clock time spent on executing graphs in microseconds." }, |
43 | // Power of 2 with bucket count 20 (> 17 minutes) |
44 | {monitoring::Buckets::Exponential(1000, 2, 20)}); |
45 | |
46 | auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New( |
47 | {"/tensorflow/core/graph_pending_queue_length_histogram" , |
48 | "The number of pending (ready but not running) tasks in graph executor." }, |
49 | // Power of 1.5 with bucket count 30 (> 191k) |
50 | {monitoring::Buckets::Exponential(1, 1.5, 30)}); |
51 | |
52 | auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New( |
53 | {"/tensorflow/core/graph_run_input_tensor_bytes" , |
54 | "The size of input tensors in bytes." }, |
55 | // Power of 2 with bucket count 14 (256MB) |
56 | {monitoring::Buckets::Exponential(1, 4, 14)}); |
57 | |
58 | auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New( |
59 | {"/tensorflow/core/graph_run_output_tensor_bytes" , |
60 | "The size of output tensors in bytes." }, |
61 | // Power of 2 with bucket count 14 (256MB) |
62 | {monitoring::Buckets::Exponential(1, 4, 14)}); |
63 | |
64 | auto* graph_unused_outputs = monitoring::Counter<1>::New( |
65 | "/tensorflow/core/graph_unused_outputs" , |
66 | "The number of unused outputs for ops of a given type." , "name" ); |
67 | |
68 | auto* tf_data_autotune_counter = monitoring::Counter<1>::New( |
69 | "/tensorflow/data/autotune" , "tf.data autotuning" , "name" ); |
70 | |
71 | auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New( |
72 | "/tensorflow/data/bytes_consumed" , |
73 | "The number of bytes consumed by a tf.data Dataset." , "name" ); |
74 | |
75 | auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New( |
76 | "/tensorflow/data/bytes_produced" , |
77 | "The number of bytes produced by a tf.data Dataset." , "name" ); |
78 | |
79 | auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New( |
80 | "/tensorflow/data/bytes_read" , |
81 | "The number of bytes read by tf.data Dataset sources." , "name" ); |
82 | |
83 | auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New( |
84 | "/tensorflow/data/bytes_fetched" , |
85 | "The number of bytes fetched from tf.data Dataset iterator." ); |
86 | |
87 | auto* tf_data_elements_counter = monitoring::Counter<1>::New( |
88 | "/tensorflow/data/elements" , "tf.data elements" , "name" ); |
89 | |
90 | auto* tf_data_experiment_counter = monitoring::Counter<1>::New( |
91 | "/tensorflow/data/experiment" , |
92 | "The number of times tf.data experiment is applied to input pipelines." , |
93 | "name" ); |
94 | |
95 | auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New( |
96 | "/tensorflow/data/fingerprint" , "tf.data fingerprint" , "name" ); |
97 | |
98 | auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New( |
99 | {"/tensorflow/data/getnext_duration" , |
100 | "Microseconds spent fetching an element from tf.data iterator." }, |
101 | // Power of 2 with bucket count 10 (1024 microseconds) and 1 second. |
102 | {monitoring::Buckets::Explicit( |
103 | {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})}); |
104 | |
105 | auto* tf_data_used_vs_budget_ratio_histogram = monitoring::Sampler<0>::New( |
106 | {"/tensorflow/data/used_vs_budget_ratio" , |
107 | "Ratio of tf.data used ram over ram budget when running optimization." }, |
108 | // Uniform linear buckets with count 10 from 0 to 2 |
109 | {monitoring::Buckets::Explicit( |
110 | {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})}); |
111 | |
112 | auto* tf_data_buffered_vs_budget_ratio_histogram = monitoring::Sampler<0>::New( |
113 | {"/tensorflow/data/buffered_vs_budget_ratio" , |
114 | "Ratio of tf.data max buffer bytes over ram budget when running " |
115 | "optimization." }, |
116 | // Uniform linear buckets with count 10 from 0 to 2 |
117 | {monitoring::Buckets::Explicit( |
118 | {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})}); |
119 | |
120 | auto* tf_data_iterator_busy_counter = |
121 | monitoring::Counter<0>::New("/tensorflow/data/iterator_busy" , |
122 | "The time (in microseconds) during which a " |
123 | "tf.data iterator was busy processing at " |
124 | "least one `GetNext()` request." ); |
125 | |
126 | auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New( |
127 | "/tensorflow/data/iterator_lifetime" , |
128 | "The time (in microseconds) between a tf.data iterator receiving the first " |
129 | "`GetNext()` request and responding to the last `GetNext()` request." ); |
130 | |
131 | auto* tf_data_iterator_gap_msec_histogram = monitoring::Sampler<0>::New( |
132 | {"/tensorflow/data/iterator_gap" , |
133 | "The time (in milliseconds) between a tf.data iterator responding to a " |
134 | "`GetNext()` request and receiving the next `GetNext()` request." }, |
135 | // Power of 1.5 with bucket count of 20 (from 1 msec to about 2.2 secs). |
136 | {monitoring::Buckets::Exponential(1, 1.5, 20)}); |
137 | |
138 | auto* tf_data_optimization_counter = monitoring::Counter<1>::New( |
139 | "/tensorflow/data/optimization" , "tf.data optimization" , "name" ); |
140 | |
141 | auto* tf_data_service_workers_created_counter = |
142 | monitoring::Counter<0>::New("/tensorflow/data/service/workers_created" , |
143 | "Number of tf.data service workers created" ); |
144 | |
145 | auto* tf_data_service_jobs_created_counter = monitoring::Counter<2>::New( |
146 | "/tensorflow/data/service/jobs_created" , "Number of tf.data service jobs." , |
147 | "processing_mode" , "coordinated_read" ); |
148 | |
149 | auto* tf_data_service_client_iterators_counter = monitoring::Counter<4>::New( |
150 | "/tensorflow/data/service/client_iterators" , |
151 | "Number of tf.data service client iterators created." , "worker_uid" , |
152 | "deployment_mode" , "processing_mode" , "is_coordinated_read" ); |
153 | |
154 | auto* tf_data_service_cross_trainer_cache_queries_counter = |
155 | monitoring::Counter<1>::New( |
156 | "/tensorflow/data/service/cross_trainer_cache_queries" , |
157 | "tf.data service cross-trainer cache queries counter. The result can " |
158 | "be hit or miss." , |
159 | "cache_hit" ); |
160 | |
161 | auto* tf_data_service_cross_trainer_cache_size_bytes = |
162 | monitoring::Gauge<int64_t, 0>::New( |
163 | "/tensorflow/data/service/cross_trainer_cache_size_bytes" , |
164 | "tf.data service cross-trainer cache memory usage in bytes." ); |
165 | |
166 | auto* tf_data_filename_counter = monitoring::Counter<2>::New( |
167 | "/tensorflow/data/filename" , "The file name read by a tf.data Dataset." , |
168 | "name" , "filename" ); |
169 | |
170 | auto* tf_data_model_gauge = |
171 | monitoring::Gauge<std::function<std::string()>, 1>::New( |
172 | "/tensorflow/data/model" , "tf.data autotuning model proto." , "id" ); |
173 | |
174 | auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New( |
175 | "/tensorflow/data/autoshard" , "tf.data autoshard statistics." , "id" , |
176 | "name" ); |
177 | |
178 | auto* tf_data_auto_shard_rewrite_batch_size_eligible = |
179 | monitoring::Counter<1>::New( |
180 | "/tensorflow/data/autoshard_rewrite_batch_size/eligible" , |
181 | "Whether tf.data pipelines that are eligible for autoshard " |
182 | "to rewrite the batch size." , |
183 | "eligible" ); |
184 | |
185 | auto* tf_data_auto_shard_rewrite_batch_size_reason = |
186 | monitoring::Counter<1>::New( |
187 | "/tensorflow/data/autoshard_rewrite_batch_size/reason" , |
188 | "The reasons that tf.data pipelines are ineligible for autoshard " |
189 | "to rewrite the batch size." , |
190 | "reason" ); |
191 | |
192 | auto* tf_data_autotune_stopping_criteria_counter = |
193 | monitoring::Counter<1>::New("/tensorflow/data/autotune_stopping_criteria" , |
194 | "The number of times each tf.data autotune " |
195 | "algorithm stopping criterion is met." , |
196 | "name" ); |
197 | |
198 | auto* parse_dense_feature_counter = monitoring::Counter<0>::New( |
199 | "/tensorflow/data/dense_feature" , |
200 | "The number of dense features parsed by ops for parsing tf.Example." ); |
201 | |
202 | auto* parse_sparse_feature_counter = monitoring::Counter<0>::New( |
203 | "/tensorflow/data/sparse_feature" , |
204 | "The number of sparse features parsed by ops for parsing tf.Example." ); |
205 | |
206 | auto* parse_ragged_feature_counter = monitoring::Counter<0>::New( |
207 | "/tensorflow/data/ragged_feature" , |
208 | "The number of ragged features parsed by ops for parsing tf.Example." ); |
209 | |
210 | auto* build_graph_calls = monitoring::Counter<0>::New( |
211 | "/tensorflow/core/graph_build_calls" , |
212 | "The number of times TensorFlow has created a new client graph. " |
213 | "A client graph is a sub-graph of the full graph, induced by a set of " |
214 | "options, including the requested feeds and fetches. It includes time " |
215 | "spent optimizing the graph with Grappler, and time spent pruning the " |
216 | "sub-graph." ); |
217 | |
218 | auto* build_graph_time_usecs = monitoring::Counter<0>::New( |
219 | "/tensorflow/core/graph_build_time_usecs" , |
220 | "The amount of time TensorFlow has spent creating new client graphs in " |
221 | "microseconds. " |
222 | "A client graph is a sub-graph of the full graph, induced by a set of " |
223 | "options, including the requested feeds and fetches. It includes time " |
224 | "spent optimizing the graph with Grappler, and time spent pruning the " |
225 | "sub-graph." ); |
226 | |
227 | auto* function_graph_optimization_time_usecs = monitoring::Counter<0>::New( |
228 | "/tensorflow/core/function_graph_optimization_time_usecs" , |
229 | "The amount of time TensorFlow has spent optimizing function graphs, in " |
230 | "microseconds. " ); |
231 | |
232 | auto* xla_compilations = monitoring::Counter<0>::New( |
233 | "/tensorflow/core/xla_compilations" , |
234 | "The number of XLA compilations used to collect " |
235 | "/tensorflow/core/xla_compilation_time_usecs" ); |
236 | |
237 | auto* xla_compilation_time_usecs = monitoring::Counter<0>::New( |
238 | "/tensorflow/core/xla_compilation_time_usecs" , |
239 | "The total time spent on compiling XLA graphs in microseconds." ); |
240 | |
241 | auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New( |
242 | "/tensorflow/tpu/xla_spmd_cores_per_replica" , |
243 | "The number of cores used by XLA SPMD-replicated models." , "cores" ); |
244 | |
245 | auto* bfc_allocator_delay = |
246 | monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay" , |
247 | "The total time spent running each graph " |
248 | "optimization pass in microseconds." ); |
249 | |
250 | auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New( |
251 | "/tensorflow/tpu/variable_distribution_time" , |
252 | "Time spent sending variables from primary task to other worker tasks " |
253 | "at the start of a call to TPUExecute. Timer starts at RunGraph " |
254 | "invocation and ends when TPUExecute args are ready on the current task." ); |
255 | |
256 | auto* test_counters = |
257 | monitoring::Counter<2>::New("/tensorflow/core/test_counters" , |
258 | "Counters used for testing." , "name" , "label" ); |
259 | |
260 | } // namespace |
261 | |
262 | auto* tpu_op_error_counter = monitoring::Counter<2>::New( |
263 | "/tensorflow/tpu/op_error_count" , |
264 | "Count the tpu related errors by op and error_type." , "op" , "error_type" ); |
265 | |
266 | auto* eager_client_error_counter = monitoring::Counter<2>::New( |
267 | "/tensorflow/core/eager_client_error_count" , |
268 | "Count the errors in eager client as a central place." , "error_source" , |
269 | "error_type" ); |
270 | |
271 | monitoring::Counter<2>* GetGraphOptimizationCounter() { |
272 | static auto* graph_optimization_counter = |
273 | monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs" , |
274 | "The total time spent running each graph " |
275 | "optimization pass in microseconds." , |
276 | "kind" , "name" ); |
277 | return graph_optimization_counter; |
278 | } |
279 | |
280 | void RecordTFDataAutotune(const string& name) { |
281 | tf_data_autotune_counter->GetCell(name)->IncrementBy(1); |
282 | } |
283 | |
284 | monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) { |
285 | return tf_data_bytes_consumed_counter->GetCell(name); |
286 | } |
287 | |
288 | monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) { |
289 | return tf_data_bytes_produced_counter->GetCell(name); |
290 | } |
291 | |
292 | monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) { |
293 | return tf_data_bytes_read_counter->GetCell(name); |
294 | } |
295 | |
296 | monitoring::CounterCell* GetTFDataElementsCounter(const string& name) { |
297 | return tf_data_elements_counter->GetCell(name); |
298 | } |
299 | |
300 | monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge( |
301 | const string& id) { |
302 | return tf_data_model_gauge->GetCell(id); |
303 | } |
304 | |
305 | void RecordTFDataBytesFetched(int64_t num_bytes) { |
306 | tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes); |
307 | } |
308 | |
309 | void RecordTFDataExperiment(const string& name) { |
310 | tf_data_experiment_counter->GetCell(name)->IncrementBy(1); |
311 | } |
312 | |
313 | void RecordTFDataFingerprint(const string& name) { |
314 | tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1); |
315 | } |
316 | |
317 | void RecordTFDataGetNextDuration(uint64 duration_us) { |
318 | static auto* tf_data_get_next_duration_cell = |
319 | tf_data_get_next_duration_usecs_histogram->GetCell(); |
320 | tf_data_get_next_duration_cell->Add(duration_us); |
321 | } |
322 | |
323 | void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio) { |
324 | static auto* tf_data_used_vs_budget_ratio_histogram_cell = |
325 | tf_data_used_vs_budget_ratio_histogram->GetCell(); |
326 | tf_data_used_vs_budget_ratio_histogram_cell->Add(ratio); |
327 | } |
328 | |
329 | void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) { |
330 | static auto* tf_data_buffered_vs_budget_ratio_histogram_cell = |
331 | tf_data_buffered_vs_budget_ratio_histogram->GetCell(); |
332 | tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio); |
333 | } |
334 | |
335 | void RecordTFDataIteratorBusy(uint64 duration_us) { |
336 | static auto* tf_data_iterator_busy_cell = |
337 | tf_data_iterator_busy_counter->GetCell(); |
338 | tf_data_iterator_busy_cell->IncrementBy(duration_us); |
339 | } |
340 | |
341 | void RecordTFDataIteratorLifetime(uint64 duration_us) { |
342 | static auto* tf_data_iterator_lifetime_cell = |
343 | tf_data_iterator_lifetime_counter->GetCell(); |
344 | tf_data_iterator_lifetime_cell->IncrementBy(duration_us); |
345 | } |
346 | |
347 | void RecordTFDataIteratorGap(uint64 duration_us) { |
348 | static auto* tf_data_iterator_gap_msec_histogram_cell = |
349 | tf_data_iterator_gap_msec_histogram->GetCell(); |
350 | tf_data_iterator_gap_msec_histogram_cell->Add(duration_us * 0.001); |
351 | } |
352 | |
353 | void RecordTFDataOptimization(const string& name, int64_t num_changes) { |
354 | tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes); |
355 | } |
356 | |
357 | void RecordTFDataServiceWorkerCreated() { |
358 | tf_data_service_workers_created_counter->GetCell()->IncrementBy(1); |
359 | } |
360 | |
361 | void RecordTFDataServiceJobsCreated( |
362 | const tensorflow::data::ProcessingModeDef& processing_mode, |
363 | bool is_coordinated_read) { |
364 | const std::string sharding_policy_str = |
365 | data::ProcessingModeDef::ShardingPolicy_Name( |
366 | processing_mode.sharding_policy()); |
367 | const std::string coordinated_read_str = |
368 | is_coordinated_read ? "true" : "false" ; |
369 | tf_data_service_jobs_created_counter |
370 | ->GetCell(sharding_policy_str, coordinated_read_str) |
371 | ->IncrementBy(1); |
372 | } |
373 | |
374 | void RecordTFDataServiceClientIterators( |
375 | int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode, |
376 | const tensorflow::data::ProcessingModeDef& processing_mode, |
377 | bool is_coordinated_read) { |
378 | const std::string deployment_mode_str = |
379 | tensorflow::data::DeploymentMode_Name(deployment_mode); |
380 | const std::string sharding_policy_str = |
381 | data::ProcessingModeDef::ShardingPolicy_Name( |
382 | processing_mode.sharding_policy()); |
383 | const std::string coordinated_read_str = |
384 | is_coordinated_read ? "true" : "false" ; |
385 | tf_data_service_client_iterators_counter |
386 | ->GetCell(absl::StrCat(worker_uid), deployment_mode_str, |
387 | sharding_policy_str, coordinated_read_str) |
388 | ->IncrementBy(1); |
389 | } |
390 | |
391 | void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit) { |
392 | std::string cache_hit_str = cache_hit ? "true" : "false" ; |
393 | tf_data_service_cross_trainer_cache_queries_counter->GetCell(cache_hit_str) |
394 | ->IncrementBy(1); |
395 | } |
396 | |
397 | void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes) { |
398 | tf_data_service_cross_trainer_cache_size_bytes->GetCell()->Set( |
399 | static_cast<int64_t>(bytes)); |
400 | } |
401 | |
402 | void RecordTFDataFilename(const string& name, const string& filename) { |
403 | tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1); |
404 | } |
405 | |
406 | void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy, |
407 | int64 num_workers, int64 num_replicas) { |
408 | tf_data_auto_shard->GetCell(id, "policy" )->Set(static_cast<int64_t>(policy)); |
409 | tf_data_auto_shard->GetCell(id, "num_workers" )->Set(num_workers); |
410 | tf_data_auto_shard->GetCell(id, "num_replicas" )->Set(num_replicas); |
411 | } |
412 | |
413 | void RecordTFDataAutoShardRewriteBatchSize( |
414 | bool eligible, const std::vector<string>& ineligible_reason) { |
415 | tf_data_auto_shard_rewrite_batch_size_eligible |
416 | ->GetCell(eligible ? "true" : "false" ) |
417 | ->IncrementBy(1); |
418 | for (const string& reason : ineligible_reason) { |
419 | tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy( |
420 | 1); |
421 | } |
422 | } |
423 | |
424 | void RecordTFDataAutotuneStoppingCriteria(const string& name) { |
425 | tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1); |
426 | } |
427 | |
428 | void RecordParseDenseFeature(int64 num_features) { |
429 | static auto* parse_dense_feature_counter_cell = |
430 | parse_dense_feature_counter->GetCell(); |
431 | parse_dense_feature_counter_cell->IncrementBy(num_features); |
432 | } |
433 | |
434 | void RecordParseSparseFeature(int64_t num_features) { |
435 | static auto* parse_sparse_feature_counter_cell = |
436 | parse_sparse_feature_counter->GetCell(); |
437 | parse_sparse_feature_counter_cell->IncrementBy(num_features); |
438 | } |
439 | |
440 | void RecordParseRaggedFeature(int64_t num_features) { |
441 | static auto* parse_ragged_feature_counter_cell = |
442 | parse_ragged_feature_counter->GetCell(); |
443 | parse_ragged_feature_counter_cell->IncrementBy(num_features); |
444 | } |
445 | |
446 | void RecordGraphInputTensors(const size_t size) { |
447 | static auto* graph_run_input_tensor_bytes_cell = |
448 | graph_run_input_tensor_bytes->GetCell(); |
449 | graph_run_input_tensor_bytes_cell->Add(size); |
450 | } |
451 | |
452 | void RecordGraphOutputTensors(const size_t size) { |
453 | static auto* graph_run_output_tensor_bytes_cell = |
454 | graph_run_output_tensor_bytes->GetCell(); |
455 | graph_run_output_tensor_bytes_cell->Add(size); |
456 | } |
457 | |
458 | void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) { |
459 | xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica)) |
460 | ->IncrementBy(1); |
461 | } |
462 | |
463 | void UpdateGraphExecTime(const uint64 running_time_usecs) { |
464 | if (running_time_usecs > 0) { |
465 | static auto* graph_runs_cell = graph_runs->GetCell(); |
466 | static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell(); |
467 | static auto* graph_run_time_usecs_histogram_cell = |
468 | graph_run_time_usecs_histogram->GetCell(); |
469 | graph_runs_cell->IncrementBy(1); |
470 | graph_run_time_usecs_cell->IncrementBy(running_time_usecs); |
471 | graph_run_time_usecs_histogram_cell->Add(running_time_usecs); |
472 | } |
473 | } |
474 | |
475 | void UpdateGraphPendingQueueLength(uint64 len) { |
476 | static auto* graph_pending_queue_length_cell = |
477 | graph_pending_queue_length_histogram->GetCell(); |
478 | graph_pending_queue_length_cell->Add(len); |
479 | } |
480 | |
481 | void UpdateGraphBuildTime(const uint64 running_time_usecs) { |
482 | if (running_time_usecs > 0) { |
483 | static auto* build_graph_calls_cell = build_graph_calls->GetCell(); |
484 | static auto* build_graph_time_usecs_cell = |
485 | build_graph_time_usecs->GetCell(); |
486 | build_graph_calls_cell->IncrementBy(1); |
487 | build_graph_time_usecs_cell->IncrementBy(running_time_usecs); |
488 | } |
489 | } |
490 | |
491 | void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) { |
492 | if (running_time_usecs > 0) { |
493 | static auto* function_graph_optimization_time_usecs_cell = |
494 | function_graph_optimization_time_usecs->GetCell(); |
495 | function_graph_optimization_time_usecs_cell->IncrementBy( |
496 | running_time_usecs); |
497 | } |
498 | } |
499 | |
500 | void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) { |
501 | if (distribution_time_usecs > 0) { |
502 | tpu_variable_distribution_time_usecs->GetCell()->IncrementBy( |
503 | distribution_time_usecs); |
504 | } |
505 | } |
506 | |
507 | void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) { |
508 | if (compilation_time_usecs > 0) { |
509 | static auto* xla_compilations_cell = xla_compilations->GetCell(); |
510 | static auto* xla_compilation_time_usecs_cell = |
511 | xla_compilation_time_usecs->GetCell(); |
512 | xla_compilations_cell->IncrementBy(1); |
513 | xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs); |
514 | } |
515 | } |
516 | |
517 | void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) { |
518 | static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell(); |
519 | if (delay_usecs > 0) { |
520 | bfc_allocator_delay_cell->IncrementBy(delay_usecs); |
521 | } |
522 | } |
523 | |
524 | void RecordUnusedOutput(const string& op_name) { |
525 | graph_unused_outputs->GetCell(op_name)->IncrementBy(1); |
526 | } |
527 | |
528 | void IncrementTestCounter(const string& name, const string& label) { |
529 | test_counters->GetCell(name, label)->IncrementBy(1); |
530 | } |
531 | |
532 | const monitoring::CounterCell* TestCounter(const string& name, |
533 | const string& label) { |
534 | return test_counters->GetCell(name, label); |
535 | } |
536 | |
537 | TestDelta::TestDelta(const string& name, const string& label) |
538 | : cell_(TestCounter(name, label)) { |
539 | Reset(); |
540 | } |
541 | |
542 | void TestDelta::Reset() { last_value_ = cell_->value(); } |
543 | |
544 | int64 TestDelta::Get() { return cell_->value() - last_value_; } |
545 | |
546 | void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type, |
547 | const std::string& bridge_version, |
548 | bool fallback_enabled, |
549 | const std::string& result) { |
550 | static auto* metric = monitoring::Counter<4>::New( |
551 | "/tensorflow/core/tf_mlir_bridge_first_phase_count" , |
552 | "Tracks processing state in first phase of mlir bridge" , "device" , |
553 | "version" , "fallback" , "result" ); |
554 | std::string fallback_status = |
555 | fallback_enabled ? "fallback_enabled" : "fallback_disabled" ; |
556 | metric->GetCell(device_type, bridge_version, fallback_status, result) |
557 | ->IncrementBy(1); |
558 | } |
559 | |
560 | void UpdateTpuErrorCounter(const string& op, const string& error_type) { |
561 | tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1); |
562 | } |
563 | |
564 | void UpdateEagerClientErrorCounter(const string& error_source, |
565 | const string& error_type) { |
566 | eager_client_error_counter->GetCell(error_source, error_type)->IncrementBy(1); |
567 | } |
568 | |
569 | void UpdateTfMlirBridgeGraphAnalysisPerOp( |
570 | const std::string& op_name, const std::string& construction_context, |
571 | bool is_single_core_inference_mode, const std::string& num_replicas, |
572 | const std::string& num_cores_per_replica, const std::string& use_tpu, |
573 | const std::string& allow_soft_placement, |
574 | const std::string& use_spmd_for_xla_partitioning, |
575 | const std::string& unsupported_reason, bool has_unsupported_features) { |
576 | static auto* metric = monitoring::Counter<10>::New( |
577 | "/tensorflow/core/tf_mlir_bridge_graph_analysis_per_op" , |
578 | "Tracks processing state per op in first phase of mlir bridge" , "op_name" , |
579 | "construction_context" , "is_single_core_inference_mode" , "num_replicas" , |
580 | "num_cores_per_replica" , "use_tpu" , "allow_soft_placement" , |
581 | "use_spmd_for_xla_partitioning" , "unsupported_reason" , |
582 | "has_unsupported_features" ); |
583 | |
584 | metric |
585 | ->GetCell(op_name, construction_context, |
586 | is_single_core_inference_mode ? "Yes" : "No" , num_replicas, |
587 | num_cores_per_replica, use_tpu, allow_soft_placement, |
588 | use_spmd_for_xla_partitioning, unsupported_reason, |
589 | has_unsupported_features ? "Yes" : "No" ) |
590 | ->IncrementBy(1); |
591 | } |
592 | |
593 | } // namespace metrics |
594 | } // namespace tensorflow |
595 | |