1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/core/framework/metrics.h"
17
18#include <cstdint>
19#include <string>
20
21#include "absl/strings/str_cat.h"
22#include "tensorflow/core/lib/monitoring/counter.h"
23#include "tensorflow/core/lib/monitoring/gauge.h"
24#include "tensorflow/core/lib/monitoring/sampler.h"
25#include "tensorflow/core/protobuf/data_service.pb.h"
26
27namespace tensorflow {
28namespace metrics {
29namespace {
30
31auto* graph_runs = monitoring::Counter<0>::New(
32 "/tensorflow/core/graph_runs",
33 "The number of graph executions used to collect "
34 "/tensorflow/core/graph_run_time_usecs");
35
36auto* graph_run_time_usecs = monitoring::Counter<0>::New(
37 "/tensorflow/core/graph_run_time_usecs",
38 "The total time spent on executing graphs in microseconds.");
39
40auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
41 {"/tensorflow/core/graph_run_time_usecs_histogram",
42 "The wall-clock time spent on executing graphs in microseconds."},
43 // Power of 2 with bucket count 20 (> 17 minutes)
44 {monitoring::Buckets::Exponential(1000, 2, 20)});
45
46auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
47 {"/tensorflow/core/graph_pending_queue_length_histogram",
48 "The number of pending (ready but not running) tasks in graph executor."},
49 // Power of 1.5 with bucket count 30 (> 191k)
50 {monitoring::Buckets::Exponential(1, 1.5, 30)});
51
52auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
53 {"/tensorflow/core/graph_run_input_tensor_bytes",
54 "The size of input tensors in bytes."},
55 // Power of 2 with bucket count 14 (256MB)
56 {monitoring::Buckets::Exponential(1, 4, 14)});
57
58auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
59 {"/tensorflow/core/graph_run_output_tensor_bytes",
60 "The size of output tensors in bytes."},
61 // Power of 2 with bucket count 14 (256MB)
62 {monitoring::Buckets::Exponential(1, 4, 14)});
63
64auto* graph_unused_outputs = monitoring::Counter<1>::New(
65 "/tensorflow/core/graph_unused_outputs",
66 "The number of unused outputs for ops of a given type.", "name");
67
68auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
69 "/tensorflow/data/autotune", "tf.data autotuning", "name");
70
71auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
72 "/tensorflow/data/bytes_consumed",
73 "The number of bytes consumed by a tf.data Dataset.", "name");
74
75auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
76 "/tensorflow/data/bytes_produced",
77 "The number of bytes produced by a tf.data Dataset.", "name");
78
79auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
80 "/tensorflow/data/bytes_read",
81 "The number of bytes read by tf.data Dataset sources.", "name");
82
83auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
84 "/tensorflow/data/bytes_fetched",
85 "The number of bytes fetched from tf.data Dataset iterator.");
86
87auto* tf_data_elements_counter = monitoring::Counter<1>::New(
88 "/tensorflow/data/elements", "tf.data elements", "name");
89
90auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
91 "/tensorflow/data/experiment",
92 "The number of times tf.data experiment is applied to input pipelines.",
93 "name");
94
95auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
96 "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
97
98auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
99 {"/tensorflow/data/getnext_duration",
100 "Microseconds spent fetching an element from tf.data iterator."},
101 // Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
102 {monitoring::Buckets::Explicit(
103 {2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
104
105auto* tf_data_used_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
106 {"/tensorflow/data/used_vs_budget_ratio",
107 "Ratio of tf.data used ram over ram budget when running optimization."},
108 // Uniform linear buckets with count 10 from 0 to 2
109 {monitoring::Buckets::Explicit(
110 {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
111
112auto* tf_data_buffered_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
113 {"/tensorflow/data/buffered_vs_budget_ratio",
114 "Ratio of tf.data max buffer bytes over ram budget when running "
115 "optimization."},
116 // Uniform linear buckets with count 10 from 0 to 2
117 {monitoring::Buckets::Explicit(
118 {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
119
120auto* tf_data_iterator_busy_counter =
121 monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
122 "The time (in microseconds) during which a "
123 "tf.data iterator was busy processing at "
124 "least one `GetNext()` request.");
125
126auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
127 "/tensorflow/data/iterator_lifetime",
128 "The time (in microseconds) between a tf.data iterator receiving the first "
129 "`GetNext()` request and responding to the last `GetNext()` request.");
130
131auto* tf_data_iterator_gap_msec_histogram = monitoring::Sampler<0>::New(
132 {"/tensorflow/data/iterator_gap",
133 "The time (in milliseconds) between a tf.data iterator responding to a "
134 "`GetNext()` request and receiving the next `GetNext()` request."},
135 // Power of 1.5 with bucket count of 20 (from 1 msec to about 2.2 secs).
136 {monitoring::Buckets::Exponential(1, 1.5, 20)});
137
138auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
139 "/tensorflow/data/optimization", "tf.data optimization", "name");
140
141auto* tf_data_service_workers_created_counter =
142 monitoring::Counter<0>::New("/tensorflow/data/service/workers_created",
143 "Number of tf.data service workers created");
144
145auto* tf_data_service_jobs_created_counter = monitoring::Counter<2>::New(
146 "/tensorflow/data/service/jobs_created", "Number of tf.data service jobs.",
147 "processing_mode", "coordinated_read");
148
149auto* tf_data_service_client_iterators_counter = monitoring::Counter<4>::New(
150 "/tensorflow/data/service/client_iterators",
151 "Number of tf.data service client iterators created.", "worker_uid",
152 "deployment_mode", "processing_mode", "is_coordinated_read");
153
154auto* tf_data_service_cross_trainer_cache_queries_counter =
155 monitoring::Counter<1>::New(
156 "/tensorflow/data/service/cross_trainer_cache_queries",
157 "tf.data service cross-trainer cache queries counter. The result can "
158 "be hit or miss.",
159 "cache_hit");
160
161auto* tf_data_service_cross_trainer_cache_size_bytes =
162 monitoring::Gauge<int64_t, 0>::New(
163 "/tensorflow/data/service/cross_trainer_cache_size_bytes",
164 "tf.data service cross-trainer cache memory usage in bytes.");
165
166auto* tf_data_filename_counter = monitoring::Counter<2>::New(
167 "/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
168 "name", "filename");
169
170auto* tf_data_model_gauge =
171 monitoring::Gauge<std::function<std::string()>, 1>::New(
172 "/tensorflow/data/model", "tf.data autotuning model proto.", "id");
173
174auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New(
175 "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
176 "name");
177
178auto* tf_data_auto_shard_rewrite_batch_size_eligible =
179 monitoring::Counter<1>::New(
180 "/tensorflow/data/autoshard_rewrite_batch_size/eligible",
181 "Whether tf.data pipelines that are eligible for autoshard "
182 "to rewrite the batch size.",
183 "eligible");
184
185auto* tf_data_auto_shard_rewrite_batch_size_reason =
186 monitoring::Counter<1>::New(
187 "/tensorflow/data/autoshard_rewrite_batch_size/reason",
188 "The reasons that tf.data pipelines are ineligible for autoshard "
189 "to rewrite the batch size.",
190 "reason");
191
192auto* tf_data_autotune_stopping_criteria_counter =
193 monitoring::Counter<1>::New("/tensorflow/data/autotune_stopping_criteria",
194 "The number of times each tf.data autotune "
195 "algorithm stopping criterion is met.",
196 "name");
197
198auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
199 "/tensorflow/data/dense_feature",
200 "The number of dense features parsed by ops for parsing tf.Example.");
201
202auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
203 "/tensorflow/data/sparse_feature",
204 "The number of sparse features parsed by ops for parsing tf.Example.");
205
206auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
207 "/tensorflow/data/ragged_feature",
208 "The number of ragged features parsed by ops for parsing tf.Example.");
209
210auto* build_graph_calls = monitoring::Counter<0>::New(
211 "/tensorflow/core/graph_build_calls",
212 "The number of times TensorFlow has created a new client graph. "
213 "A client graph is a sub-graph of the full graph, induced by a set of "
214 "options, including the requested feeds and fetches. It includes time "
215 "spent optimizing the graph with Grappler, and time spent pruning the "
216 "sub-graph.");
217
218auto* build_graph_time_usecs = monitoring::Counter<0>::New(
219 "/tensorflow/core/graph_build_time_usecs",
220 "The amount of time TensorFlow has spent creating new client graphs in "
221 "microseconds. "
222 "A client graph is a sub-graph of the full graph, induced by a set of "
223 "options, including the requested feeds and fetches. It includes time "
224 "spent optimizing the graph with Grappler, and time spent pruning the "
225 "sub-graph.");
226
227auto* function_graph_optimization_time_usecs = monitoring::Counter<0>::New(
228 "/tensorflow/core/function_graph_optimization_time_usecs",
229 "The amount of time TensorFlow has spent optimizing function graphs, in "
230 "microseconds. ");
231
232auto* xla_compilations = monitoring::Counter<0>::New(
233 "/tensorflow/core/xla_compilations",
234 "The number of XLA compilations used to collect "
235 "/tensorflow/core/xla_compilation_time_usecs");
236
237auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
238 "/tensorflow/core/xla_compilation_time_usecs",
239 "The total time spent on compiling XLA graphs in microseconds.");
240
241auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New(
242 "/tensorflow/tpu/xla_spmd_cores_per_replica",
243 "The number of cores used by XLA SPMD-replicated models.", "cores");
244
245auto* bfc_allocator_delay =
246 monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
247 "The total time spent running each graph "
248 "optimization pass in microseconds.");
249
250auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New(
251 "/tensorflow/tpu/variable_distribution_time",
252 "Time spent sending variables from primary task to other worker tasks "
253 "at the start of a call to TPUExecute. Timer starts at RunGraph "
254 "invocation and ends when TPUExecute args are ready on the current task.");
255
256auto* test_counters =
257 monitoring::Counter<2>::New("/tensorflow/core/test_counters",
258 "Counters used for testing.", "name", "label");
259
260} // namespace
261
262auto* tpu_op_error_counter = monitoring::Counter<2>::New(
263 "/tensorflow/tpu/op_error_count",
264 "Count the tpu related errors by op and error_type.", "op", "error_type");
265
266auto* eager_client_error_counter = monitoring::Counter<2>::New(
267 "/tensorflow/core/eager_client_error_count",
268 "Count the errors in eager client as a central place.", "error_source",
269 "error_type");
270
271monitoring::Counter<2>* GetGraphOptimizationCounter() {
272 static auto* graph_optimization_counter =
273 monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
274 "The total time spent running each graph "
275 "optimization pass in microseconds.",
276 "kind", "name");
277 return graph_optimization_counter;
278}
279
280void RecordTFDataAutotune(const string& name) {
281 tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
282}
283
284monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
285 return tf_data_bytes_consumed_counter->GetCell(name);
286}
287
288monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
289 return tf_data_bytes_produced_counter->GetCell(name);
290}
291
292monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
293 return tf_data_bytes_read_counter->GetCell(name);
294}
295
296monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
297 return tf_data_elements_counter->GetCell(name);
298}
299
300monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
301 const string& id) {
302 return tf_data_model_gauge->GetCell(id);
303}
304
305void RecordTFDataBytesFetched(int64_t num_bytes) {
306 tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
307}
308
309void RecordTFDataExperiment(const string& name) {
310 tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
311}
312
313void RecordTFDataFingerprint(const string& name) {
314 tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
315}
316
317void RecordTFDataGetNextDuration(uint64 duration_us) {
318 static auto* tf_data_get_next_duration_cell =
319 tf_data_get_next_duration_usecs_histogram->GetCell();
320 tf_data_get_next_duration_cell->Add(duration_us);
321}
322
323void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio) {
324 static auto* tf_data_used_vs_budget_ratio_histogram_cell =
325 tf_data_used_vs_budget_ratio_histogram->GetCell();
326 tf_data_used_vs_budget_ratio_histogram_cell->Add(ratio);
327}
328
329void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) {
330 static auto* tf_data_buffered_vs_budget_ratio_histogram_cell =
331 tf_data_buffered_vs_budget_ratio_histogram->GetCell();
332 tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio);
333}
334
335void RecordTFDataIteratorBusy(uint64 duration_us) {
336 static auto* tf_data_iterator_busy_cell =
337 tf_data_iterator_busy_counter->GetCell();
338 tf_data_iterator_busy_cell->IncrementBy(duration_us);
339}
340
341void RecordTFDataIteratorLifetime(uint64 duration_us) {
342 static auto* tf_data_iterator_lifetime_cell =
343 tf_data_iterator_lifetime_counter->GetCell();
344 tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
345}
346
347void RecordTFDataIteratorGap(uint64 duration_us) {
348 static auto* tf_data_iterator_gap_msec_histogram_cell =
349 tf_data_iterator_gap_msec_histogram->GetCell();
350 tf_data_iterator_gap_msec_histogram_cell->Add(duration_us * 0.001);
351}
352
353void RecordTFDataOptimization(const string& name, int64_t num_changes) {
354 tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
355}
356
357void RecordTFDataServiceWorkerCreated() {
358 tf_data_service_workers_created_counter->GetCell()->IncrementBy(1);
359}
360
361void RecordTFDataServiceJobsCreated(
362 const tensorflow::data::ProcessingModeDef& processing_mode,
363 bool is_coordinated_read) {
364 const std::string sharding_policy_str =
365 data::ProcessingModeDef::ShardingPolicy_Name(
366 processing_mode.sharding_policy());
367 const std::string coordinated_read_str =
368 is_coordinated_read ? "true" : "false";
369 tf_data_service_jobs_created_counter
370 ->GetCell(sharding_policy_str, coordinated_read_str)
371 ->IncrementBy(1);
372}
373
374void RecordTFDataServiceClientIterators(
375 int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
376 const tensorflow::data::ProcessingModeDef& processing_mode,
377 bool is_coordinated_read) {
378 const std::string deployment_mode_str =
379 tensorflow::data::DeploymentMode_Name(deployment_mode);
380 const std::string sharding_policy_str =
381 data::ProcessingModeDef::ShardingPolicy_Name(
382 processing_mode.sharding_policy());
383 const std::string coordinated_read_str =
384 is_coordinated_read ? "true" : "false";
385 tf_data_service_client_iterators_counter
386 ->GetCell(absl::StrCat(worker_uid), deployment_mode_str,
387 sharding_policy_str, coordinated_read_str)
388 ->IncrementBy(1);
389}
390
391void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit) {
392 std::string cache_hit_str = cache_hit ? "true" : "false";
393 tf_data_service_cross_trainer_cache_queries_counter->GetCell(cache_hit_str)
394 ->IncrementBy(1);
395}
396
397void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes) {
398 tf_data_service_cross_trainer_cache_size_bytes->GetCell()->Set(
399 static_cast<int64_t>(bytes));
400}
401
402void RecordTFDataFilename(const string& name, const string& filename) {
403 tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
404}
405
406void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
407 int64 num_workers, int64 num_replicas) {
408 tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
409 tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
410 tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
411}
412
413void RecordTFDataAutoShardRewriteBatchSize(
414 bool eligible, const std::vector<string>& ineligible_reason) {
415 tf_data_auto_shard_rewrite_batch_size_eligible
416 ->GetCell(eligible ? "true" : "false")
417 ->IncrementBy(1);
418 for (const string& reason : ineligible_reason) {
419 tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy(
420 1);
421 }
422}
423
424void RecordTFDataAutotuneStoppingCriteria(const string& name) {
425 tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
426}
427
428void RecordParseDenseFeature(int64 num_features) {
429 static auto* parse_dense_feature_counter_cell =
430 parse_dense_feature_counter->GetCell();
431 parse_dense_feature_counter_cell->IncrementBy(num_features);
432}
433
434void RecordParseSparseFeature(int64_t num_features) {
435 static auto* parse_sparse_feature_counter_cell =
436 parse_sparse_feature_counter->GetCell();
437 parse_sparse_feature_counter_cell->IncrementBy(num_features);
438}
439
440void RecordParseRaggedFeature(int64_t num_features) {
441 static auto* parse_ragged_feature_counter_cell =
442 parse_ragged_feature_counter->GetCell();
443 parse_ragged_feature_counter_cell->IncrementBy(num_features);
444}
445
446void RecordGraphInputTensors(const size_t size) {
447 static auto* graph_run_input_tensor_bytes_cell =
448 graph_run_input_tensor_bytes->GetCell();
449 graph_run_input_tensor_bytes_cell->Add(size);
450}
451
452void RecordGraphOutputTensors(const size_t size) {
453 static auto* graph_run_output_tensor_bytes_cell =
454 graph_run_output_tensor_bytes->GetCell();
455 graph_run_output_tensor_bytes_cell->Add(size);
456}
457
458void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
459 xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica))
460 ->IncrementBy(1);
461}
462
463void UpdateGraphExecTime(const uint64 running_time_usecs) {
464 if (running_time_usecs > 0) {
465 static auto* graph_runs_cell = graph_runs->GetCell();
466 static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
467 static auto* graph_run_time_usecs_histogram_cell =
468 graph_run_time_usecs_histogram->GetCell();
469 graph_runs_cell->IncrementBy(1);
470 graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
471 graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
472 }
473}
474
475void UpdateGraphPendingQueueLength(uint64 len) {
476 static auto* graph_pending_queue_length_cell =
477 graph_pending_queue_length_histogram->GetCell();
478 graph_pending_queue_length_cell->Add(len);
479}
480
481void UpdateGraphBuildTime(const uint64 running_time_usecs) {
482 if (running_time_usecs > 0) {
483 static auto* build_graph_calls_cell = build_graph_calls->GetCell();
484 static auto* build_graph_time_usecs_cell =
485 build_graph_time_usecs->GetCell();
486 build_graph_calls_cell->IncrementBy(1);
487 build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
488 }
489}
490
491void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) {
492 if (running_time_usecs > 0) {
493 static auto* function_graph_optimization_time_usecs_cell =
494 function_graph_optimization_time_usecs->GetCell();
495 function_graph_optimization_time_usecs_cell->IncrementBy(
496 running_time_usecs);
497 }
498}
499
500void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
501 if (distribution_time_usecs > 0) {
502 tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
503 distribution_time_usecs);
504 }
505}
506
507void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
508 if (compilation_time_usecs > 0) {
509 static auto* xla_compilations_cell = xla_compilations->GetCell();
510 static auto* xla_compilation_time_usecs_cell =
511 xla_compilation_time_usecs->GetCell();
512 xla_compilations_cell->IncrementBy(1);
513 xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
514 }
515}
516
517void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
518 static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
519 if (delay_usecs > 0) {
520 bfc_allocator_delay_cell->IncrementBy(delay_usecs);
521 }
522}
523
524void RecordUnusedOutput(const string& op_name) {
525 graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
526}
527
528void IncrementTestCounter(const string& name, const string& label) {
529 test_counters->GetCell(name, label)->IncrementBy(1);
530}
531
532const monitoring::CounterCell* TestCounter(const string& name,
533 const string& label) {
534 return test_counters->GetCell(name, label);
535}
536
537TestDelta::TestDelta(const string& name, const string& label)
538 : cell_(TestCounter(name, label)) {
539 Reset();
540}
541
542void TestDelta::Reset() { last_value_ = cell_->value(); }
543
544int64 TestDelta::Get() { return cell_->value() - last_value_; }
545
546void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
547 const std::string& bridge_version,
548 bool fallback_enabled,
549 const std::string& result) {
550 static auto* metric = monitoring::Counter<4>::New(
551 "/tensorflow/core/tf_mlir_bridge_first_phase_count",
552 "Tracks processing state in first phase of mlir bridge", "device",
553 "version", "fallback", "result");
554 std::string fallback_status =
555 fallback_enabled ? "fallback_enabled" : "fallback_disabled";
556 metric->GetCell(device_type, bridge_version, fallback_status, result)
557 ->IncrementBy(1);
558}
559
560void UpdateTpuErrorCounter(const string& op, const string& error_type) {
561 tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1);
562}
563
564void UpdateEagerClientErrorCounter(const string& error_source,
565 const string& error_type) {
566 eager_client_error_counter->GetCell(error_source, error_type)->IncrementBy(1);
567}
568
569void UpdateTfMlirBridgeGraphAnalysisPerOp(
570 const std::string& op_name, const std::string& construction_context,
571 bool is_single_core_inference_mode, const std::string& num_replicas,
572 const std::string& num_cores_per_replica, const std::string& use_tpu,
573 const std::string& allow_soft_placement,
574 const std::string& use_spmd_for_xla_partitioning,
575 const std::string& unsupported_reason, bool has_unsupported_features) {
576 static auto* metric = monitoring::Counter<10>::New(
577 "/tensorflow/core/tf_mlir_bridge_graph_analysis_per_op",
578 "Tracks processing state per op in first phase of mlir bridge", "op_name",
579 "construction_context", "is_single_core_inference_mode", "num_replicas",
580 "num_cores_per_replica", "use_tpu", "allow_soft_placement",
581 "use_spmd_for_xla_partitioning", "unsupported_reason",
582 "has_unsupported_features");
583
584 metric
585 ->GetCell(op_name, construction_context,
586 is_single_core_inference_mode ? "Yes" : "No", num_replicas,
587 num_cores_per_replica, use_tpu, allow_soft_placement,
588 use_spmd_for_xla_partitioning, unsupported_reason,
589 has_unsupported_features ? "Yes" : "No")
590 ->IncrementBy(1);
591}
592
593} // namespace metrics
594} // namespace tensorflow
595