1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/core/util/debug_events_writer.h"
17
18#include "tensorflow/core/lib/io/path.h"
19#include "tensorflow/core/lib/strings/strcat.h"
20#include "tensorflow/core/lib/strings/stringprintf.h"
21#include "tensorflow/core/platform/host_info.h"
22#include "tensorflow/core/public/version.h"
23
24namespace tensorflow {
25namespace tfdbg {
26
27namespace {
28void MaybeSetDebugEventTimestamp(DebugEvent* debug_event, Env* env) {
29 if (debug_event->wall_time() == 0) {
30 debug_event->set_wall_time(env->NowMicros() / 1e6);
31 }
32}
33} // namespace
34
35SingleDebugEventFileWriter::SingleDebugEventFileWriter(const string& file_path)
36 : env_(Env::Default()),
37 file_path_(file_path),
38 num_outstanding_events_(0),
39 writer_mu_() {}
40
41Status SingleDebugEventFileWriter::Init() {
42 if (record_writer_ != nullptr) {
43 // TODO(cais): We currently don't check for file deletion. When the need
44 // arises, check and fix it.
45 return OkStatus();
46 }
47
48 // Reset recordio_writer (which has a reference to writable_file_) so final
49 // Flush() and Close() call have access to writable_file_.
50 record_writer_.reset();
51
52 TF_RETURN_WITH_CONTEXT_IF_ERROR(
53 env_->NewWritableFile(file_path_, &writable_file_),
54 "Creating writable file ", file_path_);
55 record_writer_.reset(new io::RecordWriter(writable_file_.get()));
56 if (record_writer_ == nullptr) {
57 return errors::Unknown("Could not create record writer at path: ",
58 file_path_);
59 }
60 num_outstanding_events_.store(0);
61 VLOG(1) << "Successfully opened debug events file: " << file_path_;
62 return OkStatus();
63}
64
65void SingleDebugEventFileWriter::WriteSerializedDebugEvent(
66 StringPiece debug_event_str) {
67 if (record_writer_ == nullptr) {
68 if (!Init().ok()) {
69 LOG(ERROR) << "Write failed because file could not be opened.";
70 return;
71 }
72 }
73 num_outstanding_events_.fetch_add(1);
74 {
75 mutex_lock l(writer_mu_);
76 record_writer_->WriteRecord(debug_event_str).IgnoreError();
77 }
78}
79
80Status SingleDebugEventFileWriter::Flush() {
81 const int num_outstanding = num_outstanding_events_.load();
82 if (num_outstanding == 0) {
83 return OkStatus();
84 }
85 if (writable_file_ == nullptr) {
86 return errors::Unknown("Unexpected NULL file for path: ", file_path_);
87 }
88
89 {
90 mutex_lock l(writer_mu_);
91 TF_RETURN_WITH_CONTEXT_IF_ERROR(record_writer_->Flush(), "Failed to flush ",
92 num_outstanding, " debug events to ",
93 file_path_);
94 }
95
96 TF_RETURN_WITH_CONTEXT_IF_ERROR(writable_file_->Sync(), "Failed to sync ",
97 num_outstanding, " debug events to ",
98 file_path_);
99 num_outstanding_events_.store(0);
100 return OkStatus();
101}
102
103Status SingleDebugEventFileWriter::Close() {
104 Status status = Flush();
105 if (writable_file_ != nullptr) {
106 Status close_status = writable_file_->Close();
107 if (!close_status.ok()) {
108 status = close_status;
109 }
110 record_writer_.reset(nullptr);
111 writable_file_.reset(nullptr);
112 }
113 num_outstanding_events_ = 0;
114 return status;
115}
116
117const string SingleDebugEventFileWriter::FileName() { return file_path_; }
118
119mutex DebugEventsWriter::factory_mu_(LINKER_INITIALIZED);
120
121DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); }
122
123// static
124DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
125 const string& dump_root, const string& tfdbg_run_id,
126 int64_t circular_buffer_size) {
127 mutex_lock l(DebugEventsWriter::factory_mu_);
128 std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
129 DebugEventsWriter::GetDebugEventsWriterMap();
130 if (writer_pool->find(dump_root) == writer_pool->end()) {
131 std::unique_ptr<DebugEventsWriter> writer(
132 new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size));
133 writer_pool->insert(std::make_pair(dump_root, std::move(writer)));
134 }
135 return (*writer_pool)[dump_root].get();
136}
137
138// static
139Status DebugEventsWriter::LookUpDebugEventsWriter(
140 const string& dump_root, DebugEventsWriter** debug_events_writer) {
141 mutex_lock l(DebugEventsWriter::factory_mu_);
142 std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
143 DebugEventsWriter::GetDebugEventsWriterMap();
144 if (writer_pool->find(dump_root) == writer_pool->end()) {
145 return errors::FailedPrecondition(
146 "No DebugEventsWriter has been created at dump root ", dump_root);
147 }
148 *debug_events_writer = (*writer_pool)[dump_root].get();
149 return OkStatus();
150}
151
152Status DebugEventsWriter::Init() {
153 mutex_lock l(initialization_mu_);
154
155 // TODO(cais): We currently don't check for file deletion. When the need
156 // arises, check and fix file deletion.
157 if (is_initialized_) {
158 return OkStatus();
159 }
160
161 if (!env_->IsDirectory(dump_root_).ok()) {
162 TF_RETURN_WITH_CONTEXT_IF_ERROR(env_->RecursivelyCreateDir(dump_root_),
163 "Failed to create directory ", dump_root_);
164 }
165
166 int64_t time_in_seconds = env_->NowMicros() / 1e6;
167 file_prefix_ = io::JoinPath(
168 dump_root_, strings::Printf("%s.%010lld.%s", kFileNamePrefix,
169 static_cast<long long>(time_in_seconds),
170 port::Hostname().c_str()));
171 TF_RETURN_IF_ERROR(InitNonMetadataFile(SOURCE_FILES));
172 TF_RETURN_IF_ERROR(InitNonMetadataFile(STACK_FRAMES));
173 TF_RETURN_IF_ERROR(InitNonMetadataFile(GRAPHS));
174
175 // In case there is one left over from before.
176 metadata_writer_.reset();
177
178 // The metadata file should be created.
179 string metadata_filename = GetFileNameInternal(METADATA);
180 metadata_writer_.reset(new SingleDebugEventFileWriter(metadata_filename));
181 if (metadata_writer_ == nullptr) {
182 return errors::Unknown("Could not create debug event metadata file writer");
183 }
184
185 DebugEvent debug_event;
186 DebugMetadata* metadata = debug_event.mutable_debug_metadata();
187 metadata->set_tensorflow_version(TF_VERSION_STRING);
188 metadata->set_file_version(
189 strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
190 metadata->set_tfdbg_run_id(tfdbg_run_id_);
191 TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
192 TF_RETURN_WITH_CONTEXT_IF_ERROR(
193 metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
194
195 TF_RETURN_IF_ERROR(InitNonMetadataFile(EXECUTION));
196 TF_RETURN_IF_ERROR(InitNonMetadataFile(GRAPH_EXECUTION_TRACES));
197 is_initialized_ = true;
198 return OkStatus();
199}
200
201Status DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
202 DebugEvent debug_event;
203 debug_event.set_allocated_source_file(source_file);
204 return SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
205}
206
207Status DebugEventsWriter::WriteStackFrameWithId(
208 StackFrameWithId* stack_frame_with_id) {
209 DebugEvent debug_event;
210 debug_event.set_allocated_stack_frame_with_id(stack_frame_with_id);
211 return SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
212}
213
214Status DebugEventsWriter::WriteGraphOpCreation(
215 GraphOpCreation* graph_op_creation) {
216 DebugEvent debug_event;
217 debug_event.set_allocated_graph_op_creation(graph_op_creation);
218 return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
219}
220
221Status DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
222 DebugEvent debug_event;
223 debug_event.set_allocated_debugged_graph(debugged_graph);
224 return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
225}
226
227Status DebugEventsWriter::WriteExecution(Execution* execution) {
228 if (circular_buffer_size_ <= 0) {
229 // No cyclic-buffer behavior.
230 DebugEvent debug_event;
231 debug_event.set_allocated_execution(execution);
232 return SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
233 } else {
234 // Circular buffer behavior.
235 DebugEvent debug_event;
236 MaybeSetDebugEventTimestamp(&debug_event, env_);
237 debug_event.set_allocated_execution(execution);
238 string serialized;
239 debug_event.SerializeToString(&serialized);
240
241 mutex_lock l(execution_buffer_mu_);
242 execution_buffer_.emplace_back(std::move(serialized));
243 if (execution_buffer_.size() > circular_buffer_size_) {
244 execution_buffer_.pop_front();
245 }
246 return OkStatus();
247 }
248}
249
250Status DebugEventsWriter::WriteGraphExecutionTrace(
251 GraphExecutionTrace* graph_execution_trace) {
252 TF_RETURN_IF_ERROR(Init());
253 if (circular_buffer_size_ <= 0) {
254 // No cyclic-buffer behavior.
255 DebugEvent debug_event;
256 debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
257 return SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
258 } else {
259 // Circular buffer behavior.
260 DebugEvent debug_event;
261 MaybeSetDebugEventTimestamp(&debug_event, env_);
262 debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
263 string serialized;
264 debug_event.SerializeToString(&serialized);
265
266 mutex_lock l(graph_execution_trace_buffer_mu_);
267 graph_execution_trace_buffer_.emplace_back(std::move(serialized));
268 if (graph_execution_trace_buffer_.size() > circular_buffer_size_) {
269 graph_execution_trace_buffer_.pop_front();
270 }
271 return OkStatus();
272 }
273}
274
275Status DebugEventsWriter::WriteGraphExecutionTrace(
276 const string& tfdbg_context_id, const string& device_name,
277 const string& op_name, int32_t output_slot, int32_t tensor_debug_mode,
278 const Tensor& tensor_value) {
279 std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
280 trace->set_tfdbg_context_id(tfdbg_context_id);
281 if (!op_name.empty()) {
282 trace->set_op_name(op_name);
283 }
284 if (output_slot > 0) {
285 trace->set_output_slot(output_slot);
286 }
287 if (tensor_debug_mode > 0) {
288 trace->set_tensor_debug_mode(TensorDebugMode(tensor_debug_mode));
289 }
290 trace->set_device_name(device_name);
291 tensor_value.AsProtoTensorContent(trace->mutable_tensor_proto());
292 return WriteGraphExecutionTrace(trace.release());
293}
294
295void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
296 const string& debug_event_str, DebugEventFileType type) {
297 std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
298 SelectWriter(type, &writer);
299 (*writer)->WriteSerializedDebugEvent(debug_event_str);
300}
301
302void DebugEventsWriter::WriteSerializedExecutionDebugEvent(
303 const string& debug_event_str, DebugEventFileType type) {
304 const std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
305 std::deque<string>* buffer = nullptr;
306 mutex* mu = nullptr;
307 switch (type) {
308 case EXECUTION:
309 writer = &execution_writer_;
310 buffer = &execution_buffer_;
311 mu = &execution_buffer_mu_;
312 break;
313 case GRAPH_EXECUTION_TRACES:
314 writer = &graph_execution_traces_writer_;
315 buffer = &graph_execution_trace_buffer_;
316 mu = &graph_execution_trace_buffer_mu_;
317 break;
318 default:
319 return;
320 }
321
322 if (circular_buffer_size_ <= 0) {
323 // No cyclic-buffer behavior.
324 (*writer)->WriteSerializedDebugEvent(debug_event_str);
325 } else {
326 // Circular buffer behavior.
327 mutex_lock l(*mu);
328 buffer->push_back(debug_event_str);
329 if (buffer->size() > circular_buffer_size_) {
330 buffer->pop_front();
331 }
332 }
333}
334
335int DebugEventsWriter::RegisterDeviceAndGetId(const string& device_name) {
336 mutex_lock l(device_mu_);
337 int& device_id = device_name_to_id_[device_name];
338 if (device_id == 0) {
339 device_id = device_name_to_id_.size();
340 DebugEvent debug_event;
341 MaybeSetDebugEventTimestamp(&debug_event, env_);
342 DebuggedDevice* debugged_device = debug_event.mutable_debugged_device();
343 debugged_device->set_device_name(device_name);
344 debugged_device->set_device_id(device_id);
345 string serialized;
346 debug_event.SerializeToString(&serialized);
347 graphs_writer_->WriteSerializedDebugEvent(serialized);
348 }
349 return device_id;
350}
351
352Status DebugEventsWriter::FlushNonExecutionFiles() {
353 TF_RETURN_IF_ERROR(Init());
354 if (source_files_writer_ != nullptr) {
355 TF_RETURN_IF_ERROR(source_files_writer_->Flush());
356 }
357 if (stack_frames_writer_ != nullptr) {
358 TF_RETURN_IF_ERROR(stack_frames_writer_->Flush());
359 }
360 if (graphs_writer_ != nullptr) {
361 TF_RETURN_IF_ERROR(graphs_writer_->Flush());
362 }
363 return OkStatus();
364}
365
366Status DebugEventsWriter::FlushExecutionFiles() {
367 TF_RETURN_IF_ERROR(Init());
368
369 if (execution_writer_ != nullptr) {
370 if (circular_buffer_size_ > 0) {
371 // Write out all the content in the circular buffers.
372 mutex_lock l(execution_buffer_mu_);
373 while (!execution_buffer_.empty()) {
374 execution_writer_->WriteSerializedDebugEvent(execution_buffer_.front());
375 // SerializeAndWriteDebugEvent(&execution_buffer_.front());
376 execution_buffer_.pop_front();
377 }
378 }
379 TF_RETURN_IF_ERROR(execution_writer_->Flush());
380 }
381
382 if (graph_execution_traces_writer_ != nullptr) {
383 if (circular_buffer_size_ > 0) {
384 // Write out all the content in the circular buffers.
385 mutex_lock l(graph_execution_trace_buffer_mu_);
386 while (!graph_execution_trace_buffer_.empty()) {
387 graph_execution_traces_writer_->WriteSerializedDebugEvent(
388 graph_execution_trace_buffer_.front());
389 graph_execution_trace_buffer_.pop_front();
390 }
391 }
392 TF_RETURN_IF_ERROR(graph_execution_traces_writer_->Flush());
393 }
394
395 return OkStatus();
396}
397
398string DebugEventsWriter::FileName(DebugEventFileType type) {
399 if (file_prefix_.empty()) {
400 Init().IgnoreError();
401 }
402 return GetFileNameInternal(type);
403}
404
405Status DebugEventsWriter::Close() {
406 {
407 mutex_lock l(initialization_mu_);
408 if (!is_initialized_) {
409 return OkStatus();
410 }
411 }
412
413 std::vector<string> failed_to_close_files;
414
415 if (metadata_writer_ != nullptr) {
416 if (!metadata_writer_->Close().ok()) {
417 failed_to_close_files.push_back(metadata_writer_->FileName());
418 }
419 metadata_writer_.reset(nullptr);
420 }
421
422 TF_RETURN_IF_ERROR(FlushNonExecutionFiles());
423 if (source_files_writer_ != nullptr) {
424 if (!source_files_writer_->Close().ok()) {
425 failed_to_close_files.push_back(source_files_writer_->FileName());
426 }
427 source_files_writer_.reset(nullptr);
428 }
429 if (stack_frames_writer_ != nullptr) {
430 if (!stack_frames_writer_->Close().ok()) {
431 failed_to_close_files.push_back(stack_frames_writer_->FileName());
432 }
433 stack_frames_writer_.reset(nullptr);
434 }
435 if (graphs_writer_ != nullptr) {
436 if (!graphs_writer_->Close().ok()) {
437 failed_to_close_files.push_back(graphs_writer_->FileName());
438 }
439 graphs_writer_.reset(nullptr);
440 }
441
442 TF_RETURN_IF_ERROR(FlushExecutionFiles());
443 if (execution_writer_ != nullptr) {
444 if (!execution_writer_->Close().ok()) {
445 failed_to_close_files.push_back(execution_writer_->FileName());
446 }
447 execution_writer_.reset(nullptr);
448 }
449 if (graph_execution_traces_writer_ != nullptr) {
450 if (!graph_execution_traces_writer_->Close().ok()) {
451 failed_to_close_files.push_back(
452 graph_execution_traces_writer_->FileName());
453 }
454 graph_execution_traces_writer_.reset(nullptr);
455 }
456
457 if (failed_to_close_files.empty()) {
458 return OkStatus();
459 } else {
460 return errors::FailedPrecondition(
461 "Failed to close %d debug-events files associated with tfdbg",
462 failed_to_close_files.size());
463 }
464}
465
466// static
467std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
468DebugEventsWriter::GetDebugEventsWriterMap() {
469 static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
470 writer_pool =
471 new std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>();
472 return writer_pool;
473}
474
475DebugEventsWriter::DebugEventsWriter(const string& dump_root,
476 const string& tfdbg_run_id,
477 int64_t circular_buffer_size)
478 : env_(Env::Default()),
479 dump_root_(dump_root),
480 tfdbg_run_id_(tfdbg_run_id),
481 is_initialized_(false),
482 initialization_mu_(),
483 circular_buffer_size_(circular_buffer_size),
484 execution_buffer_(),
485 execution_buffer_mu_(),
486 graph_execution_trace_buffer_(),
487 graph_execution_trace_buffer_mu_(),
488 device_name_to_id_(),
489 device_mu_() {}
490
491Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
492 std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
493 SelectWriter(type, &writer);
494 const string filename = GetFileNameInternal(type);
495 writer->reset();
496
497 writer->reset(new SingleDebugEventFileWriter(filename));
498 if (*writer == nullptr) {
499 return errors::Unknown("Could not create debug event file writer for ",
500 filename);
501 }
502 TF_RETURN_WITH_CONTEXT_IF_ERROR(
503 (*writer)->Init(), "Initializing debug event writer at path ", filename);
504 VLOG(1) << "Successfully opened debug event file: " << filename;
505
506 return OkStatus();
507}
508
509Status DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
510 DebugEventFileType type) {
511 std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
512 SelectWriter(type, &writer);
513 if (writer != nullptr) {
514 // Timestamp is in seconds, with double precision.
515 MaybeSetDebugEventTimestamp(debug_event, env_);
516 string str;
517 debug_event->AppendToString(&str);
518 (*writer)->WriteSerializedDebugEvent(str);
519 return OkStatus();
520 } else {
521 return errors::Internal(
522 "Unable to find debug events file writer for DebugEventsFileType ",
523 type);
524 }
525}
526
527void DebugEventsWriter::SelectWriter(
528 DebugEventFileType type,
529 std::unique_ptr<SingleDebugEventFileWriter>** writer) {
530 switch (type) {
531 case METADATA:
532 *writer = &metadata_writer_;
533 break;
534 case SOURCE_FILES:
535 *writer = &source_files_writer_;
536 break;
537 case STACK_FRAMES:
538 *writer = &stack_frames_writer_;
539 break;
540 case GRAPHS:
541 *writer = &graphs_writer_;
542 break;
543 case EXECUTION:
544 *writer = &execution_writer_;
545 break;
546 case GRAPH_EXECUTION_TRACES:
547 *writer = &graph_execution_traces_writer_;
548 break;
549 }
550}
551
552const string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
553 switch (type) {
554 case METADATA:
555 return kMetadataSuffix;
556 case SOURCE_FILES:
557 return kSourceFilesSuffix;
558 case STACK_FRAMES:
559 return kStackFramesSuffix;
560 case GRAPHS:
561 return kGraphsSuffix;
562 case EXECUTION:
563 return kExecutionSuffix;
564 case GRAPH_EXECUTION_TRACES:
565 return kGraphExecutionTracesSuffix;
566 default:
567 string suffix;
568 return suffix;
569 }
570}
571
572string DebugEventsWriter::GetFileNameInternal(DebugEventFileType type) {
573 const string suffix = GetSuffix(type);
574 return strings::StrCat(file_prefix_, ".", suffix);
575}
576
577} // namespace tfdbg
578} // namespace tensorflow
579