1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
17#define TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
18
19#include <deque>
20
21#include "absl/container/flat_hash_map.h"
22#include "tensorflow/core/framework/tensor.h"
23#include "tensorflow/core/lib/core/status.h"
24#include "tensorflow/core/lib/io/record_writer.h"
25#include "tensorflow/core/platform/env.h"
26#include "tensorflow/core/platform/macros.h"
27#include "tensorflow/core/platform/types.h"
28#include "tensorflow/core/protobuf/debug_event.pb.h"
29
30namespace tensorflow {
31namespace tfdbg {
32
33// The set of files generated by a debugged TensorFlow program.
34enum DebugEventFileType {
35 METADATA,
36 SOURCE_FILES,
37 STACK_FRAMES,
38 GRAPHS,
39 EXECUTION,
40 GRAPH_EXECUTION_TRACES,
41};
42
43// Helper class for DebugEventsWriter.
44// This class manages the writing of data to a single TFRecord file.
45// Each object of the DebugEventsWriter class below involves multiple
46// TFRecord files, and hence utilizes multiple objects of this helper class.
47class SingleDebugEventFileWriter {
48 public:
49 explicit SingleDebugEventFileWriter(const string& file_path);
50
51 Status Init();
52
53 void WriteSerializedDebugEvent(tensorflow::StringPiece debug_event_str);
54
55 Status Flush();
56 Status Close();
57
58 const string FileName();
59
60 private:
61 Env* env_;
62 const string file_path_;
63 std::atomic_int_fast32_t num_outstanding_events_;
64
65 std::unique_ptr<WritableFile> writable_file_;
66 std::unique_ptr<io::RecordWriter> record_writer_ TF_PT_GUARDED_BY(writer_mu_);
67 mutex writer_mu_;
68};
69
70// The DebugEvents writer class.
71class DebugEventsWriter {
72 public:
73#ifndef SWIG
74 // Prefix of version string present in the first entry of every event file.
75 // Default size of each circular buffer (unit: number of DebugEvent protos).
76 static constexpr const int64_t kDefaultCyclicBufferSize = 1000;
77
78 static constexpr const char* kFileNamePrefix = "tfdbg_events";
79 static constexpr const char* kMetadataSuffix = "metadata";
80 static constexpr const char* kSourceFilesSuffix = "source_files";
81 static constexpr const char* kStackFramesSuffix = "stack_frames";
82 static constexpr const char* kGraphsSuffix = "graphs";
83 static constexpr const char* kExecutionSuffix = "execution";
84 static constexpr const char* kGraphExecutionTracesSuffix =
85 "graph_execution_traces";
86
87 static constexpr const char* kVersionPrefix = "debug.Event:";
88 static constexpr const int kCurrentFormatVersion = 1;
89#endif
90
91 // Get the DebugEventsWriter for the given dump_root.
92 // For a given dump_root value, it is a singleton. tfdbg event files come in
93 // sets of six. The singleton pattern avoids storing multiple sets in a single
94 // folder, which might cause confusion.
95 //
96 // If an instance of DebugEventsWriter has already been created at a
97 // `dump_root`, calling this method with the same `dump_root` will return
98 // the existing instance.
99 //
100 // Args:
101 // dump_root: Dump root directory. If it doesn't exist, will be created.
102 // tfdbg_run_id: Debugging run ID of the writer.
103 // circular_buffer_size: Circular buffer size (in number of DebugEvent
104 // protos). If set to a value <=0, will abolish the circular-buffer
105 // behavior.
106 // Returns:
107 // A pointer to a DebugEventsWriter object: a per-dump_root singleton.
108 static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
109 const string& tfdbg_run_id,
110 int64_t circular_buffer_size);
111 // Look up existing events writer by dump_root.
112 // If no DebugEventsWriter has been created at the dump_root, a non-OK
113 // Status will be returned. Else an OK status will be returned, with
114 // the pointer to the existing instance provided by reference.
115 static Status LookUpDebugEventsWriter(
116 const string& dump_root, DebugEventsWriter** debug_events_writer);
117 ~DebugEventsWriter();
118
119 // Sets the debug event filenames and opens file for writing.
120 // All files (see the DebugEventFileType enum) share the same prefix and
121 // differ only in their suffixes. If not called by user, will be invoked
122 // automatically by a call to FileName() or any of the Write*() methods().
123 // Idempotent: if the metadata file exists and is open, this is a no-op.
124 // If on the other hand the file was opened, but has since disappeared (e.g.
125 // deleted by another process), this will open a new file.
126 Status Init();
127
128 // The four DebugEvent fields below are written _without_ the circular
129 // buffer. Source file contents are written to the *.source_files file.
130 // Takes ownership of source_file.
131 Status WriteSourceFile(SourceFile* source_file);
132 // Stack frames are written to the *.code_locations file.
133 // Takes ownership of stack_frame_with_id.
134 Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
135 // Graph op creation events are written to the *.graphs file.
136 // Takes ownership of graph_op_creation.
137 Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
138 // Debugged graphs are written to the *.graphs file.
139 // Takes ownership of debugged_graph.
140 Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
141
142 // The two DebugEvent fields below are written to the circular buffer
143 // and saved to disk only at the FlushExecutionFiles() call.
144 // Execution events (eager execution of an op or a tf.function) are written
145 // to the *.execution file. Takes ownership of execution.
146 Status WriteExecution(Execution* execution);
147 // Graph execution traces (graph-internal tensor values or their summaries)
148 // are written to the *.graph_execution_traces file.
149 // Takes ownership of graph_execution_trace.
150 Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
151
152 // Write a graph execution trace without using a protocol buffer.
153 // Instead, pass the raw values related to the graph execution trace.
154 // Args:
155 // tfdbg_context_id: A unique ID for the context of interest, e.g., a
156 // concreted compiled tf.function that the op of interest belongs to.
157 // op_name: Name of the op that this graph execution trace is concerned
158 // with. Applicable only to the single-tensor trace case. For cases in
159 // which the trace concerns multiple tensors, this is an empty string.
160 // output_slot: Output slot index of the op that this trace is concerned
161 // with.
162 // tensor_debug_mode: An integer that represents the tensor-debug mode
163 // enum. tensor_value: The value of the tensor that describes the
164 // tensor(s)
165 // that this trace is concerned with. The semantics of this tensor value
166 // depends on the value of `tensor_debug_mode`.
167 Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
168 const string& device_name,
169 const string& op_name, int32_t output_slot,
170 int32_t tensor_debug_mode,
171 const Tensor& tensor_value);
172
173 // Writes a serialized DebugEvent to one of the debug-events files
174 // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
175 // and GRAPHS files.
176 // NOTE: Actually used in the Python binding, to avoid overhead of
177 // serializing and parsing protos at the language interface.
178 void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str,
179 DebugEventFileType type);
180
181 // Writes a serialized DebugEvent to one of the debug-events files
182 // concerned with the execution-related events: the EXECUTION and
183 // GRAPH_EXECUTION_TRACES files. This involves the cyclic-buffer behavior if
184 // circular_buffer_size is configured to be >0.
185 // NOTE: Actually used in the Python binding, to avoid overhead of
186 // serializing and parsing protos at the language interface.
187 void WriteSerializedExecutionDebugEvent(const string& debug_event_str,
188 DebugEventFileType type);
189
190 // Given name of the device, retrieve a unique integer ID. As a side effect,
191 // if this is the first time this object encounters the device name,
192 // writes a DebuggedDevice proto to the .graphs file in the file set.
193 int RegisterDeviceAndGetId(const string& device_name);
194
195 // EventWriter automatically flushes and closes on destruction, but
196 // this method is provided for users who want to write to disk sooner
197 // and/or check for success.
198 // FlushNonExecutionFiles() pushes outstanding DebugEvents not written
199 // events to the circular buffer to their respective files.
200 Status FlushNonExecutionFiles();
201
202 // Writes current contents of the circular buffers to their respective
203 // debug event files and clears the circular buffers.
204 Status FlushExecutionFiles();
205
206 // Close() calls FlushNonExecutionFiles() and FlushExecutionFiles()
207 // and then closes the current debug events files.
208 Status Close();
209
210 private:
211 static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
212
213 // Get a static map from dump-root path to DebugEventsWriter objects.
214 // This helps the per-dump-root singletone pattern.
215 GetDebugEventsWriterMap();
216
217 // Guards calls to the GetDebugEventsWriter() method.
218 static mutex factory_mu_;
219
220 DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
221 int64_t circular_buffer_size);
222
223 // Get the path prefix. The same for all files, which differ only in the
224 // suffix.
225 string FileName(DebugEventFileType type);
226
227 // Initialize the TFRecord writer for non-metadata file type.
228 Status InitNonMetadataFile(DebugEventFileType type);
229
230 Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
231 DebugEventFileType type);
232
233 void SelectWriter(DebugEventFileType type,
234 std::unique_ptr<SingleDebugEventFileWriter>** writer);
235 const string GetSuffix(DebugEventFileType type);
236 string GetFileNameInternal(DebugEventFileType type);
237
238 Env* env_;
239 const string dump_root_;
240 const string tfdbg_run_id_;
241
242 string file_prefix_;
243 bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
244 mutex initialization_mu_;
245
246 const int64_t circular_buffer_size_;
247 std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
248 mutex execution_buffer_mu_;
249 std::deque<string> graph_execution_trace_buffer_
250 TF_GUARDED_BY(graph_execution_trace_buffer_mu_);
251 mutex graph_execution_trace_buffer_mu_;
252
253 absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_);
254 mutex device_mu_;
255
256 std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_;
257 std::unique_ptr<SingleDebugEventFileWriter> source_files_writer_;
258 std::unique_ptr<SingleDebugEventFileWriter> stack_frames_writer_;
259 std::unique_ptr<SingleDebugEventFileWriter> graphs_writer_;
260 std::unique_ptr<SingleDebugEventFileWriter> execution_writer_;
261 std::unique_ptr<SingleDebugEventFileWriter> graph_execution_traces_writer_;
262
263 TF_DISALLOW_COPY_AND_ASSIGN(DebugEventsWriter);
264
265 friend class DebugEventsWriterTest;
266};
267
268} // namespace tfdbg
269} // namespace tensorflow
270
271#endif // TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
272