1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_ |
17 | #define TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_ |
18 | |
19 | #include <deque> |
20 | |
21 | #include "absl/container/flat_hash_map.h" |
22 | #include "tensorflow/core/framework/tensor.h" |
23 | #include "tensorflow/core/lib/core/status.h" |
24 | #include "tensorflow/core/lib/io/record_writer.h" |
25 | #include "tensorflow/core/platform/env.h" |
26 | #include "tensorflow/core/platform/macros.h" |
27 | #include "tensorflow/core/platform/types.h" |
28 | #include "tensorflow/core/protobuf/debug_event.pb.h" |
29 | |
30 | namespace tensorflow { |
31 | namespace tfdbg { |
32 | |
33 | // The set of files generated by a debugged TensorFlow program. |
34 | enum DebugEventFileType { |
35 | METADATA, |
36 | SOURCE_FILES, |
37 | STACK_FRAMES, |
38 | GRAPHS, |
39 | EXECUTION, |
40 | GRAPH_EXECUTION_TRACES, |
41 | }; |
42 | |
43 | // Helper class for DebugEventsWriter. |
44 | // This class manages the writing of data to a single TFRecord file. |
45 | // Each object of the DebugEventsWriter class below involves multiple |
46 | // TFRecord files, and hence utilizes multiple objects of this helper class. |
47 | class SingleDebugEventFileWriter { |
48 | public: |
49 | explicit SingleDebugEventFileWriter(const string& file_path); |
50 | |
51 | Status Init(); |
52 | |
53 | void WriteSerializedDebugEvent(tensorflow::StringPiece debug_event_str); |
54 | |
55 | Status Flush(); |
56 | Status Close(); |
57 | |
58 | const string FileName(); |
59 | |
60 | private: |
61 | Env* env_; |
62 | const string file_path_; |
63 | std::atomic_int_fast32_t num_outstanding_events_; |
64 | |
65 | std::unique_ptr<WritableFile> writable_file_; |
66 | std::unique_ptr<io::RecordWriter> record_writer_ TF_PT_GUARDED_BY(writer_mu_); |
67 | mutex writer_mu_; |
68 | }; |
69 | |
70 | // The DebugEvents writer class. |
71 | class DebugEventsWriter { |
72 | public: |
73 | #ifndef SWIG |
74 | // Prefix of version string present in the first entry of every event file. |
75 | // Default size of each circular buffer (unit: number of DebugEvent protos). |
76 | static constexpr const int64_t kDefaultCyclicBufferSize = 1000; |
77 | |
78 | static constexpr const char* kFileNamePrefix = "tfdbg_events" ; |
79 | static constexpr const char* kMetadataSuffix = "metadata" ; |
80 | static constexpr const char* kSourceFilesSuffix = "source_files" ; |
81 | static constexpr const char* kStackFramesSuffix = "stack_frames" ; |
82 | static constexpr const char* kGraphsSuffix = "graphs" ; |
83 | static constexpr const char* kExecutionSuffix = "execution" ; |
84 | static constexpr const char* kGraphExecutionTracesSuffix = |
85 | "graph_execution_traces" ; |
86 | |
87 | static constexpr const char* kVersionPrefix = "debug.Event:" ; |
88 | static constexpr const int kCurrentFormatVersion = 1; |
89 | #endif |
90 | |
91 | // Get the DebugEventsWriter for the given dump_root. |
92 | // For a given dump_root value, it is a singleton. tfdbg event files come in |
93 | // sets of six. The singleton pattern avoids storing multiple sets in a single |
94 | // folder, which might cause confusion. |
95 | // |
96 | // If an instance of DebugEventsWriter has already been created at a |
97 | // `dump_root`, calling this method with the same `dump_root` will return |
98 | // the existing instance. |
99 | // |
100 | // Args: |
101 | // dump_root: Dump root directory. If it doesn't exist, will be created. |
102 | // tfdbg_run_id: Debugging run ID of the writer. |
103 | // circular_buffer_size: Circular buffer size (in number of DebugEvent |
104 | // protos). If set to a value <=0, will abolish the circular-buffer |
105 | // behavior. |
106 | // Returns: |
107 | // A pointer to a DebugEventsWriter object: a per-dump_root singleton. |
108 | static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root, |
109 | const string& tfdbg_run_id, |
110 | int64_t circular_buffer_size); |
111 | // Look up existing events writer by dump_root. |
112 | // If no DebugEventsWriter has been created at the dump_root, a non-OK |
113 | // Status will be returned. Else an OK status will be returned, with |
114 | // the pointer to the existing instance provided by reference. |
115 | static Status LookUpDebugEventsWriter( |
116 | const string& dump_root, DebugEventsWriter** debug_events_writer); |
117 | ~DebugEventsWriter(); |
118 | |
119 | // Sets the debug event filenames and opens file for writing. |
120 | // All files (see the DebugEventFileType enum) share the same prefix and |
121 | // differ only in their suffixes. If not called by user, will be invoked |
122 | // automatically by a call to FileName() or any of the Write*() methods(). |
123 | // Idempotent: if the metadata file exists and is open, this is a no-op. |
124 | // If on the other hand the file was opened, but has since disappeared (e.g. |
125 | // deleted by another process), this will open a new file. |
126 | Status Init(); |
127 | |
128 | // The four DebugEvent fields below are written _without_ the circular |
129 | // buffer. Source file contents are written to the *.source_files file. |
130 | // Takes ownership of source_file. |
131 | Status WriteSourceFile(SourceFile* source_file); |
132 | // Stack frames are written to the *.code_locations file. |
133 | // Takes ownership of stack_frame_with_id. |
134 | Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id); |
135 | // Graph op creation events are written to the *.graphs file. |
136 | // Takes ownership of graph_op_creation. |
137 | Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation); |
138 | // Debugged graphs are written to the *.graphs file. |
139 | // Takes ownership of debugged_graph. |
140 | Status WriteDebuggedGraph(DebuggedGraph* debugged_graph); |
141 | |
142 | // The two DebugEvent fields below are written to the circular buffer |
143 | // and saved to disk only at the FlushExecutionFiles() call. |
144 | // Execution events (eager execution of an op or a tf.function) are written |
145 | // to the *.execution file. Takes ownership of execution. |
146 | Status WriteExecution(Execution* execution); |
147 | // Graph execution traces (graph-internal tensor values or their summaries) |
148 | // are written to the *.graph_execution_traces file. |
149 | // Takes ownership of graph_execution_trace. |
150 | Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace); |
151 | |
152 | // Write a graph execution trace without using a protocol buffer. |
153 | // Instead, pass the raw values related to the graph execution trace. |
154 | // Args: |
155 | // tfdbg_context_id: A unique ID for the context of interest, e.g., a |
156 | // concreted compiled tf.function that the op of interest belongs to. |
157 | // op_name: Name of the op that this graph execution trace is concerned |
158 | // with. Applicable only to the single-tensor trace case. For cases in |
159 | // which the trace concerns multiple tensors, this is an empty string. |
160 | // output_slot: Output slot index of the op that this trace is concerned |
161 | // with. |
162 | // tensor_debug_mode: An integer that represents the tensor-debug mode |
163 | // enum. tensor_value: The value of the tensor that describes the |
164 | // tensor(s) |
165 | // that this trace is concerned with. The semantics of this tensor value |
166 | // depends on the value of `tensor_debug_mode`. |
167 | Status WriteGraphExecutionTrace(const string& tfdbg_context_id, |
168 | const string& device_name, |
169 | const string& op_name, int32_t output_slot, |
170 | int32_t tensor_debug_mode, |
171 | const Tensor& tensor_value); |
172 | |
173 | // Writes a serialized DebugEvent to one of the debug-events files |
174 | // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES |
175 | // and GRAPHS files. |
176 | // NOTE: Actually used in the Python binding, to avoid overhead of |
177 | // serializing and parsing protos at the language interface. |
178 | void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str, |
179 | DebugEventFileType type); |
180 | |
181 | // Writes a serialized DebugEvent to one of the debug-events files |
182 | // concerned with the execution-related events: the EXECUTION and |
183 | // GRAPH_EXECUTION_TRACES files. This involves the cyclic-buffer behavior if |
184 | // circular_buffer_size is configured to be >0. |
185 | // NOTE: Actually used in the Python binding, to avoid overhead of |
186 | // serializing and parsing protos at the language interface. |
187 | void WriteSerializedExecutionDebugEvent(const string& debug_event_str, |
188 | DebugEventFileType type); |
189 | |
190 | // Given name of the device, retrieve a unique integer ID. As a side effect, |
191 | // if this is the first time this object encounters the device name, |
192 | // writes a DebuggedDevice proto to the .graphs file in the file set. |
193 | int RegisterDeviceAndGetId(const string& device_name); |
194 | |
195 | // EventWriter automatically flushes and closes on destruction, but |
196 | // this method is provided for users who want to write to disk sooner |
197 | // and/or check for success. |
198 | // FlushNonExecutionFiles() pushes outstanding DebugEvents not written |
199 | // events to the circular buffer to their respective files. |
200 | Status FlushNonExecutionFiles(); |
201 | |
202 | // Writes current contents of the circular buffers to their respective |
203 | // debug event files and clears the circular buffers. |
204 | Status FlushExecutionFiles(); |
205 | |
206 | // Close() calls FlushNonExecutionFiles() and FlushExecutionFiles() |
207 | // and then closes the current debug events files. |
208 | Status Close(); |
209 | |
210 | private: |
211 | static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* |
212 | |
213 | // Get a static map from dump-root path to DebugEventsWriter objects. |
214 | // This helps the per-dump-root singletone pattern. |
215 | GetDebugEventsWriterMap(); |
216 | |
217 | // Guards calls to the GetDebugEventsWriter() method. |
218 | static mutex factory_mu_; |
219 | |
220 | DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id, |
221 | int64_t circular_buffer_size); |
222 | |
223 | // Get the path prefix. The same for all files, which differ only in the |
224 | // suffix. |
225 | string FileName(DebugEventFileType type); |
226 | |
227 | // Initialize the TFRecord writer for non-metadata file type. |
228 | Status InitNonMetadataFile(DebugEventFileType type); |
229 | |
230 | Status SerializeAndWriteDebugEvent(DebugEvent* debug_event, |
231 | DebugEventFileType type); |
232 | |
233 | void SelectWriter(DebugEventFileType type, |
234 | std::unique_ptr<SingleDebugEventFileWriter>** writer); |
235 | const string GetSuffix(DebugEventFileType type); |
236 | string GetFileNameInternal(DebugEventFileType type); |
237 | |
238 | Env* env_; |
239 | const string dump_root_; |
240 | const string tfdbg_run_id_; |
241 | |
242 | string file_prefix_; |
243 | bool is_initialized_ TF_GUARDED_BY(initialization_mu_); |
244 | mutex initialization_mu_; |
245 | |
246 | const int64_t circular_buffer_size_; |
247 | std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_); |
248 | mutex execution_buffer_mu_; |
249 | std::deque<string> graph_execution_trace_buffer_ |
250 | TF_GUARDED_BY(graph_execution_trace_buffer_mu_); |
251 | mutex graph_execution_trace_buffer_mu_; |
252 | |
253 | absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_); |
254 | mutex device_mu_; |
255 | |
256 | std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_; |
257 | std::unique_ptr<SingleDebugEventFileWriter> source_files_writer_; |
258 | std::unique_ptr<SingleDebugEventFileWriter> stack_frames_writer_; |
259 | std::unique_ptr<SingleDebugEventFileWriter> graphs_writer_; |
260 | std::unique_ptr<SingleDebugEventFileWriter> execution_writer_; |
261 | std::unique_ptr<SingleDebugEventFileWriter> graph_execution_traces_writer_; |
262 | |
263 | TF_DISALLOW_COPY_AND_ASSIGN(DebugEventsWriter); |
264 | |
265 | friend class DebugEventsWriterTest; |
266 | }; |
267 | |
268 | } // namespace tfdbg |
269 | } // namespace tensorflow |
270 | |
271 | #endif // TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_ |
272 | |