1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/core/debug/debug_graph_utils.h" |
17 | |
18 | #include "tensorflow/core/common_runtime/memory_types.h" |
19 | #include "tensorflow/core/framework/kernel_def.pb.h" |
20 | #include "tensorflow/core/framework/node_def_builder.h" |
21 | #include "tensorflow/core/framework/op_kernel.h" |
22 | #include "tensorflow/core/graph/node_builder.h" |
23 | #include "tensorflow/core/lib/strings/strcat.h" |
24 | #include "tensorflow/core/protobuf/debug.pb.h" |
25 | |
26 | namespace tensorflow { |
27 | |
28 | namespace { |
29 | |
30 | // TODO(cais): Switch to safe_strtob when available. |
31 | Status ParseBoolString(const string& bool_str, bool* bool_val) { |
32 | const string lower_bool_str = absl::AsciiStrToLower(bool_str); |
33 | if (lower_bool_str == "false" || lower_bool_str == "f" || |
34 | lower_bool_str == "0" ) { |
35 | *bool_val = false; |
36 | } else if (lower_bool_str == "true" || lower_bool_str == "t" || |
37 | lower_bool_str == "1" ) { |
38 | *bool_val = true; |
39 | } else { |
40 | return errors::InvalidArgument("Invalid string for bool value: " , bool_str); |
41 | } |
42 | return OkStatus(); |
43 | } |
44 | |
45 | } // namespace |
46 | |
47 | // static |
48 | Status DebugNodeInserter::InsertNodes( |
49 | const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph, |
50 | Device* device) { |
51 | // TODO(cais): This method is getting too large in size. |
52 | // Refactor it with helpers. |
53 | |
54 | if (watches.empty()) { |
55 | // Nothing to do: Return OK right away. |
56 | return OkStatus(); |
57 | } |
58 | |
59 | // Debug ops and URLs for wildcard node names (if any). |
60 | std::vector<string> default_debug_ops; |
61 | std::vector<string> default_debug_urls; |
62 | |
63 | // A map from tensor name (e.g., "node_a:0") to list of debug op names |
64 | // (e.g., {"DebugIdentity", "DebugNanCount"}) |
65 | std::unordered_map<string, std::vector<string>> tensor_watches; |
66 | // A map from tensor name to debug_url. |
67 | std::unordered_map<string, std::vector<string>> tensor_watch_urls; |
68 | std::unordered_map<string, bool> tensor_tolerate_failures; |
69 | |
70 | // Cache the proto content for fast lookup later |
71 | for (const DebugTensorWatch& watch : watches) { |
72 | if (watch.debug_ops().empty()) { |
73 | continue; |
74 | } |
75 | |
76 | if (watch.debug_urls().empty()) { |
77 | continue; |
78 | } |
79 | |
80 | if (watch.node_name() == "*" ) { |
81 | if (watch.output_slot() == -1) { |
82 | default_debug_ops.insert(default_debug_ops.end(), |
83 | watch.debug_ops().begin(), |
84 | watch.debug_ops().end()); |
85 | default_debug_urls.insert(default_debug_urls.end(), |
86 | watch.debug_urls().begin(), |
87 | watch.debug_urls().end()); |
88 | } else { |
89 | return Status(error::FAILED_PRECONDITION, |
90 | strings::StrCat( |
91 | "output_slot is expected to be -1 for wildcard " , |
92 | "node name (\"*\"), but got " , watch.output_slot())); |
93 | } |
94 | continue; |
95 | } else { |
96 | if (watch.output_slot() < 0) { |
97 | return Status( |
98 | error::FAILED_PRECONDITION, |
99 | strings::StrCat("A negative output_slot in DebugTensorWatch is " , |
100 | "valid only for the wildcard node name (\"*\"), " , |
101 | "but got node name " , watch.node_name())); |
102 | } |
103 | } |
104 | |
105 | string tensor_name = |
106 | strings::StrCat(watch.node_name(), ":" , watch.output_slot()); |
107 | |
108 | std::vector<string> debug_ops; |
109 | for (const string& debug_op : watch.debug_ops()) { |
110 | debug_ops.push_back(debug_op); |
111 | } |
112 | |
113 | tensor_watches[tensor_name] = debug_ops; |
114 | tensor_tolerate_failures[tensor_name] = |
115 | watch.tolerate_debug_op_creation_failures(); |
116 | |
117 | std::vector<string> urls; |
118 | for (const string& url : watch.debug_urls()) { |
119 | urls.push_back(url); |
120 | } |
121 | tensor_watch_urls[tensor_name] = urls; |
122 | } |
123 | |
124 | if (tensor_watches.empty()) { |
125 | return OkStatus(); |
126 | } |
127 | |
128 | DeviceType device_type = DeviceType{device->device_type()}; |
129 | |
130 | // Keep track of all edges to be removed. |
131 | std::vector<const Edge*> edges_to_remove; |
132 | |
133 | for (Node* src_node : graph->nodes()) { |
134 | // Make a map from output slot to outgoing edges from the slot. |
135 | std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges; |
136 | for (const Edge* edge : src_node->out_edges()) { |
137 | const int src_output = edge->src_output(); |
138 | if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) { |
139 | output_slot_to_edges[src_output] = {edge}; |
140 | } else { |
141 | output_slot_to_edges[src_output].push_back(edge); |
142 | } |
143 | } |
144 | |
145 | // Iterate through all output slots of the node. |
146 | for (int src_output_slot = 0; src_output_slot < src_node->num_outputs(); |
147 | ++src_output_slot) { |
148 | const string tensor_name = |
149 | strings::StrCat(src_node->name(), ":" , src_output_slot); |
150 | const bool explicit_tensor_match = |
151 | tensor_watches.find(tensor_name) != tensor_watches.end(); |
152 | if (!explicit_tensor_match && default_debug_ops.empty()) { |
153 | continue; |
154 | } |
155 | |
156 | // Now we have encountered a watched tensor. We will: |
157 | // 1) Mark this edge as to be removed, iff this is a non-Reference |
158 | // tensor |
159 | // 2) Create a Copy node for the tensor |
160 | // 3) Add a new edge, from the source tensor to the Copy node |
161 | // 4) Add a new edge, from the Copy node to the destination node, iff |
162 | // this is a non-Reference tensor. |
163 | // 5) Create all the requested debug nodes and their edges to the Copy |
164 | // node. |
165 | // 6) Add control edges from the debug nodes to the destination nodes |
166 | // to ensure that the tensors values exported by the debug nodes |
167 | // to the debug URLs reflect the values before the execution of |
168 | // the destination nodes. |
169 | |
170 | const DataType src_dt = src_node->output_type(src_output_slot); |
171 | MemoryType memory_type; |
172 | TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node, |
173 | src_output_slot, &memory_type)); |
174 | |
175 | // Create the copy node for the watched tensor. |
176 | const std::vector<string> debug_ops = explicit_tensor_match |
177 | ? tensor_watches[tensor_name] |
178 | : default_debug_ops; |
179 | const std::vector<string> debug_urls = |
180 | explicit_tensor_match ? tensor_watch_urls[tensor_name] |
181 | : default_debug_urls; |
182 | Node* copy_node; |
183 | Status copy_s = |
184 | CreateCopyNode(graph, device_type, memory_type == HOST_MEMORY, |
185 | src_node->name(), src_output_slot, src_dt, tensor_name, |
186 | debug_ops, debug_urls, ©_node); |
187 | if (!copy_s.ok()) { |
188 | return Status( |
189 | error::FAILED_PRECONDITION, |
190 | strings::StrCat("Failed to create Copy/CopyHost node for tensor " , |
191 | tensor_name, ", due to: " , copy_s.error_message())); |
192 | } |
193 | |
194 | // Add edge from watched tensor to the copy node. |
195 | graph->AddEdge(src_node, src_output_slot, copy_node, 0); |
196 | |
197 | // Create all requested debug nodes and their edges to the Copy node. |
198 | std::vector<Node*> debug_nodes; |
199 | for (size_t i = 0; i < debug_ops.size(); ++i) { |
200 | const string& debug_op_name = debug_ops[i]; |
201 | |
202 | Node* debug_node; |
203 | Status debug_s = CreateDebugNode(graph, *device, copy_node->name(), |
204 | src_dt, tensor_name, debug_urls, i, |
205 | debug_op_name, &debug_node); |
206 | if (debug_s.ok()) { |
207 | graph->AddEdge(copy_node, 0, debug_node, 0); |
208 | debug_nodes.push_back(debug_node); |
209 | } else { |
210 | if (tensor_tolerate_failures[tensor_name]) { |
211 | LOG(INFO) << "Tolerating failure to create debug node: " |
212 | << "tensor name = " << tensor_name << "; " |
213 | << "debug op name = " << debug_op_name; |
214 | } else { |
215 | return Status( |
216 | error::FAILED_PRECONDITION, |
217 | strings::StrCat("Failed to create debug node " , debug_op_name, |
218 | " for tensor " , tensor_name, |
219 | ", due to: " , debug_s.error_message())); |
220 | } |
221 | } |
222 | } |
223 | |
224 | // Is the output a reference? |
225 | const bool is_ref = IsRefType(src_node->output_type(src_output_slot)); |
226 | |
227 | // Iterate through all outgoing edges attached to the slot. |
228 | for (const Edge* edge : output_slot_to_edges[src_output_slot]) { |
229 | // Mark the edge for removal. |
230 | if (!is_ref) { |
231 | edges_to_remove.push_back(edge); |
232 | graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input()); |
233 | } |
234 | |
235 | // Add control edges from the debug nodes to the destination node |
236 | // to ensure that the debug nodes are executed before the destination |
237 | // node. Skip Enter and NextIteration ops to avoid hanging. |
238 | for (Node* debug_node : debug_nodes) { |
239 | if (!src_node->IsEnter() && !src_node->IsNextIteration()) { |
240 | graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(), |
241 | Graph::kControlSlot); |
242 | } |
243 | } |
244 | } |
245 | } |
246 | } |
247 | |
248 | // Remove all edges marked for removal. |
249 | for (const Edge* edge : edges_to_remove) { |
250 | graph->RemoveEdge(edge); |
251 | } |
252 | |
253 | return OkStatus(); |
254 | } |
255 | |
256 | void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) { |
257 | bool deparallelized_a_loop = false; |
258 | for (Node* node : graph->nodes()) { |
259 | if (node->IsEnter()) { |
260 | const AttrValue* parallel_iterations = |
261 | node->attrs().Find("parallel_iterations" ); |
262 | if (parallel_iterations && parallel_iterations->i() > 1) { |
263 | deparallelized_a_loop = true; |
264 | VLOG(1) << "Changing the parallel_iterations attribute of the " |
265 | << "Enter/RefEnter node \"" << node->name() << "\" on device \"" |
266 | << device->name() << "\" from " << parallel_iterations->i() |
267 | << " to 1." ; |
268 | node->AddAttr<int64_t>("parallel_iterations" , 1); |
269 | } |
270 | } |
271 | } |
272 | if (deparallelized_a_loop) { |
273 | LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations " |
274 | << "attribute of all scheduled Enter/RefEnter nodes to 1. (This " |
275 | << "does not affect subsequent non-debug runs.)" ; |
276 | } |
277 | } |
278 | |
279 | // static |
280 | const string DebugNodeInserter::GetCopyNodeName(const string& node_name, |
281 | const int output_slot) { |
282 | // For example, if the watched node is named "node1" and the output slot |
283 | // is 0, the debug node will be called: __copy_node1_0 |
284 | return strings::StrCat("__copy_" , node_name, "_" , output_slot); |
285 | } |
286 | |
287 | // static |
288 | const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name, |
289 | const int debug_op_num, |
290 | const string& debug_op_name) { |
291 | // For example, if the watched node is named "node1" and the debug op that |
292 | // watches the output slot of node1 is of the type "DebugNanCount", the |
293 | // debug node will be called: __dbg_node1_0_0_DebugNanCount. |
294 | return strings::StrCat("__dbg_" , tensor_name, "_" , debug_op_num, "_" , |
295 | debug_op_name); |
296 | } |
297 | |
298 | // static |
299 | Status DebugNodeInserter::CreateCopyNode( |
300 | Graph* graph, const DeviceType device_type, const bool is_host_memory, |
301 | const string& src_node_name, const int src_output, const DataType src_dt, |
302 | const string& tensor_name, const std::vector<string>& debug_ops, |
303 | const std::vector<string>& debug_urls, Node** copy_node) { |
304 | const string kGatedGrpcAttributeKey = "gated_grpc" ; |
305 | |
306 | NodeDef node_def; |
307 | const KernelDef* kdef; |
308 | |
309 | const string copy_op_name = is_host_memory ? "CopyHost" : "Copy" ; |
310 | const string copy_node_name = GetCopyNodeName(src_node_name, src_output); |
311 | |
312 | // Cross debug_ops and debug_urls to get the list of debug ops and watches. |
313 | std::vector<string> debug_ops_spec; |
314 | for (const string& debug_op : debug_ops) { |
315 | for (const string& debug_url : debug_urls) { |
316 | string debug_op_name_proper; |
317 | std::unordered_map<string, string> custom_attributes; |
318 | TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper, |
319 | &custom_attributes)); |
320 | |
321 | bool gated_grpc_value = false; |
322 | if (custom_attributes.find(kGatedGrpcAttributeKey) != |
323 | custom_attributes.end()) { |
324 | TF_RETURN_IF_ERROR(ParseBoolString( |
325 | custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value)); |
326 | } |
327 | debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";" , |
328 | debug_url, ";" , |
329 | gated_grpc_value ? "1" : "0" )); |
330 | } |
331 | } |
332 | |
333 | auto builder = NodeDefBuilder(copy_node_name, copy_op_name) |
334 | .Input(src_node_name, src_output, src_dt) |
335 | .Attr("debug_ops_spec" , debug_ops_spec); |
336 | |
337 | if (!builder.Finalize(&node_def).ok()) { |
338 | return Status( |
339 | error::FAILED_PRECONDITION, |
340 | strings::StrCat("Failed to create node definition " , "for copy op " , |
341 | copy_node_name, " on watched tensor " , tensor_name)); |
342 | } |
343 | Status s = FindKernelDef(device_type, node_def, &kdef, nullptr); |
344 | |
345 | if (!s.ok()) { |
346 | return Status( |
347 | error::FAILED_PRECONDITION, |
348 | strings::StrCat("Failed to find kernel definition " , "for copy op " , |
349 | copy_node_name, " on watched tensor " , tensor_name)); |
350 | } |
351 | if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) { |
352 | return Status(error::FAILED_PRECONDITION, |
353 | strings::StrCat("Failed to create copy node " , copy_node_name, |
354 | " on watched tensor " , tensor_name)); |
355 | } |
356 | |
357 | return OkStatus(); |
358 | } |
359 | |
360 | // static |
361 | Status DebugNodeInserter::ParseDebugOpName( |
362 | const string& debug_op_name, string* debug_op_name_proper, |
363 | std::unordered_map<string, string>* attributes) { |
364 | const size_t l_index = debug_op_name.find('('); |
365 | const size_t r_index = debug_op_name.find(')'); |
366 | if (l_index == string::npos && r_index == string::npos) { |
367 | *debug_op_name_proper = debug_op_name; |
368 | } else { |
369 | if (l_index == string::npos || l_index == 0 || |
370 | r_index != debug_op_name.size() - 1) { |
371 | return errors::InvalidArgument("Malformed debug op name \"" , |
372 | debug_op_name, "\"" ); |
373 | } |
374 | |
375 | *debug_op_name_proper = debug_op_name.substr(0, l_index); |
376 | string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1); |
377 | |
378 | std::vector<string> attribute_segs = str_util::Split(arguments, ";" ); |
379 | for (const string& attribute_seg : attribute_segs) { |
380 | StringPiece seg(attribute_seg); |
381 | str_util::RemoveWhitespaceContext(&seg); |
382 | if (seg.empty()) { |
383 | continue; |
384 | } |
385 | |
386 | const size_t eq_index = seg.find('='); |
387 | if (eq_index == string::npos) { |
388 | return errors::InvalidArgument( |
389 | "Malformed attributes in debug op name \"" , debug_op_name, "\"" ); |
390 | } |
391 | |
392 | const string key(seg.substr(0, eq_index)); |
393 | const string value( |
394 | seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1)); |
395 | if (key.empty() || value.empty()) { |
396 | return errors::InvalidArgument( |
397 | "Malformed attributes in debug op name \"" , debug_op_name, "\"" ); |
398 | } |
399 | |
400 | if (attributes->find(key) == attributes->end()) { |
401 | (*attributes)[key] = value; |
402 | } else { |
403 | return errors::InvalidArgument("Duplicate attribute name \"" , key, |
404 | "\" found in the debug op: \"" , |
405 | debug_op_name, "\"" ); |
406 | } |
407 | } |
408 | } |
409 | return OkStatus(); |
410 | } |
411 | |
412 | // static |
413 | Status DebugNodeInserter::SetDebugNodeAttributes( |
414 | Node* debug_node, const std::unordered_map<string, string>& attributes) { |
415 | std::unordered_set<string> unfulfilled_keys; |
416 | for (const auto& item : attributes) { |
417 | unfulfilled_keys.insert(item.first); |
418 | } |
419 | |
420 | for (const auto& attr : debug_node->op_def().attr()) { |
421 | if (attributes.find(attr.name()) != attributes.end()) { |
422 | const string& attr_value = attributes.at(attr.name()); |
423 | if (attr.type() == "string" ) { |
424 | debug_node->AddAttr<string>(attr.name(), attr_value); |
425 | } else if (attr.type() == "float" ) { |
426 | float float_value = 0.0; |
427 | if (!::tensorflow::strings::safe_strtof(attr_value.c_str(), |
428 | &float_value)) { |
429 | return errors::InvalidArgument( |
430 | "Invalid value string for float-type attribute " , attr.name(), |
431 | "of debug node " , debug_node->name(), ": \"" , attr_value, "\"" ); |
432 | } |
433 | debug_node->AddAttr<float>(attr.name(), float_value); |
434 | } else if (attr.type() == "int" ) { |
435 | int64_t int_value = 0; |
436 | if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) { |
437 | return errors::InvalidArgument( |
438 | "Invalid value string for int-type attribute " , attr.name(), |
439 | "of debug node " , debug_node->name(), ": \"" , attr_value, "\"" ); |
440 | } |
441 | debug_node->AddAttr<int>(attr.name(), int_value); |
442 | } else if (attr.type() == "bool" ) { |
443 | bool bool_value; |
444 | if (!ParseBoolString(attr_value, &bool_value).ok()) { |
445 | return errors::InvalidArgument( |
446 | "Invalid value string for bool-type attribute " , attr.name(), |
447 | "of debug node " , debug_node->name(), ": \"" , attr_value, "\"" ); |
448 | } |
449 | debug_node->AddAttr<bool>(attr.name(), bool_value); |
450 | } else { |
451 | return errors::InvalidArgument( |
452 | "Unsupported type of custom attribute for debug ops: " , |
453 | attr.type()); |
454 | } |
455 | |
456 | unfulfilled_keys.erase(attr.name()); |
457 | } |
458 | } |
459 | |
460 | if (unfulfilled_keys.empty()) { |
461 | return OkStatus(); |
462 | } else { |
463 | return errors::InvalidArgument( |
464 | unfulfilled_keys.size(), |
465 | " attribute key(s) were not valid for debug node " , debug_node->name(), |
466 | ": " , absl::StrJoin(unfulfilled_keys, ", " )); |
467 | } |
468 | } |
469 | |
470 | // static |
471 | Status DebugNodeInserter::CreateDebugNode( |
472 | Graph* graph, const Device& device, const string& src_copy_node_name, |
473 | const DataType src_dt, const string& tensor_name, |
474 | const std::vector<string>& debug_urls, const int debug_op_num, |
475 | const string& debug_op_name, Node** debug_node) { |
476 | NodeDef node_def; |
477 | const KernelDef* kdef; |
478 | |
479 | string debug_op_name_proper; |
480 | std::unordered_map<string, string> custom_attributes; |
481 | TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper, |
482 | &custom_attributes)); |
483 | |
484 | const string debug_node_name = |
485 | GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper); |
486 | auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper) |
487 | .Input(src_copy_node_name, 0, src_dt) |
488 | .Attr("device_name" , device.name()) |
489 | .Attr("tensor_name" , tensor_name) |
490 | .Attr("debug_urls" , debug_urls); |
491 | |
492 | if (!builder.Finalize(&node_def).ok()) { |
493 | return errors::FailedPrecondition( |
494 | "Failed to create node definition for debug op " , debug_op_name_proper, |
495 | " on watched tensor " , tensor_name); |
496 | } |
497 | if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr) |
498 | .ok()) { |
499 | return errors::FailedPrecondition( |
500 | "Failed to find kernel definition for debug op " , debug_op_name_proper, |
501 | " on watched tensor " , tensor_name); |
502 | } |
503 | if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) { |
504 | return errors::FailedPrecondition("Failed to create debug node " , |
505 | debug_op_name_proper, |
506 | " on watched tensor " , tensor_name); |
507 | } |
508 | |
509 | // Set custom attributes (if any). |
510 | if (!custom_attributes.empty()) { |
511 | TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes)); |
512 | } |
513 | |
514 | return OkStatus(); |
515 | } |
516 | |
517 | } // namespace tensorflow |
518 | |