1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ |
17 | #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ |
18 | |
19 | #include <numeric> |
20 | |
21 | #include "tensorflow/core/platform/bfloat16.h" |
22 | |
23 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
24 | #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" |
25 | #include "tensorflow/core/common_runtime/gpu/gpu_util.h" |
26 | #include "tensorflow/core/util/determinism.h" |
27 | #endif |
28 | |
29 | #if GOOGLE_CUDA |
30 | #include "tensorflow/core/platform/cuda.h" |
31 | #elif TENSORFLOW_USE_ROCM |
32 | #include "tensorflow/core/platform/rocm.h" |
33 | #endif |
34 | |
35 | #include "tensorflow/core/debug/debug_io_utils.h" |
36 | #include "tensorflow/core/framework/device_base.h" |
37 | #include "tensorflow/core/framework/op_kernel.h" |
38 | #include "tensorflow/core/framework/tensor_util.h" |
39 | #include "tensorflow/core/lib/core/notification.h" |
40 | #include "tensorflow/core/lib/strings/stringprintf.h" |
41 | #include "tensorflow/core/util/debug_events_writer.h" |
42 | |
43 | namespace tensorflow { |
44 | |
45 | // Copy op for debugging. |
46 | // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the |
47 | // device on which the tensor is allocated. |
48 | class CopyOp : public OpKernel { |
49 | public: |
50 | explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) { |
51 | OP_REQUIRES_OK(context, context->GetAttr("tensor_name" , &tensor_name_)); |
52 | |
53 | std::vector<string> debug_ops_spec; |
54 | OP_REQUIRES_OK(context, |
55 | context->GetAttr("debug_ops_spec" , &debug_ops_spec)); |
56 | for (const string& debug_op_spec : debug_ops_spec) { |
57 | // Assume debug_op_spec has the format |
58 | // <debug_op>;<debug_url>;<gated_grpc>, e.g., |
59 | // DebugIdentity;grpc://localhost:3333;1 |
60 | const std::vector<string> items = str_util::Split(debug_op_spec, ";" ); |
61 | OP_REQUIRES( |
62 | context, items.size() == 3, |
63 | errors::Internal( |
64 | "Unexpected number of semicolons in debug_ops_spec element: " , |
65 | debug_op_spec)); |
66 | debug_op_and_url_specs_.push_back( |
67 | DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":" , items[0]), |
68 | items[1], items[2] == "1" )); |
69 | } |
70 | } |
71 | |
72 | void Compute(OpKernelContext* context) override { |
73 | const Tensor& src_tensor = context->input(0); |
74 | |
75 | if (src_tensor.IsInitialized() && |
76 | DataTypeCanUseMemcpy(src_tensor.dtype()) && |
77 | DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) { |
78 | // Source tensor is initialized and is mem-copyable. Make a copy. |
79 | Tensor* copied_tensor; |
80 | OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(), |
81 | &copied_tensor)); |
82 | |
83 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
84 | Device* device = static_cast<Device*>(context->device()); |
85 | // Determine if the input tensor is not on CPU (e.g., on GPU). |
86 | bool off_host_input = device->device_type() == DEVICE_GPU && |
87 | !context->input_alloc_attr(0).on_host(); |
88 | |
89 | if (off_host_input) { |
90 | DeviceContext* device_ctxt = context->op_device_context(); |
91 | // Input is not on host: deep-copy it from GPU to the same GPU. |
92 | Notification done_copy; |
93 | GPUUtil::CopyGPUTensorToSameGPU( |
94 | device, device_ctxt, &src_tensor, copied_tensor, |
95 | [&done_copy](const Status& s) { done_copy.Notify(); }); |
96 | done_copy.WaitForNotification(); |
97 | } else { |
98 | // The input tensor is on the host (CPU): deep-copy from CPU to CPU. |
99 | *copied_tensor = tensor::DeepCopy(src_tensor); |
100 | } |
101 | #else |
102 | *copied_tensor = tensor::DeepCopy(src_tensor); |
103 | #endif |
104 | } else { |
105 | // Source tensor is NOT initialized and/or is not mem-copyable: Forward |
106 | // the Tensor object. |
107 | context->set_output(0, src_tensor); |
108 | } |
109 | } |
110 | |
111 | bool IsExpensive() override { return false; } |
112 | |
113 | private: |
114 | string tensor_name_; |
115 | std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_; |
116 | }; |
117 | |
118 | // Base class of all debug ops. |
119 | class BaseDebugOp : public OpKernel { |
120 | public: |
121 | explicit BaseDebugOp(const string& debug_op_name, |
122 | OpKernelConstruction* context) |
123 | : OpKernel(context), debug_op_name_(debug_op_name) { |
124 | OP_REQUIRES_OK(context, context->GetAttr("debug_urls" , &debug_urls_)); |
125 | OP_REQUIRES_OK(context, context->GetAttr("gated_grpc" , &gated_grpc_)); |
126 | |
127 | string device_name; |
128 | string tensor_name; |
129 | OP_REQUIRES_OK(context, context->GetAttr("device_name" , &device_name)); |
130 | OP_REQUIRES_OK(context, context->GetAttr("tensor_name" , &tensor_name)); |
131 | |
132 | std::vector<string> name_items = str_util::Split(tensor_name, ':'); |
133 | string node_name; |
134 | int32_t output_slot = 0; |
135 | OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2, |
136 | errors::InvalidArgument("Failed to parse tensor name: \"" , |
137 | tensor_name, "\"" )); |
138 | if (name_items.size() == 2) { |
139 | node_name = name_items[0]; |
140 | OP_REQUIRES( |
141 | context, strings::safe_strto32(name_items[1], &output_slot), |
142 | errors::InvalidArgument("Invalid string value for output_slot: \"" , |
143 | name_items[1], "\"" )); |
144 | } else if (name_items.size() == 1) { |
145 | node_name = name_items[0]; |
146 | } |
147 | |
148 | debug_watch_key_.reset( |
149 | new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_)); |
150 | } |
151 | |
152 | bool IsExpensive() override { return false; } |
153 | |
154 | protected: |
155 | // Apply gRPC gating (if gated_grpc_ attribute is true). |
156 | // |
157 | // Returns false if and only if all grpc:// debug URLs of the debug op are |
158 | // disabled currently (i.e., gated off), in which case the debug op will emit |
159 | // an empty (size {0}) tensor of undefined data type. |
160 | bool ApplyGrpcGating(OpKernelContext* context) { |
161 | if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen( |
162 | debug_watch_key_->debug_node_name, debug_urls_)) { |
163 | // The entire node is gated off: Output an empty tensor and avoid |
164 | // expensive computation. |
165 | Tensor* output_tensor; |
166 | TensorShape shape({0}); |
167 | if (!context->allocate_output(0, shape, &output_tensor).ok()) { |
168 | LOG(ERROR) << "Debug node of watch key " |
169 | << debug_watch_key_->debug_node_name |
170 | << " failed to allocate empty tensor under gated-off state." ; |
171 | } |
172 | return false; |
173 | } else { |
174 | return true; |
175 | } |
176 | } |
177 | |
178 | // Publish a tensor to all debug URLs of the debug op. |
179 | // Log an error if the publishing failed. |
180 | Status PublishTensor(const Tensor& tensor) { |
181 | if (debug_urls_.empty()) { |
182 | return OkStatus(); |
183 | } else { |
184 | Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor, |
185 | Env::Default()->NowMicros(), |
186 | debug_urls_, gated_grpc_); |
187 | if (!status.ok()) { |
188 | LOG(ERROR) << "Debug node of watch key " |
189 | << debug_watch_key_->debug_node_name |
190 | << " failed to publish debug tensor data to all URLs " |
191 | << str_util::Join(debug_urls_, ", " ) |
192 | << ", due to: " << status.error_message(); |
193 | } |
194 | return status; |
195 | } |
196 | } |
197 | |
198 | private: |
199 | const string debug_op_name_; |
200 | std::unique_ptr<DebugNodeKey> debug_watch_key_; |
201 | std::vector<string> debug_urls_; |
202 | bool gated_grpc_; |
203 | }; |
204 | |
205 | // Identity op for debugging. |
206 | // Output slot 0 carries the debug signal and is always allocated on the |
207 | // host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp, |
208 | // the debug signal is equal to the input tensor. |
209 | class DebugIdentityOp : public BaseDebugOp { |
210 | public: |
211 | explicit DebugIdentityOp(OpKernelConstruction* context) |
212 | : BaseDebugOp("DebugIdentity" , context) {} |
213 | |
214 | void Compute(OpKernelContext* context) override { |
215 | if (!ApplyGrpcGating(context)) { |
216 | return; |
217 | } |
218 | |
219 | OP_REQUIRES_OK(context, PublishTensor(context->input(0))); |
220 | context->set_output(0, context->input(0)); |
221 | } |
222 | }; |
223 | |
224 | // NaN-counter op for debugging. |
225 | template <typename T> |
226 | class DebugNanCountOp : public BaseDebugOp { |
227 | public: |
228 | explicit DebugNanCountOp(OpKernelConstruction* context) |
229 | : BaseDebugOp("DebugNanCount" , context) {} |
230 | |
231 | void Compute(OpKernelContext* context) override { |
232 | if (!ApplyGrpcGating(context)) { |
233 | return; |
234 | } |
235 | |
236 | Tensor* output_tensor; |
237 | const Tensor& input = context->input(0); |
238 | |
239 | // Use DT_INT64/int64 to be consistent with TensorShape::num_elements(). |
240 | int64_t nan_count = 0; |
241 | |
242 | // If the input is an uninitialized tensor, let nan_count be 0. |
243 | if (input.IsInitialized()) { |
244 | // Count NaNs. |
245 | const TensorShape& input_shape = input.shape(); |
246 | const T* input_flat = input.template flat<T>().data(); |
247 | |
248 | for (int64_t i = 0; i < input_shape.num_elements(); ++i) { |
249 | if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) { |
250 | nan_count++; |
251 | } |
252 | } |
253 | } |
254 | |
255 | TensorShape shape({1}); |
256 | OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); |
257 | output_tensor->vec<int64_t>()(0) = nan_count; |
258 | OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); |
259 | } |
260 | }; |
261 | |
262 | // Numeric summary op for debugging. |
263 | template <typename T> |
264 | class DebugNumericSummaryOp : public BaseDebugOp { |
265 | public: |
266 | explicit DebugNumericSummaryOp(OpKernelConstruction* context) |
267 | : BaseDebugOp("DebugNumericSummary" , context) { |
268 | OP_REQUIRES_OK(context, context->GetAttr("lower_bound" , &lower_bound_)); |
269 | OP_REQUIRES_OK(context, context->GetAttr("upper_bound" , &upper_bound_)); |
270 | OP_REQUIRES_OK(context, |
271 | context->GetAttr("mute_if_healthy" , &mute_if_healthy_)); |
272 | } |
273 | |
274 | void Compute(OpKernelContext* context) override { |
275 | if (!ApplyGrpcGating(context)) { |
276 | return; |
277 | } |
278 | |
279 | Tensor* output_tensor; |
280 | const Tensor& input = context->input(0); |
281 | |
282 | int64_t is_initialized = 0; |
283 | int64_t element_count = 0; |
284 | int64_t negative_inf_count = 0; |
285 | int64_t negative_count = 0; |
286 | int64_t zero_count = 0; |
287 | int64_t positive_count = 0; |
288 | int64_t positive_inf_count = 0; |
289 | int64_t nan_count = 0; |
290 | double min = std::numeric_limits<double>::infinity(); |
291 | double max = -std::numeric_limits<double>::infinity(); |
292 | double sum = 0.0; |
293 | double mean = std::numeric_limits<double>::quiet_NaN(); |
294 | double variance = std::numeric_limits<double>::quiet_NaN(); |
295 | |
296 | // Equal to negative_count + zero_count + positive_count. |
297 | int64_t non_inf_nan_count = 0; |
298 | |
299 | const TensorShape& input_shape = input.shape(); |
300 | if (input.IsInitialized()) { |
301 | is_initialized = 1; |
302 | const T* input_flat = input.template flat<T>().data(); |
303 | |
304 | element_count = input_shape.num_elements(); |
305 | const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_); |
306 | const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_); |
307 | |
308 | for (int64_t i = 0; i < element_count; ++i) { |
309 | const double x = static_cast<double>(input_flat[i]); |
310 | if (Eigen::numext::isnan(x)) { |
311 | nan_count++; |
312 | } else if (Eigen::numext::isinf(x)) { |
313 | if (x < 0.0) { |
314 | negative_inf_count++; |
315 | } else { |
316 | positive_inf_count++; |
317 | } |
318 | } else { |
319 | if (is_lower_bound_custom && x <= lower_bound_) { |
320 | negative_inf_count++; |
321 | } else if (is_upper_bound_custom && x >= upper_bound_) { |
322 | positive_inf_count++; |
323 | } else if (x < 0.0) { |
324 | negative_count++; |
325 | } else if (x > 0.0) { |
326 | positive_count++; |
327 | } else { |
328 | zero_count++; |
329 | } |
330 | |
331 | if (x < min) { |
332 | min = x; |
333 | } |
334 | if (x > max) { |
335 | max = x; |
336 | } |
337 | |
338 | non_inf_nan_count++; |
339 | sum += x; |
340 | } |
341 | } |
342 | |
343 | if (non_inf_nan_count > 0) { |
344 | mean = sum / non_inf_nan_count; |
345 | |
346 | // Do a second pass to compute variance. |
347 | variance = 0.0; |
348 | for (int64_t i = 0; i < element_count; ++i) { |
349 | const double x = static_cast<double>(input_flat[i]); |
350 | if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) { |
351 | variance += (x - mean) * (x - mean); |
352 | } |
353 | } |
354 | variance /= non_inf_nan_count; |
355 | } |
356 | } |
357 | |
358 | TensorShape shape({14 + input_shape.dims()}); |
359 | OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); |
360 | output_tensor->vec<double>()(0) = static_cast<double>(is_initialized); |
361 | output_tensor->vec<double>()(1) = static_cast<double>(element_count); |
362 | output_tensor->vec<double>()(2) = static_cast<double>(nan_count); |
363 | output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count); |
364 | output_tensor->vec<double>()(4) = static_cast<double>(negative_count); |
365 | output_tensor->vec<double>()(5) = static_cast<double>(zero_count); |
366 | output_tensor->vec<double>()(6) = static_cast<double>(positive_count); |
367 | output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count); |
368 | output_tensor->vec<double>()(8) = min; |
369 | output_tensor->vec<double>()(9) = max; |
370 | output_tensor->vec<double>()(10) = mean; |
371 | output_tensor->vec<double>()(11) = variance; |
372 | |
373 | output_tensor->vec<double>()(12) = static_cast<double>(input.dtype()); |
374 | output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims()); |
375 | for (size_t d = 0; d < input_shape.dims(); ++d) { |
376 | output_tensor->vec<double>()(14 + d) = |
377 | static_cast<double>(input_shape.dim_sizes()[d]); |
378 | } |
379 | |
380 | bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 && |
381 | positive_inf_count == 0; |
382 | if (!mute) { |
383 | OP_REQUIRES_OK(context, PublishTensor(*output_tensor)); |
384 | } |
385 | } |
386 | |
387 | private: |
388 | float lower_bound_; |
389 | float upper_bound_; |
390 | bool mute_if_healthy_; |
391 | }; |
392 | |
393 | // Identity op for tfdbg v2: Writes debug data using DebugEventsWriter. |
394 | class DebugIdentityV2Op : public OpKernel { |
395 | public: |
396 | explicit DebugIdentityV2Op(OpKernelConstruction* context) |
397 | : OpKernel(context), |
398 | device_name_(context->device()->name()), |
399 | output_slot_(-1), |
400 | tensor_debug_mode_(0), |
401 | tfdbg_run_id_() { |
402 | std::vector<string> debug_urls; |
403 | OP_REQUIRES_OK(context, context->GetAttr("debug_urls" , &debug_urls)); |
404 | for (const string& debug_url : debug_urls) { |
405 | if (absl::StartsWith(debug_url, DebugIO::kFileURLScheme)) { |
406 | dump_roots_.emplace_back( |
407 | debug_url.substr(strlen(DebugIO::kFileURLScheme))); |
408 | } else { |
409 | context->SetStatus( |
410 | errors::Internal("Unsupported debug URL schema in: " , debug_url)); |
411 | } |
412 | } |
413 | OP_REQUIRES_OK(context, |
414 | context->GetAttr("tfdbg_context_id" , &tfdbg_context_id_)); |
415 | OP_REQUIRES_OK(context, context->GetAttr("op_name" , &op_name_)); |
416 | OP_REQUIRES_OK(context, context->GetAttr("output_slot" , &output_slot_)); |
417 | OP_REQUIRES_OK(context, |
418 | context->GetAttr("tensor_debug_mode" , &tensor_debug_mode_)); |
419 | if (context->HasAttr("circular_buffer_size" )) { |
420 | OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size" , |
421 | &circular_buffer_size_)); |
422 | } else { |
423 | circular_buffer_size_ = |
424 | tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize; |
425 | } |
426 | if (context->HasAttr("tfdbg_run_id" )) { |
427 | OP_REQUIRES_OK(context, context->GetAttr("tfdbg_run_id" , &tfdbg_run_id_)); |
428 | } |
429 | } |
430 | |
431 | void Compute(OpKernelContext* context) override { |
432 | const Tensor& tensor = context->input(0); |
433 | for (const string& dump_root : dump_roots_) { |
434 | tfdbg::DebugEventsWriter* debug_events_writer = |
435 | tfdbg::DebugEventsWriter::GetDebugEventsWriter( |
436 | dump_root, tfdbg_run_id_, circular_buffer_size_); |
437 | OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace( |
438 | tfdbg_context_id_, device_name_, op_name_, |
439 | output_slot_, tensor_debug_mode_, tensor)); |
440 | } |
441 | context->set_output(0, tensor); |
442 | } |
443 | |
444 | private: |
445 | std::vector<string> dump_roots_; |
446 | string tfdbg_context_id_; |
447 | string device_name_; |
448 | string op_name_; |
449 | int32 output_slot_; |
450 | int32 tensor_debug_mode_; |
451 | int64_t circular_buffer_size_; |
452 | string tfdbg_run_id_; |
453 | }; |
454 | |
455 | typedef Eigen::ThreadPoolDevice CPUDevice; |
456 | typedef Eigen::GpuDevice GPUDevice; |
457 | |
458 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
459 | template <typename Tin, typename Tout> |
460 | struct CurtHealthLaunch { |
461 | void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]); |
462 | }; |
463 | |
464 | extern template struct CurtHealthLaunch<Eigen::half, float>; |
465 | extern template struct CurtHealthLaunch<float, float>; |
466 | extern template struct CurtHealthLaunch<double, float>; |
467 | extern template struct CurtHealthLaunch<Eigen::half, double>; |
468 | extern template struct CurtHealthLaunch<float, double>; |
469 | extern template struct CurtHealthLaunch<double, double>; |
470 | |
471 | template <typename Tin, typename Tout> |
472 | struct ConciseHealthLaunch { |
473 | void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); |
474 | }; |
475 | |
476 | extern template struct ConciseHealthLaunch<Eigen::half, float>; |
477 | extern template struct ConciseHealthLaunch<float, float>; |
478 | extern template struct ConciseHealthLaunch<double, float>; |
479 | extern template struct ConciseHealthLaunch<Eigen::half, double>; |
480 | extern template struct ConciseHealthLaunch<float, double>; |
481 | extern template struct ConciseHealthLaunch<double, double>; |
482 | |
483 | template <typename Tin, typename Tout> |
484 | struct FullHealthLaunch { |
485 | void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]); |
486 | }; |
487 | |
488 | extern template struct FullHealthLaunch<Eigen::half, float>; |
489 | extern template struct FullHealthLaunch<float, float>; |
490 | extern template struct FullHealthLaunch<double, float>; |
491 | extern template struct FullHealthLaunch<Eigen::half, double>; |
492 | extern template struct FullHealthLaunch<float, double>; |
493 | extern template struct FullHealthLaunch<double, double>; |
494 | |
495 | template <typename Tin, typename Tout> |
496 | struct ReduceInfNanThreeSlotsLaunch { |
497 | void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); |
498 | }; |
499 | |
500 | extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, float>; |
501 | extern template struct ReduceInfNanThreeSlotsLaunch<float, float>; |
502 | extern template struct ReduceInfNanThreeSlotsLaunch<double, float>; |
503 | extern template struct ReduceInfNanThreeSlotsLaunch<Eigen::half, double>; |
504 | extern template struct ReduceInfNanThreeSlotsLaunch<float, double>; |
505 | extern template struct ReduceInfNanThreeSlotsLaunch<double, double>; |
506 | |
507 | #endif |
508 | |
509 | template <typename Device, typename Tin, typename Tout> |
510 | class DebugNumericSummaryV2Op; |
511 | |
512 | // Numeric summary op for tfdbg v2: CPU Kernel. |
513 | template <typename Tin, typename Tout> |
514 | class DebugNumericSummaryV2Op<CPUDevice, Tin, Tout> : public OpKernel { |
515 | public: |
516 | explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) |
517 | : OpKernel(context) { |
518 | OP_REQUIRES_OK(context, |
519 | context->GetAttr("tensor_debug_mode" , &tensor_debug_mode_)); |
520 | OP_REQUIRES_OK(context, context->GetAttr("tensor_id" , &tensor_id_)); |
521 | } |
522 | |
523 | void Compute(OpKernelContext* context) override { |
524 | const Tensor& tensor = context->input(0); |
525 | auto in = tensor.flat<Tin>(); |
526 | const Tin* data = in.data(); |
527 | const int64_t size = in.size(); |
528 | Tensor* output_tensor; |
529 | Tout tensor_id = static_cast<Tout>(tensor_id_); |
530 | const Tout num_elem = static_cast<Tout>(context->input(0).NumElements()); |
531 | // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because |
532 | // that mode does not make use of tensor_id. |
533 | if (tensor_debug_mode_ != 8) { |
534 | OP_REQUIRES( |
535 | context, tensor_id_ <= kMaxTensorId, |
536 | errors::InvalidArgument("DebugNumericSummaryV2Op requires " |
537 | "tensor_id to be less than or equal to " |
538 | "(2^" , |
539 | std::numeric_limits<Tout>::digits, |
540 | "). Given tensor_id:" , tensor_id_)); |
541 | } |
542 | |
543 | if (tensor_debug_mode_ == 2) { // CURT_HEALTH |
544 | TensorShape shape({2}); |
545 | OP_REQUIRES_OK(context, |
546 | context->allocate_output(0, shape, &output_tensor)); |
547 | output_tensor->flat<Tout>()(0) = tensor_id; // Slot tensor id |
548 | output_tensor->flat<Tout>()(1) = 0.0; // Has inf or nan |
549 | int fp_props = |
550 | std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { |
551 | return Eigen::numext::isfinite(y) ? x : 1; |
552 | }); |
553 | if (fp_props) { |
554 | output_tensor->flat<Tout>()(1) = 1.0; |
555 | } |
556 | } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH |
557 | TensorShape shape({5}); |
558 | OP_REQUIRES_OK(context, |
559 | context->allocate_output(0, shape, &output_tensor)); |
560 | output_tensor->flat<Tout>()(0) = tensor_id; |
561 | output_tensor->flat<Tout>()(1) = num_elem; |
562 | |
563 | // Accumulator value [neg_inf_count, pos_inf_count, nan_count] |
564 | Tout fp_props[3] = {0.0, 0.0, 0.0}; |
565 | std::for_each(data, data + size, [&fp_props](const Tin& y) { |
566 | if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { |
567 | // Do nothing: common case. |
568 | } else if (Eigen::numext::isinf(y)) { |
569 | if (y < static_cast<Tin>(0.f)) { |
570 | ++fp_props[0]; |
571 | } else { |
572 | ++fp_props[1]; |
573 | } |
574 | } else if (Eigen::numext::isnan(y)) { |
575 | ++fp_props[2]; |
576 | } |
577 | }); |
578 | output_tensor->flat<Tout>()(2) = fp_props[0]; // Slot for -inf count |
579 | output_tensor->flat<Tout>()(3) = fp_props[1]; // Slot for inf count |
580 | output_tensor->flat<Tout>()(4) = fp_props[2]; // Slot for nan count |
581 | } else if (tensor_debug_mode_ == 4) { // FULL HEALTH |
582 | TensorShape shape({11}); |
583 | OP_REQUIRES_OK(context, |
584 | context->allocate_output(0, shape, &output_tensor)); |
585 | int num_dims = tensor.dims(); |
586 | output_tensor->flat<Tout>()(0) = tensor_id; |
587 | output_tensor->flat<Tout>()(1) = -1.0; // TODO(144919262): Device ID |
588 | output_tensor->flat<Tout>()(2) = static_cast<Tout>(tensor.dtype()); |
589 | output_tensor->flat<Tout>()(3) = static_cast<Tout>(num_dims); |
590 | output_tensor->flat<Tout>()(4) = num_elem; |
591 | |
592 | // Accumulator value [neg_inf_count, pos_inf_count, nan_count, neg_count, |
593 | // zero_count, pos_count] |
594 | Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; |
595 | std::for_each(data, data + size, [&fp_props](const Tin& y) { |
596 | if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { |
597 | if (y < static_cast<Tin>(0.f)) { |
598 | ++fp_props[3]; |
599 | } else if (y == static_cast<Tin>(0.f)) { |
600 | ++fp_props[4]; |
601 | } else { |
602 | ++fp_props[5]; |
603 | } |
604 | } else if (Eigen::numext::isinf(y)) { |
605 | if (y < static_cast<Tin>(0.f)) { |
606 | ++fp_props[0]; |
607 | } else { |
608 | ++fp_props[1]; |
609 | } |
610 | } else if (Eigen::numext::isnan(y)) { |
611 | ++fp_props[2]; |
612 | } |
613 | }); |
614 | output_tensor->flat<Tout>()(5) = fp_props[0]; // Slot for -inf count |
615 | output_tensor->flat<Tout>()(6) = fp_props[1]; // Slot for inf count |
616 | output_tensor->flat<Tout>()(7) = fp_props[2]; // Slot for nan count. |
617 | output_tensor->flat<Tout>()(8) = fp_props[3]; // Slot for neg count. |
618 | output_tensor->flat<Tout>()(9) = fp_props[4]; // Slot for zero count. |
619 | output_tensor->flat<Tout>()(10) = fp_props[5]; // Slot for pos count. |
620 | } else if (tensor_debug_mode_ == 5) { // SHAPE |
621 | TensorShape shape({10}); |
622 | OP_REQUIRES_OK(context, |
623 | context->allocate_output(0, shape, &output_tensor)); |
624 | |
625 | int num_dims = tensor.dims(); |
626 | output_tensor->flat<Tout>()(0) = tensor_id; |
627 | output_tensor->flat<Tout>()(1) = static_cast<Tout>(tensor.dtype()); |
628 | output_tensor->flat<Tout>()(2) = static_cast<Tout>(num_dims); |
629 | output_tensor->flat<Tout>()(3) = num_elem; |
630 | |
631 | // Tensor shape - stored as (6 columns) |
632 | // if num_dim is less than 6, we right pad the shape with zeros |
633 | // if num_dim is greater than 6, we truncate the head (left most) of the |
634 | // dimensions as they are more predictable than the last few (e.g. batch |
635 | // size as first dimension) |
636 | int dim_idx = 4; |
637 | for (int i = std::max(0, num_dims - kShapeDims); |
638 | i < std::max(6, num_dims); ++i) { |
639 | if (i < num_dims) { |
640 | output_tensor->flat<Tout>()(dim_idx++) = |
641 | static_cast<Tout>(tensor.dim_size(i)); |
642 | } else { |
643 | output_tensor->flat<Tout>()(dim_idx++) = 0.0; |
644 | } |
645 | } |
646 | } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. |
647 | TensorShape shape({3}); |
648 | OP_REQUIRES_OK(context, |
649 | context->allocate_output(0, shape, &output_tensor)); |
650 | output_tensor->flat<Tout>()(0) = 0.0; // Slot for -inf. |
651 | output_tensor->flat<Tout>()(1) = 0.0; // Slot for inf. |
652 | output_tensor->flat<Tout>()(2) = 0.0; // Slot for nan. |
653 | |
654 | int fp_props = |
655 | std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { |
656 | int result = x; |
657 | if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { |
658 | // Do nothing: common case. |
659 | } else if (Eigen::numext::isinf(y)) { |
660 | result |= y < static_cast<Tin>(0.f) ? kNegInfBit : kPosInfBit; |
661 | } else if (Eigen::numext::isnan(y)) { |
662 | result |= kNaNBit; |
663 | } |
664 | return result; |
665 | }); |
666 | |
667 | if (fp_props & kNegInfBit) { |
668 | output_tensor->flat<Tout>()(0) = -std::numeric_limits<Tout>::infinity(); |
669 | } |
670 | if (fp_props & kPosInfBit) { |
671 | output_tensor->flat<Tout>()(1) = std::numeric_limits<Tout>::infinity(); |
672 | } |
673 | if (fp_props & kNaNBit) { |
674 | output_tensor->flat<Tout>()(2) = std::numeric_limits<Tout>::quiet_NaN(); |
675 | } |
676 | } else { |
677 | // TODO(cais): Implement other tensor debug modes in debug_event.proto. |
678 | context->SetStatus(errors::Unimplemented( |
679 | "Unimplemented tensor debug mode: " , tensor_debug_mode_)); |
680 | } |
681 | } |
682 | |
683 | private: |
684 | int tensor_debug_mode_; |
685 | int64_t tensor_id_; |
686 | static constexpr int kShapeDims = 6; |
687 | static constexpr int kNegInfBit = 0x01; |
688 | static constexpr int kPosInfBit = 0x02; |
689 | static constexpr int kNaNBit = 0x04; |
690 | static constexpr int64_t kMaxTensorId = 1LL |
691 | << std::numeric_limits<Tout>::digits; |
692 | }; |
693 | |
694 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
695 | |
696 | template <typename Tin, typename Tout> |
697 | class DebugNumericSummaryV2Op<GPUDevice, Tin, Tout> : public AsyncOpKernel { |
698 | public: |
699 | typedef GPUDevice Device; |
700 | |
701 | explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) |
702 | : AsyncOpKernel(context) { |
703 | OP_REQUIRES_OK(context, |
704 | context->GetAttr("tensor_debug_mode" , &tensor_debug_mode_)); |
705 | OP_REQUIRES_OK(context, context->GetAttr("tensor_id" , &tensor_id_)); |
706 | } |
707 | |
708 | void ComputeAsync(OpKernelContext* context, DoneCallback done) override { |
709 | Tensor* output_tensor; |
710 | Tout tensor_id = static_cast<Tout>(tensor_id_); |
711 | const Tensor& tensor = context->input(0); |
712 | const Tout num_elem = static_cast<Tout>(tensor.NumElements()); |
713 | const Device& d = context->eigen_device<Device>(); |
714 | auto input = tensor.flat<Tin>(); |
715 | auto check_cb = [this, done]() { done(); }; |
716 | // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because |
717 | // that mode does not make use of tensor_id. |
718 | if (tensor_debug_mode_ != 8) { |
719 | OP_REQUIRES_ASYNC( |
720 | context, tensor_id_ <= kMaxTensorId, |
721 | errors::InvalidArgument("DebugNumericSummaryV2Op requires " |
722 | "tensor_id to be less than or equal to " |
723 | "(2^" , |
724 | std::numeric_limits<Tout>::digits, |
725 | "). Given tensor_id:" , tensor_id_), |
726 | done); |
727 | } |
728 | |
729 | if (tensor_debug_mode_ == 2) { // CURT_HEALTH. |
730 | TensorShape shape({2}); |
731 | OP_REQUIRES_OK(context, |
732 | context->allocate_output(0, shape, &output_tensor)); |
733 | |
734 | auto* stream = context->op_device_context()->stream(); |
735 | OP_REQUIRES_ASYNC(context, stream != nullptr, |
736 | errors::Internal("No GPU stream available." ), done); |
737 | |
738 | se::DeviceMemoryBase output_tensor_ptr( |
739 | output_tensor->flat<Tout>().data(), |
740 | output_tensor->flat<Tout>().size()); |
741 | stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout)); |
742 | // Copy tensor_id to slot zero |
743 | stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout)); |
744 | if (num_elem == 0) { |
745 | done(); |
746 | return; |
747 | } |
748 | |
749 | // Call the GPU kernels for the numerical (inf/nan) checks. |
750 | auto input = context->input(0).flat<Tin>(); |
751 | CurtHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(), |
752 | output_tensor->flat<Tout>().data() + 1); |
753 | |
754 | context->device() |
755 | ->tensorflow_accelerator_device_info() |
756 | ->event_mgr->ThenExecute(stream, std::move(check_cb)); |
757 | } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH. |
758 | TensorShape shape({5}); |
759 | OP_REQUIRES_OK(context, |
760 | context->allocate_output(0, shape, &output_tensor)); |
761 | OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(), |
762 | errors::Unimplemented( |
763 | "Determinism is not yet supported for " |
764 | "DebugNumericSummaryV2 when tensor_debug_mode is " |
765 | "CONCISE_HEALTH." ), |
766 | done); |
767 | |
768 | auto* stream = context->op_device_context()->stream(); |
769 | OP_REQUIRES_ASYNC(context, stream != nullptr, |
770 | errors::Internal("No GPU stream available." ), done); |
771 | |
772 | se::DeviceMemoryBase output_tensor_ptr( |
773 | output_tensor->flat<Tout>().data(), |
774 | output_tensor->flat<Tout>().size()); |
775 | stream->ThenMemset32(&output_tensor_ptr, 0, 5 * sizeof(Tout)); |
776 | const Tout static_output[] = {tensor_id, num_elem}; |
777 | stream->ThenMemcpy(&output_tensor_ptr, &static_output, 2 * sizeof(Tout)); |
778 | if (num_elem == 0) { |
779 | done(); |
780 | return; |
781 | } |
782 | |
783 | // Call the GPU kernels for the numerical (inf/nan) checks. |
784 | ConciseHealthLaunch<Tin, Tout>().Run( |
785 | d, input.data(), input.size(), |
786 | output_tensor->flat<Tout>().data() + 2); |
787 | |
788 | context->device() |
789 | ->tensorflow_accelerator_device_info() |
790 | ->event_mgr->ThenExecute(stream, std::move(check_cb)); |
791 | } else if (tensor_debug_mode_ == 4) { // FULL HEALTH |
792 | TensorShape shape({11}); |
793 | OP_REQUIRES_OK(context, |
794 | context->allocate_output(0, shape, &output_tensor)); |
795 | |
796 | auto* stream = context->op_device_context()->stream(); |
797 | OP_REQUIRES_ASYNC(context, stream != nullptr, |
798 | errors::Internal("No GPU stream available." ), done); |
799 | OP_REQUIRES_ASYNC(context, !tensorflow::OpDeterminismRequired(), |
800 | errors::Unimplemented( |
801 | "Determinism is not yet supported for " |
802 | "DebugNumericSummaryV2 when tensor_debug_mode is " |
803 | "FULL_HEALTH." ), |
804 | done); |
805 | |
806 | se::DeviceMemoryBase output_tensor_ptr( |
807 | output_tensor->flat<Tout>().data(), |
808 | output_tensor->flat<Tout>().size()); |
809 | stream->ThenMemset32(&output_tensor_ptr, 0, 11 * sizeof(Tout)); |
810 | |
811 | int num_dims = tensor.dims(); |
812 | const Tout static_output[] = {tensor_id, |
813 | -1.0, // TODO(144919262): Device ID |
814 | static_cast<Tout>(tensor.dtype()), |
815 | static_cast<Tout>(num_dims), num_elem}; |
816 | stream->ThenMemcpy(&output_tensor_ptr, &static_output, 5 * sizeof(Tout)); |
817 | if (num_elem == 0) { |
818 | done(); |
819 | return; |
820 | } |
821 | |
822 | // Call the GPU kernels for the numerical (inf/nan) checks and |
823 | // pos/neg/zero counts. |
824 | FullHealthLaunch<Tin, Tout>().Run(d, input.data(), input.size(), |
825 | output_tensor->flat<Tout>().data() + 5); |
826 | |
827 | context->device() |
828 | ->tensorflow_accelerator_device_info() |
829 | ->event_mgr->ThenExecute(stream, std::move(check_cb)); |
830 | } else if (tensor_debug_mode_ == 5) { // SHAPE |
831 | TensorShape shape({10}); |
832 | OP_REQUIRES_OK(context, |
833 | context->allocate_output(0, shape, &output_tensor)); |
834 | |
835 | auto* stream = context->op_device_context()->stream(); |
836 | OP_REQUIRES_ASYNC(context, stream != nullptr, |
837 | errors::Internal("No GPU stream available." ), done); |
838 | |
839 | se::DeviceMemoryBase output_tensor_ptr( |
840 | output_tensor->flat<Tout>().data(), |
841 | output_tensor->flat<Tout>().size()); |
842 | |
843 | int num_dims = tensor.dims(); |
844 | Tout static_output[10] = {tensor_id, |
845 | static_cast<Tout>(tensor.dtype()), |
846 | static_cast<Tout>(num_dims), |
847 | num_elem, |
848 | 0.0, |
849 | 0.0, |
850 | 0.0, |
851 | 0.0, |
852 | 0.0, |
853 | 0.0}; |
854 | // Tensor shape: right pad zeros, truncate head |
855 | int dim_idx = 4; |
856 | for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) { |
857 | static_output[dim_idx++] = static_cast<Tout>(tensor.dim_size(i)); |
858 | } |
859 | // Write to device stream |
860 | stream->ThenMemcpy(&output_tensor_ptr, &static_output, sizeof(Tout) * 10); |
861 | context->device() |
862 | ->tensorflow_accelerator_device_info() |
863 | ->event_mgr->ThenExecute(stream, std::move(check_cb)); |
864 | } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. |
865 | TensorShape shape({3}); |
866 | OP_REQUIRES_OK(context, |
867 | context->allocate_output(0, shape, &output_tensor)); |
868 | |
869 | auto* stream = context->op_device_context()->stream(); |
870 | OP_REQUIRES_ASYNC(context, stream != nullptr, |
871 | errors::Internal("No GPU stream available." ), done); |
872 | |
873 | se::DeviceMemoryBase output_tensor_ptr( |
874 | output_tensor->flat<Tout>().data(), |
875 | output_tensor->flat<Tout>().size()); |
876 | stream->ThenMemset32(&output_tensor_ptr, 0, |
877 | output_tensor->flat<Tout>().size() * sizeof(Tout)); |
878 | if (num_elem == 0) { |
879 | done(); |
880 | return; |
881 | } |
882 | |
883 | // Call the GPU kernels for the numerical (inf/nan) checks. |
884 | auto input = context->input(0).flat<Tin>(); |
885 | ReduceInfNanThreeSlotsLaunch<Tin, Tout>().Run( |
886 | d, input.data(), input.size(), output_tensor->flat<Tout>().data()); |
887 | |
888 | context->device() |
889 | ->tensorflow_accelerator_device_info() |
890 | ->event_mgr->ThenExecute(stream, std::move(check_cb)); |
891 | } else { |
892 | // TODO(cais): Implement other tensor debug modes in debug_event.proto. |
893 | context->SetStatus(errors::Unimplemented( |
894 | "Unimplemented tensor debug mode: " , tensor_debug_mode_)); |
895 | done(); |
896 | } |
897 | } |
898 | |
899 | private: |
900 | int tensor_debug_mode_; |
901 | int64_t tensor_id_; |
902 | static constexpr int64_t kMaxTensorId = 1L |
903 | << std::numeric_limits<Tout>::digits; |
904 | }; |
905 | |
906 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
907 | |
908 | } // namespace tensorflow |
909 | |
910 | #endif // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_ |
911 | |