1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// See docs in ../ops/array_ops.cc.
17
18// clang-format off
19#include "tensorflow/core/platform/bfloat16.h"
20
21#include <math.h> // NOLINT
22#include <algorithm> // NOLINT
23#include <numeric> // NOLINT
24// clang-format on
25
26#include "tensorflow/core/framework/op_kernel.h"
27#include "tensorflow/core/framework/register_types.h"
28#include "tensorflow/core/framework/tensor.h"
29#include "tensorflow/core/framework/tensor_reference.h"
30#include "tensorflow/core/framework/types.h"
31
32#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
33#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
34#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
35
36#if GOOGLE_CUDA
37#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
38#elif TENSORFLOW_USE_ROCM
39#include "tensorflow/core/platform/rocm.h"
40#endif
41namespace tensorflow {
42
43typedef Eigen::ThreadPoolDevice CPUDevice;
44typedef Eigen::GpuDevice GPUDevice;
45
46#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
47template <typename T>
48struct CheckNumericsLaunch {
49 void Run(const GPUDevice& d, const T* data, int size,
50 int abnormal_detected[2]);
51};
52
53extern template struct CheckNumericsLaunch<Eigen::half>;
54extern template struct CheckNumericsLaunch<float>;
55extern template struct CheckNumericsLaunch<double>;
56
57template <typename T>
58struct CheckNumericsLaunchV2 {
59 void Run(const GPUDevice& d, const T* data, int size,
60 int abnormal_detected[3]);
61};
62
63extern template struct CheckNumericsLaunchV2<Eigen::half>;
64extern template struct CheckNumericsLaunchV2<float>;
65extern template struct CheckNumericsLaunchV2<double>;
66#endif
67
68namespace {
69
70const int kInfBit = 0x01;
71const int kNaNBit = 0x02;
72const int kNegativeInfBit = 0x04;
73const int kPositiveInfBit = 0x08;
74
75template <typename Device, typename T>
76class CheckNumericsOp;
77
78// Partial specialization for CPU
79// TODO(jeff,rmlarsen): We should make this variant be an AsyncOpKernel, as
80// was done for the GPU case below.
81template <typename T>
82class CheckNumericsOp<CPUDevice, T> : public OpKernel {
83 public:
84 explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
85 // message_ is used as the prefix for the assertion error message. For
86 // instance, this can be the name of the input op that produced the tensor.
87 OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
88 }
89
90 void Compute(OpKernelContext* context) override {
91 // pass along the input to the output
92 context->set_output(0, context->input(0));
93
94 auto in = context->input(0).flat<T>();
95 const T* data = in.data();
96 const int64_t size = in.size();
97 // Check to see if any element of the tensor is NaN or Inf.
98 int fp_props = std::accumulate(
99 data, data + size, 0,
100 [this](const int x, const T& y) { return checkFloatingElement(x, y); });
101 if (fp_props != 0) {
102 const string& status = getErrorString(fp_props);
103 if (!status.empty()) {
104 context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
105 status, " values"));
106 }
107 }
108 }
109
110 protected:
111 virtual int checkFloatingElement(const int x, const T& y) {
112 int result = x;
113 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
114 // Do nothing: common case.
115 } else {
116 if (Eigen::numext::isinf(y)) {
117 result |= kInfBit;
118 } else if (Eigen::numext::isnan(y)) {
119 result |= kNaNBit;
120 }
121 }
122 return result;
123 }
124
125 virtual const string getErrorString(const int fp_props) {
126 string status;
127 if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
128 status = "Inf and NaN";
129 } else {
130 if (fp_props & kInfBit) {
131 status = "Inf";
132 }
133 if (fp_props & kNaNBit) {
134 status = "NaN";
135 }
136 }
137 return status;
138 }
139
140 private:
141 string message_;
142};
143
144template <typename Device, typename T>
145class CheckNumericsV2Op;
146
147// Partial specialization for CPU: v2.
148// The v2 op differs from the v1 in that it distinguishes -inf and +inf.
149template <typename T>
150class CheckNumericsV2Op<CPUDevice, T> : public CheckNumericsOp<CPUDevice, T> {
151 public:
152 explicit CheckNumericsV2Op(OpKernelConstruction* context)
153 : CheckNumericsOp<CPUDevice, T>(context) {}
154
155 protected:
156 int checkFloatingElement(const int x, const T& y) override {
157 int result = x;
158 if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
159 // Do nothing: common case.
160 } else {
161 if (Eigen::numext::isinf(y)) {
162 result |= y < static_cast<T>(0.) ? kNegativeInfBit : kPositiveInfBit;
163 } else if (Eigen::numext::isnan(y)) {
164 result |= kNaNBit;
165 }
166 }
167 return result;
168 }
169
170 const string getErrorString(const int fp_props) override {
171 std::vector<string> anomalies;
172 if (fp_props & kNegativeInfBit) {
173 anomalies.push_back("-Inf");
174 }
175 if (fp_props & kPositiveInfBit) {
176 anomalies.push_back("+Inf");
177 }
178 if (fp_props & kNaNBit) {
179 anomalies.push_back("NaN");
180 }
181 if (anomalies.size() == 3) {
182 return strings::StrCat(anomalies[0], ", ", anomalies[1], ", and ",
183 anomalies[2]);
184 } else if (anomalies.size() == 2) {
185 return strings::StrCat(anomalies[0], " and ", anomalies[1]);
186 } else {
187 return anomalies[0];
188 }
189 }
190};
191
192#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
193// Partial specialization for GPU
194template <typename T>
195class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
196 public:
197 typedef GPUDevice Device;
198
199 explicit CheckNumericsOp(OpKernelConstruction* context)
200 : AsyncOpKernel(context) {
201 // message_ is used as the prefix for the assertion error message. For
202 // instance, this can be the name of the input op that produced the tensor.
203 OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
204 }
205
206 void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
207 // pass along the input to the output
208 context->set_output(0, context->input(0));
209 if (context->input(0).NumElements() == 0) {
210 done();
211 return;
212 }
213 auto input = context->input(0).flat<T>();
214
215 // Allocate and initialize the elements to hold the check results
216 Tensor abnormal_detected;
217 const int abnormal_detected_size = getAnomalyIndicatorSize();
218 OP_REQUIRES_OK(context, context->allocate_temp(
219 DT_INT32, TensorShape({abnormal_detected_size}),
220 &abnormal_detected));
221
222 auto* stream = context->op_device_context()->stream();
223 OP_REQUIRES_ASYNC(context, stream != nullptr,
224 errors::Internal("No GPU stream available."), done);
225
226 se::DeviceMemoryBase abnormal_detected_ptr(
227 abnormal_detected.flat<int>().data(),
228 abnormal_detected.flat<int>().size());
229 stream->ThenMemset32(&abnormal_detected_ptr, 0,
230 abnormal_detected.flat<int>().size() * sizeof(int));
231
232 // Call the GPU kernels for the numerical checks
233 const Device& d = context->eigen_device<Device>();
234 RunKernel(d, input.data(), input.size(),
235 abnormal_detected.flat<int>().data());
236
237 // Copy the results from device to host
238 AllocatorAttributes attr;
239 attr.set_on_host(true);
240 attr.set_gpu_compatible(true);
241 Tensor abnormal_detected_host;
242 OP_REQUIRES_OK_ASYNC(
243 context,
244 context->allocate_temp(DT_INT32, TensorShape({abnormal_detected_size}),
245 &abnormal_detected_host, attr),
246 done);
247 OP_REQUIRES_ASYNC(
248 context,
249 stream
250 ->ThenMemcpy(abnormal_detected_host.flat<int>().data(),
251 abnormal_detected_ptr,
252 abnormal_detected_size * sizeof(int))
253 .ok(),
254 errors::Internal("GPU memcpy from device to host failed"), done);
255
256 // We have observed crashes on some network stacks when not holding
257 // this tensor reference.
258 TensorReference abnormal_detected_ref(abnormal_detected);
259 auto check_cb = [this, stream, abnormal_detected_ref,
260 abnormal_detected_host, context, done]() {
261#if GOOGLE_CUDA
262 se::cuda::ScopedActivateExecutorContext scoped_activation{
263 stream->parent()};
264#elif TENSORFLOW_USE_ROCM
265 se::rocm::ScopedActivateExecutorContext scoped_activation{
266 stream->parent()};
267#endif
268 TTypes<const int>::Vec abnormal_detected_host_flat =
269 abnormal_detected_host.flat<int>();
270 abnormal_detected_ref.Unref();
271 checkForAnomalies(context, abnormal_detected_host_flat);
272 done();
273 };
274 context->device()
275 ->tensorflow_accelerator_device_info()
276 ->event_mgr->ThenExecute(stream, std::move(check_cb));
277 }
278
279 protected:
280 virtual int getAnomalyIndicatorSize() { return 2; }
281
282 virtual void RunKernel(const GPUDevice& d, const T* data, int size,
283 int* abnormal_detected) {
284 CheckNumericsLaunch<T>().Run(d, data, size, abnormal_detected);
285 }
286
287 virtual void checkForAnomalies(
288 OpKernelContext* context,
289 const TTypes<const int>::Vec& abnormality_indicators) {
290 const int is_nan = abnormality_indicators(0);
291 const int is_inf = abnormality_indicators(1);
292 if (is_nan || is_inf) {
293 LOG(ERROR) << "abnormal_detected_host @" << abnormality_indicators.data()
294 << " = {" << is_nan << ", " << is_inf << "} " << message_;
295
296 string anomalies;
297 if (is_nan && is_inf) {
298 anomalies = "Inf and NaN";
299 } else if (is_nan) {
300 anomalies = "NaN";
301 } else if (is_inf) {
302 anomalies = "Inf";
303 }
304 context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
305 anomalies, " values"));
306 }
307 }
308
309 string message_;
310};
311
312template <typename T>
313class CheckNumericsV2Op<GPUDevice, T> : public CheckNumericsOp<GPUDevice, T> {
314 public:
315 CheckNumericsV2Op(OpKernelConstruction* context)
316 : CheckNumericsOp<GPUDevice, T>(context) {}
317
318 protected:
319 int getAnomalyIndicatorSize() override { return 3; }
320
321 void RunKernel(const GPUDevice& d, const T* data, int size,
322 int* abnormal_detected) override {
323 CheckNumericsLaunchV2<T>().Run(d, data, size, abnormal_detected);
324 }
325
326 void checkForAnomalies(
327 OpKernelContext* context,
328 const TTypes<const int>::Vec& abnormality_indicators) override {
329 const int is_nan = abnormality_indicators(0);
330 const int is_negative_inf = abnormality_indicators(1);
331 const int is_positive_inf = abnormality_indicators(2);
332 if (is_negative_inf || is_positive_inf || is_nan) {
333 std::vector<string> anomalies;
334 if (is_negative_inf) {
335 anomalies.push_back("-Inf");
336 }
337 if (is_positive_inf) {
338 anomalies.push_back("+Inf");
339 }
340 if (is_nan) {
341 anomalies.push_back("NaN");
342 }
343 string all_anomalies;
344 if (anomalies.size() == 3) {
345 all_anomalies = strings::StrCat(anomalies[0], ", ", anomalies[1],
346 ", and ", anomalies[2]);
347 } else if (anomalies.size() == 2) {
348 all_anomalies = strings::StrCat(anomalies[0], " and ", anomalies[1]);
349 } else {
350 all_anomalies = anomalies[0];
351 }
352 context->SetStatus(errors::InvalidArgument(
353 this->message_, " : Tensor had ", all_anomalies, " values"));
354 }
355 }
356
357 static constexpr int abnormal_detected_size = 3;
358};
359
360#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
361
362} // namespace
363
364#define REGISTER_CPU_KERNEL(T) \
365 REGISTER_KERNEL_BUILDER( \
366 Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
367 CheckNumericsOp<CPUDevice, T>);
368TF_CALL_half(REGISTER_CPU_KERNEL);
369TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
370TF_CALL_float(REGISTER_CPU_KERNEL);
371TF_CALL_double(REGISTER_CPU_KERNEL);
372
373#define REGISTER_V2_CPU_KERNEL(T) \
374 REGISTER_KERNEL_BUILDER( \
375 Name("CheckNumericsV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
376 CheckNumericsV2Op<CPUDevice, T>);
377TF_CALL_half(REGISTER_V2_CPU_KERNEL);
378TF_CALL_bfloat16(REGISTER_V2_CPU_KERNEL);
379TF_CALL_float(REGISTER_V2_CPU_KERNEL);
380TF_CALL_double(REGISTER_V2_CPU_KERNEL);
381
382#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
383REGISTER_KERNEL_BUILDER(
384 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
385 CheckNumericsOp<GPUDevice, Eigen::half>);
386REGISTER_KERNEL_BUILDER(
387 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<float>("T"),
388 CheckNumericsOp<GPUDevice, float>);
389REGISTER_KERNEL_BUILDER(
390 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<double>("T"),
391 CheckNumericsOp<GPUDevice, double>);
392
393REGISTER_KERNEL_BUILDER(
394 Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
395 CheckNumericsV2Op<GPUDevice, Eigen::half>);
396REGISTER_KERNEL_BUILDER(
397 Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<float>("T"),
398 CheckNumericsV2Op<GPUDevice, float>);
399REGISTER_KERNEL_BUILDER(
400 Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<double>("T"),
401 CheckNumericsV2Op<GPUDevice, double>);
402#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
403
404} // namespace tensorflow
405