1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/tsl/platform/status.h"
17
18#include <stdio.h>
19
20#include <deque>
21#include <functional>
22#include <memory>
23#include <string>
24
25#include "absl/base/call_once.h"
26#include "absl/strings/cord.h"
27#include "absl/strings/escaping.h"
28#include "absl/strings/match.h"
29#include "absl/types/optional.h"
30#include "tensorflow/core/protobuf/error_codes.pb.h"
31#include "tensorflow/core/protobuf/status.pb.h"
32#include "tensorflow/tsl/platform/mutex.h"
33#include "tensorflow/tsl/platform/stacktrace.h"
34#include "tensorflow/tsl/platform/str_util.h"
35#include "tensorflow/tsl/platform/strcat.h"
36#include "tensorflow/tsl/platform/stringprintf.h"
37
38namespace tsl {
39namespace error {
40// TODO(aminim): figure out the protobuf migration story
41using tensorflow::error::ABORTED;
42using tensorflow::error::ALREADY_EXISTS;
43using tensorflow::error::CANCELLED;
44using tensorflow::error::DATA_LOSS;
45using tensorflow::error::DEADLINE_EXCEEDED;
46using tensorflow::error::FAILED_PRECONDITION;
47using tensorflow::error::INTERNAL;
48using tensorflow::error::INVALID_ARGUMENT;
49using tensorflow::error::NOT_FOUND;
50using tensorflow::error::OK;
51using tensorflow::error::OUT_OF_RANGE;
52using tensorflow::error::PERMISSION_DENIED;
53using tensorflow::error::RESOURCE_EXHAUSTED;
54using tensorflow::error::UNAUTHENTICATED;
55using tensorflow::error::UNAVAILABLE;
56using tensorflow::error::UNIMPLEMENTED;
57using tensorflow::error::UNKNOWN;
58} // namespace error
59namespace {
60
61// Log sink is used to collect recent warning and error log messages to be
62// attached to the error status.
63class StatusLogSink : public TFLogSink {
64 public:
65 static StatusLogSink* GetInstance() {
66 static StatusLogSink* sink = new StatusLogSink();
67 return sink;
68 }
69
70 void enable() {
71 absl::call_once(flag_, [this] {
72 num_messages_ = 5; // default to 5 messages
73
74 if (const char* num_msgs_str =
75 getenv("TF_WORKER_NUM_FORWARDED_LOG_MESSAGES")) {
76 if (!absl::SimpleAtoi(num_msgs_str, &num_messages_)) {
77 LOG(WARNING) << "Failed to parse env variable "
78 "TF_WORKER_NUM_WARNING_ERROR_LOG_IN_STATUS="
79 << num_msgs_str << " as int. Using the default value "
80 << num_messages_ << ".";
81 }
82 }
83
84 if (num_messages_ > 0) {
85 TFAddLogSink(this);
86 }
87 });
88 }
89
90 void GetMessages(std::vector<std::string>* logs) TF_LOCKS_EXCLUDED(mu_) {
91 mutex_lock lock(mu_);
92
93 for (auto& msg : messages_) {
94 logs->push_back(msg);
95 }
96 }
97
98 void Send(const TFLogEntry& entry) override TF_LOCKS_EXCLUDED(mu_) {
99 if (entry.log_severity() < absl::LogSeverity::kWarning) return;
100
101 mutex_lock lock(mu_);
102 messages_.emplace_back(entry.ToString());
103 if (messages_.size() > static_cast<size_t>(num_messages_)) {
104 messages_.pop_front();
105 }
106 }
107
108 private:
109 mutex mu_;
110 // for allowing repeated/concurrent calls to enable()
111 absl::once_flag flag_;
112 int num_messages_ = 0;
113 std::deque<std::string> messages_ TF_GUARDED_BY(mu_);
114};
115
116} // namespace
117
118// TODO(b/197552541) Move this namespace to errors.h after absl migration.
119namespace errors {
120static constexpr const char kStackTraceProtoUrl[] =
121 "type.googleapis.com/tensorflow.StackTracePayload";
122
123void SetStackTrace(::tsl::Status& status, std::vector<StackFrame> stack_trace) {
124 status.SetStackTrace(stack_trace);
125}
126
127std::vector<StackFrame> GetStackTrace(const ::tsl::Status& status) {
128 return status.GetStackTrace();
129}
130
131} // namespace errors
132
133Status::~Status() {}
134
135void Status::SetStackTrace(std::vector<StackFrame> stack_trace) {
136 if (state_ != nullptr) {
137 state_->stack_trace = stack_trace;
138 }
139}
140
141std::vector<StackFrame> Status::GetStackTrace() const {
142 if (state_ != nullptr) {
143 return state_->stack_trace;
144 } else {
145 return std::vector<StackFrame>();
146 }
147}
148
149absl::Span<const SourceLocation> Status::GetSourceLocations() const {
150 return state_ != nullptr ? state_->source_locations
151 : absl::Span<const SourceLocation>();
152}
153
154void Status::MaybeAddSourceLocation(SourceLocation loc) {
155 if (state_ == nullptr) {
156 return;
157 }
158 if (loc.line <= 0) {
159 return;
160 }
161 if (loc.file_name == nullptr) {
162 return;
163 }
164 if (loc.file_name[0] == '\0') {
165 return;
166 }
167 state_->source_locations.push_back(loc);
168}
169
170Status::Status(tsl::error::Code code, absl::string_view msg,
171 SourceLocation loc) {
172 assert(code != tsl::error::OK);
173 state_ = std::make_unique<State>();
174 state_->code = code;
175 state_->msg = std::string(msg);
176 MaybeAddSourceLocation(loc);
177 VLOG(5) << "Generated non-OK status: \"" << *this << "\". "
178 << CurrentStackTrace();
179}
180
181void Status::Update(const Status& new_status) {
182 if (ok()) {
183 *this = new_status;
184 }
185}
186
187void Status::SlowCopyFrom(const State* src) {
188 if (src == nullptr) {
189 state_ = nullptr;
190 } else {
191 state_ = std::make_unique<State>(*src);
192 }
193}
194
195Status::State* Status::NewStateFromNonOKStatus(const Status& s) {
196 return new State(*s.state_);
197}
198
199const std::string& Status::empty_string() {
200 static string* empty = new string;
201 return *empty;
202}
203
204std::string error_name(error::Code code) {
205 switch (code) {
206 case tsl::error::OK:
207 return "OK";
208 break;
209 case tsl::error::CANCELLED:
210 return "CANCELLED";
211 break;
212 case tsl::error::UNKNOWN:
213 return "UNKNOWN";
214 break;
215 case tsl::error::INVALID_ARGUMENT:
216 return "INVALID_ARGUMENT";
217 break;
218 case tsl::error::DEADLINE_EXCEEDED:
219 return "DEADLINE_EXCEEDED";
220 break;
221 case tsl::error::NOT_FOUND:
222 return "NOT_FOUND";
223 break;
224 case tsl::error::ALREADY_EXISTS:
225 return "ALREADY_EXISTS";
226 break;
227 case tsl::error::PERMISSION_DENIED:
228 return "PERMISSION_DENIED";
229 break;
230 case tsl::error::UNAUTHENTICATED:
231 return "UNAUTHENTICATED";
232 break;
233 case tsl::error::RESOURCE_EXHAUSTED:
234 return "RESOURCE_EXHAUSTED";
235 break;
236 case tsl::error::FAILED_PRECONDITION:
237 return "FAILED_PRECONDITION";
238 break;
239 case tsl::error::ABORTED:
240 return "ABORTED";
241 break;
242 case tsl::error::OUT_OF_RANGE:
243 return "OUT_OF_RANGE";
244 break;
245 case tsl::error::UNIMPLEMENTED:
246 return "UNIMPLEMENTED";
247 break;
248 case tsl::error::INTERNAL:
249 return "INTERNAL";
250 break;
251 case tsl::error::UNAVAILABLE:
252 return "UNAVAILABLE";
253 break;
254 case tsl::error::DATA_LOSS:
255 return "DATA_LOSS";
256 break;
257 default:
258 char tmp[30];
259 snprintf(tmp, sizeof(tmp), "UNKNOWN_CODE(%d)", static_cast<int>(code));
260 return tmp;
261 break;
262 }
263}
264
265std::string Status::ToString() const {
266 if (state_ == nullptr) {
267 return "OK";
268 } else {
269 std::string result(error_name(state_->code));
270 result += ": ";
271 result += state_->msg;
272
273 for (const std::pair<const std::string, std::string>& element :
274 state_->payloads) {
275 absl::StrAppend(&result, " [", element.first, "='",
276 absl::CHexEscape(element.second), "']");
277 }
278
279 return result;
280 }
281}
282
283void Status::IgnoreError() const {
284 // no-op
285}
286
287void Status::SetPayload(absl::string_view type_url, absl::string_view payload) {
288 if (ok()) return;
289 state_->payloads[std::string(type_url)] = std::string(payload);
290}
291
292absl::optional<absl::Cord> Status::GetPayload(
293 absl::string_view type_url) const {
294 if (ok()) return absl::nullopt;
295 auto payload_iter = state_->payloads.find(std::string(type_url));
296 if (payload_iter == state_->payloads.end()) return absl::nullopt;
297 return absl::Cord(payload_iter->second);
298}
299
300bool Status::ErasePayload(absl::string_view type_url) {
301 if (ok()) return false;
302 auto payload_iter = state_->payloads.find(std::string(type_url));
303 if (payload_iter == state_->payloads.end()) return false;
304 state_->payloads.erase(payload_iter);
305 return true;
306}
307
308void Status::ForEachPayload(
309 const std::function<void(absl::string_view, absl::string_view)>& visitor)
310 const {
311 if (ok()) return;
312 for (const auto& payload : state_->payloads) {
313 visitor(payload.first, payload.second);
314 }
315}
316
317std::ostream& operator<<(std::ostream& os, const Status& x) {
318 os << x.ToString();
319 return os;
320}
321
322Status OkStatus() { return Status(); }
323
324Status FromAbslStatus(const absl::Status& s) {
325 if (s.ok()) {
326 return Status();
327 }
328 Status converted(static_cast<tsl::error::Code>(s.code()), s.message());
329 s.ForEachPayload(
330 [&converted](absl::string_view key, const absl::Cord& value) {
331 converted.SetPayload(key, std::string(value));
332 });
333
334 return converted;
335}
336
337absl::Status ToAbslStatus(const ::tsl::Status& s) {
338 if (s.ok()) {
339 return absl::OkStatus();
340 }
341
342 absl::Status converted(static_cast<absl::StatusCode>(s.code()),
343 s.error_message());
344 s.ForEachPayload([&converted](tsl::StringPiece key, tsl::StringPiece value) {
345 converted.SetPayload(key, absl::Cord(value));
346 });
347
348 return converted;
349}
350
351std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) {
352 std::string r("Non-OK-status: ");
353 r += msg;
354 r += " status: ";
355 r += v.ToString();
356 // Leaks string but this is only to be used in a fatal error message
357 return new std::string(r);
358}
359
360StatusGroup::StatusGroup() {}
361
362StatusGroup::StatusGroup(std::initializer_list<Status> statuses) {
363 for (const Status& s : statuses) {
364 Update(s);
365 }
366}
367
368static constexpr const char kDerivedStatusProtoUrl[] =
369 "type.googleapis.com/tensorflow.DerivedStatus";
370
371Status StatusGroup::MakeDerived(const Status& s) {
372 if (IsDerived(s)) {
373 return s;
374 } else {
375 Status derived(s);
376 // TODO(b/200167936): Serialize an instance of DerivedStatus proto instead
377 // of using the string directly. The string is never used so it is not
378 // causing any issues at the moment.
379 derived.SetPayload(kDerivedStatusProtoUrl, "");
380 return derived;
381 }
382}
383
384bool StatusGroup::IsDerived(const Status& s) {
385 return s.GetPayload(kDerivedStatusProtoUrl).has_value();
386}
387
388void StatusGroup::ConfigureLogHistory() {
389 StatusLogSink::GetInstance()->enable();
390}
391
392void StatusGroup::Update(const Status& s) {
393 if (s.ok()) {
394 ++num_ok_;
395 } else {
396 ok_ = false;
397 if (IsDerived(s)) {
398 derived_.insert(s);
399 } else {
400 non_derived_.insert(s);
401 }
402 }
403}
404
405static constexpr int kMaxAggregatedStatusMessageSize = 8 * 1024;
406static constexpr int kMaxAttachedLogMessageSize = 512;
407
408std::unordered_map<std::string, std::string> StatusGroup::GetPayloads() const {
409 std::unordered_map<std::string, std::string> payloads;
410 auto capture_payload = [&payloads](absl::string_view key,
411 absl::string_view value) {
412 payloads[std::string(key)] = std::string(value);
413 };
414
415 for (const auto& status : derived_) {
416 status.ForEachPayload(capture_payload);
417 }
418
419 // If a key appears in both derived_ and non_derived_ payloads, then the
420 // non_derived_ payload receives priority.
421 for (const auto& status : non_derived_) {
422 status.ForEachPayload(capture_payload);
423 }
424
425 payloads.erase(kDerivedStatusProtoUrl);
426
427 return payloads;
428}
429
430Status MakeStatus(
431 tsl::error::Code code, absl::string_view message,
432 const std::unordered_map<std::string, std::string>& payloads) {
433 Status status(code, message);
434 for (const auto& payload : payloads) {
435 status.SetPayload(payload.first, payload.second);
436 }
437 return status;
438}
439
440std::string MakeString(const Status& status) {
441 return absl::StrCat(error_name(status.code()), ": ", status.error_message());
442}
443
444// Summarize all the status objects in the StatusGroup. This is used when
445// individual Status objects in the StatusGroup are not already summarized.
446Status StatusGroup::as_summary_status() const {
447 if (ok_) {
448 return OkStatus();
449 }
450
451 // Gather recent logs as a string
452 auto get_recent_logs = [this]() -> std::string {
453 if (!recent_logs_.empty()) {
454 std::vector<std::string> fmt;
455 fmt.push_back("\nRecent warning and error logs:");
456 for (auto& log : recent_logs_) {
457 // Add an indentation to make it look nicer.
458 fmt.push_back(" " + log.substr(0, kMaxAttachedLogMessageSize));
459 }
460 return absl::StrJoin(fmt, "\n");
461 } else {
462 return "";
463 }
464 };
465
466 // If only one root status is found, do not add summary header and footer.
467 if (non_derived_.size() == 1) {
468 return MakeStatus(non_derived_.begin()->code(),
469 strings::StrCat(non_derived_.begin()->error_message(),
470 get_recent_logs()),
471 GetPayloads());
472 }
473
474 if (!non_derived_.empty()) {
475 std::vector<std::string> fmt;
476
477 fmt.push_back(
478 strings::Printf("%zu root error(s) found.", non_derived_.size()));
479
480 int index = 0;
481 auto code = tsl::error::CANCELLED;
482 for (const auto& s : non_derived_) {
483 // NOTE: Avoid using CANCELLED as the code of summary status if the group
484 // contains other error code.
485 if (code == tsl::error::CANCELLED && s.code() != tsl::error::CANCELLED) {
486 code = s.code();
487 }
488 fmt.emplace_back(strings::StrCat(" (", index, ") ", MakeString(s)));
489 ++index;
490 }
491
492 fmt.push_back(strings::Printf("%zu successful operations.", num_ok_));
493 fmt.push_back(
494 strings::Printf("%zu derived errors ignored.", derived_.size()));
495
496 std::string error_msg =
497 absl::StrJoin(fmt, "\n").substr(0, kMaxAggregatedStatusMessageSize);
498
499 return MakeStatus(code, strings::StrCat(error_msg, get_recent_logs()),
500 GetPayloads());
501 } else {
502 // All statuses are derived. Pick the first available status to return.
503 return MakeDerived(MakeStatus(derived_.begin()->code(),
504 derived_.begin()->error_message(),
505 GetPayloads()));
506 }
507}
508
509// Concatenate all the status objects in the StatusGroup. This is used when
510// individual Status objects in the StatusGroup are already summarized Status.
511Status StatusGroup::as_concatenated_status() const {
512 if (ok_) {
513 return OkStatus();
514 }
515
516 // If only one root status is found, return it directly.
517 if (non_derived_.size() == 1) {
518 return MakeStatus(non_derived_.begin()->code(),
519 non_derived_.begin()->error_message(), GetPayloads());
520 }
521
522 if (!non_derived_.empty()) {
523 std::vector<string> fmt;
524 fmt.emplace_back("\n=====================");
525 for (const auto& s : non_derived_) {
526 fmt.emplace_back(MakeString(s));
527 }
528 fmt.emplace_back("=====================\n");
529 return MakeStatus(
530 non_derived_.begin()->code(),
531 absl::StrJoin(fmt, "\n").substr(0, kMaxAggregatedStatusMessageSize),
532 GetPayloads());
533 } else {
534 // All statuses are derived. Pick the first available status to return.
535 // This should not happen in normal execution.
536 return MakeDerived(MakeStatus(derived_.begin()->code(),
537 derived_.begin()->error_message(),
538 GetPayloads()));
539 }
540}
541
542void StatusGroup::AttachLogMessages() {
543 recent_logs_.clear();
544 StatusLogSink::GetInstance()->GetMessages(&recent_logs_);
545}
546
547} // namespace tsl
548