1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/tsl/platform/status.h" |
17 | |
18 | #include <stdio.h> |
19 | |
20 | #include <deque> |
21 | #include <functional> |
22 | #include <memory> |
23 | #include <string> |
24 | |
25 | #include "absl/base/call_once.h" |
26 | #include "absl/strings/cord.h" |
27 | #include "absl/strings/escaping.h" |
28 | #include "absl/strings/match.h" |
29 | #include "absl/types/optional.h" |
30 | #include "tensorflow/core/protobuf/error_codes.pb.h" |
31 | #include "tensorflow/core/protobuf/status.pb.h" |
32 | #include "tensorflow/tsl/platform/mutex.h" |
33 | #include "tensorflow/tsl/platform/stacktrace.h" |
34 | #include "tensorflow/tsl/platform/str_util.h" |
35 | #include "tensorflow/tsl/platform/strcat.h" |
36 | #include "tensorflow/tsl/platform/stringprintf.h" |
37 | |
38 | namespace tsl { |
39 | namespace error { |
40 | // TODO(aminim): figure out the protobuf migration story |
41 | using tensorflow::error::ABORTED; |
42 | using tensorflow::error::ALREADY_EXISTS; |
43 | using tensorflow::error::CANCELLED; |
44 | using tensorflow::error::DATA_LOSS; |
45 | using tensorflow::error::DEADLINE_EXCEEDED; |
46 | using tensorflow::error::FAILED_PRECONDITION; |
47 | using tensorflow::error::INTERNAL; |
48 | using tensorflow::error::INVALID_ARGUMENT; |
49 | using tensorflow::error::NOT_FOUND; |
50 | using tensorflow::error::OK; |
51 | using tensorflow::error::OUT_OF_RANGE; |
52 | using tensorflow::error::PERMISSION_DENIED; |
53 | using tensorflow::error::RESOURCE_EXHAUSTED; |
54 | using tensorflow::error::UNAUTHENTICATED; |
55 | using tensorflow::error::UNAVAILABLE; |
56 | using tensorflow::error::UNIMPLEMENTED; |
57 | using tensorflow::error::UNKNOWN; |
58 | } // namespace error |
59 | namespace { |
60 | |
61 | // Log sink is used to collect recent warning and error log messages to be |
62 | // attached to the error status. |
63 | class StatusLogSink : public TFLogSink { |
64 | public: |
65 | static StatusLogSink* GetInstance() { |
66 | static StatusLogSink* sink = new StatusLogSink(); |
67 | return sink; |
68 | } |
69 | |
70 | void enable() { |
71 | absl::call_once(flag_, [this] { |
72 | num_messages_ = 5; // default to 5 messages |
73 | |
74 | if (const char* num_msgs_str = |
75 | getenv("TF_WORKER_NUM_FORWARDED_LOG_MESSAGES" )) { |
76 | if (!absl::SimpleAtoi(num_msgs_str, &num_messages_)) { |
77 | LOG(WARNING) << "Failed to parse env variable " |
78 | "TF_WORKER_NUM_WARNING_ERROR_LOG_IN_STATUS=" |
79 | << num_msgs_str << " as int. Using the default value " |
80 | << num_messages_ << "." ; |
81 | } |
82 | } |
83 | |
84 | if (num_messages_ > 0) { |
85 | TFAddLogSink(this); |
86 | } |
87 | }); |
88 | } |
89 | |
90 | void GetMessages(std::vector<std::string>* logs) TF_LOCKS_EXCLUDED(mu_) { |
91 | mutex_lock lock(mu_); |
92 | |
93 | for (auto& msg : messages_) { |
94 | logs->push_back(msg); |
95 | } |
96 | } |
97 | |
98 | void Send(const TFLogEntry& entry) override TF_LOCKS_EXCLUDED(mu_) { |
99 | if (entry.log_severity() < absl::LogSeverity::kWarning) return; |
100 | |
101 | mutex_lock lock(mu_); |
102 | messages_.emplace_back(entry.ToString()); |
103 | if (messages_.size() > static_cast<size_t>(num_messages_)) { |
104 | messages_.pop_front(); |
105 | } |
106 | } |
107 | |
108 | private: |
109 | mutex mu_; |
110 | // for allowing repeated/concurrent calls to enable() |
111 | absl::once_flag flag_; |
112 | int num_messages_ = 0; |
113 | std::deque<std::string> messages_ TF_GUARDED_BY(mu_); |
114 | }; |
115 | |
116 | } // namespace |
117 | |
118 | // TODO(b/197552541) Move this namespace to errors.h after absl migration. |
119 | namespace errors { |
120 | static constexpr const char kStackTraceProtoUrl[] = |
121 | "type.googleapis.com/tensorflow.StackTracePayload" ; |
122 | |
123 | void SetStackTrace(::tsl::Status& status, std::vector<StackFrame> stack_trace) { |
124 | status.SetStackTrace(stack_trace); |
125 | } |
126 | |
127 | std::vector<StackFrame> GetStackTrace(const ::tsl::Status& status) { |
128 | return status.GetStackTrace(); |
129 | } |
130 | |
131 | } // namespace errors |
132 | |
133 | Status::~Status() {} |
134 | |
135 | void Status::SetStackTrace(std::vector<StackFrame> stack_trace) { |
136 | if (state_ != nullptr) { |
137 | state_->stack_trace = stack_trace; |
138 | } |
139 | } |
140 | |
141 | std::vector<StackFrame> Status::GetStackTrace() const { |
142 | if (state_ != nullptr) { |
143 | return state_->stack_trace; |
144 | } else { |
145 | return std::vector<StackFrame>(); |
146 | } |
147 | } |
148 | |
149 | absl::Span<const SourceLocation> Status::GetSourceLocations() const { |
150 | return state_ != nullptr ? state_->source_locations |
151 | : absl::Span<const SourceLocation>(); |
152 | } |
153 | |
154 | void Status::MaybeAddSourceLocation(SourceLocation loc) { |
155 | if (state_ == nullptr) { |
156 | return; |
157 | } |
158 | if (loc.line <= 0) { |
159 | return; |
160 | } |
161 | if (loc.file_name == nullptr) { |
162 | return; |
163 | } |
164 | if (loc.file_name[0] == '\0') { |
165 | return; |
166 | } |
167 | state_->source_locations.push_back(loc); |
168 | } |
169 | |
170 | Status::Status(tsl::error::Code code, absl::string_view msg, |
171 | SourceLocation loc) { |
172 | assert(code != tsl::error::OK); |
173 | state_ = std::make_unique<State>(); |
174 | state_->code = code; |
175 | state_->msg = std::string(msg); |
176 | MaybeAddSourceLocation(loc); |
177 | VLOG(5) << "Generated non-OK status: \"" << *this << "\". " |
178 | << CurrentStackTrace(); |
179 | } |
180 | |
181 | void Status::Update(const Status& new_status) { |
182 | if (ok()) { |
183 | *this = new_status; |
184 | } |
185 | } |
186 | |
187 | void Status::SlowCopyFrom(const State* src) { |
188 | if (src == nullptr) { |
189 | state_ = nullptr; |
190 | } else { |
191 | state_ = std::make_unique<State>(*src); |
192 | } |
193 | } |
194 | |
195 | Status::State* Status::NewStateFromNonOKStatus(const Status& s) { |
196 | return new State(*s.state_); |
197 | } |
198 | |
199 | const std::string& Status::empty_string() { |
200 | static string* empty = new string; |
201 | return *empty; |
202 | } |
203 | |
204 | std::string error_name(error::Code code) { |
205 | switch (code) { |
206 | case tsl::error::OK: |
207 | return "OK" ; |
208 | break; |
209 | case tsl::error::CANCELLED: |
210 | return "CANCELLED" ; |
211 | break; |
212 | case tsl::error::UNKNOWN: |
213 | return "UNKNOWN" ; |
214 | break; |
215 | case tsl::error::INVALID_ARGUMENT: |
216 | return "INVALID_ARGUMENT" ; |
217 | break; |
218 | case tsl::error::DEADLINE_EXCEEDED: |
219 | return "DEADLINE_EXCEEDED" ; |
220 | break; |
221 | case tsl::error::NOT_FOUND: |
222 | return "NOT_FOUND" ; |
223 | break; |
224 | case tsl::error::ALREADY_EXISTS: |
225 | return "ALREADY_EXISTS" ; |
226 | break; |
227 | case tsl::error::PERMISSION_DENIED: |
228 | return "PERMISSION_DENIED" ; |
229 | break; |
230 | case tsl::error::UNAUTHENTICATED: |
231 | return "UNAUTHENTICATED" ; |
232 | break; |
233 | case tsl::error::RESOURCE_EXHAUSTED: |
234 | return "RESOURCE_EXHAUSTED" ; |
235 | break; |
236 | case tsl::error::FAILED_PRECONDITION: |
237 | return "FAILED_PRECONDITION" ; |
238 | break; |
239 | case tsl::error::ABORTED: |
240 | return "ABORTED" ; |
241 | break; |
242 | case tsl::error::OUT_OF_RANGE: |
243 | return "OUT_OF_RANGE" ; |
244 | break; |
245 | case tsl::error::UNIMPLEMENTED: |
246 | return "UNIMPLEMENTED" ; |
247 | break; |
248 | case tsl::error::INTERNAL: |
249 | return "INTERNAL" ; |
250 | break; |
251 | case tsl::error::UNAVAILABLE: |
252 | return "UNAVAILABLE" ; |
253 | break; |
254 | case tsl::error::DATA_LOSS: |
255 | return "DATA_LOSS" ; |
256 | break; |
257 | default: |
258 | char tmp[30]; |
259 | snprintf(tmp, sizeof(tmp), "UNKNOWN_CODE(%d)" , static_cast<int>(code)); |
260 | return tmp; |
261 | break; |
262 | } |
263 | } |
264 | |
265 | std::string Status::ToString() const { |
266 | if (state_ == nullptr) { |
267 | return "OK" ; |
268 | } else { |
269 | std::string result(error_name(state_->code)); |
270 | result += ": " ; |
271 | result += state_->msg; |
272 | |
273 | for (const std::pair<const std::string, std::string>& element : |
274 | state_->payloads) { |
275 | absl::StrAppend(&result, " [" , element.first, "='" , |
276 | absl::CHexEscape(element.second), "']" ); |
277 | } |
278 | |
279 | return result; |
280 | } |
281 | } |
282 | |
283 | void Status::IgnoreError() const { |
284 | // no-op |
285 | } |
286 | |
287 | void Status::SetPayload(absl::string_view type_url, absl::string_view payload) { |
288 | if (ok()) return; |
289 | state_->payloads[std::string(type_url)] = std::string(payload); |
290 | } |
291 | |
292 | absl::optional<absl::Cord> Status::GetPayload( |
293 | absl::string_view type_url) const { |
294 | if (ok()) return absl::nullopt; |
295 | auto payload_iter = state_->payloads.find(std::string(type_url)); |
296 | if (payload_iter == state_->payloads.end()) return absl::nullopt; |
297 | return absl::Cord(payload_iter->second); |
298 | } |
299 | |
300 | bool Status::ErasePayload(absl::string_view type_url) { |
301 | if (ok()) return false; |
302 | auto payload_iter = state_->payloads.find(std::string(type_url)); |
303 | if (payload_iter == state_->payloads.end()) return false; |
304 | state_->payloads.erase(payload_iter); |
305 | return true; |
306 | } |
307 | |
308 | void Status::ForEachPayload( |
309 | const std::function<void(absl::string_view, absl::string_view)>& visitor) |
310 | const { |
311 | if (ok()) return; |
312 | for (const auto& payload : state_->payloads) { |
313 | visitor(payload.first, payload.second); |
314 | } |
315 | } |
316 | |
317 | std::ostream& operator<<(std::ostream& os, const Status& x) { |
318 | os << x.ToString(); |
319 | return os; |
320 | } |
321 | |
322 | Status OkStatus() { return Status(); } |
323 | |
324 | Status FromAbslStatus(const absl::Status& s) { |
325 | if (s.ok()) { |
326 | return Status(); |
327 | } |
328 | Status converted(static_cast<tsl::error::Code>(s.code()), s.message()); |
329 | s.ForEachPayload( |
330 | [&converted](absl::string_view key, const absl::Cord& value) { |
331 | converted.SetPayload(key, std::string(value)); |
332 | }); |
333 | |
334 | return converted; |
335 | } |
336 | |
337 | absl::Status ToAbslStatus(const ::tsl::Status& s) { |
338 | if (s.ok()) { |
339 | return absl::OkStatus(); |
340 | } |
341 | |
342 | absl::Status converted(static_cast<absl::StatusCode>(s.code()), |
343 | s.error_message()); |
344 | s.ForEachPayload([&converted](tsl::StringPiece key, tsl::StringPiece value) { |
345 | converted.SetPayload(key, absl::Cord(value)); |
346 | }); |
347 | |
348 | return converted; |
349 | } |
350 | |
351 | std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) { |
352 | std::string r("Non-OK-status: " ); |
353 | r += msg; |
354 | r += " status: " ; |
355 | r += v.ToString(); |
356 | // Leaks string but this is only to be used in a fatal error message |
357 | return new std::string(r); |
358 | } |
359 | |
360 | StatusGroup::StatusGroup() {} |
361 | |
362 | StatusGroup::StatusGroup(std::initializer_list<Status> statuses) { |
363 | for (const Status& s : statuses) { |
364 | Update(s); |
365 | } |
366 | } |
367 | |
368 | static constexpr const char kDerivedStatusProtoUrl[] = |
369 | "type.googleapis.com/tensorflow.DerivedStatus" ; |
370 | |
371 | Status StatusGroup::MakeDerived(const Status& s) { |
372 | if (IsDerived(s)) { |
373 | return s; |
374 | } else { |
375 | Status derived(s); |
376 | // TODO(b/200167936): Serialize an instance of DerivedStatus proto instead |
377 | // of using the string directly. The string is never used so it is not |
378 | // causing any issues at the moment. |
379 | derived.SetPayload(kDerivedStatusProtoUrl, "" ); |
380 | return derived; |
381 | } |
382 | } |
383 | |
384 | bool StatusGroup::IsDerived(const Status& s) { |
385 | return s.GetPayload(kDerivedStatusProtoUrl).has_value(); |
386 | } |
387 | |
388 | void StatusGroup::ConfigureLogHistory() { |
389 | StatusLogSink::GetInstance()->enable(); |
390 | } |
391 | |
392 | void StatusGroup::Update(const Status& s) { |
393 | if (s.ok()) { |
394 | ++num_ok_; |
395 | } else { |
396 | ok_ = false; |
397 | if (IsDerived(s)) { |
398 | derived_.insert(s); |
399 | } else { |
400 | non_derived_.insert(s); |
401 | } |
402 | } |
403 | } |
404 | |
405 | static constexpr int kMaxAggregatedStatusMessageSize = 8 * 1024; |
406 | static constexpr int kMaxAttachedLogMessageSize = 512; |
407 | |
408 | std::unordered_map<std::string, std::string> StatusGroup::GetPayloads() const { |
409 | std::unordered_map<std::string, std::string> payloads; |
410 | auto capture_payload = [&payloads](absl::string_view key, |
411 | absl::string_view value) { |
412 | payloads[std::string(key)] = std::string(value); |
413 | }; |
414 | |
415 | for (const auto& status : derived_) { |
416 | status.ForEachPayload(capture_payload); |
417 | } |
418 | |
419 | // If a key appears in both derived_ and non_derived_ payloads, then the |
420 | // non_derived_ payload receives priority. |
421 | for (const auto& status : non_derived_) { |
422 | status.ForEachPayload(capture_payload); |
423 | } |
424 | |
425 | payloads.erase(kDerivedStatusProtoUrl); |
426 | |
427 | return payloads; |
428 | } |
429 | |
430 | Status MakeStatus( |
431 | tsl::error::Code code, absl::string_view message, |
432 | const std::unordered_map<std::string, std::string>& payloads) { |
433 | Status status(code, message); |
434 | for (const auto& payload : payloads) { |
435 | status.SetPayload(payload.first, payload.second); |
436 | } |
437 | return status; |
438 | } |
439 | |
440 | std::string MakeString(const Status& status) { |
441 | return absl::StrCat(error_name(status.code()), ": " , status.error_message()); |
442 | } |
443 | |
444 | // Summarize all the status objects in the StatusGroup. This is used when |
445 | // individual Status objects in the StatusGroup are not already summarized. |
446 | Status StatusGroup::as_summary_status() const { |
447 | if (ok_) { |
448 | return OkStatus(); |
449 | } |
450 | |
451 | // Gather recent logs as a string |
452 | auto get_recent_logs = [this]() -> std::string { |
453 | if (!recent_logs_.empty()) { |
454 | std::vector<std::string> fmt; |
455 | fmt.push_back("\nRecent warning and error logs:" ); |
456 | for (auto& log : recent_logs_) { |
457 | // Add an indentation to make it look nicer. |
458 | fmt.push_back(" " + log.substr(0, kMaxAttachedLogMessageSize)); |
459 | } |
460 | return absl::StrJoin(fmt, "\n" ); |
461 | } else { |
462 | return "" ; |
463 | } |
464 | }; |
465 | |
466 | // If only one root status is found, do not add summary header and footer. |
467 | if (non_derived_.size() == 1) { |
468 | return MakeStatus(non_derived_.begin()->code(), |
469 | strings::StrCat(non_derived_.begin()->error_message(), |
470 | get_recent_logs()), |
471 | GetPayloads()); |
472 | } |
473 | |
474 | if (!non_derived_.empty()) { |
475 | std::vector<std::string> fmt; |
476 | |
477 | fmt.push_back( |
478 | strings::Printf("%zu root error(s) found." , non_derived_.size())); |
479 | |
480 | int index = 0; |
481 | auto code = tsl::error::CANCELLED; |
482 | for (const auto& s : non_derived_) { |
483 | // NOTE: Avoid using CANCELLED as the code of summary status if the group |
484 | // contains other error code. |
485 | if (code == tsl::error::CANCELLED && s.code() != tsl::error::CANCELLED) { |
486 | code = s.code(); |
487 | } |
488 | fmt.emplace_back(strings::StrCat(" (" , index, ") " , MakeString(s))); |
489 | ++index; |
490 | } |
491 | |
492 | fmt.push_back(strings::Printf("%zu successful operations." , num_ok_)); |
493 | fmt.push_back( |
494 | strings::Printf("%zu derived errors ignored." , derived_.size())); |
495 | |
496 | std::string error_msg = |
497 | absl::StrJoin(fmt, "\n" ).substr(0, kMaxAggregatedStatusMessageSize); |
498 | |
499 | return MakeStatus(code, strings::StrCat(error_msg, get_recent_logs()), |
500 | GetPayloads()); |
501 | } else { |
502 | // All statuses are derived. Pick the first available status to return. |
503 | return MakeDerived(MakeStatus(derived_.begin()->code(), |
504 | derived_.begin()->error_message(), |
505 | GetPayloads())); |
506 | } |
507 | } |
508 | |
509 | // Concatenate all the status objects in the StatusGroup. This is used when |
510 | // individual Status objects in the StatusGroup are already summarized Status. |
511 | Status StatusGroup::as_concatenated_status() const { |
512 | if (ok_) { |
513 | return OkStatus(); |
514 | } |
515 | |
516 | // If only one root status is found, return it directly. |
517 | if (non_derived_.size() == 1) { |
518 | return MakeStatus(non_derived_.begin()->code(), |
519 | non_derived_.begin()->error_message(), GetPayloads()); |
520 | } |
521 | |
522 | if (!non_derived_.empty()) { |
523 | std::vector<string> fmt; |
524 | fmt.emplace_back("\n=====================" ); |
525 | for (const auto& s : non_derived_) { |
526 | fmt.emplace_back(MakeString(s)); |
527 | } |
528 | fmt.emplace_back("=====================\n" ); |
529 | return MakeStatus( |
530 | non_derived_.begin()->code(), |
531 | absl::StrJoin(fmt, "\n" ).substr(0, kMaxAggregatedStatusMessageSize), |
532 | GetPayloads()); |
533 | } else { |
534 | // All statuses are derived. Pick the first available status to return. |
535 | // This should not happen in normal execution. |
536 | return MakeDerived(MakeStatus(derived_.begin()->code(), |
537 | derived_.begin()->error_message(), |
538 | GetPayloads())); |
539 | } |
540 | } |
541 | |
542 | void StatusGroup::AttachLogMessages() { |
543 | recent_logs_.clear(); |
544 | StatusLogSink::GetInstance()->GetMessages(&recent_logs_); |
545 | } |
546 | |
547 | } // namespace tsl |
548 | |