1 | /* Copyright 2021 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef RUY_RUY_TRACE_H_ |
17 | #define RUY_RUY_TRACE_H_ |
18 | |
19 | #ifdef RUY_TRACE |
20 | |
21 | #include <algorithm> |
22 | #include <cstdio> |
23 | #include <cstdlib> |
24 | #include <memory> |
25 | #include <mutex> |
26 | #include <string> |
27 | #include <thread> |
28 | #include <vector> |
29 | |
30 | #include "ruy/mat.h" |
31 | #include "ruy/matrix.h" |
32 | #include "ruy/path.h" |
33 | #include "ruy/platform.h" |
34 | #include "ruy/side_pair.h" |
35 | |
36 | namespace ruy { |
37 | |
38 | // Helper for `formatted` so we don't have to put .c_str() on strings. |
39 | template <typename T> |
40 | T value_for_snprintf(T value) { |
41 | return value; |
42 | } |
43 | |
44 | inline const char* value_for_snprintf(const std::string& s) { |
45 | return s.c_str(); |
46 | } |
47 | |
48 | // A sprintf-like function returning a std::string. |
49 | // Remove this once we can rely on std::format (c++20). |
50 | template <typename... Args> |
51 | std::string formatted(const char* format, Args... args) { |
52 | char buf[1024]; |
53 | #pragma GCC diagnostic push |
54 | #pragma GCC diagnostic warning "-Wformat-security" |
55 | int size = snprintf(buf, sizeof buf, format, value_for_snprintf(args)...); |
56 | #pragma GCC diagnostic pop |
57 | if (size <= 0) { |
58 | abort(); |
59 | } |
60 | return std::string(buf); |
61 | } |
62 | |
63 | // An entry in the trace. |
64 | struct ThreadTraceEntry final { |
65 | std::string text; |
66 | int indent = 0; |
67 | const char* source_file = nullptr; |
68 | int source_line = 0; |
69 | }; |
70 | |
71 | // Trace for one thread. |
72 | class ThreadTrace final { |
73 | public: |
74 | ~ThreadTrace() {} |
75 | |
76 | void set_thread_id(int thread_id) { thread_id_ = thread_id; } |
77 | int thread_id() const { return thread_id_; } |
78 | |
79 | bool is_in_run_ahead_packing_loop() const { |
80 | return is_in_run_ahead_packing_loop_; |
81 | } |
82 | void set_is_in_run_ahead_packing_loop(bool value) { |
83 | is_in_run_ahead_packing_loop_ = value; |
84 | } |
85 | |
86 | void set_current_source_file(const char* source_file) { |
87 | current_source_file_ = source_file; |
88 | } |
89 | |
90 | void set_current_source_line(int source_line) { |
91 | current_source_line_ = source_line; |
92 | } |
93 | |
94 | const std::vector<ThreadTraceEntry>& entries() const { return entries_; } |
95 | |
96 | template <typename... Args> |
97 | void Write(const char* format, Args... args) { |
98 | ThreadTraceEntry entry; |
99 | entry.text = formatted(format, args...); |
100 | entry.indent = indent_; |
101 | entry.source_file = current_source_file_; |
102 | entry.source_line = current_source_line_; |
103 | entries_.emplace_back(std::move(entry)); |
104 | } |
105 | |
106 | template <typename... Args> |
107 | void EnterScope(const char* scope_name) { |
108 | Write("%s {" , scope_name); |
109 | indent_++; |
110 | } |
111 | void LeaveScope(const char* scope_name) { |
112 | indent_--; |
113 | Write("} // end of %s" , scope_name); |
114 | } |
115 | |
116 | private: |
117 | // The trace contents |
118 | std::vector<ThreadTraceEntry> entries_; |
119 | |
120 | // Current indentation level. |
121 | int indent_ = 0; |
122 | // Thread's ID as set by Ruy, e.g. [0,N-1]. Not OS TID. |
123 | int thread_id_ = -1; |
124 | // The run-ahead loop in `EnsurePacked` may run many iterations when the |
125 | // thread is waiting for a block to be packed by another thread --- it's |
126 | // a busy wait. We track whether we are in that mode to avoid generating |
127 | // many uninteresting trace entries. |
128 | bool is_in_run_ahead_packing_loop_ = false; |
129 | // Last recorded value of __FILE__ and __LINE__, as a convenience so we don't |
130 | // have to pass these in every call to `Write`. |
131 | const char* current_source_file_ = nullptr; |
132 | int current_source_line_ = 0; |
133 | }; |
134 | |
135 | // Main components of ruy. Used for trace colorization. |
136 | enum class Component { kNone, kFrontEnd, kMiddleEnd, kBackEnd, kThreadPool }; |
137 | |
138 | // Output format for the trace. |
139 | enum class TraceOutputFormat { kNone, kTerminal, kHtml }; |
140 | |
141 | inline std::string IndentString(int indent) { |
142 | std::string s; |
143 | for (int i = 0; i < indent; i++) { |
144 | s += " " ; |
145 | } |
146 | return s; |
147 | } |
148 | |
149 | // Returns the text to write to the trace to open a colored section. |
150 | inline const char* ColorSectionStart(TraceOutputFormat output_format, |
151 | Component component) { |
152 | switch (output_format) { |
153 | case TraceOutputFormat::kTerminal: |
154 | switch (component) { |
155 | case Component::kFrontEnd: |
156 | return "\x1b[36m" ; |
157 | case Component::kMiddleEnd: |
158 | return "\x1b[32m" ; |
159 | case Component::kBackEnd: |
160 | return "\x1b[31m" ; |
161 | case Component::kThreadPool: |
162 | return "\x1b[33m" ; |
163 | default: |
164 | abort(); |
165 | return nullptr; |
166 | } |
167 | case TraceOutputFormat::kHtml: |
168 | switch (component) { |
169 | case Component::kFrontEnd: |
170 | return "<span style=\"background-color:#B2EBF2\">" ; |
171 | case Component::kMiddleEnd: |
172 | return "<span style=\"background-color:#C8E6C9\">" ; |
173 | case Component::kBackEnd: |
174 | return "<span style=\"background-color:#FFCDD2\">" ; |
175 | case Component::kThreadPool: |
176 | return "<span style=\"background-color:#FFF9C4\">" ; |
177 | default: |
178 | abort(); |
179 | return nullptr; |
180 | } |
181 | default: |
182 | abort(); |
183 | return nullptr; |
184 | } |
185 | } |
186 | |
187 | // Returns the text to write to the trace to close a colored section. |
188 | inline const char* ColorSectionEnd(TraceOutputFormat output_format) { |
189 | switch (output_format) { |
190 | case TraceOutputFormat::kTerminal: |
191 | return "\x1b[0m" ; |
192 | case TraceOutputFormat::kHtml: |
193 | return "</span>" ; |
194 | default: |
195 | abort(); |
196 | return nullptr; |
197 | } |
198 | } |
199 | |
200 | // Returns the output format to use for the trace. |
201 | inline TraceOutputFormat GetOutputFormat() { |
202 | const char* html_env = getenv("RUY_TRACE_HTML" ); |
203 | if (html_env && strtol(html_env, nullptr, 10) != 0) { |
204 | return TraceOutputFormat::kHtml; |
205 | } else { |
206 | return TraceOutputFormat::kTerminal; |
207 | } |
208 | } |
209 | |
210 | // A `basename` function that's good enough for ruy __FILE__'s. |
211 | // Note: `basename` is POSIX-only and annoying (takes a char*, may mutate). |
212 | inline const char* GetBaseName(const char* path) { |
213 | std::size_t len = strlen(path); |
214 | if (len == 0) { |
215 | return path; |
216 | } |
217 | const char* ptr = path + len - 1; |
218 | while (ptr != path) { |
219 | if (*ptr == '/' || *ptr == '\\') { |
220 | return ptr + 1; |
221 | } |
222 | --ptr; |
223 | } |
224 | // Path did not contain any path separator. |
225 | return path; |
226 | } |
227 | |
228 | // Determines a Component (used for colorization) by source file. |
229 | inline Component GetComponentBySourceFile(const char* base_name) { |
230 | if (!strcmp(base_name, "pack.h" ) || !strcmp(base_name, "kernel.h" )) { |
231 | return Component::kBackEnd; |
232 | } else if (!strcmp(base_name, "trmul.cc" ) || |
233 | !strcmp(base_name, "block_map.cc" )) { |
234 | return Component::kMiddleEnd; |
235 | } else if (!strcmp(base_name, "thread_pool.cc" )) { |
236 | return Component::kThreadPool; |
237 | } else { |
238 | return Component::kFrontEnd; |
239 | } |
240 | } |
241 | |
242 | inline std::string EscapeText(TraceOutputFormat output_format, |
243 | const std::string& text) { |
244 | if (output_format == TraceOutputFormat::kHtml) { |
245 | std::string escaped_text; |
246 | for (char c : text) { |
247 | if (c == '<') { |
248 | escaped_text += "<" ; |
249 | } else if (c == '>') { |
250 | escaped_text += ">" ; |
251 | } else { |
252 | escaped_text += c; |
253 | } |
254 | } |
255 | return escaped_text; |
256 | } else { |
257 | return text; |
258 | } |
259 | } |
260 | |
261 | // Prints an entry from the trace to the destination trace file. |
262 | inline void Print(const ThreadTraceEntry& entry, |
263 | TraceOutputFormat output_format, FILE* file) { |
264 | const char* base_name = GetBaseName(entry.source_file); |
265 | Component component = GetComponentBySourceFile(base_name); |
266 | const std::string& source_location = |
267 | formatted("%s:%d" , base_name, entry.source_line); |
268 | const std::string& escaped_text = EscapeText(output_format, entry.text); |
269 | fprintf(file, "%s%-32s%s%s%s\n" , ColorSectionStart(output_format, component), |
270 | source_location.c_str(), IndentString(entry.indent).c_str(), |
271 | escaped_text.c_str(), ColorSectionEnd(output_format)); |
272 | } |
273 | |
274 | // Prints a thread's entire trace to the destination trace file. |
275 | inline void Print(const ThreadTrace& trace, TraceOutputFormat output_format, |
276 | FILE* file) { |
277 | if (output_format == TraceOutputFormat::kHtml) { |
278 | fprintf(file, "<html><body><pre>\n<span style=\"font-weight:bold\">\n" ); |
279 | } |
280 | fprintf(file, "Ruy trace for thread %d:\n" , trace.thread_id()); |
281 | if (output_format == TraceOutputFormat::kHtml) { |
282 | fprintf(file, "</span>\n" ); |
283 | } |
284 | for (const ThreadTraceEntry& entry : trace.entries()) { |
285 | Print(entry, output_format, file); |
286 | } |
287 | fprintf(file, "\n" ); |
288 | if (output_format == TraceOutputFormat::kHtml) { |
289 | fprintf(file, "</pre></body></html>\n" ); |
290 | } |
291 | } |
292 | |
293 | // Holds all the threads' traces. This is a global singleton class. |
294 | // On exit, when the singleton is destroyed, the destructor prints out the |
295 | // traces. |
296 | class AllThreadTraces final { |
297 | public: |
298 | // Add a new ThreadTrace for the current thread. Should be called only once |
299 | // on each thread. |
300 | ThreadTrace* AddCurrentThread() { |
301 | std::lock_guard<std::mutex> lock(mutex_); |
302 | ThreadTrace* thread_trace = new ThreadTrace; |
303 | thread_traces_.emplace_back(thread_trace); |
304 | return thread_trace; |
305 | } |
306 | ~AllThreadTraces() { |
307 | std::lock_guard<std::mutex> lock(mutex_); |
308 | // Open the destination file. |
309 | const char* file_env = getenv("RUY_TRACE_FILE" ); |
310 | FILE* file = stdout; |
311 | if (file_env) { |
312 | file = fopen(file_env, "w" ); |
313 | if (!file) { |
314 | fprintf(stderr, "Failed to open %s for write\n" , file_env); |
315 | exit(1); |
316 | } |
317 | } |
318 | // Sort the threads by Ruy Thread ID (not OS TID). |
319 | auto output_format = GetOutputFormat(); |
320 | std::sort(std::begin(thread_traces_), std::end(thread_traces_), |
321 | [](const auto& a, const auto& b) { |
322 | return a->thread_id() < b->thread_id(); |
323 | }); |
324 | // Print all the threads' traces. |
325 | for (const auto& trace : thread_traces_) { |
326 | Print(*trace, output_format, file); |
327 | } |
328 | if (file_env) { |
329 | fclose(file); |
330 | } |
331 | } |
332 | static AllThreadTraces* Singleton() { |
333 | static AllThreadTraces all_thread_traces; |
334 | return &all_thread_traces; |
335 | } |
336 | |
337 | private: |
338 | std::vector<std::unique_ptr<ThreadTrace>> thread_traces_; |
339 | std::mutex mutex_; |
340 | }; |
341 | |
342 | // Returns the thread-local ThreadTrace singleton, constructing it as needed. |
343 | inline ThreadTrace* ThreadLocalTrace() { |
344 | static thread_local ThreadTrace* thread_local_trace = |
345 | AllThreadTraces::Singleton()->AddCurrentThread(); |
346 | return thread_local_trace; |
347 | } |
348 | |
349 | // RAII helper to trace a scope, e.g. a function scope. |
350 | class RuyTraceScope { |
351 | const char* source_file_; |
352 | int source_line_; |
353 | const char* scope_name_; |
354 | |
355 | public: |
356 | RuyTraceScope(const char* source_file, int source_line, |
357 | const char* scope_name) |
358 | : source_file_(source_file), |
359 | source_line_(source_line), |
360 | scope_name_(scope_name) { |
361 | ThreadLocalTrace()->set_current_source_file(source_file_); |
362 | ThreadLocalTrace()->set_current_source_line(source_line_); |
363 | ThreadLocalTrace()->EnterScope(scope_name_); |
364 | } |
365 | ~RuyTraceScope() { |
366 | ThreadLocalTrace()->set_current_source_file(source_file_); |
367 | ThreadLocalTrace()->set_current_source_line(source_line_); |
368 | ThreadLocalTrace()->LeaveScope(scope_name_); |
369 | } |
370 | }; |
371 | |
372 | #define RUY_TRACE_SCOPE_NAME_IMPL(file, line, name) \ |
373 | RuyTraceScope ruy_trace_scope##line(file, line, name) |
374 | #define RUY_TRACE_SCOPE_NAME(name) \ |
375 | RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, name) |
376 | #define RUY_TRACE_SCOPE \ |
377 | RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, __FUNCTION__) |
378 | |
379 | // Helpers to trace Ruy objects. |
380 | |
381 | inline std::string str(Order o) { |
382 | return o == Order::kRowMajor ? "row-major" : "column-major" ; |
383 | } |
384 | |
385 | inline std::string str(Side s) { return s == Side::kLhs ? "LHS" : "RHS" ; } |
386 | |
387 | inline std::string str(const Layout& layout) { |
388 | std::string s = |
389 | formatted("%dx%d, %s" , layout.rows(), layout.cols(), str(layout.order())); |
390 | int inner_size = |
391 | layout.order() == Order::kRowMajor ? layout.cols() : layout.rows(); |
392 | if (inner_size != layout.stride()) { |
393 | s += formatted(", stride=%d" , layout.stride()); |
394 | } else { |
395 | s += formatted(", unstrided" ); |
396 | } |
397 | return s; |
398 | } |
399 | |
400 | inline std::string str(const MatLayout& layout) { |
401 | std::string s = |
402 | formatted("%dx%d, %s" , layout.rows, layout.cols, str(layout.order)); |
403 | int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows; |
404 | if (inner_size != layout.stride) { |
405 | s += formatted(", stride=%d" , layout.stride); |
406 | } else { |
407 | s += formatted(", unstrided" ); |
408 | } |
409 | return s; |
410 | } |
411 | |
412 | inline std::string str(const PMatLayout& layout) { |
413 | std::string s = |
414 | formatted("%dx%d, %s" , layout.rows, layout.cols, str(layout.order)); |
415 | int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows; |
416 | if (inner_size != layout.stride) { |
417 | s += formatted(", stride=%d" , layout.stride); |
418 | } else { |
419 | s += formatted(", unstrided" ); |
420 | } |
421 | s += formatted(", kernel blocks: %dx%d %s" , layout.kernel.rows, |
422 | layout.kernel.cols, str(layout.kernel.order)); |
423 | return s; |
424 | } |
425 | |
426 | template <typename T> |
427 | std::string str() { |
428 | return "<unknown type>" ; |
429 | } |
430 | #define RUY_IMPL_STR_TYPE_STD(T) \ |
431 | template <> \ |
432 | inline std::string str<std::T>() { \ |
433 | return #T; \ |
434 | } |
435 | #define RUY_IMPL_STR_TYPE(T) \ |
436 | template <> \ |
437 | inline std::string str<T>() { \ |
438 | return #T; \ |
439 | } |
440 | |
441 | RUY_IMPL_STR_TYPE(float) |
442 | RUY_IMPL_STR_TYPE(double) |
443 | RUY_IMPL_STR_TYPE_STD(int8_t) |
444 | RUY_IMPL_STR_TYPE_STD(uint8_t) |
445 | RUY_IMPL_STR_TYPE_STD(int16_t) |
446 | RUY_IMPL_STR_TYPE_STD(uint16_t) |
447 | RUY_IMPL_STR_TYPE_STD(int32_t) |
448 | RUY_IMPL_STR_TYPE_STD(uint32_t) |
449 | RUY_IMPL_STR_TYPE_STD(int64_t) |
450 | RUY_IMPL_STR_TYPE_STD(uint64_t) |
451 | |
452 | template <typename T> |
453 | std::string str(const Matrix<T>& matrix) { |
454 | std::string s = formatted("Matrix<%s>, %s" , str<T>(), str(matrix.layout())); |
455 | if (matrix.zero_point()) { |
456 | s += formatted(", zero_point=%d" , static_cast<int>(matrix.zero_point())); |
457 | } |
458 | if (matrix.cache_policy() != CachePolicy::kNeverCache) { |
459 | s += |
460 | formatted(", cache_policy=%d" , static_cast<int>(matrix.cache_policy())); |
461 | } |
462 | return s; |
463 | } |
464 | |
465 | inline std::string str(const Type& type) { |
466 | char c; |
467 | if (type.is_floating_point) { |
468 | c = 'f'; |
469 | } else if (type.is_signed) { |
470 | c = 'i'; |
471 | } else { |
472 | c = 'u'; |
473 | } |
474 | return formatted("%c%d" , c, type.size * 8); |
475 | } |
476 | |
477 | inline std::string str(const EMat& mat) { |
478 | std::string s = |
479 | formatted("EMat, data_type=%s, %s" , str(mat.data_type), str(mat.layout)); |
480 | if (mat.zero_point) { |
481 | s += formatted(", zero_point=%d" , static_cast<int>(mat.zero_point)); |
482 | } |
483 | if (mat.cache_policy != CachePolicy::kNeverCache) { |
484 | s += formatted(", cache_policy=%d" , static_cast<int>(mat.cache_policy)); |
485 | } |
486 | return s; |
487 | } |
488 | |
489 | inline std::string str(const PEMat& mat) { |
490 | std::string s = |
491 | formatted("PEMat, data_type=%s, %s" , str(mat.data_type), str(mat.layout)); |
492 | if (mat.zero_point) { |
493 | s += formatted(", zero_point=%d" , static_cast<int>(mat.zero_point)); |
494 | } |
495 | return s; |
496 | } |
497 | |
498 | inline std::string str(Path paths) { |
499 | bool first = true; |
500 | std::string s; |
501 | for (int bit = 0; bit < 16; bit++) { |
502 | Path cur_path = static_cast<Path>(1 << bit); |
503 | if ((paths & cur_path) != Path::kNone) { |
504 | if (!first) { |
505 | s += " | " ; |
506 | } |
507 | first = false; |
508 | switch (cur_path) { |
509 | case Path::kNone: |
510 | continue; |
511 | #define RUY_HANDLE_PATH(p) \ |
512 | case Path::p: \ |
513 | s += #p; \ |
514 | break; |
515 | RUY_HANDLE_PATH(kStandardCpp) |
516 | RUY_HANDLE_PATH(kInternalStandardCppVariant1) |
517 | RUY_HANDLE_PATH(kInternalStandardCppVariant2) |
518 | RUY_HANDLE_PATH(kInternalStandardCppVariant3) |
519 | #if RUY_PLATFORM_ARM |
520 | RUY_HANDLE_PATH(kNeon) |
521 | RUY_HANDLE_PATH(kNeonDotprod) |
522 | #endif // RUY_PLATFORM_ARM |
523 | #if RUY_PLATFORM_X86 |
524 | RUY_HANDLE_PATH(kAvx) |
525 | RUY_HANDLE_PATH(kAvx2Fma) |
526 | RUY_HANDLE_PATH(kAvx512) |
527 | #endif // RUY_PLATFORM_X86 |
528 | #undef RUY_HANDLE_PATH |
529 | default: |
530 | fprintf(stderr, "Unhandled Path value 0x%x\n" , |
531 | static_cast<int>(cur_path)); |
532 | abort(); |
533 | } |
534 | } |
535 | } |
536 | return s; |
537 | } |
538 | |
539 | // Implementation of RUY_TRACE_INFO(X) macros. |
540 | |
541 | #define RUY_TRACE_INFO_MUL \ |
542 | ThreadLocalTrace()->Write("CompiledPaths: %s", str(CompiledPaths)); \ |
543 | ThreadLocalTrace()->Write("LHS: %s", str(lhs)); \ |
544 | ThreadLocalTrace()->Write("RHS: %s", str(rhs)); \ |
545 | ThreadLocalTrace()->Write("Destination: %s", str(*dst)); |
546 | |
547 | #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_TRANSPOSING \ |
548 | ThreadLocalTrace()->Write("Canonicalizing to column-major destination:"); \ |
549 | ThreadLocalTrace()->Write( \ |
550 | "Swapping LHS<->RHS and flipping all storage orders."); |
551 | |
552 | #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_ASSUMING_COLMAJOR_DST \ |
553 | ThreadLocalTrace()->Write("Runtime-selected path: %s", str(the_path)); \ |
554 | ThreadLocalTrace()->Write("LHS: %s", str(params->src[Side::kLhs])); \ |
555 | ThreadLocalTrace()->Write("RHS: %s", str(params->src[Side::kRhs])); \ |
556 | ThreadLocalTrace()->Write("Destination: %s", str(params->dst)); |
557 | |
558 | #define RUY_TRACE_INFO_POPULATE_TRMUL_PARAMS \ |
559 | ThreadLocalTrace()->Write( \ |
560 | "Here we have this Path as a template parameter: %s", str(ThePath)); \ |
561 | ThreadLocalTrace()->Write("PackedLhsScalar: %s", str<PackedLhsScalar>()); \ |
562 | ThreadLocalTrace()->Write("PackedRhsScalar: %s", str<PackedRhsScalar>()); \ |
563 | ThreadLocalTrace()->Write("Kernel function pointer: %p", \ |
564 | params->run_kernel); \ |
565 | ThreadLocalTrace()->Write("Kernel LHS block layout: %dx%d %s", \ |
566 | LhsKernelLayout::kRows, LhsKernelLayout::kCols, \ |
567 | str(LhsKernelLayout::kOrder)); \ |
568 | ThreadLocalTrace()->Write("Kernel RHS block layout: %dx%d %s", \ |
569 | RhsKernelLayout::kRows, RhsKernelLayout::kCols, \ |
570 | str(RhsKernelLayout::kOrder)); \ |
571 | ThreadLocalTrace()->Write("Created packed matrices:"); \ |
572 | ThreadLocalTrace()->Write("Packed LHS matrix: %s", \ |
573 | str(params->packed_matrix[Side::kLhs])); \ |
574 | ThreadLocalTrace()->Write("Packed RHS matrix: %s", \ |
575 | str(params->packed_matrix[Side::kRhs])); \ |
576 | ThreadLocalTrace()->Write("LHS packing function pointer: %p", \ |
577 | params->run_pack[Side::kLhs]); \ |
578 | ThreadLocalTrace()->Write("RHS packing function pointer: %p", \ |
579 | params->run_pack[Side::kRhs]); |
580 | |
581 | #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE \ |
582 | ThreadLocalTrace()->Write("SetRuntimeEnabledPaths forcing paths: %s", \ |
583 | str(*paths)); |
584 | |
585 | #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR \ |
586 | ThreadLocalTrace()->Write("Environment variable forcing paths: %s", \ |
587 | str(*paths)); |
588 | |
589 | #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_DETECTION \ |
590 | ThreadLocalTrace()->Write( \ |
591 | "Runtime-detected paths: %s", \ |
592 | str(*paths & ~kNonArchPathsIncludingInternalVariants)); |
593 | |
594 | #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_SHOULD_CACHE \ |
595 | ThreadLocalTrace()->Write( \ |
596 | "Caching the packed %s matrix. Already in cache: %s", str(side), \ |
597 | action == PrepackedCache::Action::kInsertedNewEntry ? "no" : "yes"); |
598 | |
599 | #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_NO_CACHE \ |
600 | ThreadLocalTrace()->Write("Not caching the packed %s matrix.", str(side)); |
601 | |
602 | #define RUY_TRACE_INFO_GET_TENTATIVE_THREAD_COUNT \ |
603 | ThreadLocalTrace()->Write( \ |
604 | "tentative_thread_count=%d (determined based on shape %dx%dx%d and " \ |
605 | "max_num_threads=%d)", \ |
606 | tentative_thread_count, rows, depth, cols, ctx->max_num_threads()); |
607 | |
608 | #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_TRUE \ |
609 | ThreadLocalTrace()->Write( \ |
610 | "Choosing to use the simple loop code path in TrMul because of the " \ |
611 | "linear traversal and single thread."); |
612 | |
613 | #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_FALSE \ |
614 | ThreadLocalTrace()->Write( \ |
615 | "Choosing to use the general case code path in TrMul because of: %s", \ |
616 | tentative_thread_count > 1 ? "multi-threading" \ |
617 | : "non-linear traversal order"); |
618 | |
619 | #define RUY_TRACE_INFO_TRMUL_SIMPLE_LOOP \ |
620 | ThreadLocalTrace()->Write("Entering the simple loop code path of TrMul"); |
621 | |
622 | #define RUY_TRACE_INFO_TRMUL_GENERAL_CASE \ |
623 | ThreadLocalTrace()->Write("Entering the general case code path of TrMul"); |
624 | |
625 | #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_START \ |
626 | ThreadLocalTrace()->Write("Kernel block: %dx%d", kernel_rows, kernel_cols); \ |
627 | ThreadLocalTrace()->Write( \ |
628 | "BlockMap shape: %dx%d (destination matrix shape rounded to next " \ |
629 | "kernel blocks)", \ |
630 | rows, cols); \ |
631 | ThreadLocalTrace()->Write( \ |
632 | "Rectangularness log2: %dx%d (powers of two factors bringing the shape " \ |
633 | "closest to square)", \ |
634 | rows_rectangularness_log2, cols_rectangularness_log2); \ |
635 | ThreadLocalTrace()->Write("Accumulation depth: %d", depth); \ |
636 | ThreadLocalTrace()->Write("LHS scalar type size: %d", lhs_scalar_size); \ |
637 | ThreadLocalTrace()->Write("RHS scalar type size: %d", rhs_scalar_size); \ |
638 | ThreadLocalTrace()->Write("Tentative thread count: %d", \ |
639 | tentative_thread_count); \ |
640 | ThreadLocalTrace()->Write( \ |
641 | "CPU cache params: local_cache_size=%d, last_level_cache_size=%d", \ |
642 | cpu_cache_params.local_cache_size, \ |
643 | cpu_cache_params.last_level_cache_size); \ |
644 | ThreadLocalTrace()->Write( \ |
645 | "For the sizes below, when rows!=cols, we always retain the min of the " \ |
646 | "two."); \ |
647 | ThreadLocalTrace()->Write("Kernel block size_log2: %d", kernel_size_log2); \ |
648 | ThreadLocalTrace()->Write( \ |
649 | "BlockMap size_log2: %d (destination matrix shape rounded to next " \ |
650 | "kernel blocks)", \ |
651 | size_log2); \ |
652 | ThreadLocalTrace()->Write( \ |
653 | "Now we will pick the optimal log2 of BlockMap block size"); |
654 | |
655 | #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE \ |
656 | ThreadLocalTrace()->Write( \ |
657 | "For BlockMap block size_log2 %d, score=%d (" \ |
658 | "multithreading_score=%d + cache_locality_score=%d + " \ |
659 | "kernel_amortization_score=%d)", \ |
660 | block_size_log2, score, multithreading_score, cache_locality_score, \ |
661 | kernel_amortization_score); |
662 | |
663 | #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_END \ |
664 | ThreadLocalTrace()->Write("Selecting BlockMap block size_log2: %d", \ |
665 | best_score_block_size_log2); \ |
666 | ThreadLocalTrace()->Write( \ |
667 | "BlockMap has %dx%d blocks, each of size between %dx%d and %dx%d.", \ |
668 | 1 << num_blocks_of_rows_log2, 1 << num_blocks_of_cols_log2, \ |
669 | block_map->small_block_dims[Side::kLhs], \ |
670 | block_map->small_block_dims[Side::kRhs], \ |
671 | block_map->small_block_dims[Side::kLhs] + \ |
672 | block_map->kernel_dims[Side::kLhs], \ |
673 | block_map->small_block_dims[Side::kRhs] + \ |
674 | block_map->kernel_dims[Side::kRhs]); \ |
675 | ThreadLocalTrace()->Write( \ |
676 | "The first %d rows of blocks have %d rows, the remaining ones have %d " \ |
677 | "rows ", \ |
678 | block_map->large_blocks[Side::kLhs], \ |
679 | block_map->small_block_dims[Side::kLhs] + \ |
680 | block_map->kernel_dims[Side::kLhs], \ |
681 | block_map->small_block_dims[Side::kLhs]); \ |
682 | ThreadLocalTrace()->Write( \ |
683 | "The first %d columns of blocks have %d columns, the remaining ones " \ |
684 | "have %d columns ", \ |
685 | block_map->large_blocks[Side::kRhs], \ |
686 | block_map->small_block_dims[Side::kRhs] + \ |
687 | block_map->kernel_dims[Side::kLhs], \ |
688 | block_map->small_block_dims[Side::kRhs]); \ |
689 | ThreadLocalTrace()->Write( \ |
690 | "Traversal order: %s", \ |
691 | block_map->traversal_order == BlockMapTraversalOrder::kLinear ? "linear" \ |
692 | : block_map->traversal_order == BlockMapTraversalOrder::kFractalZ \ |
693 | ? "fractal Z-curve" \ |
694 | : block_map->traversal_order == BlockMapTraversalOrder::kFractalU \ |
695 | ? "fractal U-curve" \ |
696 | : block_map->traversal_order == BlockMapTraversalOrder::kFractalHilbert \ |
697 | ? "fractal Hilbert curve" \ |
698 | : nullptr); \ |
699 | ThreadLocalTrace()->Write("Finalized thread count: %d", \ |
700 | block_map->thread_count); |
701 | |
702 | #define RUY_TRACE_SET_THEAD_ID(thread_id) \ |
703 | ThreadLocalTrace()->set_thread_id(thread_id); |
704 | |
705 | #define RUY_TRACE_INFO_TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS \ |
706 | ThreadLocalTrace()->Write( \ |
707 | "Block #%d is at position (%d, %d) in the BlockMap.", block_id, \ |
708 | block[Side::kLhs], block[Side::kRhs]); \ |
709 | ThreadLocalTrace()->Write( \ |
710 | "Block #%d has shape %dx%d and starts at position (%d, %d) in the " \ |
711 | "destination matrix.", \ |
712 | block_id, end[Side::kLhs] - start[Side::kLhs], \ |
713 | end[Side::kRhs] - start[Side::kRhs], start[Side::kLhs], \ |
714 | start[Side::kRhs]); \ |
715 | ThreadLocalTrace()->Write( \ |
716 | "Block #%d depends on LHS panel #%d and RHS panel #%d.", block_id, \ |
717 | block[Side::kLhs], block[Side::kRhs]); |
718 | |
719 | #define RUY_TRACE_INFO_TRYPACK_PACKING \ |
720 | ThreadLocalTrace()->Write( \ |
721 | "%s panel #%d is not already packed. Packing it now.", str(side), \ |
722 | block); |
723 | |
724 | #define RUY_TRACE_INFO_TRYPACK_ANOTHER_THREAD_PACKING \ |
725 | if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
726 | ThreadLocalTrace()->Write( \ |
727 | "%s panel #%d is currently being packed by another thread.", \ |
728 | str(side), block); \ |
729 | } |
730 | |
731 | #define RUY_TRACE_INFO_TRYPACK_PREVIOUSLY_PACKED \ |
732 | if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
733 | ThreadLocalTrace()->Write("%s panel #%d had previously been packed.", \ |
734 | str(side), block); \ |
735 | } |
736 | |
737 | #define RUY_TRACE_INFO_TRYPACK_PACKED_BY_ANOTHER_THREAD \ |
738 | ThreadLocalTrace()->Write( \ |
739 | "%s panel #%d has just been packed by another thread.", str(side), \ |
740 | block); |
741 | |
742 | #define RUY_TRACE_INFO_ENSURE_PACKED_ENTER_RUN_AHEAD \ |
743 | if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
744 | ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(true); \ |
745 | ThreadLocalTrace()->Write( \ |
746 | "We're blocked on other threads packing the panels that we need. " \ |
747 | "Packing some other panels while we wait..."); \ |
748 | } |
749 | |
750 | #define RUY_TRACE_INFO_ENSURE_PACKED_END \ |
751 | if (ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \ |
752 | ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(false); \ |
753 | ThreadLocalTrace()->Write( \ |
754 | "Other threads have finished packing what we were waiting for."); \ |
755 | } |
756 | |
757 | #define RUY_TRACE_INFO_RUN_PACK \ |
758 | ThreadLocalTrace()->Write("Path: %s", str(ThePath)); \ |
759 | ThreadLocalTrace()->Write("Packing panel consisting of columns [%d, %d)", \ |
760 | start_col, end_col); \ |
761 | ThreadLocalTrace()->Write("Source: columns [%d, %d) of %s", start_col, \ |
762 | end_col, str(src_matrix)); \ |
763 | ThreadLocalTrace()->Write("Destination: columns [%d, %d) of %s", start_col, \ |
764 | end_col, str(*packed_matrix)); \ |
765 | if (end_col > src_matrix.layout.cols) { \ |
766 | ThreadLocalTrace()->Write( \ |
767 | "This runs past the last column of the source matrix. Padding as " \ |
768 | "needed."); \ |
769 | } \ |
770 | if (packed_matrix->layout.rows > src_matrix.layout.rows) { \ |
771 | ThreadLocalTrace()->Write( \ |
772 | "The packed matrix has more rows than the source matrix due to " \ |
773 | "rounding up to the kernel block size. Padding as needed."); \ |
774 | } |
775 | |
776 | #define RUY_TRACE_INFO_RUN_KERNEL \ |
777 | { \ |
778 | ThreadLocalTrace()->Write("Path: %s", str(KernelArgs<KernelType>::kPath)); \ |
779 | int lhs_cols = end[Side::kLhs] - start[Side::kLhs]; \ |
780 | int rhs_cols = end[Side::kRhs] - start[Side::kRhs]; \ |
781 | int kernel_lhs_cols = src[Side::kLhs].layout.kernel.cols; \ |
782 | int kernel_rhs_cols = src[Side::kRhs].layout.kernel.cols; \ |
783 | ThreadLocalTrace()->Write("LHS: columns [%d, %d) of %s", \ |
784 | start[Side::kLhs], end[Side::kLhs], \ |
785 | str(src[Side::kLhs])); \ |
786 | ThreadLocalTrace()->Write("RHS: columns [%d, %d) of %s", \ |
787 | start[Side::kRhs], end[Side::kRhs], \ |
788 | str(src[Side::kRhs])); \ |
789 | ThreadLocalTrace()->Write("Destination: block [%d, %d)x[%d, %d) of %s", \ |
790 | start[Side::kLhs], end[Side::kLhs], \ |
791 | start[Side::kRhs], end[Side::kRhs], str(*dst)); \ |
792 | if (end[Side::kLhs] > dst->layout.rows || \ |
793 | end[Side::kRhs] > dst->layout.cols) { \ |
794 | ThreadLocalTrace()->Write( \ |
795 | "This runs over the destination matrix boundaries. The kernel will " \ |
796 | "internally clamp stores to avoid overruns."); \ |
797 | } \ |
798 | ThreadLocalTrace()->Write( \ |
799 | "The kernel's inner loop only produces a %dx%d block, so the " \ |
800 | "kernel's outer loops will run %dx%d times.", \ |
801 | kernel_lhs_cols, kernel_rhs_cols, lhs_cols / kernel_lhs_cols, \ |
802 | rhs_cols / kernel_rhs_cols); \ |
803 | } |
804 | |
805 | #define RUY_TRACE_INFO_THREAD_FUNC_IMPL_WAITING \ |
806 | ThreadLocalTrace()->Write("Waiting for a task..."); |
807 | |
808 | #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK \ |
809 | ThreadLocalTrace()->Write("Sending task #%d to a worker thread...", i); |
810 | |
811 | #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK_ZERO_ON_CUR_THREAD \ |
812 | ThreadLocalTrace()->Write("Running task #0 on the current thread..."); |
813 | |
814 | #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_WAITING_FOR_THREADS \ |
815 | ThreadLocalTrace()->Write("Waiting for worker threads to finish.."); |
816 | |
817 | #define RUY_TRACE_INFO(id) \ |
818 | [=]() { \ |
819 | ThreadLocalTrace()->set_current_source_file(__FILE__); \ |
820 | ThreadLocalTrace()->set_current_source_line(__LINE__); \ |
821 | RUY_TRACE_INFO_##id \ |
822 | }() |
823 | |
824 | } // namespace ruy |
825 | |
826 | #else |
827 | |
828 | // Vacuous implementation when RUY_TRACE is not defined. |
829 | #define RUY_TRACE_SCOPE_NAME(name) |
830 | #define RUY_TRACE_SCOPE |
831 | #define RUY_TRACE_SET_THEAD_ID(thread_id) |
832 | #define RUY_TRACE_INFO(id) |
833 | |
834 | #endif |
835 | |
836 | #endif // RUY_RUY_TRACE_H_ |
837 | |