1 | #pragma once |
2 | |
3 | #include <cstddef> |
4 | #include <cstdint> |
5 | #include <list> |
6 | #include <string> |
7 | #include <unordered_map> |
8 | #include <vector> |
9 | |
10 | #include <ATen/record_function.h> |
11 | #include <c10/macros/Macros.h> |
12 | #include <c10/util/Optional.h> |
13 | #include <c10/util/hash.h> |
14 | #include <torch/csrc/Export.h> |
15 | #include <torch/csrc/jit/frontend/source_range.h> |
16 | |
17 | #ifndef _WIN32 |
18 | #include <ctime> |
19 | #endif |
20 | #if defined(C10_IOS) && defined(C10_MOBILE) |
21 | #include <sys/time.h> // for gettimeofday() |
22 | #endif |
23 | |
24 | #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) |
25 | #define C10_RDTSC |
26 | #if defined(_MSC_VER) |
27 | #include <intrin.h> |
28 | #elif defined(__CUDACC__) || defined(__HIPCC__) |
29 | #undef C10_RDTSC |
30 | #elif defined(__clang__) |
31 | // `__rdtsc` is available by default. |
32 | // NB: This has to be first, because Clang will also define `__GNUC__` |
33 | #elif defined(__GNUC__) |
34 | #include <x86intrin.h> |
35 | #else |
36 | #undef C10_RDTSC |
37 | #endif |
38 | #endif |
39 | |
40 | // TODO: replace with pytorch/rfcs#43 when it is ready. |
41 | #define SOFT_ASSERT(cond, ...) \ |
42 | [&]() -> bool { \ |
43 | if (C10_UNLIKELY(!(cond))) { \ |
44 | torch::profiler::impl::logSoftAssert( \ |
45 | __func__, \ |
46 | __FILE__, \ |
47 | static_cast<uint32_t>(__LINE__), \ |
48 | #cond, \ |
49 | ::c10::str(__VA_ARGS__)); \ |
50 | if (torch::profiler::impl::softAssertRaises()) { \ |
51 | TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__); \ |
52 | } else { \ |
53 | TORCH_WARN(__VA_ARGS__); \ |
54 | } \ |
55 | return false; \ |
56 | } \ |
57 | return true; \ |
58 | }() |
59 | |
60 | namespace torch { |
61 | namespace profiler { |
62 | namespace impl { |
63 | TORCH_API bool softAssertRaises(); |
64 | TORCH_API void setSoftAssertRaises(c10::optional<bool> value); |
65 | TORCH_API void logSoftAssert( |
66 | const char* func, |
67 | const char* file, |
68 | uint32_t line, |
69 | const char* cond, |
70 | const char* args); |
71 | TORCH_API inline void logSoftAssert( |
72 | const char* func, |
73 | const char* file, |
74 | uint32_t line, |
75 | const char* cond, |
76 | ::c10::detail::CompileTimeEmptyString args) { |
77 | logSoftAssert(func, file, line, cond, (const char*)args); |
78 | } |
79 | TORCH_API void logSoftAssert( |
80 | const char* func, |
81 | const char* file, |
82 | uint32_t line, |
83 | const char* cond, |
84 | const std::string& args); |
85 | |
86 | using time_t = int64_t; |
87 | using steady_clock_t = std::conditional< |
88 | std::chrono::high_resolution_clock::is_steady, |
89 | std::chrono::high_resolution_clock, |
90 | std::chrono::steady_clock>::type; |
91 | |
92 | inline time_t getTimeSinceEpoch() { |
93 | auto now = std::chrono::system_clock::now().time_since_epoch(); |
94 | return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count(); |
95 | } |
96 | |
97 | inline time_t getTime(bool allow_monotonic = false) { |
98 | #if defined(C10_IOS) && defined(C10_MOBILE) |
99 | // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS |
100 | // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime |
101 | // is implemented or not |
102 | struct timeval now; |
103 | gettimeofday(&now, NULL); |
104 | return static_cast<time_t>(now.tv_sec) * 1000000000 + |
105 | static_cast<time_t>(now.tv_usec) * 1000; |
106 | #elif defined(_WIN32) || defined(__MACH__) |
107 | return std::chrono::duration_cast<std::chrono::nanoseconds>( |
108 | steady_clock_t::now().time_since_epoch()) |
109 | .count(); |
110 | #else |
111 | // clock_gettime is *much* faster than std::chrono implementation on Linux |
112 | struct timespec t {}; |
113 | auto mode = CLOCK_REALTIME; |
114 | if (allow_monotonic) { |
115 | mode = CLOCK_MONOTONIC; |
116 | } |
117 | clock_gettime(mode, &t); |
118 | return static_cast<time_t>(t.tv_sec) * 1000000000 + |
119 | static_cast<time_t>(t.tv_nsec); |
120 | #endif |
121 | } |
122 | |
123 | // We often do not need to capture true wall times. If a fast mechanism such |
124 | // as TSC is available we can use that instead and convert back to epoch time |
125 | // during post processing. This greatly reduce the clock's contribution to |
126 | // profiling. |
127 | // http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/ |
128 | // https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io |
129 | // TODO: We should use |
130 | // `https://github.com/google/benchmark/blob/main/src/cycleclock.h` |
131 | inline auto getApproximateTime() { |
132 | #if defined(C10_RDTSC) |
133 | return static_cast<uint64_t>(__rdtsc()); |
134 | #else |
135 | return getTime(); |
136 | #endif |
137 | } |
138 | |
139 | using approx_time_t = decltype(getApproximateTime()); |
140 | static_assert( |
141 | std::is_same<approx_time_t, int64_t>::value || |
142 | std::is_same<approx_time_t, uint64_t>::value, |
143 | "Expected either int64_t (`getTime`) or uint64_t (some TSC reads)." ); |
144 | |
145 | // Convert `getCount` results to Nanoseconds since unix epoch. |
146 | class ApproximateClockToUnixTimeConverter final { |
147 | public: |
148 | ApproximateClockToUnixTimeConverter(); |
149 | std::function<time_t(approx_time_t)> makeConverter(); |
150 | |
151 | struct UnixAndApproximateTimePair { |
152 | time_t t_; |
153 | approx_time_t approx_t_; |
154 | }; |
155 | static UnixAndApproximateTimePair measurePair(); |
156 | |
157 | private: |
158 | static constexpr size_t replicates = 1001; |
159 | using time_pairs = std::array<UnixAndApproximateTimePair, replicates>; |
160 | time_pairs measurePairs(); |
161 | |
162 | time_pairs start_times_; |
163 | }; |
164 | |
165 | std::string getNvtxStr( |
166 | const char* name, |
167 | int64_t sequence_nr, |
168 | const std::vector<std::vector<int64_t>>& shapes, |
169 | at::RecordFunctionHandle op_id = 0, |
170 | const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids = |
171 | {}); |
172 | |
173 | struct TORCH_API FileLineFunc { |
174 | std::string filename; |
175 | size_t line; |
176 | std::string funcname; |
177 | }; |
178 | |
179 | TORCH_API std::vector<FileLineFunc> prepareCallstack( |
180 | const std::vector<jit::StackEntry>& cs); |
181 | TORCH_API std::vector<std::string> callstackStr( |
182 | const std::vector<FileLineFunc>& cs); |
183 | TORCH_API std::string stacksToStr( |
184 | const std::vector<std::string>& stacks, |
185 | const char* delim); |
186 | TORCH_API std::vector<std::vector<int64_t>> inputSizes( |
187 | const at::RecordFunction& fn, |
188 | const bool flatten_list_enabled = false); |
189 | TORCH_API std::string shapesToStr( |
190 | const std::vector<std::vector<int64_t>>& shapes); |
191 | TORCH_API std::string dtypesToStr(const std::vector<std::string>& types); |
192 | TORCH_API std::string inputOpIdsToStr( |
193 | const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids); |
194 | TORCH_API std::vector<std::string> inputTypes(const at::RecordFunction& fn); |
195 | |
196 | std::unordered_map<std::string, c10::IValue> TORCH_API |
197 | (const at::RecordFunction& fn); |
198 | |
199 | uint64_t TORCH_API computeFlops( |
200 | const std::string& op_name, |
201 | const std::unordered_map<std::string, c10::IValue>& ); |
202 | |
203 | template <typename T> |
204 | class TORCH_API GlobalStateManager { |
205 | public: |
206 | static GlobalStateManager& singleton() { |
207 | static GlobalStateManager singleton_; |
208 | return singleton_; |
209 | } |
210 | |
211 | static void push(std::shared_ptr<T>&& state) { |
212 | if (singleton().state_) { |
213 | LOG(WARNING) << "GlobalStatePtr already exists!" ; |
214 | } else { |
215 | singleton().state_ = std::move(state); |
216 | } |
217 | } |
218 | |
219 | static auto* get() { |
220 | return singleton().state_.get(); |
221 | } |
222 | |
223 | static std::shared_ptr<T> pop() { |
224 | auto out = singleton().state_; |
225 | singleton().state_.reset(); |
226 | return out; |
227 | } |
228 | |
229 | private: |
230 | GlobalStateManager() = default; |
231 | |
232 | std::shared_ptr<T> state_; |
233 | }; |
234 | |
235 | struct HashCombine { |
236 | template <typename T0, typename T1> |
237 | size_t operator()(const std::pair<T0, T1>& i) { |
238 | return c10::get_hash((*this)(i.first), (*this)(i.second)); |
239 | } |
240 | |
241 | template <typename... Args> |
242 | size_t operator()(const std::tuple<Args...>& i) { |
243 | return c10::get_hash(i); |
244 | } |
245 | |
246 | template <typename T> |
247 | size_t operator()(const T& i) { |
248 | return c10::get_hash(i); |
249 | } |
250 | }; |
251 | |
252 | } // namespace impl |
253 | } // namespace profiler |
254 | } // namespace torch |
255 | |
256 | namespace torch { |
257 | namespace autograd { |
258 | namespace profiler { |
259 | using torch::profiler::impl::computeFlops; |
260 | using torch::profiler::impl::getTime; |
261 | } // namespace profiler |
262 | } // namespace autograd |
263 | } // namespace torch |
264 | |