1 | #pragma once |
2 | |
3 | #include <string> |
4 | #include <vector> |
5 | |
6 | #include <torch/csrc/profiler/api.h> |
7 | #include <torch/csrc/profiler/events.h> |
8 | #include <torch/csrc/profiler/stubs/base.h> |
9 | #include <torch/csrc/profiler/util.h> |
10 | |
11 | namespace torch { |
12 | namespace profiler { |
13 | namespace impl { |
14 | struct Result; |
15 | namespace kineto { |
16 | struct ActivityTraceWrapper; |
17 | } // namespace kineto |
18 | } // namespace impl |
19 | } // namespace profiler |
20 | namespace autograd { |
21 | namespace profiler { |
22 | using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>; |
23 | |
24 | struct TORCH_API KinetoEvent { |
25 | KinetoEvent( |
26 | std::shared_ptr<const torch::profiler::impl::Result>, |
27 | const bool verbose); |
28 | |
29 | uint64_t startThreadId() const; |
30 | uint64_t endThreadId() const; |
31 | uint8_t activityType() const; |
32 | uint64_t fwdThreadId() const; |
33 | bool hasShapes() const; |
34 | const c10::ArrayRef<std::vector<int64_t>> shapes() const; |
35 | bool hasTypes() const; |
36 | const c10::ArrayRef<std::string> dtypes() const; |
37 | uint64_t flops() const; |
38 | int64_t sequenceNr() const; |
39 | bool hasStack() const; |
40 | const c10::ArrayRef<std::string> stack() const; |
41 | uint8_t scope() const; |
42 | bool hasModuleHierarchy() const; |
43 | const c10::ArrayRef<std::string> moduleHierarchy() const; |
44 | int64_t debugHandle() const; |
45 | std::string name() const; |
46 | c10::DeviceType deviceType() const; |
47 | uint8_t deviceIndex() const; |
48 | int64_t nBytes() const; |
49 | uint64_t startUs() const; |
50 | uint64_t durationUs() const; |
51 | bool isAsync() const; |
52 | uint64_t correlationId() const; |
53 | uint64_t linkedCorrelationId() const; |
54 | int64_t deviceResourceId() const; |
55 | std::string backend() const; |
56 | bool isPythonFunction() const; |
57 | int64_t cudaElapsedUs() const; |
58 | void getPerfEventCounters(torch::profiler::perf_counters_t&) const; |
59 | |
60 | private: |
61 | torch::profiler::impl::ProfilerEventStub fallbackStart() const; |
62 | torch::profiler::impl::ProfilerEventStub fallbackEnd() const; |
63 | |
64 | std::shared_ptr<const torch::profiler::impl::Result> result_; |
65 | std::vector<std::string> python_stack_; |
66 | |
67 | // Copy fields from result so we can return ArrayRefs. |
68 | std::vector<std::vector<int64_t>> shapes_; |
69 | std::vector<std::string> dtypes_; |
70 | }; |
71 | |
72 | // Consolidating events returned directly from Kineto |
73 | // with events manually created by us (e.g. start/stop marks, |
74 | // memory allocation events) |
75 | struct TORCH_API ProfilerResult { |
76 | ProfilerResult(); |
77 | ProfilerResult( |
78 | uint64_t start_time, |
79 | std::vector<KinetoEvent> events, |
80 | std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&& |
81 | trace, |
82 | std::vector<experimental_event_t>&& event_tree); |
83 | ~ProfilerResult(); |
84 | |
85 | uint64_t trace_start_us() const { |
86 | return trace_start_us_; |
87 | } |
88 | |
89 | const std::vector<KinetoEvent>& events() const { |
90 | return events_; |
91 | } |
92 | |
93 | const std::vector<experimental_event_t>& event_tree() const { |
94 | return event_tree_; |
95 | } |
96 | |
97 | void save(const std::string& path); |
98 | |
99 | private: |
100 | uint64_t trace_start_us_ = 0; |
101 | std::vector<KinetoEvent> events_; |
102 | std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_; |
103 | std::vector<experimental_event_t> event_tree_; |
104 | }; |
105 | |
106 | /* |
107 | * This API is used by backends to record latency of events that |
108 | * happened in the backend but were not visible to pytorch runtime. |
109 | * For example, if part of the model is lowered to a dsp backend, then |
110 | * the execution of that part of the model is delegated to the backend. |
111 | * When backend finishes execution it has an option to provide profiling |
112 | * information (latency only at th emoment) corresponding to different operators |
113 | * that were executed in the backend. |
114 | * When such events are recorded by backend using this API, the event |
115 | * records will be collected by active kineto profiler. If no kineto profiler |
116 | * is active then the event is ignored. |
117 | * This provides us with a way to generate all the profiling information |
118 | * for a model regardless of where model (or part of it) executed. |
119 | * @param start_time_us: start time in us of the event |
120 | * @param end_time_us: end time in us of the event |
121 | * @param debug_handle: debug handle to correlate this event/op with |
122 | * model level module/source information |
123 | * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc. |
124 | * @param event_name: name of the event, e.g. op name |
125 | * @param backend_name: name of the backend where the event took place. |
126 | */ |
127 | TORCH_API void reportBackendEventToActiveKinetoProfiler( |
128 | const int64_t start_time_us, |
129 | const int64_t end_time_us, |
130 | const int64_t debug_handle, |
131 | const at::RecordScope scope, |
132 | const std::string& event_name, |
133 | const std::string& backend_name); |
134 | |
135 | TORCH_API void enableProfiler( |
136 | const torch::profiler::impl::ProfilerConfig& config, |
137 | const std::set<torch::profiler::impl::ActivityType>& activities, |
138 | const std::unordered_set<at::RecordScope>& scopes = {}); |
139 | |
140 | /* |
141 | * Same as enableProfiler but with callback to do post-processing of |
142 | * KinetoEvents. |
143 | * enableProfilerWithEventPostProcess enables profiler to capture |
144 | * specified activities, with specified RecordFunction scope, if any. |
145 | * Additionally, it takes a functor that does in-place post processing of |
146 | * events, e.g. populate stack trace or module hierarchy information lazily |
147 | * using debug_handle. |
148 | * Example usage is with lite interpreter that has recording scope of |
149 | * LITE_INTERPRETER. In this case lite interpreter runtime, records debug |
150 | * handles in RecordFunction, along with other information. Debug handles are |
151 | * eventually passed down to KinetoEvent and recorded as part of the event. |
152 | * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables |
153 | * profiler using post-processing callback, via |
154 | * enableProfilerWithEventPostProcess, that takes these debug handles and |
155 | * generates stack trace and module hierarchy information, once profiling is |
156 | * done. |
157 | */ |
158 | using post_process_t = std::function<void( |
159 | /*debug_handle */ int64_t, |
160 | /*jit_stack */ std::vector<std::string>&, |
161 | /*jit_modules */ std::vector<std::string>&)>; |
162 | TORCH_API void enableProfilerWithEventPostProcess( |
163 | const torch::profiler::impl::ProfilerConfig& config, |
164 | const std::set<torch::profiler::impl::ActivityType>& activities, |
165 | post_process_t&& cb, |
166 | const std::unordered_set<at::RecordScope>& scopes = {}); |
167 | |
168 | TORCH_API std::unique_ptr<ProfilerResult> disableProfiler(); |
169 | |
170 | TORCH_API void prepareProfiler( |
171 | const torch::profiler::impl::ProfilerConfig& config, |
172 | const std::set<torch::profiler::impl::ActivityType>& activities); |
173 | |
174 | } // namespace profiler |
175 | } // namespace autograd |
176 | |
177 | namespace profiler { |
178 | namespace impl { |
179 | |
180 | // Experimental. |
181 | TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id); |
182 | |
183 | } // namespace impl |
184 | } // namespace profiler |
185 | |
186 | } // namespace torch |
187 | |