1#pragma once
2
3#include <string>
4#include <vector>
5
6#include <torch/csrc/profiler/api.h>
7#include <torch/csrc/profiler/events.h>
8#include <torch/csrc/profiler/stubs/base.h>
9#include <torch/csrc/profiler/util.h>
10
11namespace torch {
12namespace profiler {
13namespace impl {
14struct Result;
15namespace kineto {
16struct ActivityTraceWrapper;
17} // namespace kineto
18} // namespace impl
19} // namespace profiler
20namespace autograd {
21namespace profiler {
22using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
23
24struct TORCH_API KinetoEvent {
25 KinetoEvent(
26 std::shared_ptr<const torch::profiler::impl::Result>,
27 const bool verbose);
28
29 uint64_t startThreadId() const;
30 uint64_t endThreadId() const;
31 uint8_t activityType() const;
32 uint64_t fwdThreadId() const;
33 bool hasShapes() const;
34 const c10::ArrayRef<std::vector<int64_t>> shapes() const;
35 bool hasTypes() const;
36 const c10::ArrayRef<std::string> dtypes() const;
37 uint64_t flops() const;
38 int64_t sequenceNr() const;
39 bool hasStack() const;
40 const c10::ArrayRef<std::string> stack() const;
41 uint8_t scope() const;
42 bool hasModuleHierarchy() const;
43 const c10::ArrayRef<std::string> moduleHierarchy() const;
44 int64_t debugHandle() const;
45 std::string name() const;
46 c10::DeviceType deviceType() const;
47 uint8_t deviceIndex() const;
48 int64_t nBytes() const;
49 uint64_t startUs() const;
50 uint64_t durationUs() const;
51 bool isAsync() const;
52 uint64_t correlationId() const;
53 uint64_t linkedCorrelationId() const;
54 int64_t deviceResourceId() const;
55 std::string backend() const;
56 bool isPythonFunction() const;
57 int64_t cudaElapsedUs() const;
58 void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
59
60 private:
61 torch::profiler::impl::ProfilerEventStub fallbackStart() const;
62 torch::profiler::impl::ProfilerEventStub fallbackEnd() const;
63
64 std::shared_ptr<const torch::profiler::impl::Result> result_;
65 std::vector<std::string> python_stack_;
66
67 // Copy fields from result so we can return ArrayRefs.
68 std::vector<std::vector<int64_t>> shapes_;
69 std::vector<std::string> dtypes_;
70};
71
72// Consolidating events returned directly from Kineto
73// with events manually created by us (e.g. start/stop marks,
74// memory allocation events)
75struct TORCH_API ProfilerResult {
76 ProfilerResult();
77 ProfilerResult(
78 uint64_t start_time,
79 std::vector<KinetoEvent> events,
80 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
81 trace,
82 std::vector<experimental_event_t>&& event_tree);
83 ~ProfilerResult();
84
85 uint64_t trace_start_us() const {
86 return trace_start_us_;
87 }
88
89 const std::vector<KinetoEvent>& events() const {
90 return events_;
91 }
92
93 const std::vector<experimental_event_t>& event_tree() const {
94 return event_tree_;
95 }
96
97 void save(const std::string& path);
98
99 private:
100 uint64_t trace_start_us_ = 0;
101 std::vector<KinetoEvent> events_;
102 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
103 std::vector<experimental_event_t> event_tree_;
104};
105
106/*
107 * This API is used by backends to record latency of events that
108 * happened in the backend but were not visible to pytorch runtime.
109 * For example, if part of the model is lowered to a dsp backend, then
110 * the execution of that part of the model is delegated to the backend.
111 * When backend finishes execution it has an option to provide profiling
112 * information (latency only at th emoment) corresponding to different operators
113 * that were executed in the backend.
114 * When such events are recorded by backend using this API, the event
115 * records will be collected by active kineto profiler. If no kineto profiler
116 * is active then the event is ignored.
117 * This provides us with a way to generate all the profiling information
118 * for a model regardless of where model (or part of it) executed.
119 * @param start_time_us: start time in us of the event
120 * @param end_time_us: end time in us of the event
121 * @param debug_handle: debug handle to correlate this event/op with
122 * model level module/source information
123 * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
124 * @param event_name: name of the event, e.g. op name
125 * @param backend_name: name of the backend where the event took place.
126 */
127TORCH_API void reportBackendEventToActiveKinetoProfiler(
128 const int64_t start_time_us,
129 const int64_t end_time_us,
130 const int64_t debug_handle,
131 const at::RecordScope scope,
132 const std::string& event_name,
133 const std::string& backend_name);
134
135TORCH_API void enableProfiler(
136 const torch::profiler::impl::ProfilerConfig& config,
137 const std::set<torch::profiler::impl::ActivityType>& activities,
138 const std::unordered_set<at::RecordScope>& scopes = {});
139
140/*
141 * Same as enableProfiler but with callback to do post-processing of
142 * KinetoEvents.
143 * enableProfilerWithEventPostProcess enables profiler to capture
144 * specified activities, with specified RecordFunction scope, if any.
145 * Additionally, it takes a functor that does in-place post processing of
146 * events, e.g. populate stack trace or module hierarchy information lazily
147 * using debug_handle.
148 * Example usage is with lite interpreter that has recording scope of
149 * LITE_INTERPRETER. In this case lite interpreter runtime, records debug
150 * handles in RecordFunction, along with other information. Debug handles are
151 * eventually passed down to KinetoEvent and recorded as part of the event.
152 * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
153 * profiler using post-processing callback, via
154 * enableProfilerWithEventPostProcess, that takes these debug handles and
155 * generates stack trace and module hierarchy information, once profiling is
156 * done.
157 */
158using post_process_t = std::function<void(
159 /*debug_handle */ int64_t,
160 /*jit_stack */ std::vector<std::string>&,
161 /*jit_modules */ std::vector<std::string>&)>;
162TORCH_API void enableProfilerWithEventPostProcess(
163 const torch::profiler::impl::ProfilerConfig& config,
164 const std::set<torch::profiler::impl::ActivityType>& activities,
165 post_process_t&& cb,
166 const std::unordered_set<at::RecordScope>& scopes = {});
167
168TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
169
170TORCH_API void prepareProfiler(
171 const torch::profiler::impl::ProfilerConfig& config,
172 const std::set<torch::profiler::impl::ActivityType>& activities);
173
174} // namespace profiler
175} // namespace autograd
176
177namespace profiler {
178namespace impl {
179
180// Experimental.
181TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
182
183} // namespace impl
184} // namespace profiler
185
186} // namespace torch
187