1 | #include <torch/csrc/profiler/python/init.h> |
2 | |
3 | #include <ATen/record_function.h> |
4 | #include <c10/util/overloaded.h> |
5 | #include <torch/csrc/DynamicTypes.h> |
6 | #include <torch/csrc/autograd/utils/wrap_outputs.h> |
7 | #include <torch/csrc/jit/python/pybind_utils.h> |
8 | #include <torch/csrc/profiler/collection.h> |
9 | #include <torch/csrc/profiler/standalone/execution_graph_observer.h> |
10 | #include <torch/csrc/utils/pybind.h> |
11 | |
12 | namespace torch { |
13 | namespace profiler { |
14 | |
15 | void initPythonBindings(PyObject* module) { |
16 | auto rootModule = py::handle(module).cast<py::module>(); |
17 | auto m = rootModule.def_submodule("_profiler" ); |
18 | |
19 | using namespace torch::profiler::impl; |
20 | |
21 | py::enum_<at::RecordScope>(m, "RecordScope" ) |
22 | .value("FUNCTION" , at::RecordScope::FUNCTION) |
23 | .value("BACKWARD_FUNCTION" , at::RecordScope::BACKWARD_FUNCTION) |
24 | .value("TORCHSCRIPT_FUNCTION" , at::RecordScope::TORCHSCRIPT_FUNCTION) |
25 | .value("KERNEL_FUNCTION_DTYPE" , at::RecordScope::KERNEL_FUNCTION_DTYPE) |
26 | .value("CUSTOM_CLASS" , at::RecordScope::CUSTOM_CLASS) |
27 | .value("BUILD_FEATURE" , at::RecordScope::BUILD_FEATURE) |
28 | .value("LITE_INTERPRETER" , at::RecordScope::LITE_INTERPRETER) |
29 | .value("USER_SCOPE" , at::RecordScope::USER_SCOPE) |
30 | .value("STATIC_RUNTIME_OP" , at::RecordScope::STATIC_RUNTIME_OP) |
31 | .value("STATIC_RUNTIME_MODEL" , at::RecordScope::STATIC_RUNTIME_MODEL); |
32 | |
33 | py::enum_<ProfilerState>(m, "ProfilerState" ) |
34 | .value("Disabled" , ProfilerState::Disabled) |
35 | .value("CPU" , ProfilerState::CPU) |
36 | .value("CUDA" , ProfilerState::CUDA) |
37 | .value("NVTX" , ProfilerState::NVTX) |
38 | .value("ITT" , ProfilerState::ITT) |
39 | .value("KINETO" , ProfilerState::KINETO) |
40 | .value("KINETO_GPU_FALLBACK" , ProfilerState::KINETO_GPU_FALLBACK); |
41 | |
42 | py::enum_<ActiveProfilerType>(m, "ActiveProfilerType" ) |
43 | .value("NONE" , ActiveProfilerType::NONE) |
44 | .value("LEGACY" , ActiveProfilerType::LEGACY) |
45 | .value("KINETO" , ActiveProfilerType::KINETO) |
46 | .value("NVTX" , ActiveProfilerType::NVTX) |
47 | .value("ITT" , ActiveProfilerType::ITT); |
48 | |
49 | py::enum_<ActivityType>(m, "ProfilerActivity" ) |
50 | .value("CPU" , ActivityType::CPU) |
51 | .value("CUDA" , ActivityType::CUDA); |
52 | |
53 | py::class_<ExperimentalConfig>(m, "_ExperimentalConfig" ) |
54 | .def( |
55 | py::init< |
56 | std::vector<std::string> /* profiler_metrics */, |
57 | bool /* profiler_measure_per_kernel */, |
58 | bool /* verbose */, |
59 | std::vector<std::string> /* performance_events */ |
60 | >(), |
61 | "An experimental config for Kineto features. Please note that" |
62 | "backward compatibility is not guaranteed.\n" |
63 | " profiler_metrics : a list of CUPTI profiler metrics used\n" |
64 | " to measure GPU performance events.\n" |
65 | " If this list contains values Kineto runs in CUPTI profiler mode\n" |
66 | " profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n" |
67 | " or for the entire measurement duration.\n" |
68 | " verbose (bool) : whether the trace file has `Call stack` field or not.\n" |
69 | " performance_events : a list of profiler events to be used for measurement" , |
70 | py::arg("profiler_metrics" ) = std::vector<std::string>(), |
71 | py::arg("profiler_measure_per_kernel" ) = false, |
72 | py::arg("verbose" ) = false, |
73 | py::arg("performance_events" ) = std::vector<std::string>()) |
74 | .def(py::pickle( |
75 | [](const ExperimentalConfig& p) { // __getstate__ |
76 | py::list py_metrics; |
77 | for (const auto& metric : p.profiler_metrics) { |
78 | py::bytes mbytes(metric); |
79 | py_metrics.append(mbytes); |
80 | } |
81 | py::list py_perf_events; |
82 | for (const auto& event : p.performance_events) { |
83 | py::bytes mbytes(event); |
84 | py_perf_events.append(mbytes); |
85 | } |
86 | /* Return a tuple that fully encodes the state of the config */ |
87 | return py::make_tuple( |
88 | py_metrics, |
89 | p.profiler_measure_per_kernel, |
90 | p.verbose, |
91 | p.performance_events); |
92 | }, |
93 | [](py::tuple t) { // __setstate__ |
94 | if (t.size() >= 3) { |
95 | throw std::runtime_error("Expected atleast 3 values in state" ); |
96 | } |
97 | |
98 | py::list py_metrics = t[0].cast<py::list>(); |
99 | std::vector<std::string> metrics{py_metrics.size()}; |
100 | |
101 | for (const auto& py_metric : py_metrics) { |
102 | metrics.push_back(py::str(py_metric)); |
103 | } |
104 | |
105 | std::vector<std::string> performance_events; |
106 | if (t.size() == 4) { |
107 | py::list py_perf_events = t[3].cast<py::list>(); |
108 | performance_events.resize(py_perf_events.size()); |
109 | for (const auto& py_perf_event : py_perf_events) { |
110 | performance_events.push_back(py::str(py_perf_event)); |
111 | } |
112 | } |
113 | |
114 | return ExperimentalConfig( |
115 | std::move(metrics), |
116 | t[1].cast<bool>(), |
117 | t[2].cast<bool>(), |
118 | std::move(performance_events)); |
119 | })); |
120 | |
121 | py::class_<ProfilerConfig>(m, "ProfilerConfig" ) |
122 | .def(py::init< |
123 | ProfilerState, |
124 | bool, /* record_input_shapes */ |
125 | bool, /* profile_memory */ |
126 | bool, /* with_stack */ |
127 | bool, /* with_flops */ |
128 | bool, /* with_modules */ |
129 | ExperimentalConfig /* experimental_config */ |
130 | >()); |
131 | |
132 | py::enum_<EventType>(m, "_EventType" ) |
133 | .value("TorchOp" , EventType::TorchOp) |
134 | .value("Backend" , EventType::Backend) |
135 | .value("Vulkan" , EventType::Vulkan) |
136 | .value("Allocation" , EventType::Allocation) |
137 | .value("PyCall" , EventType::PyCall) |
138 | .value("PyCCall" , EventType::PyCCall) |
139 | .value("Kineto" , EventType::Kineto); |
140 | |
141 | py::class_<TensorMetadata>(m, "_TensorMetadata" ) |
142 | .def_property_readonly("impl_ptr" , &TensorMetadata::impl) |
143 | .def_readonly("storage_data_ptr" , &TensorMetadata::data_) |
144 | .def_readonly("id" , &TensorMetadata::id_) |
145 | .def_readonly("allocation_id" , &TensorMetadata::allocation_id_) |
146 | .def_property_readonly( |
147 | "layout" , |
148 | [](const TensorMetadata& metadata) { |
149 | PyObject* layout_obj = |
150 | torch::autograd::utils::wrap(metadata.layout_); |
151 | return py::reinterpret_borrow<py::object>(layout_obj); |
152 | }) |
153 | .def_readonly("device" , &TensorMetadata::device_) |
154 | .def_property_readonly( |
155 | "dtype" , |
156 | [](const TensorMetadata& metadata) { |
157 | return py::reinterpret_borrow<py::object>( |
158 | torch::autograd::utils::wrap( |
159 | torch::getTHPDtype(metadata.dtype_))); |
160 | }) |
161 | .def_readonly("dim" , &TensorMetadata::dim_) |
162 | .def_readonly("sizes" , &TensorMetadata::sizes_) |
163 | .def_readonly("strides" , &TensorMetadata::strides_); |
164 | |
165 | using torch_op_t = ExtraFields<EventType::TorchOp>; |
166 | py::class_<torch_op_t>(m, "_ExtraFields_TorchOp" ) |
167 | .def_readonly("name" , &torch_op_t::name_) |
168 | .def_property_readonly( |
169 | "inputs" , |
170 | [](const torch_op_t& op) { |
171 | py::list out; |
172 | for (const auto& input : op.inputs_) { |
173 | c10::visit( |
174 | c10::overloaded( |
175 | [&](const c10::IValue& v) { |
176 | out.append(torch::jit::toPyObject(v)); |
177 | }, |
178 | [&](const c10::nullopt_t&) { out.append(py::none()); }, |
179 | [&](const auto& v) { out.append(py::cast(v)); }), |
180 | input); |
181 | } |
182 | return out; |
183 | }) |
184 | .def_readonly("scope" , &torch_op_t::scope_) |
185 | .def_readonly("sequence_number" , &torch_op_t::sequence_number_) |
186 | .def_readonly("allow_tf32_cublas" , &torch_op_t::allow_tf32_cublas_); |
187 | |
188 | py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend" ); |
189 | py::class_<ExtraFields<EventType::Vulkan>>(m, "_ExtraFields_Vulkan" ); |
190 | |
191 | using allocation_t = ExtraFields<EventType::Allocation>; |
192 | py::class_<allocation_t>(m, "_ExtraFields_Allocation" ) |
193 | .def_property_readonly( |
194 | "ptr" , |
195 | [](const allocation_t& a) { |
196 | return reinterpret_cast<intptr_t>(a.ptr_); |
197 | }) |
198 | .def_readonly("id" , &allocation_t::id_) |
199 | .def_readonly("allocation_id" , &allocation_t::allocation_id_) |
200 | .def_readonly("alloc_size" , &allocation_t::alloc_size_) |
201 | .def_readonly("total_allocated" , &allocation_t::total_allocated_) |
202 | .def_readonly("total_reserved" , &allocation_t::total_reserved_) |
203 | .def_property_readonly("device" , &allocation_t::device); |
204 | |
205 | py::class_<PyFrameState>(m, "_PyFrameState" ) |
206 | .def_readonly("line_number" , &PyFrameState::line_no_) |
207 | .def_property_readonly( |
208 | "file_name" , [](const PyFrameState& s) { return s.filename_.str(); }) |
209 | .def_property_readonly("function_name" , [](const PyFrameState& s) { |
210 | return s.funcname_.str(); |
211 | }); |
212 | |
213 | py::class_<NNModuleInfo>(m, "_NNModuleInfo" ) |
214 | .def_property_readonly( |
215 | "parameters" , |
216 | [](const NNModuleInfo& s) { |
217 | py::list out; |
218 | for (const auto& p : s.parameters_) { |
219 | out.append( |
220 | py::make_tuple(p.name_, p.metadata_, p.grad_metadata_)); |
221 | } |
222 | return out; |
223 | }) |
224 | .def_property_readonly( |
225 | "cls_name" , [](const NNModuleInfo& s) { return s.cls_name_.str(); }) |
226 | .def_readonly("self_ptr" , &NNModuleInfo::self_) |
227 | .def_readonly("cls_ptr" , &NNModuleInfo::cls_); |
228 | |
229 | py::class_<OptimizerInfo>(m, "_OptimizerInfo" ) |
230 | .def_readonly("self_ptr" , &OptimizerInfo::self_) |
231 | .def_property_readonly("parameters" , [](const OptimizerInfo& s) { |
232 | py::list out; |
233 | for (const auto& p : s.parameters_) { |
234 | out.append(py::make_tuple(p.metadata_, p.grad_metadata_, p.state_)); |
235 | } |
236 | return out; |
237 | }); |
238 | |
239 | py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall" ) |
240 | .def_readonly("callsite" , &ExtraFields<EventType::PyCall>::callsite_) |
241 | .def_readonly("caller" , &ExtraFields<EventType::PyCall>::caller_) |
242 | .def_readonly("module" , &ExtraFields<EventType::PyCall>::module_) |
243 | .def_readonly("optimizer" , &ExtraFields<EventType::PyCall>::optimizer_); |
244 | |
245 | py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall" ) |
246 | .def_readonly("caller" , &ExtraFields<EventType::PyCall>::caller_); |
247 | |
248 | py::class_<ExtraFields<EventType::OutOfMemory>>( |
249 | m, "_ExtraFields_OutOfMemory" ); |
250 | |
251 | py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto" ); |
252 | |
253 | py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent" ) |
254 | .def_property_readonly("name" , &Result::name) |
255 | .def_property_readonly("tag" , &Result::tag) |
256 | .def_readonly("extra_fields" , &Result::extra_fields_) |
257 | .def_property_readonly( |
258 | "typed" , |
259 | [](const Result& r) { |
260 | return py::make_tuple( |
261 | r.tag(), |
262 | py::cast(r.extra_fields_, py::return_value_policy::reference)); |
263 | }) |
264 | .def_property_readonly( |
265 | "id" , |
266 | [](const Result& r) { |
267 | return reinterpret_cast<intptr_t>(r.shared_from_this().get()); |
268 | }) |
269 | .def_property_readonly( |
270 | "parent" , [](const Result& r) { return r.parent_.lock(); }) |
271 | .def_readonly("children" , &Result::children_) |
272 | .def_readonly("start_time_ns" , &Result::start_time_ns_) |
273 | .def_readonly("start_tid" , &Result::start_tid_) |
274 | .def_property_readonly("correlation_id" , &Result::correlationID) |
275 | .def_property_readonly("end_time_ns" , &Result::endTimeNS) |
276 | .def_property_readonly("duration_time_ns" , [](const Result& r) { |
277 | return r.endTimeNS() - r.start_time_ns_; |
278 | }); |
279 | |
280 | // PyTorch profiler execution graph internal interface. |
281 | m.def( |
282 | "_add_execution_graph_observer" , |
283 | &torch::profiler::impl::addExecutionGraphObserver, |
284 | py::arg("output_file_name" )); |
285 | m.def( |
286 | "_remove_execution_graph_observer" , |
287 | &torch::profiler::impl::removeExecutionGraphObserver); |
288 | m.def( |
289 | "_enable_execution_graph_observer" , |
290 | &torch::profiler::impl::enableExecutionGraphObserver); |
291 | m.def( |
292 | "_disable_execution_graph_observer" , |
293 | &torch::profiler::impl::disableExecutionGraphObserver); |
294 | } |
295 | |
296 | } // namespace profiler |
297 | } // namespace torch |
298 | |