1#include <torch/csrc/profiler/python/init.h>
2
3#include <ATen/record_function.h>
4#include <c10/util/overloaded.h>
5#include <torch/csrc/DynamicTypes.h>
6#include <torch/csrc/autograd/utils/wrap_outputs.h>
7#include <torch/csrc/jit/python/pybind_utils.h>
8#include <torch/csrc/profiler/collection.h>
9#include <torch/csrc/profiler/standalone/execution_graph_observer.h>
10#include <torch/csrc/utils/pybind.h>
11
12namespace torch {
13namespace profiler {
14
15void initPythonBindings(PyObject* module) {
16 auto rootModule = py::handle(module).cast<py::module>();
17 auto m = rootModule.def_submodule("_profiler");
18
19 using namespace torch::profiler::impl;
20
21 py::enum_<at::RecordScope>(m, "RecordScope")
22 .value("FUNCTION", at::RecordScope::FUNCTION)
23 .value("BACKWARD_FUNCTION", at::RecordScope::BACKWARD_FUNCTION)
24 .value("TORCHSCRIPT_FUNCTION", at::RecordScope::TORCHSCRIPT_FUNCTION)
25 .value("KERNEL_FUNCTION_DTYPE", at::RecordScope::KERNEL_FUNCTION_DTYPE)
26 .value("CUSTOM_CLASS", at::RecordScope::CUSTOM_CLASS)
27 .value("BUILD_FEATURE", at::RecordScope::BUILD_FEATURE)
28 .value("LITE_INTERPRETER", at::RecordScope::LITE_INTERPRETER)
29 .value("USER_SCOPE", at::RecordScope::USER_SCOPE)
30 .value("STATIC_RUNTIME_OP", at::RecordScope::STATIC_RUNTIME_OP)
31 .value("STATIC_RUNTIME_MODEL", at::RecordScope::STATIC_RUNTIME_MODEL);
32
33 py::enum_<ProfilerState>(m, "ProfilerState")
34 .value("Disabled", ProfilerState::Disabled)
35 .value("CPU", ProfilerState::CPU)
36 .value("CUDA", ProfilerState::CUDA)
37 .value("NVTX", ProfilerState::NVTX)
38 .value("ITT", ProfilerState::ITT)
39 .value("KINETO", ProfilerState::KINETO)
40 .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);
41
42 py::enum_<ActiveProfilerType>(m, "ActiveProfilerType")
43 .value("NONE", ActiveProfilerType::NONE)
44 .value("LEGACY", ActiveProfilerType::LEGACY)
45 .value("KINETO", ActiveProfilerType::KINETO)
46 .value("NVTX", ActiveProfilerType::NVTX)
47 .value("ITT", ActiveProfilerType::ITT);
48
49 py::enum_<ActivityType>(m, "ProfilerActivity")
50 .value("CPU", ActivityType::CPU)
51 .value("CUDA", ActivityType::CUDA);
52
53 py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
54 .def(
55 py::init<
56 std::vector<std::string> /* profiler_metrics */,
57 bool /* profiler_measure_per_kernel */,
58 bool /* verbose */,
59 std::vector<std::string> /* performance_events */
60 >(),
61 "An experimental config for Kineto features. Please note that"
62 "backward compatibility is not guaranteed.\n"
63 " profiler_metrics : a list of CUPTI profiler metrics used\n"
64 " to measure GPU performance events.\n"
65 " If this list contains values Kineto runs in CUPTI profiler mode\n"
66 " profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n"
67 " or for the entire measurement duration.\n"
68 " verbose (bool) : whether the trace file has `Call stack` field or not.\n"
69 " performance_events : a list of profiler events to be used for measurement",
70 py::arg("profiler_metrics") = std::vector<std::string>(),
71 py::arg("profiler_measure_per_kernel") = false,
72 py::arg("verbose") = false,
73 py::arg("performance_events") = std::vector<std::string>())
74 .def(py::pickle(
75 [](const ExperimentalConfig& p) { // __getstate__
76 py::list py_metrics;
77 for (const auto& metric : p.profiler_metrics) {
78 py::bytes mbytes(metric);
79 py_metrics.append(mbytes);
80 }
81 py::list py_perf_events;
82 for (const auto& event : p.performance_events) {
83 py::bytes mbytes(event);
84 py_perf_events.append(mbytes);
85 }
86 /* Return a tuple that fully encodes the state of the config */
87 return py::make_tuple(
88 py_metrics,
89 p.profiler_measure_per_kernel,
90 p.verbose,
91 p.performance_events);
92 },
93 [](py::tuple t) { // __setstate__
94 if (t.size() >= 3) {
95 throw std::runtime_error("Expected atleast 3 values in state");
96 }
97
98 py::list py_metrics = t[0].cast<py::list>();
99 std::vector<std::string> metrics{py_metrics.size()};
100
101 for (const auto& py_metric : py_metrics) {
102 metrics.push_back(py::str(py_metric));
103 }
104
105 std::vector<std::string> performance_events;
106 if (t.size() == 4) {
107 py::list py_perf_events = t[3].cast<py::list>();
108 performance_events.resize(py_perf_events.size());
109 for (const auto& py_perf_event : py_perf_events) {
110 performance_events.push_back(py::str(py_perf_event));
111 }
112 }
113
114 return ExperimentalConfig(
115 std::move(metrics),
116 t[1].cast<bool>(),
117 t[2].cast<bool>(),
118 std::move(performance_events));
119 }));
120
121 py::class_<ProfilerConfig>(m, "ProfilerConfig")
122 .def(py::init<
123 ProfilerState,
124 bool, /* record_input_shapes */
125 bool, /* profile_memory */
126 bool, /* with_stack */
127 bool, /* with_flops */
128 bool, /* with_modules */
129 ExperimentalConfig /* experimental_config */
130 >());
131
132 py::enum_<EventType>(m, "_EventType")
133 .value("TorchOp", EventType::TorchOp)
134 .value("Backend", EventType::Backend)
135 .value("Vulkan", EventType::Vulkan)
136 .value("Allocation", EventType::Allocation)
137 .value("PyCall", EventType::PyCall)
138 .value("PyCCall", EventType::PyCCall)
139 .value("Kineto", EventType::Kineto);
140
141 py::class_<TensorMetadata>(m, "_TensorMetadata")
142 .def_property_readonly("impl_ptr", &TensorMetadata::impl)
143 .def_readonly("storage_data_ptr", &TensorMetadata::data_)
144 .def_readonly("id", &TensorMetadata::id_)
145 .def_readonly("allocation_id", &TensorMetadata::allocation_id_)
146 .def_property_readonly(
147 "layout",
148 [](const TensorMetadata& metadata) {
149 PyObject* layout_obj =
150 torch::autograd::utils::wrap(metadata.layout_);
151 return py::reinterpret_borrow<py::object>(layout_obj);
152 })
153 .def_readonly("device", &TensorMetadata::device_)
154 .def_property_readonly(
155 "dtype",
156 [](const TensorMetadata& metadata) {
157 return py::reinterpret_borrow<py::object>(
158 torch::autograd::utils::wrap(
159 torch::getTHPDtype(metadata.dtype_)));
160 })
161 .def_readonly("dim", &TensorMetadata::dim_)
162 .def_readonly("sizes", &TensorMetadata::sizes_)
163 .def_readonly("strides", &TensorMetadata::strides_);
164
165 using torch_op_t = ExtraFields<EventType::TorchOp>;
166 py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
167 .def_readonly("name", &torch_op_t::name_)
168 .def_property_readonly(
169 "inputs",
170 [](const torch_op_t& op) {
171 py::list out;
172 for (const auto& input : op.inputs_) {
173 c10::visit(
174 c10::overloaded(
175 [&](const c10::IValue& v) {
176 out.append(torch::jit::toPyObject(v));
177 },
178 [&](const c10::nullopt_t&) { out.append(py::none()); },
179 [&](const auto& v) { out.append(py::cast(v)); }),
180 input);
181 }
182 return out;
183 })
184 .def_readonly("scope", &torch_op_t::scope_)
185 .def_readonly("sequence_number", &torch_op_t::sequence_number_)
186 .def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);
187
188 py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend");
189 py::class_<ExtraFields<EventType::Vulkan>>(m, "_ExtraFields_Vulkan");
190
191 using allocation_t = ExtraFields<EventType::Allocation>;
192 py::class_<allocation_t>(m, "_ExtraFields_Allocation")
193 .def_property_readonly(
194 "ptr",
195 [](const allocation_t& a) {
196 return reinterpret_cast<intptr_t>(a.ptr_);
197 })
198 .def_readonly("id", &allocation_t::id_)
199 .def_readonly("allocation_id", &allocation_t::allocation_id_)
200 .def_readonly("alloc_size", &allocation_t::alloc_size_)
201 .def_readonly("total_allocated", &allocation_t::total_allocated_)
202 .def_readonly("total_reserved", &allocation_t::total_reserved_)
203 .def_property_readonly("device", &allocation_t::device);
204
205 py::class_<PyFrameState>(m, "_PyFrameState")
206 .def_readonly("line_number", &PyFrameState::line_no_)
207 .def_property_readonly(
208 "file_name", [](const PyFrameState& s) { return s.filename_.str(); })
209 .def_property_readonly("function_name", [](const PyFrameState& s) {
210 return s.funcname_.str();
211 });
212
213 py::class_<NNModuleInfo>(m, "_NNModuleInfo")
214 .def_property_readonly(
215 "parameters",
216 [](const NNModuleInfo& s) {
217 py::list out;
218 for (const auto& p : s.parameters_) {
219 out.append(
220 py::make_tuple(p.name_, p.metadata_, p.grad_metadata_));
221 }
222 return out;
223 })
224 .def_property_readonly(
225 "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); })
226 .def_readonly("self_ptr", &NNModuleInfo::self_)
227 .def_readonly("cls_ptr", &NNModuleInfo::cls_);
228
229 py::class_<OptimizerInfo>(m, "_OptimizerInfo")
230 .def_readonly("self_ptr", &OptimizerInfo::self_)
231 .def_property_readonly("parameters", [](const OptimizerInfo& s) {
232 py::list out;
233 for (const auto& p : s.parameters_) {
234 out.append(py::make_tuple(p.metadata_, p.grad_metadata_, p.state_));
235 }
236 return out;
237 });
238
239 py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
240 .def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
241 .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
242 .def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
243 .def_readonly("optimizer", &ExtraFields<EventType::PyCall>::optimizer_);
244
245 py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
246 .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
247
248 py::class_<ExtraFields<EventType::OutOfMemory>>(
249 m, "_ExtraFields_OutOfMemory");
250
251 py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto");
252
253 py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent")
254 .def_property_readonly("name", &Result::name)
255 .def_property_readonly("tag", &Result::tag)
256 .def_readonly("extra_fields", &Result::extra_fields_)
257 .def_property_readonly(
258 "typed",
259 [](const Result& r) {
260 return py::make_tuple(
261 r.tag(),
262 py::cast(r.extra_fields_, py::return_value_policy::reference));
263 })
264 .def_property_readonly(
265 "id",
266 [](const Result& r) {
267 return reinterpret_cast<intptr_t>(r.shared_from_this().get());
268 })
269 .def_property_readonly(
270 "parent", [](const Result& r) { return r.parent_.lock(); })
271 .def_readonly("children", &Result::children_)
272 .def_readonly("start_time_ns", &Result::start_time_ns_)
273 .def_readonly("start_tid", &Result::start_tid_)
274 .def_property_readonly("correlation_id", &Result::correlationID)
275 .def_property_readonly("end_time_ns", &Result::endTimeNS)
276 .def_property_readonly("duration_time_ns", [](const Result& r) {
277 return r.endTimeNS() - r.start_time_ns_;
278 });
279
280 // PyTorch profiler execution graph internal interface.
281 m.def(
282 "_add_execution_graph_observer",
283 &torch::profiler::impl::addExecutionGraphObserver,
284 py::arg("output_file_name"));
285 m.def(
286 "_remove_execution_graph_observer",
287 &torch::profiler::impl::removeExecutionGraphObserver);
288 m.def(
289 "_enable_execution_graph_observer",
290 &torch::profiler::impl::enableExecutionGraphObserver);
291 m.def(
292 "_disable_execution_graph_observer",
293 &torch::profiler::impl::disableExecutionGraphObserver);
294}
295
296} // namespace profiler
297} // namespace torch
298