1#include <cstring>
2#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
3#include <torch/csrc/autograd/profiler_kineto.h>
4
5#include <c10/macros/Export.h>
6#include <c10/util/C++17.h>
7#include <c10/util/Exception.h>
8#include <c10/util/flat_hash_map.h>
9#include <c10/util/irange.h>
10#include <c10/util/overloaded.h>
11#include <c10/util/variant.h>
12
13#include <torch/csrc/profiler/api.h>
14#include <torch/csrc/profiler/collection.h>
15#include <torch/csrc/profiler/containers.h>
16#include <torch/csrc/profiler/events.h>
17#include <torch/csrc/profiler/kineto_shim.h>
18#include <torch/csrc/profiler/orchestration/observer.h>
19#include <torch/csrc/profiler/perf.h>
20#include <torch/csrc/profiler/standalone/itt_observer.h>
21#include <torch/csrc/profiler/standalone/nvtx_observer.h>
22#include <torch/csrc/profiler/util.h>
23
24#include <ATen/Context.h>
25
26#include <deque>
27#include <limits>
28#include <sstream>
29#include <stdexcept>
30#include <utility>
31
32#ifdef USE_KINETO
33#include <libkineto.h>
34#include <time_since_epoch.h>
35
36#ifndef _MSC_VER
37// TODO: TO be removed, once this properly works from libkineto
38// Literal copy-n-paste from third_party/kineto/libkineto/src/WeakSymbols.cpp
39extern "C" {
40// This function is needed to avoid superfluous dependency on GNU OpenMP library
41// when cuPTI is linked statically For more details see
42// https://github.com/pytorch/pytorch/issues/51026
43__attribute__((weak)) int acc_get_device_type() {
44 throw std::runtime_error(
45 "Dummy implementation of acc_get_device_type is not supposed to be called!");
46}
47} // extern "C"
48#endif // _MSC_VER
49#endif // USE_KINETO
50
51namespace torch {
52namespace autograd {
53namespace profiler {
54
55namespace {
56inline int64_t getTimeUs() {
57#ifdef USE_KINETO
58 return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
59#else
60 return torch::profiler::impl::getTime() / 1000;
61#endif // USE_KINETO
62}
63
64using torch::profiler::impl::ActiveProfilerType;
65using torch::profiler::impl::dtypesToStr;
66using torch::profiler::impl::EventType;
67using torch::profiler::impl::ExtraFields;
68using torch::profiler::impl::op_input_t;
69using torch::profiler::impl::ProfilerStateBase;
70using torch::profiler::impl::PyExtraFieldsBase;
71using torch::profiler::impl::Result;
72using torch::profiler::impl::shapesToStr;
73using torch::profiler::impl::stacksToStr;
74using torch::profiler::impl::TensorMetadata;
75
76auto shapesAndDtypes(const std::vector<op_input_t>& inputs) {
77 std::vector<std::vector<int64_t>> shapes;
78 std::vector<std::string> dtypes;
79 for (const auto& i : inputs) {
80 c10::visit(
81 c10::overloaded(
82 [&](const TensorMetadata& t) {
83 shapes.emplace_back(t.sizes_);
84 dtypes.emplace_back(scalarTypeToTypeMeta(t.dtype_).name());
85 },
86 [&](const std::vector<TensorMetadata>&) {
87 shapes.emplace_back();
88 dtypes.emplace_back("TensorList");
89 },
90 [&](const c10::IValue&) {
91 shapes.emplace_back();
92 dtypes.emplace_back("Scalar");
93 },
94 [&](const auto&) {
95 shapes.emplace_back();
96 dtypes.emplace_back();
97 }),
98 i);
99 }
100 return std::make_pair(shapes, dtypes);
101}
102
103struct MetadataBase {
104 MetadataBase(const std::shared_ptr<Result>& result)
105 : kineto_activity_{result->kineto_activity_} {
106 if (c10::holds_alternative<ExtraFields<EventType::Kineto>>(
107 result->extra_fields_)) {
108 // In order to add metadata we have to downcast from
109 // `libkineto::ITraceActivity` to `libkineto::GenericTraceActivity`. We
110 // know that all activities provided by PyTorch are of the correct type,
111 // however Kineto profilers can (and do) add events that inherit directly
112 // from ITraceActivity. As a result, any Result which was constructed from
113 // an event that Kineto provided is unsafe to cast.
114 if (!(SOFT_ASSERT(!hasKinetoActivity()))) {
115 result->kineto_activity_ = nullptr;
116 }
117 kineto_activity_ = result->kineto_activity_;
118 }
119 }
120
121 void addMetadata(const std::string& key, const std::string& value) {
122 if (kineto_activity_ && !value.empty() && value != "\"\"") {
123 torch::profiler::impl::kineto::addMetadata(kineto_activity_, key, value);
124 }
125 }
126
127 bool hasKinetoActivity() const {
128 return kineto_activity_ != nullptr;
129 }
130
131 private:
132 const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
133};
134
135struct AddTensorboardFields : public MetadataBase {
136 AddTensorboardFields(
137 const std::shared_ptr<Result>& result,
138 KinetoEvent& kineto_event)
139 : MetadataBase(result) {
140 result->visit(*this);
141 const auto module_hierarchy = kineto_event.moduleHierarchy();
142 addMetadata("Module Hierarchy", stacksToStr(module_hierarchy.vec(), "."));
143 addMetadata("Call stack", stacksToStr(kineto_event.stack().vec(), ";"));
144
145 result->visit_if_base<PyExtraFieldsBase>([&, this](const auto& i) -> void {
146 this->addMetadata("Python id", std::to_string(i.id_));
147
148 c10::optional<std::string> parent_id;
149 std::shared_ptr<Result> parent = result->parent_.lock();
150 while (parent && !parent_id.has_value()) {
151 parent->visit_if_base<PyExtraFieldsBase>(
152 [&](const auto& j) { parent_id = std::to_string(j.id_); });
153 parent = parent->parent_.lock();
154 }
155 this->addMetadata("Python parent id", parent_id.value_or("null"));
156 });
157 }
158
159 void operator()(const ExtraFields<EventType::PyCall>& py_call) {
160 if (py_call.module_.has_value()) {
161 addMetadata("Python module id", std::to_string(py_call.module_->id_));
162 }
163 }
164
165 template <typename T>
166 void operator()(const T&) {}
167};
168
169struct AddGenericMetadata : public MetadataBase {
170 AddGenericMetadata(
171 std::shared_ptr<Result>& result,
172 const torch::profiler::impl::ProfilerConfig* config)
173 : MetadataBase(result), config_(config) {
174 result->visit(*this);
175 if (config->experimental_config.verbose) {
176 result->visit_if_base<PyExtraFieldsBase>(
177 [&, this](const auto& i) -> void {
178 this->addMetadata("Python thread", std::to_string(i.python_tid_));
179 });
180 }
181 }
182
183 void operator()(ExtraFields<EventType::TorchOp>& op_event) {
184 const auto shapes_and_dtypes = shapesAndDtypes(op_event.inputs_);
185 if (!shapes_and_dtypes.first.empty()) {
186 addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first));
187 }
188
189 if (!shapes_and_dtypes.second.empty()) {
190 addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
191 }
192
193 if (config_ && !config_->experimental_config.performance_events.empty()) {
194 auto& event_names = config_->experimental_config.performance_events;
195 for (auto i = 0; i < op_event.perf_event_counters_->size(); ++i) {
196 addMetadata(
197 event_names[i],
198 std::to_string((*op_event.perf_event_counters_)[i]));
199 }
200 }
201
202 // add information about an associated forward op, if a sequence number
203 // is available (e.g. during training)
204 if (op_event.sequence_number_ >= 0) {
205 addMetadata("Fwd thread id", std::to_string(op_event.forward_tid_));
206 addMetadata("Sequence number", std::to_string(op_event.sequence_number_));
207 }
208 }
209
210 void operator()(ExtraFields<EventType::Backend>& backend_event) {
211 if (!backend_event.backend_.empty()) {
212 addMetadata("Backend", "\"" + backend_event.backend_ + "\"");
213 }
214 }
215
216 void operator()(const ExtraFields<EventType::Allocation>& alloc) {
217 addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
218 addMetadata("Device Id", std::to_string(alloc.device_index_));
219 addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(alloc.ptr_)));
220 addMetadata("Bytes", std::to_string(alloc.alloc_size_));
221 addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
222 addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
223 }
224
225 void operator()(const ExtraFields<EventType::OutOfMemory>& alloc) {
226 addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
227 addMetadata("Device Id", std::to_string(alloc.device_index_));
228 addMetadata("Bytes", std::to_string(alloc.alloc_size_));
229 addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
230 addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
231 }
232
233 template <typename T>
234 void operator()(const T&) {}
235
236 private:
237 /* To get names of the performance events */
238 const torch::profiler::impl::ProfilerConfig* config_;
239};
240
241// Assumption: Total threads number will not exceed 2^16-1, and total ops will
242// not exceed 2^48 -1.
243static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
244 return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1)));
245}
246
247struct KinetoThreadLocalState : public ProfilerStateBase {
248 explicit KinetoThreadLocalState(
249 const ProfilerConfig& config,
250 std::set<torch::profiler::impl::ActivityType> activities)
251 : ProfilerStateBase(config),
252 start_time_(getTimeUs()),
253 record_queue_(config, std::move(activities)) {}
254 ~KinetoThreadLocalState() override = default;
255
256 static KinetoThreadLocalState* get(bool global) {
257 auto* state = ProfilerStateBase::get(/*global=*/global);
258 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
259 state == nullptr ||
260 state->profilerType() == ActiveProfilerType::KINETO);
261 return static_cast<KinetoThreadLocalState*>(state);
262 }
263
264 ActiveProfilerType profilerType() override {
265 return ActiveProfilerType::KINETO;
266 }
267
268 void reportVulkanEventToProfiler(torch::profiler::impl::vulkan_id_t id) {
269 if (!config_.disabled()) {
270 record_queue_.getSubqueue()->emplace_vulkan_event(
271 torch::profiler::impl::getApproximateTime(), id);
272 }
273 }
274
275 void reportMemoryUsage(
276 void* ptr,
277 int64_t alloc_size,
278 size_t total_allocated,
279 size_t total_reserved,
280 c10::Device device) override {
281 if (config_.profile_memory && !config_.disabled()) {
282 record_queue_.getSubqueue()->emplace_allocation_event(
283 torch::profiler::impl::getApproximateTime(),
284 ptr,
285 alloc_size,
286 total_allocated,
287 total_reserved,
288 device.type(),
289 device.index());
290 }
291 }
292
293 void reportOutOfMemory(
294 int64_t alloc_size,
295 size_t total_allocated,
296 size_t total_reserved,
297 c10::Device device) override {
298 if (config_.profile_memory && !config_.disabled()) {
299 record_queue_.getSubqueue()->emplace_ooms_event(
300 torch::profiler::impl::getApproximateTime(),
301 alloc_size,
302 total_allocated,
303 total_reserved,
304 device.type(),
305 device.index());
306 }
307 }
308
309 const post_process_t& getEventPostProcessingCallback() const {
310 return event_post_process_cb_;
311 }
312
313 void setEventPostProcessingCallback(post_process_t&& cb) {
314 event_post_process_cb_ = std::move(cb);
315 }
316
317 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>
318 finalizeTrace() {
319 auto end_time = getTimeUs();
320 record_queue_.stop();
321
322 std::lock_guard<std::mutex> guard(state_mutex_);
323 auto converter = clock_converter_.makeConverter();
324 auto records_and_trace =
325 record_queue_.getRecords(std::move(converter), start_time_, end_time);
326
327 materializeOpEvents(records_and_trace.first);
328
329 // finalizeCPUTrace(cpu_trace_.get());
330
331 // `kineto_events_` does not include Python events. Instead it exposes them
332 // via the `stacks` property.
333 kineto_events_.erase(
334 std::remove_if(
335 kineto_events_.begin(),
336 kineto_events_.end(),
337 [](const auto& i) { return i.isPythonFunction(); }),
338 kineto_events_.end());
339
340 return std::move(records_and_trace.second);
341 }
342
343 template <typename T>
344 void invokeCallback(T& t) {
345 if (event_post_process_cb_) {
346 event_post_process_cb_(t.debug_handle_, t.jit_stack_, t.jit_modules_);
347 }
348 }
349
350 void materializeOpEvents(std::vector<std::shared_ptr<Result>>& events) {
351 for (auto& e : events) {
352 if (e->parent_.expired()) {
353 event_tree_.push_back(e);
354 }
355
356 if (e->finished_) {
357 e->visit(c10::overloaded(
358 [this](ExtraFields<EventType::TorchOp>& i) { invokeCallback(i); },
359 [this](ExtraFields<EventType::Backend>& i) { invokeCallback(i); },
360 [](auto&) {}));
361
362 kineto_events_.emplace_back(e, config_.experimental_config.verbose);
363 AddTensorboardFields add_tb(e, kineto_events_.back());
364 AddGenericMetadata add_generic(e, &config_);
365
366 // It is not safe to use the activity after post processing.
367 e->kineto_activity_ = nullptr;
368 }
369 }
370 }
371
372 void finalizeCPUTrace(
373 std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace) {
374#ifndef USE_KINETO
375 }
376#else // USE_KINETO
377 TORCH_INTERNAL_ASSERT(
378 cpu_trace->activities.size() == kineto_events_.size());
379 // startThreadId_seqNum to pointer of activity.
380 // Low-16bits of startThreadId and low-48bits seqNum are concatenated into
381 // one uint64_t variable as key.
382
383 // From the time being, we need disable the forward/backward correlation
384 // feature to workaround the crash bug.
385 // TODO: by Mike Guo
386 // reenable the forward/backward correlation when kineto fix the following
387 // raw pointer
388 // GenericTraceActivity.flow.linkedActivity
389 /*
390 std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
391 tidSeq2activity;
392
393 for (const auto idx : c10::irange(cpu_trace->activities.size())) {
394 auto& kineto_event = kineto_events_[idx];
395 auto& activity = cpu_trace->activities[idx];
396
397 // add information about an associated forward op, if a sequence number
398 // is available (e.g. during training)
399 if (kineto_event.sequenceNr() >= 0) {
400 generateForwardBackwardLink(
401 kineto_event, fwd_bwd_link_id, activity, tidSeq2activity);
402 }
403 }
404 */
405 }
406
407 void generateForwardBackwardLink(
408 const KinetoEvent& kineto_event,
409 uint64_t& fwd_bwd_link_id,
410 libkineto::GenericTraceActivity& activity,
411 std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>&
412 tidSeq2activity) {
413 if (kineto_event.fwdThreadId() > 0) {
414 // act is backward op.
415 uint64_t key = getForwardThreadKey(
416 kineto_event.fwdThreadId(), kineto_event.sequenceNr());
417 auto iter = tidSeq2activity.find(key);
418 if (iter != tidSeq2activity.end()) {
419 libkineto::GenericTraceActivity* fwd = iter->second;
420 fwd->flow.start = true;
421 activity.flow.id = fwd->flow.id = fwd_bwd_link_id;
422 activity.flow.type = fwd->flow.type = libkineto::kLinkFwdBwd;
423 ++fwd_bwd_link_id;
424 }
425 } else if (kineto_event.startThreadId() != 0) {
426 // act is forward op.
427 uint64_t key = getForwardThreadKey(
428 kineto_event.startThreadId(), kineto_event.sequenceNr());
429 // Assumption: Among all ops with same sequence number,
430 // the one with biggest start time is most likely launching backward op.
431 auto iter = tidSeq2activity.find(key);
432 if (iter == tidSeq2activity.end()) {
433 tidSeq2activity[key] = &activity;
434 } else {
435 // Now the sequence number is only incremented on creating a "Node"
436 // object for backward pass, by calling
437 // "at::sequence_number::get_and_increment()". Among all ops with same
438 // sequence number, the one with biggest startTime is the one launching
439 // backward op.
440 if (activity.startTime >= iter->second->startTime) {
441 tidSeq2activity[key] = &activity;
442 }
443 }
444 }
445 }
446#endif // USE_KINETO
447
448 uint64_t start_time_;
449 torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_;
450 torch::profiler::impl::RecordQueue record_queue_;
451 std::vector<KinetoEvent> kineto_events_;
452 std::vector<experimental_event_t> event_tree_;
453 // Optional, if event post-processing is enabled.
454 post_process_t event_post_process_cb_;
455};
456
457template <bool use_global_state_ptr = false>
458std::unique_ptr<at::ObserverContext> onFunctionEnter(
459 const at::RecordFunction& fn) {
460 auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
461 if (!state_ptr) {
462 return nullptr;
463 }
464 return state_ptr->record_queue_.getSubqueue()->begin_op(fn);
465}
466
467// @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter
468template <bool use_global_state_ptr = false>
469void onFunctionExit(
470 const at::RecordFunction& fn,
471 at::ObserverContext* ctx_ptr) {
472 auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
473 if (!state_ptr) {
474 return;
475 }
476 const auto& config = state_ptr->config();
477 auto* kineto_ctx_ptr =
478 static_cast<torch::profiler::impl::KinetoObserverContext*>(ctx_ptr);
479 TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
480 kineto_ctx_ptr->event_->end_time_ =
481 torch::profiler::impl::getApproximateTime();
482 if (!config.experimental_config.performance_events.empty()) {
483 state_ptr->record_queue_.getSubqueue()->disable_perf_profiler(
484 *kineto_ctx_ptr->event_->counters_);
485 }
486 kineto_ctx_ptr->event_->basic_fields_.end_tid_ =
487 at::RecordFunction::currentThreadId();
488 if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
489 try {
490 auto fallback = kineto_ctx_ptr->fallback_;
491 TORCH_INTERNAL_ASSERT(fallback != nullptr);
492 torch::profiler::impl::cudaStubs()->record(
493 nullptr, &fallback->cuda_event_end_, nullptr);
494 } catch (const std::exception& e) {
495 LOG(WARNING) << "Failed to record CUDA event. " << e.what();
496 }
497 }
498
499 if (fn.scope() == at::RecordScope::USER_SCOPE) {
500 torch::profiler::impl::kineto::popUserCorrelationId();
501 } else {
502 torch::profiler::impl::kineto::popCorrelationId();
503 }
504}
505
506template <bool use_global_callback = false>
507void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
508 auto registration_state_ptr =
509 KinetoThreadLocalState::get(use_global_callback);
510 TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
511 auto recordFunctionCallback =
512 at::RecordFunctionCallback(
513 onFunctionEnter<use_global_callback>,
514 onFunctionExit<use_global_callback>)
515 .needsInputs(registration_state_ptr->config().report_input_shapes)
516 .scopes(scopes);
517
518 auto handle = c10::guts::if_constexpr<use_global_callback>(
519 [&] { return at::addGlobalCallback(recordFunctionCallback); },
520 [&] { return at::addThreadLocalCallback(recordFunctionCallback); });
521 registration_state_ptr->setCallbackHandle(handle);
522}
523
524} // namespace
525
526void reportBackendEventToActiveKinetoProfiler(
527 const int64_t start_time_us,
528 const int64_t end_time_us,
529 const int64_t debug_handle,
530 const at::RecordScope scope,
531 const std::string& event_name,
532 const std::string& backend_name) {
533 TORCH_INTERNAL_ASSERT(
534 KinetoThreadLocalState::get(/*global=*/true) == nullptr,
535 "On-demand profiling does not support post processing callback");
536
537 auto state_ptr = KinetoThreadLocalState::get(/*global=*/false);
538 if (!state_ptr) {
539 return;
540 }
541
542 state_ptr->record_queue_.getSubqueue()->emplace_backend_event(
543 start_time_us,
544 end_time_us,
545 debug_handle,
546 scope,
547 event_name,
548 backend_name);
549
550 /* no support for input shapes now?
551 if (config.report_input_shapes) {
552 ctx_ptr->shapes = inputSizes(fn);
553 ctx_ptr->dtypes = inputTypes(fn);
554 }
555 */
556}
557
558void prepareProfiler(
559 const torch::profiler::impl::ProfilerConfig& config,
560 const std::set<torch::profiler::impl::ActivityType>& activities) {
561 if (config.state == ProfilerState::NVTX ||
562 config.state == ProfilerState::ITT) {
563 return;
564 }
565 TORCH_CHECK(
566 config.state == ProfilerState::KINETO ||
567 config.state == ProfilerState::KINETO_GPU_FALLBACK,
568 "Supported only in Kineto profiler");
569 torch::profiler::impl::kineto::prepareTrace(
570 /*cpuOnly=*/!at::hasCUDA(), activities, config.experimental_config);
571
572 if (!config.experimental_config.performance_events.empty()) {
573 /* For now only CPU activity is supported */
574 TORCH_CHECK(
575 activities.count(torch::autograd::profiler::ActivityType::CPU),
576 "Cannot run cpu hardware profiler without CPU activities, please only use CPU activity type");
577 /*
578 * Sending a warning and passing the non-standard event to the backend
579 * Backend can abort if the event is not supported.
580 * TODO Should we gracefully drop the invalid event if we have atleast one
581 * valid?
582 */
583 auto is_standard_event = [](const std::string& event) -> bool {
584 for (auto e : torch::profiler::ProfilerPerfEvents) {
585 if (!std::strcmp(event.c_str(), e)) {
586 return true;
587 }
588 }
589 return false;
590 };
591
592 for (const auto& e : config.experimental_config.performance_events) {
593 if (!is_standard_event(e)) {
594 TORCH_WARN("Forwarding a non-standard CPU performance event : ", e);
595 }
596 }
597 }
598}
599
600void enableProfilerWithEventPostProcess(
601 const torch::profiler::impl::ProfilerConfig& config,
602 const std::set<torch::profiler::impl::ActivityType>& activities,
603 post_process_t&& cb,
604 const std::unordered_set<at::RecordScope>& scopes) {
605 TORCH_CHECK(
606 config.state != ProfilerState::NVTX,
607 "NVTX does not support post processing callback.");
608 TORCH_CHECK(
609 config.state != ProfilerState::ITT,
610 "ITT does not support post processing callback.");
611 TORCH_INTERNAL_ASSERT(
612 KinetoThreadLocalState::get(/*global=*/true) == nullptr,
613 "On-demand profiling does not support post processing callback");
614
615 enableProfiler(config, activities, scopes);
616 auto state_ptr = KinetoThreadLocalState::get(config.global());
617 state_ptr->setEventPostProcessingCallback(std::move(cb));
618}
619
620void enableProfiler(
621 const torch::profiler::impl::ProfilerConfig& config,
622 const std::set<torch::profiler::impl::ActivityType>& activities,
623 const std::unordered_set<at::RecordScope>& scopes) {
624 const auto has_cpu = activities.count(ActivityType::CPU);
625 TORCH_CHECK(
626 KinetoThreadLocalState::get(/*global=*/config.global()) == nullptr,
627 "Profiler is already enabled",
628 (config.global() ? "." : " on this thread."));
629
630 if (config.state == ProfilerState::NVTX) {
631 torch::profiler::impl::pushNVTXCallbacks(config, scopes);
632 return;
633 } else if (config.state == ProfilerState::ITT) {
634 torch::profiler::impl::pushITTCallbacks(config, scopes);
635 return;
636 }
637
638 TORCH_CHECK(
639 config.state == ProfilerState::KINETO ||
640 config.state == ProfilerState::KINETO_GPU_FALLBACK || config.global());
641 TORCH_CHECK(!activities.empty(), "No activities specified.");
642 TORCH_INTERNAL_ASSERT(
643 has_cpu || !config.global(),
644 "Ondemand profiling must enable CPU tracing");
645
646 KinetoThreadLocalState::push(
647 std::make_shared<KinetoThreadLocalState>(config, activities));
648
649 if (has_cpu) {
650 config.global() ? pushProfilingCallbacks</*global=*/true>(scopes)
651 : pushProfilingCallbacks</*global=*/false>(scopes);
652 }
653
654 if (!config.global()) {
655 torch::profiler::impl::kineto::startTrace();
656 }
657}
658
659std::unique_ptr<ProfilerResult> disableProfiler() {
660 auto state_ptr = ProfilerStateBase::pop();
661 const auto& config = state_ptr->config();
662 TORCH_CHECK(
663 state_ptr &&
664 (config.state == ProfilerState::KINETO ||
665 config.state == ProfilerState::KINETO_GPU_FALLBACK ||
666 config.state == ProfilerState::KINETO_ONDEMAND ||
667 config.state == ProfilerState::NVTX ||
668 config.state == ProfilerState::ITT),
669 "Can't disable Kineto profiler when it's not running");
670
671 state_ptr->removeCallback();
672
673 // Traces are converged via libkineto automatically for ondemand flow
674 if (state_ptr->config().global()) {
675 (void)std::static_pointer_cast<KinetoThreadLocalState>(state_ptr)
676 ->finalizeTrace();
677 return std::make_unique<ProfilerResult>();
678 }
679
680 // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK
681 std::unique_ptr<ProfilerResult> result;
682 if (state_ptr->config().state == ProfilerState::NVTX) {
683 result = std::make_unique<ProfilerResult>();
684 }
685
686 if (config.state == ProfilerState::KINETO ||
687 config.state == ProfilerState::KINETO_GPU_FALLBACK) {
688 auto kineto_state_ptr =
689 std::static_pointer_cast<KinetoThreadLocalState>(state_ptr);
690 auto trace = kineto_state_ptr->finalizeTrace();
691 result = std::make_unique<ProfilerResult>(
692 kineto_state_ptr->start_time_,
693 std::move(kineto_state_ptr->kineto_events_),
694 std::move(trace),
695 std::move(kineto_state_ptr->event_tree_));
696 }
697
698 return result;
699}
700
701KinetoEvent::KinetoEvent(
702 std::shared_ptr<const torch::profiler::impl::Result> result,
703 const bool verbose)
704 : result_{result} {
705 TORCH_INTERNAL_ASSERT(result != nullptr);
706
707 if (verbose) {
708 // Populate Python stack
709 auto parent = result_->parent_.lock();
710 while (parent != nullptr) {
711 parent->visit_if_base<PyExtraFieldsBase>(
712 [&](const auto& i) { python_stack_.push_back(parent->name()); });
713 parent = parent->parent_.lock();
714 }
715 }
716
717 result->visit_if_base<ExtraFields<EventType::TorchOp>>([&](const auto& op) {
718 std::tie(shapes_, dtypes_) = shapesAndDtypes(op.inputs_);
719 });
720}
721
722bool KinetoEvent::isPythonFunction() const {
723 bool out{false};
724 result_->visit_if_base<PyExtraFieldsBase>([&](const auto&) { out = true; });
725 return out;
726}
727
728bool KinetoEvent::hasShapes() const {
729 return !shapes_.empty();
730}
731
732const c10::ArrayRef<std::vector<int64_t>> KinetoEvent::shapes() const {
733 return shapes_;
734}
735
736bool KinetoEvent::hasTypes() const {
737 return !dtypes_.empty();
738}
739
740const c10::ArrayRef<std::string> KinetoEvent::dtypes() const {
741 return dtypes_;
742}
743
744const c10::ArrayRef<std::string> KinetoEvent::stack() const {
745 auto get = [&](const auto& i) -> auto& {
746 return !i.jit_stack_.empty() ? i.jit_stack_ : python_stack_;
747 };
748
749 using out_t = const c10::ArrayRef<std::string>;
750 return result_->visit(c10::overloaded(
751 [&](const ExtraFields<EventType::TorchOp>& i) -> out_t { return get(i); },
752 [&](const ExtraFields<EventType::Backend>& i) -> out_t { return get(i); },
753 [&](const auto&) -> out_t { return python_stack_; }));
754}
755
756const c10::ArrayRef<std::string> KinetoEvent::moduleHierarchy() const {
757 return result_->visit(c10::overloaded(
758 [](const ExtraFields<EventType::TorchOp>& e)
759 -> const c10::ArrayRef<std::string> { return e.jit_modules_; },
760 [](const ExtraFields<EventType::Backend>& e)
761 -> const c10::ArrayRef<std::string> { return e.jit_modules_; },
762 [](const auto&) -> const c10::ArrayRef<std::string> { return {}; }));
763}
764
765uint64_t KinetoEvent::durationUs() const {
766 return (result_->endTimeNS() - result_->start_time_ns_) / 1000;
767}
768
769int64_t KinetoEvent::debugHandle() const {
770 return result_->visit(c10::overloaded(
771 [](const ExtraFields<EventType::TorchOp>& i) { return i.debug_handle_; },
772 [](const ExtraFields<EventType::Backend>& i) { return i.debug_handle_; },
773 [](const auto&) -> int64_t { return -1; }));
774}
775
776uint8_t KinetoEvent::deviceIndex() const {
777 return result_->visit(c10::overloaded(
778 [](const ExtraFields<EventType::Allocation>& i) {
779 return static_cast<uint8_t>(i.device_index_);
780 },
781 [](const ExtraFields<EventType::OutOfMemory>& i) {
782 return static_cast<uint8_t>(i.device_index_);
783 },
784 [&](const auto&) {
785 return static_cast<uint8_t>(result_->kineto_info_.device);
786 }));
787}
788
789bool KinetoEvent::hasStack() const {
790 return !stack().empty();
791}
792
793int64_t KinetoEvent::cudaElapsedUs() const {
794 auto cuda_event_start = fallbackStart();
795 auto cuda_event_end = fallbackEnd();
796 if (!cuda_event_start || !cuda_event_end) {
797 return -1;
798 }
799 try {
800 return (int64_t)torch::profiler::impl::cudaStubs()->elapsed(
801 &cuda_event_start, &cuda_event_end);
802 } catch (std::exception& e) {
803 LOG(WARNING) << "Failed to measure time between two CUDA events. "
804 << e.what();
805 }
806 return -1;
807}
808
809void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
810 return result_->visit(c10::overloaded(
811 [&in](const ExtraFields<EventType::TorchOp>& e) -> void {
812 const size_t n = e.perf_event_counters_->size();
813 // should be rare
814 if (in.size() < n) {
815 in.resize(n, 0);
816 }
817 for (size_t i = 0; i < n; ++i) {
818 in[i] = (*e.perf_event_counters_)[i];
819 }
820 },
821 [](const auto&) -> void { return; }));
822}
823
824#define FORWARD_FROM_RESULT(method_name, result_expr) \
825 decltype(std::declval<KinetoEvent>().method_name()) \
826 KinetoEvent::method_name() const { \
827 return static_cast<decltype(std::declval<KinetoEvent>().method_name())>( \
828 result_->result_expr); \
829 }
830
831FORWARD_FROM_RESULT(startThreadId, start_tid_)
832FORWARD_FROM_RESULT(endThreadId, endTID())
833FORWARD_FROM_RESULT(activityType, kinetoType())
834FORWARD_FROM_RESULT(name, name())
835FORWARD_FROM_RESULT(deviceType, deviceType())
836FORWARD_FROM_RESULT(startUs, start_time_ns_ / 1000)
837FORWARD_FROM_RESULT(correlationId, correlationID())
838FORWARD_FROM_RESULT(deviceResourceId, kineto_info_.resource)
839#undef FORWARD_FROM_RESULT
840
841// Most of the fields in `KinetoEvent` only make sense for a single event type.
842// (Generally TorchOp.) For all other types they simply return the default
843// value. This macro provides a succinct way of expressing this behavior.
844#define TYPED_ATTR_WITH_DEFAULT( \
845 event_type, method_name, expression, default_value) \
846 decltype(std::declval<KinetoEvent>().method_name()) \
847 KinetoEvent::method_name() const { \
848 using out_t = decltype(std::declval<KinetoEvent>().method_name()); \
849 return result_->visit(c10::overloaded( \
850 [](const ExtraFields<EventType::event_type>& e) -> out_t { \
851 return expression; \
852 }, \
853 [](const auto&) -> out_t { return default_value; })); \
854 }
855
856#define TYPED_ATTR(event_type, method_name, expression) \
857 TYPED_ATTR_WITH_DEFAULT(event_type, method_name, expression, {})
858
859TYPED_ATTR_WITH_DEFAULT(TorchOp, sequenceNr, e.sequence_number_, -1)
860TYPED_ATTR(TorchOp, fwdThreadId, e.sequence_number_ >= 0 ? e.forward_tid_ : 0)
861TYPED_ATTR(TorchOp, scope, static_cast<uint8_t>(e.scope_))
862TYPED_ATTR(TorchOp, hasModuleHierarchy, !e.jit_modules_.empty())
863TYPED_ATTR(TorchOp, isAsync, e.is_async_)
864TYPED_ATTR(TorchOp, fallbackStart, e.gpu_fallback_.cuda_event_start_)
865TYPED_ATTR(TorchOp, fallbackEnd, e.gpu_fallback_.cuda_event_end_)
866TYPED_ATTR(
867 TorchOp,
868 flops,
869 !e.extra_args_.empty() ? computeFlops(e.name_, e.extra_args_) : 0)
870TYPED_ATTR(Backend, backend, e.backend_)
871TYPED_ATTR(Allocation, nBytes, e.alloc_size_)
872TYPED_ATTR(Kineto, linkedCorrelationId, [&]() {
873 const auto linked = e.linked_activity_.lock();
874 return linked ? linked->correlationID() : 0;
875}())
876#undef TYPED_ATTR
877#undef TYPED_ATTR_WITH_DEFAULT
878
879ProfilerResult::ProfilerResult(
880 uint64_t start_time,
881 std::vector<KinetoEvent> events,
882 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
883 trace,
884 std::vector<experimental_event_t>&& event_tree)
885 : trace_start_us_(start_time),
886 events_(std::move(events)),
887 trace_(std::move(trace)),
888 event_tree_(std::move(event_tree)) {}
889ProfilerResult::ProfilerResult() = default;
890ProfilerResult::~ProfilerResult() = default;
891
892void ProfilerResult::save(const std::string& path) {
893 trace_->save(path);
894}
895
896} // namespace profiler
897} // namespace autograd
898
899namespace profiler {
900namespace impl {
901void _reportVulkanEventToProfiler(vulkan_id_t id) {
902 auto state_ptr = ::torch::autograd::profiler::KinetoThreadLocalState::get(
903 /*global=*/false);
904 if (state_ptr) {
905 state_ptr->reportVulkanEventToProfiler(id);
906 }
907}
908} // namespace impl
909} // namespace profiler
910
911} // namespace torch
912