1#pragma once
2
3#include <cstdint>
4#include <memory>
5#include <mutex>
6#include <type_traits>
7#include <utility>
8
9#include <ATen/Context.h>
10#include <c10/core/Device.h>
11#include <c10/core/TensorImpl.h>
12#include <c10/macros/Macros.h>
13#include <c10/util/flat_hash_map.h>
14#include <c10/util/strong_type.h>
15#include <c10/util/variant.h>
16#include <torch/csrc/profiler/containers.h>
17#include <torch/csrc/profiler/data_flow.h>
18#include <torch/csrc/profiler/events.h>
19#include <torch/csrc/profiler/kineto_shim.h>
20#include <torch/csrc/profiler/orchestration/python_tracer.h>
21#include <torch/csrc/profiler/perf.h>
22#include <torch/csrc/profiler/stubs/base.h>
23#include <torch/csrc/profiler/util.h>
24#include <torch/csrc/utils/python_stub.h>
25
26namespace torch {
27namespace profiler {
28namespace impl {
29
30enum class EventType : uint8_t {
31 TorchOp = 0,
32 Backend,
33 Vulkan,
34 Allocation,
35 OutOfMemory,
36 PyCall,
37 PyCCall,
38 Kineto
39};
40
41// ============================================================================
42// == Value (Tensor, Scalar) summary ==========================================
43// ============================================================================
44struct TORCH_API RawTensorMetadataBase {
45 RawTensorMetadataBase() = default;
46 explicit RawTensorMetadataBase(const at::Tensor& t);
47
48 StorageImplData data_;
49 c10::ScalarType dtype_;
50 c10::Layout layout_;
51 uint32_t dim_;
52};
53
54// Collected during profiling.
55struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
56 RawTensorMetadata() = default;
57 RawTensorMetadata(const RawTensorMetadata&) = default;
58 explicit RawTensorMetadata(const at::Tensor& t);
59
60 // Wrap `weak_self_` in `c10::optional` and split device into components to
61 // keep struct default constructable. (which the std::array initializer needs)
62 c10::optional<WeakTensor> weak_self_;
63 c10::DeviceType device_type_;
64 c10::DeviceIndex device_index_;
65};
66
67// Used during post processing.
68struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
69 TensorMetadata(
70 const RawTensorMetadata& r,
71 std::vector<int64_t> sizes,
72 std::vector<int64_t> strides);
73
74 TensorImplAddress impl() const {
75 return weak_self_.get();
76 }
77
78 WeakTensor weak_self_;
79 c10::Device device_;
80 std::vector<int64_t> sizes_;
81 std::vector<int64_t> strides_;
82
83 // Set during `calculateUniqueTensorIDs`.
84 c10::optional<TensorID> id_;
85 c10::optional<AllocationID> allocation_id_;
86};
87
88using op_input_t = c10::variant<
89 TensorMetadata,
90 std::vector<TensorMetadata>,
91 c10::IValue,
92 c10::nullopt_t>;
93
94// ============================================================================
95// == ExtraFields =============================================================
96// ============================================================================
97template <EventType>
98struct ExtraFields;
99
100struct Result;
101
102struct TorchOpBasicFields {
103 int64_t sequence_number_{0};
104 uint64_t forward_tid_{0};
105 at::RecordScope scope_{};
106 bool is_async_{false};
107 int64_t debug_handle_{0};
108 std::string name_;
109
110 // Set in the exit callback.
111 uint64_t end_tid_{0};
112};
113
114using jit_stack_t = std::vector<std::string>;
115using jit_modules_t = std::vector<std::string>;
116using extra_args_t = std::unordered_map<std::string, c10::IValue>;
117
118struct FallbackPair {
119 ProfilerEventStub cuda_event_start_ = nullptr;
120 ProfilerEventStub cuda_event_end_ = nullptr;
121};
122
123template <>
124struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
125 ExtraFields(
126 TorchOpBasicFields&& f,
127 uint64_t correlation_id,
128 time_t end_time_ns,
129 std::vector<op_input_t>&& inputs,
130 jit_stack_t&& jit_stack,
131 jit_modules_t&& jit_modules,
132 extra_args_t&& extra_args,
133 FallbackPair&& gpu_fallback,
134 bool allow_tf32_cublas,
135 std::unique_ptr<perf_counters_t>&& perf_event_counters)
136 : TorchOpBasicFields(std::move(f)),
137 correlation_id_{correlation_id},
138 end_time_ns_{end_time_ns},
139 inputs_{std::move(inputs)},
140 jit_stack_{std::move(jit_stack)},
141 jit_modules_{std::move(jit_modules)},
142 extra_args_{std::move(extra_args)},
143 gpu_fallback_{std::move(gpu_fallback)},
144 allow_tf32_cublas_{allow_tf32_cublas},
145 perf_event_counters_{std::move(perf_event_counters)} {}
146 uint64_t correlation_id_;
147 time_t end_time_ns_;
148 std::vector<op_input_t> inputs_;
149 jit_stack_t jit_stack_;
150 jit_modules_t jit_modules_;
151 extra_args_t extra_args_;
152 FallbackPair gpu_fallback_;
153 bool allow_tf32_cublas_;
154 std::unique_ptr<perf_counters_t> perf_event_counters_;
155};
156
157template <>
158struct ExtraFields<EventType::Backend> {
159 int64_t start_time_us_;
160 int64_t end_time_us_;
161 int64_t debug_handle_;
162 at::RecordScope scope_;
163 std::string name_;
164 std::string backend_;
165 jit_stack_t jit_stack_;
166 jit_modules_t jit_modules_;
167};
168
169template <>
170struct ExtraFields<EventType::Vulkan> {
171 using raw_event_t = std::pair<approx_time_t, vulkan_id_t>;
172 std::string name_;
173 int64_t duration_ns_{0};
174 // While building the event tree, we want to report a vulkan event's duration
175 // as 0 so that its end time doesn't exceed that of its parent cpu op
176 bool in_tree_building_{false};
177};
178
179struct RawAllocation {
180 torch::profiler::impl::approx_time_t start_time_;
181 void* ptr_;
182 int64_t alloc_size_;
183 size_t total_allocated_;
184 size_t total_reserved_;
185 c10::DeviceType device_type_;
186 c10::DeviceIndex device_index_;
187};
188
189// For performance.
190static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation.");
191
192template <>
193struct ExtraFields<EventType::Allocation> : RawAllocation {
194 ExtraFields(const RawAllocation& allocation) : RawAllocation(allocation) {}
195
196 c10::Device device() const {
197 return {device_type_, device_index_};
198 }
199
200 c10::optional<TensorID> id_;
201 c10::optional<AllocationID> allocation_id_;
202};
203
204template <>
205struct ExtraFields<EventType::OutOfMemory> {
206 torch::profiler::impl::approx_time_t start_time_;
207 int64_t alloc_size_;
208 size_t total_allocated_;
209 size_t total_reserved_;
210 c10::DeviceType device_type_;
211 c10::DeviceIndex device_index_;
212};
213
214// For performance.
215static_assert(
216 c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>,
217 "Non-POD member of ExtraFields<EventType::OutOfMemory>.");
218
219struct PyFrameState {
220 int line_no_;
221 at::StringView filename_;
222 at::StringView funcname_;
223};
224
225template <typename T, typename Tag>
226using strong_t = strong::
227 type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>;
228
229using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>;
230using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>;
231using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>;
232using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>;
233using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>;
234
235struct NNModuleInfo {
236 struct ParameterInfo {
237 std::string name_;
238 TensorMetadata metadata_;
239 c10::optional<TensorMetadata> grad_metadata_;
240 };
241
242 PyModuleSelf self_;
243 PyModuleCls cls_;
244 at::StringView cls_name_;
245
246 std::vector<ParameterInfo> parameters_;
247 // Indicates that `self_` is the kth instance of `cls_` observed.
248 size_t id_{std::numeric_limits<size_t>::max()};
249};
250
251struct OptimizerInfo {
252 struct ParameterInfo {
253 TensorMetadata metadata_;
254 c10::optional<TensorMetadata> grad_metadata_;
255 std::vector<std::pair<std::string, TensorMetadata>> state_;
256 };
257
258 PyOptimizerSelf self_;
259 PyOptimizerCls cls_;
260 at::StringView cls_name_;
261
262 std::vector<ParameterInfo> parameters_;
263};
264
265struct PyExtraFieldsBase {
266 PyExtraFieldsBase(time_t end_time_ns, size_t python_tid, PyFrameState caller)
267 : end_time_ns_{end_time_ns},
268 python_tid_{python_tid},
269 caller_{std::move(caller)} {}
270
271 time_t end_time_ns_;
272 size_t python_tid_;
273 PyFrameState caller_;
274
275 // kth python event observed. (Used by TensorBoard)
276 size_t id_{std::numeric_limits<size_t>::max()};
277};
278
279template <>
280struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
281 struct args_t {
282 PyFrameState frame_state_;
283 c10::optional<NNModuleInfo> module_info_;
284 c10::optional<OptimizerInfo> optimizer_info_;
285 };
286
287 ExtraFields(
288 time_t end_time_ns,
289 size_t python_tid,
290 PyFrameState caller,
291 args_t args)
292 : PyExtraFieldsBase(end_time_ns, python_tid, caller),
293 callsite_{args.frame_state_},
294 module_{args.module_info_},
295 optimizer_{args.optimizer_info_} {}
296
297 PyFrameState callsite_;
298 c10::optional<NNModuleInfo> module_;
299 c10::optional<OptimizerInfo> optimizer_;
300};
301
302template <>
303struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
304 using args_t = at::StringView;
305
306 ExtraFields(
307 time_t end_time_ns,
308 size_t python_tid,
309 PyFrameState caller,
310 args_t args)
311 : PyExtraFieldsBase(end_time_ns, python_tid, caller),
312 function_name_{std::move(args)} {}
313
314 at::StringView function_name_;
315};
316
317template <>
318struct ExtraFields<EventType::Kineto> {
319 // Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used
320 // during post processing to properly embed Kineto events into the broader
321 // profiler tree structure. End users are not generally expected to use these
322 // fields directly, but they are available for debugging.
323 struct Flow {
324 uint32_t id{0};
325 uint32_t type{0};
326 uint32_t start{0};
327 };
328
329 std::string name_;
330 int64_t duration_us_{0};
331 uint64_t correlation_id_{0};
332 libkineto::ActivityType activity_type_;
333 Flow flow;
334 std::weak_ptr<Result> linked_activity_{};
335};
336
337struct TORCH_API Result : public std::enable_shared_from_this<Result> {
338 template <typename... Args>
339 [[nodiscard]] static std::shared_ptr<Result> create(Args... args) {
340 return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...));
341 }
342
343 template <typename T>
344 decltype(auto) visit(T&& visitor) {
345 return c10::visit(std::forward<T>(visitor), extra_fields_);
346 }
347
348 template <typename T>
349 decltype(auto) visit(T&& visitor) const {
350 return c10::visit(std::forward<T>(visitor), extra_fields_);
351 }
352
353 template <typename T, typename Fn>
354 void visit_if_base(Fn&& fn) const {
355 visit([&](const auto& extra_fields) {
356 using extra_fields_t = typename std::remove_cv<
357 typename std::remove_reference<decltype(extra_fields)>::type>::type;
358
359 c10::guts::if_constexpr<std::is_base_of<T, extra_fields_t>::value>(
360 [&](auto _) { fn(_(extra_fields)); });
361 });
362 }
363
364 EventType tag() const {
365 return visit([](const auto& i) { return deduceTag(i); });
366 }
367
368 std::string name() const;
369 libkineto::ActivityType kinetoType() const;
370 uint64_t correlationID() const;
371 int64_t endTimeNS() const;
372 uint64_t endTID() const;
373 c10::DeviceType deviceType() const;
374
375 int64_t start_time_ns_;
376 uint64_t start_tid_;
377 kineto::DeviceAndResource kineto_info_;
378 c10::variant<
379 ExtraFields<EventType::TorchOp>,
380 ExtraFields<EventType::Backend>,
381 ExtraFields<EventType::Vulkan>,
382 ExtraFields<EventType::Allocation>,
383 ExtraFields<EventType::OutOfMemory>,
384 ExtraFields<EventType::PyCall>,
385 ExtraFields<EventType::PyCCall>,
386 ExtraFields<EventType::Kineto>>
387 extra_fields_;
388
389 std::weak_ptr<Result> parent_;
390 std::vector<std::shared_ptr<Result>> children_;
391 bool finished_{false};
392
393 const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
394
395 private:
396 template <EventType E>
397 Result(
398 int64_t start_time_ns,
399 uint64_t start_tid,
400 kineto::DeviceAndResource kineto_info,
401 ExtraFields<E>&& extra_fields)
402 : start_time_ns_{start_time_ns},
403 start_tid_{start_tid},
404 kineto_info_{kineto_info},
405 extra_fields_{std::move(extra_fields)} {}
406
407 template <EventType E>
408 static EventType deduceTag(const ExtraFields<E>&) {
409 return E;
410 }
411};
412
413struct KinetoObserverContext : public at::ObserverContext {
414 struct Event {
415 TorchOpBasicFields basic_fields_;
416 approx_time_t start_time_;
417
418 // Set in the exit callback.
419 approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()};
420
421 bool allow_tf32_cublas_;
422 std::unique_ptr<perf_counters_t> counters_;
423 };
424
425 explicit KinetoObserverContext(Event* event) : event_{event} {}
426
427 Event* event_;
428 FallbackPair* fallback_{nullptr};
429};
430
431constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024;
432
433// InputOutputEncoder
434// Stores each op_events' shapes and dtypes into a contiguous AppendOnlyList
435// so that we no longer create vectors for shapes and dtypes on every op.
436// Those vectors can be created during post-processing.
437class InputOutputEncoder final {
438 public:
439 void push(c10::ArrayRef<const c10::IValue> values);
440
441 // Used during post-processing to create vectors for shapes and dtype.
442 auto getNextShapesAndDtypes();
443
444 void clear();
445
446 private:
447 enum class Tag {
448 Tensor = 0,
449 UndefinedTensor,
450 TensorListBegin, // TODO: generalize to other lists.
451 Scalar,
452 Other,
453 TERMINATOR
454 };
455
456 void push(const at::Tensor& t);
457
458 AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
459 AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE>
460 tensor_metadata_;
461 AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_;
462 AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
463};
464
465using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler;
466
467class TORCH_API ThreadLocalSubqueue {
468 public:
469 ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config);
470
471 std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn);
472
473 template <class... Args>
474 void emplace_backend_event(Args&&... args) {
475 backend_events_.emplace_back(std::forward<Args>(args)...);
476 }
477
478 template <class... Args>
479 void emplace_vulkan_event(Args&&... args) {
480 vulkan_events_.emplace_back(std::forward<Args>(args)...);
481 }
482
483 template <class... Args>
484 void emplace_allocation_event(Args&&... args) {
485 allocations_.emplace_back(std::forward<Args>(args)...);
486 }
487
488 template <class... Args>
489 void emplace_ooms_event(Args&&... args) {
490 ooms_.emplace_back(std::forward<Args>(args)...);
491 }
492
493 template <class... Args>
494 void emplace_py_call(Args&&... args) {
495 py_calls_.emplace_back(std::forward<Args>(args)...);
496 }
497
498 uint64_t tid() const {
499 return tid_;
500 }
501
502 const kineto::DeviceAndResource& kineto_info() const {
503 return kineto_info_;
504 }
505
506 inline void disable_perf_profiler(perf_counters_t& counters) const {
507 perf_profiler_->Disable(counters);
508 }
509
510 private:
511 uint64_t tid_;
512 ProfilerConfig config_;
513 kineto::DeviceAndResource kineto_info_;
514 std::unique_ptr<perf_profiler_t> perf_profiler_;
515
516 friend class RecordQueue;
517 // See `containers.h` for block size benchmarks.
518 static constexpr size_t BlockSize = 512;
519
520 struct TorchOpStorage {
521 // NB: This is a destructive operation.
522 void materialize(
523 std::vector<std::shared_ptr<Result>>& out,
524 const std::function<time_t(approx_time_t)> time_converter,
525 const uint64_t tid,
526 const kineto::DeviceAndResource& kineto_info);
527
528 template <typename T, size_t ChunkSize>
529 class EventBlock : public std::array<T, ChunkSize> {
530 public:
531 EventBlock();
532 uint64_t correlation_id(const T* ptr) const;
533
534 private:
535 uint64_t id_start_;
536 };
537
538 using event_t = KinetoObserverContext::Event;
539 class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> {
540 public:
541 template <class... Args>
542 std::pair<event_t*, uint64_t> emplace_back(Args&&... args);
543 static uint64_t correlationID(const OpList::Iterator& e);
544 } op_events_;
545
546 // report_input_shapes
547 InputOutputEncoder inputs_outputs_;
548
549 // with_stack (JIT)
550 AppendOnlyList<jit_stack_t, BlockSize> jit_stack_;
551
552 // with_modules
553 AppendOnlyList<jit_modules_t, BlockSize> jit_modules_;
554
555 // with_flops
556 AppendOnlyList<extra_args_t, BlockSize> extra_args_;
557
558 // ProfilerState::KINETO_GPU_FALLBACK
559 AppendOnlyList<FallbackPair, BlockSize> gpu_fallback_;
560 } torch_ops_;
561
562 // reportBackendEventToActiveKinetoProfiler
563 AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_;
564
565 // _reportVulkanEventToProfiler
566 AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize>
567 vulkan_events_;
568
569 // reportMemoryUsage
570 AppendOnlyList<RawAllocation, BlockSize> allocations_;
571
572 // reportOOMs
573 AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_;
574
575 // with_stack (Python)
576 AppendOnlyList<std::pair<python_tracer::TraceKey, approx_time_t>, BlockSize>
577 py_calls_;
578};
579
580class TORCH_API RecordQueue {
581 public:
582 RecordQueue(const ProfilerConfig& config, std::set<ActivityType> activities);
583
584 bool tracePython() const;
585 ThreadLocalSubqueue* getSubqueue();
586 void stop();
587
588 // NB: This is a destructive operation.
589 std::pair<
590 std::vector<std::shared_ptr<Result>>,
591 std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
592 getRecords(
593 std::function<time_t(approx_time_t)> time_converter,
594 uint64_t start_time_us,
595 uint64_t end_time_us);
596
597 private:
598 uint32_t id_;
599 ProfilerConfig config_;
600 std::set<ActivityType> activities_;
601 ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
602 sub_queues_;
603 std::mutex sub_queue_mutex_;
604 std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_;
605};
606
607} // namespace impl
608} // namespace profiler
609} // namespace torch
610