1 | #pragma once |
2 | |
3 | #include <cstdint> |
4 | #include <memory> |
5 | #include <mutex> |
6 | #include <type_traits> |
7 | #include <utility> |
8 | |
9 | #include <ATen/Context.h> |
10 | #include <c10/core/Device.h> |
11 | #include <c10/core/TensorImpl.h> |
12 | #include <c10/macros/Macros.h> |
13 | #include <c10/util/flat_hash_map.h> |
14 | #include <c10/util/strong_type.h> |
15 | #include <c10/util/variant.h> |
16 | #include <torch/csrc/profiler/containers.h> |
17 | #include <torch/csrc/profiler/data_flow.h> |
18 | #include <torch/csrc/profiler/events.h> |
19 | #include <torch/csrc/profiler/kineto_shim.h> |
20 | #include <torch/csrc/profiler/orchestration/python_tracer.h> |
21 | #include <torch/csrc/profiler/perf.h> |
22 | #include <torch/csrc/profiler/stubs/base.h> |
23 | #include <torch/csrc/profiler/util.h> |
24 | #include <torch/csrc/utils/python_stub.h> |
25 | |
26 | namespace torch { |
27 | namespace profiler { |
28 | namespace impl { |
29 | |
30 | enum class EventType : uint8_t { |
31 | TorchOp = 0, |
32 | Backend, |
33 | Vulkan, |
34 | Allocation, |
35 | OutOfMemory, |
36 | PyCall, |
37 | PyCCall, |
38 | Kineto |
39 | }; |
40 | |
41 | // ============================================================================ |
42 | // == Value (Tensor, Scalar) summary ========================================== |
43 | // ============================================================================ |
44 | struct TORCH_API RawTensorMetadataBase { |
45 | RawTensorMetadataBase() = default; |
46 | explicit RawTensorMetadataBase(const at::Tensor& t); |
47 | |
48 | StorageImplData data_; |
49 | c10::ScalarType dtype_; |
50 | c10::Layout layout_; |
51 | uint32_t dim_; |
52 | }; |
53 | |
54 | // Collected during profiling. |
55 | struct TORCH_API RawTensorMetadata : RawTensorMetadataBase { |
56 | RawTensorMetadata() = default; |
57 | RawTensorMetadata(const RawTensorMetadata&) = default; |
58 | explicit RawTensorMetadata(const at::Tensor& t); |
59 | |
60 | // Wrap `weak_self_` in `c10::optional` and split device into components to |
61 | // keep struct default constructable. (which the std::array initializer needs) |
62 | c10::optional<WeakTensor> weak_self_; |
63 | c10::DeviceType device_type_; |
64 | c10::DeviceIndex device_index_; |
65 | }; |
66 | |
67 | // Used during post processing. |
68 | struct TORCH_API TensorMetadata : public RawTensorMetadataBase { |
69 | TensorMetadata( |
70 | const RawTensorMetadata& r, |
71 | std::vector<int64_t> sizes, |
72 | std::vector<int64_t> strides); |
73 | |
74 | TensorImplAddress impl() const { |
75 | return weak_self_.get(); |
76 | } |
77 | |
78 | WeakTensor weak_self_; |
79 | c10::Device device_; |
80 | std::vector<int64_t> sizes_; |
81 | std::vector<int64_t> strides_; |
82 | |
83 | // Set during `calculateUniqueTensorIDs`. |
84 | c10::optional<TensorID> id_; |
85 | c10::optional<AllocationID> allocation_id_; |
86 | }; |
87 | |
88 | using op_input_t = c10::variant< |
89 | TensorMetadata, |
90 | std::vector<TensorMetadata>, |
91 | c10::IValue, |
92 | c10::nullopt_t>; |
93 | |
94 | // ============================================================================ |
95 | // == ExtraFields ============================================================= |
96 | // ============================================================================ |
97 | template <EventType> |
98 | struct ; |
99 | |
100 | struct Result; |
101 | |
102 | struct TorchOpBasicFields { |
103 | int64_t sequence_number_{0}; |
104 | uint64_t forward_tid_{0}; |
105 | at::RecordScope scope_{}; |
106 | bool is_async_{false}; |
107 | int64_t debug_handle_{0}; |
108 | std::string name_; |
109 | |
110 | // Set in the exit callback. |
111 | uint64_t end_tid_{0}; |
112 | }; |
113 | |
114 | using jit_stack_t = std::vector<std::string>; |
115 | using jit_modules_t = std::vector<std::string>; |
116 | using = std::unordered_map<std::string, c10::IValue>; |
117 | |
118 | struct FallbackPair { |
119 | ProfilerEventStub cuda_event_start_ = nullptr; |
120 | ProfilerEventStub cuda_event_end_ = nullptr; |
121 | }; |
122 | |
123 | template <> |
124 | struct <EventType::TorchOp> : TorchOpBasicFields { |
125 | ( |
126 | TorchOpBasicFields&& f, |
127 | uint64_t correlation_id, |
128 | time_t end_time_ns, |
129 | std::vector<op_input_t>&& inputs, |
130 | jit_stack_t&& jit_stack, |
131 | jit_modules_t&& jit_modules, |
132 | extra_args_t&& , |
133 | FallbackPair&& gpu_fallback, |
134 | bool allow_tf32_cublas, |
135 | std::unique_ptr<perf_counters_t>&& perf_event_counters) |
136 | : TorchOpBasicFields(std::move(f)), |
137 | correlation_id_{correlation_id}, |
138 | end_time_ns_{end_time_ns}, |
139 | inputs_{std::move(inputs)}, |
140 | jit_stack_{std::move(jit_stack)}, |
141 | jit_modules_{std::move(jit_modules)}, |
142 | extra_args_{std::move(extra_args)}, |
143 | gpu_fallback_{std::move(gpu_fallback)}, |
144 | allow_tf32_cublas_{allow_tf32_cublas}, |
145 | perf_event_counters_{std::move(perf_event_counters)} {} |
146 | uint64_t ; |
147 | time_t ; |
148 | std::vector<op_input_t> ; |
149 | jit_stack_t ; |
150 | jit_modules_t ; |
151 | extra_args_t ; |
152 | FallbackPair ; |
153 | bool ; |
154 | std::unique_ptr<perf_counters_t> ; |
155 | }; |
156 | |
157 | template <> |
158 | struct <EventType::Backend> { |
159 | int64_t ; |
160 | int64_t ; |
161 | int64_t debug_handle_; |
162 | at::RecordScope ; |
163 | std::string ; |
164 | std::string ; |
165 | jit_stack_t ; |
166 | jit_modules_t ; |
167 | }; |
168 | |
169 | template <> |
170 | struct <EventType::Vulkan> { |
171 | using = std::pair<approx_time_t, vulkan_id_t>; |
172 | std::string ; |
173 | int64_t {0}; |
174 | // While building the event tree, we want to report a vulkan event's duration |
175 | // as 0 so that its end time doesn't exceed that of its parent cpu op |
176 | bool {false}; |
177 | }; |
178 | |
179 | struct RawAllocation { |
180 | torch::profiler::impl::approx_time_t start_time_; |
181 | void* ptr_; |
182 | int64_t alloc_size_; |
183 | size_t total_allocated_; |
184 | size_t total_reserved_; |
185 | c10::DeviceType device_type_; |
186 | c10::DeviceIndex device_index_; |
187 | }; |
188 | |
189 | // For performance. |
190 | static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation." ); |
191 | |
192 | template <> |
193 | struct <EventType::Allocation> : RawAllocation { |
194 | (const RawAllocation& allocation) : RawAllocation(allocation) {} |
195 | |
196 | c10::Device () const { |
197 | return {device_type_, device_index_}; |
198 | } |
199 | |
200 | c10::optional<TensorID> ; |
201 | c10::optional<AllocationID> ; |
202 | }; |
203 | |
204 | template <> |
205 | struct <EventType::OutOfMemory> { |
206 | torch::profiler::impl::approx_time_t ; |
207 | int64_t ; |
208 | size_t ; |
209 | size_t ; |
210 | c10::DeviceType ; |
211 | c10::DeviceIndex ; |
212 | }; |
213 | |
214 | // For performance. |
215 | static_assert( |
216 | c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>, |
217 | "Non-POD member of ExtraFields<EventType::OutOfMemory>." ); |
218 | |
219 | struct PyFrameState { |
220 | int line_no_; |
221 | at::StringView filename_; |
222 | at::StringView funcname_; |
223 | }; |
224 | |
225 | template <typename T, typename Tag> |
226 | using strong_t = strong:: |
227 | type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>; |
228 | |
229 | using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>; |
230 | using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>; |
231 | using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>; |
232 | using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>; |
233 | using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>; |
234 | |
235 | struct NNModuleInfo { |
236 | struct ParameterInfo { |
237 | std::string name_; |
238 | TensorMetadata metadata_; |
239 | c10::optional<TensorMetadata> grad_metadata_; |
240 | }; |
241 | |
242 | PyModuleSelf self_; |
243 | PyModuleCls cls_; |
244 | at::StringView cls_name_; |
245 | |
246 | std::vector<ParameterInfo> parameters_; |
247 | // Indicates that `self_` is the kth instance of `cls_` observed. |
248 | size_t id_{std::numeric_limits<size_t>::max()}; |
249 | }; |
250 | |
251 | struct OptimizerInfo { |
252 | struct ParameterInfo { |
253 | TensorMetadata metadata_; |
254 | c10::optional<TensorMetadata> grad_metadata_; |
255 | std::vector<std::pair<std::string, TensorMetadata>> state_; |
256 | }; |
257 | |
258 | PyOptimizerSelf self_; |
259 | PyOptimizerCls cls_; |
260 | at::StringView cls_name_; |
261 | |
262 | std::vector<ParameterInfo> parameters_; |
263 | }; |
264 | |
265 | struct { |
266 | (time_t end_time_ns, size_t python_tid, PyFrameState caller) |
267 | : end_time_ns_{end_time_ns}, |
268 | python_tid_{python_tid}, |
269 | caller_{std::move(caller)} {} |
270 | |
271 | time_t ; |
272 | size_t ; |
273 | PyFrameState ; |
274 | |
275 | // kth python event observed. (Used by TensorBoard) |
276 | size_t {std::numeric_limits<size_t>::max()}; |
277 | }; |
278 | |
279 | template <> |
280 | struct <EventType::PyCall> : public PyExtraFieldsBase { |
281 | struct { |
282 | PyFrameState ; |
283 | c10::optional<NNModuleInfo> ; |
284 | c10::optional<OptimizerInfo> ; |
285 | }; |
286 | |
287 | ( |
288 | time_t end_time_ns, |
289 | size_t python_tid, |
290 | PyFrameState caller, |
291 | args_t args) |
292 | : PyExtraFieldsBase(end_time_ns, python_tid, caller), |
293 | callsite_{args.frame_state_}, |
294 | module_{args.module_info_}, |
295 | optimizer_{args.optimizer_info_} {} |
296 | |
297 | PyFrameState ; |
298 | c10::optional<NNModuleInfo> ; |
299 | c10::optional<OptimizerInfo> ; |
300 | }; |
301 | |
302 | template <> |
303 | struct <EventType::PyCCall> : public PyExtraFieldsBase { |
304 | using = at::StringView; |
305 | |
306 | ( |
307 | time_t end_time_ns, |
308 | size_t python_tid, |
309 | PyFrameState caller, |
310 | args_t args) |
311 | : PyExtraFieldsBase(end_time_ns, python_tid, caller), |
312 | function_name_{std::move(args)} {} |
313 | |
314 | at::StringView ; |
315 | }; |
316 | |
317 | template <> |
318 | struct <EventType::Kineto> { |
319 | // Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used |
320 | // during post processing to properly embed Kineto events into the broader |
321 | // profiler tree structure. End users are not generally expected to use these |
322 | // fields directly, but they are available for debugging. |
323 | struct { |
324 | uint32_t {0}; |
325 | uint32_t {0}; |
326 | uint32_t {0}; |
327 | }; |
328 | |
329 | std::string ; |
330 | int64_t {0}; |
331 | uint64_t {0}; |
332 | libkineto::ActivityType ; |
333 | Flow ; |
334 | std::weak_ptr<Result> {}; |
335 | }; |
336 | |
337 | struct TORCH_API Result : public std::enable_shared_from_this<Result> { |
338 | template <typename... Args> |
339 | [[nodiscard]] static std::shared_ptr<Result> create(Args... args) { |
340 | return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...)); |
341 | } |
342 | |
343 | template <typename T> |
344 | decltype(auto) visit(T&& visitor) { |
345 | return c10::visit(std::forward<T>(visitor), extra_fields_); |
346 | } |
347 | |
348 | template <typename T> |
349 | decltype(auto) visit(T&& visitor) const { |
350 | return c10::visit(std::forward<T>(visitor), extra_fields_); |
351 | } |
352 | |
353 | template <typename T, typename Fn> |
354 | void visit_if_base(Fn&& fn) const { |
355 | visit([&](const auto& ) { |
356 | using = typename std::remove_cv< |
357 | typename std::remove_reference<decltype(extra_fields)>::type>::type; |
358 | |
359 | c10::guts::if_constexpr<std::is_base_of<T, extra_fields_t>::value>( |
360 | [&](auto _) { fn(_(extra_fields)); }); |
361 | }); |
362 | } |
363 | |
364 | EventType tag() const { |
365 | return visit([](const auto& i) { return deduceTag(i); }); |
366 | } |
367 | |
368 | std::string name() const; |
369 | libkineto::ActivityType kinetoType() const; |
370 | uint64_t correlationID() const; |
371 | int64_t endTimeNS() const; |
372 | uint64_t endTID() const; |
373 | c10::DeviceType deviceType() const; |
374 | |
375 | int64_t start_time_ns_; |
376 | uint64_t start_tid_; |
377 | kineto::DeviceAndResource kineto_info_; |
378 | c10::variant< |
379 | ExtraFields<EventType::TorchOp>, |
380 | ExtraFields<EventType::Backend>, |
381 | ExtraFields<EventType::Vulkan>, |
382 | ExtraFields<EventType::Allocation>, |
383 | ExtraFields<EventType::OutOfMemory>, |
384 | ExtraFields<EventType::PyCall>, |
385 | ExtraFields<EventType::PyCCall>, |
386 | ExtraFields<EventType::Kineto>> |
387 | ; |
388 | |
389 | std::weak_ptr<Result> parent_; |
390 | std::vector<std::shared_ptr<Result>> children_; |
391 | bool finished_{false}; |
392 | |
393 | const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr}; |
394 | |
395 | private: |
396 | template <EventType E> |
397 | Result( |
398 | int64_t start_time_ns, |
399 | uint64_t start_tid, |
400 | kineto::DeviceAndResource kineto_info, |
401 | ExtraFields<E>&& ) |
402 | : start_time_ns_{start_time_ns}, |
403 | start_tid_{start_tid}, |
404 | kineto_info_{kineto_info}, |
405 | extra_fields_{std::move(extra_fields)} {} |
406 | |
407 | template <EventType E> |
408 | static EventType (const ExtraFields<E>&) { |
409 | return E; |
410 | } |
411 | }; |
412 | |
413 | struct KinetoObserverContext : public at::ObserverContext { |
414 | struct Event { |
415 | TorchOpBasicFields basic_fields_; |
416 | approx_time_t start_time_; |
417 | |
418 | // Set in the exit callback. |
419 | approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()}; |
420 | |
421 | bool allow_tf32_cublas_; |
422 | std::unique_ptr<perf_counters_t> counters_; |
423 | }; |
424 | |
425 | explicit KinetoObserverContext(Event* event) : event_{event} {} |
426 | |
427 | Event* event_; |
428 | FallbackPair* fallback_{nullptr}; |
429 | }; |
430 | |
431 | constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024; |
432 | |
433 | // InputOutputEncoder |
434 | // Stores each op_events' shapes and dtypes into a contiguous AppendOnlyList |
435 | // so that we no longer create vectors for shapes and dtypes on every op. |
436 | // Those vectors can be created during post-processing. |
437 | class InputOutputEncoder final { |
438 | public: |
439 | void push(c10::ArrayRef<const c10::IValue> values); |
440 | |
441 | // Used during post-processing to create vectors for shapes and dtype. |
442 | auto getNextShapesAndDtypes(); |
443 | |
444 | void clear(); |
445 | |
446 | private: |
447 | enum class Tag { |
448 | Tensor = 0, |
449 | UndefinedTensor, |
450 | TensorListBegin, // TODO: generalize to other lists. |
451 | Scalar, |
452 | Other, |
453 | TERMINATOR |
454 | }; |
455 | |
456 | void push(const at::Tensor& t); |
457 | |
458 | AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_; |
459 | AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE> |
460 | tensor_metadata_; |
461 | AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_; |
462 | AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_; |
463 | }; |
464 | |
465 | using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler; |
466 | |
467 | class TORCH_API ThreadLocalSubqueue { |
468 | public: |
469 | ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config); |
470 | |
471 | std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn); |
472 | |
473 | template <class... Args> |
474 | void emplace_backend_event(Args&&... args) { |
475 | backend_events_.emplace_back(std::forward<Args>(args)...); |
476 | } |
477 | |
478 | template <class... Args> |
479 | void emplace_vulkan_event(Args&&... args) { |
480 | vulkan_events_.emplace_back(std::forward<Args>(args)...); |
481 | } |
482 | |
483 | template <class... Args> |
484 | void emplace_allocation_event(Args&&... args) { |
485 | allocations_.emplace_back(std::forward<Args>(args)...); |
486 | } |
487 | |
488 | template <class... Args> |
489 | void emplace_ooms_event(Args&&... args) { |
490 | ooms_.emplace_back(std::forward<Args>(args)...); |
491 | } |
492 | |
493 | template <class... Args> |
494 | void emplace_py_call(Args&&... args) { |
495 | py_calls_.emplace_back(std::forward<Args>(args)...); |
496 | } |
497 | |
498 | uint64_t tid() const { |
499 | return tid_; |
500 | } |
501 | |
502 | const kineto::DeviceAndResource& kineto_info() const { |
503 | return kineto_info_; |
504 | } |
505 | |
506 | inline void disable_perf_profiler(perf_counters_t& counters) const { |
507 | perf_profiler_->Disable(counters); |
508 | } |
509 | |
510 | private: |
511 | uint64_t tid_; |
512 | ProfilerConfig config_; |
513 | kineto::DeviceAndResource kineto_info_; |
514 | std::unique_ptr<perf_profiler_t> perf_profiler_; |
515 | |
516 | friend class RecordQueue; |
517 | // See `containers.h` for block size benchmarks. |
518 | static constexpr size_t BlockSize = 512; |
519 | |
520 | struct TorchOpStorage { |
521 | // NB: This is a destructive operation. |
522 | void materialize( |
523 | std::vector<std::shared_ptr<Result>>& out, |
524 | const std::function<time_t(approx_time_t)> time_converter, |
525 | const uint64_t tid, |
526 | const kineto::DeviceAndResource& kineto_info); |
527 | |
528 | template <typename T, size_t ChunkSize> |
529 | class EventBlock : public std::array<T, ChunkSize> { |
530 | public: |
531 | EventBlock(); |
532 | uint64_t correlation_id(const T* ptr) const; |
533 | |
534 | private: |
535 | uint64_t id_start_; |
536 | }; |
537 | |
538 | using event_t = KinetoObserverContext::Event; |
539 | class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> { |
540 | public: |
541 | template <class... Args> |
542 | std::pair<event_t*, uint64_t> emplace_back(Args&&... args); |
543 | static uint64_t correlationID(const OpList::Iterator& e); |
544 | } op_events_; |
545 | |
546 | // report_input_shapes |
547 | InputOutputEncoder inputs_outputs_; |
548 | |
549 | // with_stack (JIT) |
550 | AppendOnlyList<jit_stack_t, BlockSize> jit_stack_; |
551 | |
552 | // with_modules |
553 | AppendOnlyList<jit_modules_t, BlockSize> jit_modules_; |
554 | |
555 | // with_flops |
556 | AppendOnlyList<extra_args_t, BlockSize> ; |
557 | |
558 | // ProfilerState::KINETO_GPU_FALLBACK |
559 | AppendOnlyList<FallbackPair, BlockSize> gpu_fallback_; |
560 | } torch_ops_; |
561 | |
562 | // reportBackendEventToActiveKinetoProfiler |
563 | AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_; |
564 | |
565 | // _reportVulkanEventToProfiler |
566 | AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize> |
567 | vulkan_events_; |
568 | |
569 | // reportMemoryUsage |
570 | AppendOnlyList<RawAllocation, BlockSize> allocations_; |
571 | |
572 | // reportOOMs |
573 | AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_; |
574 | |
575 | // with_stack (Python) |
576 | AppendOnlyList<std::pair<python_tracer::TraceKey, approx_time_t>, BlockSize> |
577 | py_calls_; |
578 | }; |
579 | |
580 | class TORCH_API RecordQueue { |
581 | public: |
582 | RecordQueue(const ProfilerConfig& config, std::set<ActivityType> activities); |
583 | |
584 | bool tracePython() const; |
585 | ThreadLocalSubqueue* getSubqueue(); |
586 | void stop(); |
587 | |
588 | // NB: This is a destructive operation. |
589 | std::pair< |
590 | std::vector<std::shared_ptr<Result>>, |
591 | std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>> |
592 | getRecords( |
593 | std::function<time_t(approx_time_t)> time_converter, |
594 | uint64_t start_time_us, |
595 | uint64_t end_time_us); |
596 | |
597 | private: |
598 | uint32_t id_; |
599 | ProfilerConfig config_; |
600 | std::set<ActivityType> activities_; |
601 | ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>> |
602 | sub_queues_; |
603 | std::mutex sub_queue_mutex_; |
604 | std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_; |
605 | }; |
606 | |
607 | } // namespace impl |
608 | } // namespace profiler |
609 | } // namespace torch |
610 | |