collection.h source code [pytorch/torch/csrc/profiler/collection.h]

1	#pragma once
2
3	#include <cstdint>
4	#include <memory>
5	#include <mutex>
6	#include <type_traits>
7	#include <utility>
8
9	#include <ATen/Context.h>
10	#include <c10/core/Device.h>
11	#include <c10/core/TensorImpl.h>
12	#include <c10/macros/Macros.h>
13	#include <c10/util/flat_hash_map.h>
14	#include <c10/util/strong_type.h>
15	#include <c10/util/variant.h>
16	#include <torch/csrc/profiler/containers.h>
17	#include <torch/csrc/profiler/data_flow.h>
18	#include <torch/csrc/profiler/events.h>
19	#include <torch/csrc/profiler/kineto_shim.h>
20	#include <torch/csrc/profiler/orchestration/python_tracer.h>
21	#include <torch/csrc/profiler/perf.h>
22	#include <torch/csrc/profiler/stubs/base.h>
23	#include <torch/csrc/profiler/util.h>
24	#include <torch/csrc/utils/python_stub.h>
25
26	namespace torch {
27	namespace profiler {
28	namespace impl {
29
30	enum class EventType : uint8_t {
31	TorchOp = `0`,
32	Backend,
33	Vulkan,
34	Allocation,
35	OutOfMemory,
36	PyCall,
37	PyCCall,
38	Kineto
39	};
40
41	// ============================================================================
42	// == Value (Tensor, Scalar) summary ==========================================
43	// ============================================================================
44	struct TORCH_API RawTensorMetadataBase {
45	RawTensorMetadataBase() = default;
46	explicit RawTensorMetadataBase(const at::Tensor& t);
47
48	StorageImplData data_;
49	c10::ScalarType dtype_;
50	c10::Layout layout_;
51	uint32_t dim_;
52	};
53
54	// Collected during profiling.
55	struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
56	RawTensorMetadata() = default;
57	RawTensorMetadata(const RawTensorMetadata&) = default;
58	explicit RawTensorMetadata(const at::Tensor& t);
59
60	// Wrap `weak_self_` in `c10::optional` and split device into components to
61	// keep struct default constructable. (which the std::array initializer needs)
62	c10::optional<WeakTensor> weak_self_;
63	c10::DeviceType device_type_;
64	c10::DeviceIndex device_index_;
65	};
66
67	// Used during post processing.
68	struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
69	TensorMetadata(
70	const RawTensorMetadata& r,
71	std::vector<int64_t> sizes,
72	std::vector<int64_t> strides);
73
74	TensorImplAddress impl() const {
75	return weak_self_.get();
76	}
77
78	WeakTensor weak_self_;
79	c10::Device device_;
80	std::vector<int64_t> sizes_;
81	std::vector<int64_t> strides_;
82
83	// Set during `calculateUniqueTensorIDs`.
84	c10::optional<TensorID> id_;
85	c10::optional<AllocationID> allocation_id_;
86	};
87
88	using op_input_t = c10::variant<
89	TensorMetadata,
90	std::vector<TensorMetadata>,
91	c10::IValue,
92	c10::nullopt_t>;
93
94	// ============================================================================
95	// == ExtraFields =============================================================
96	// ============================================================================
97	template <EventType>
98	struct ExtraFields;
99
100	struct Result;
101
102	struct TorchOpBasicFields {
103	int64_t sequence_number_{`0`};
104	uint64_t forward_tid_{`0`};
105	at::RecordScope scope_{};
106	bool is_async_{false};
107	int64_t debug_handle_{`0`};
108	std::string name_;
109
110	// Set in the exit callback.
111	uint64_t end_tid_{`0`};
112	};
113
114	using jit_stack_t = std::vector<std::string>;
115	using jit_modules_t = std::vector<std::string>;
116	using extra_args_t = std::unordered_map<std::string, c10::IValue>;
117
118	struct FallbackPair {
119	ProfilerEventStub cuda_event_start_ = nullptr;
120	ProfilerEventStub cuda_event_end_ = nullptr;
121	};
122
123	template <>
124	struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
125	ExtraFields(
126	TorchOpBasicFields&& f,
127	uint64_t correlation_id,
128	time_t end_time_ns,
129	std::vector<op_input_t>&& inputs,
130	jit_stack_t&& jit_stack,
131	jit_modules_t&& jit_modules,
132	extra_args_t&& extra_args,
133	FallbackPair&& gpu_fallback,
134	bool allow_tf32_cublas,
135	std::unique_ptr<perf_counters_t>&& perf_event_counters)
136	: TorchOpBasicFields (std::move(f)),
137	correlation_id_{correlation_id},
138	end_time_ns_{end_time_ns},
139	inputs_{std::move(inputs)},
140	jit_stack_{std::move(jit_stack)},
141	jit_modules_{std::move(jit_modules)},
142	extra_args_{std::move(extra_args)},
143	gpu_fallback_{std::move(gpu_fallback)},
144	allow_tf32_cublas_{allow_tf32_cublas},
145	perf_event_counters_{std::move(perf_event_counters)} {}
146	uint64_t correlation_id_;
147	time_t end_time_ns_;
148	std::vector<op_input_t> inputs_;
149	jit_stack_t jit_stack_;
150	jit_modules_t jit_modules_;
151	extra_args_t extra_args_;
152	FallbackPair gpu_fallback_;
153	bool allow_tf32_cublas_;
154	std::unique_ptr<perf_counters_t> perf_event_counters_;
155	};
156
157	template <>
158	struct ExtraFields<EventType::Backend> {
159	int64_t start_time_us_;
160	int64_t end_time_us_;
161	int64_t debug_handle_;
162	at::RecordScope scope_;
163	std::string name_;
164	std::string backend_;
165	jit_stack_t jit_stack_;
166	jit_modules_t jit_modules_;
167	};
168
169	template <>
170	struct ExtraFields<EventType::Vulkan> {
171	using raw_event_t = std::pair<approx_time_t, vulkan_id_t>;
172	std::string name_;
173	int64_t duration_ns_{`0`};
174	// While building the event tree, we want to report a vulkan event's duration
175	// as 0 so that its end time doesn't exceed that of its parent cpu op
176	bool in_tree_building_{false};
177	};
178
179	struct RawAllocation {
180	torch::profiler::impl::approx_time_t start_time_;
181	void* ptr_;
182	int64_t alloc_size_;
183	size_t total_allocated_;
184	size_t total_reserved_;
185	c10::DeviceType device_type_;
186	c10::DeviceIndex device_index_;
187	};
188
189	// For performance.
190	static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation.");
191
192	template <>
193	struct ExtraFields<EventType::Allocation> : RawAllocation {
194	ExtraFields(const RawAllocation& allocation) : RawAllocation (allocation) {}
195
196	c10::Device device() const {
197	return {device_type_, device_index_};
198	}
199
200	c10::optional<TensorID> id_;
201	c10::optional<AllocationID> allocation_id_;
202	};
203
204	template <>
205	struct ExtraFields<EventType::OutOfMemory> {
206	torch::profiler::impl::approx_time_t start_time_;
207	int64_t alloc_size_;
208	size_t total_allocated_;
209	size_t total_reserved_;
210	c10::DeviceType device_type_;
211	c10::DeviceIndex device_index_;
212	};
213
214	// For performance.
215	static_assert(
216	c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>,
217	"Non-POD member of ExtraFields<EventType::OutOfMemory>.");
218
219	struct PyFrameState {
220	int line_no_;
221	at::StringView filename_;
222	at::StringView funcname_;
223	};
224
225	template <typename T, typename Tag>
226	using strong_t = strong::
227	type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>;
228
229	using PyModuleSelf = strong_t<PyObject, struct* PyModuleSelf_>;
230	using PyModuleCls = strong_t<PyObject, struct* PyModuleCls_>;
231	using PyMethod = strong_t</PyMethodDef/ void, struct* PyMethod_>;
232	using PyOptimizerSelf = strong_t<PyObject, struct* PyOptSelf_>;
233	using PyOptimizerCls = strong_t<PyObject, struct* PyOptimizer_>;
234
235	struct NNModuleInfo {
236	struct ParameterInfo {
237	std::string name_;
238	TensorMetadata metadata_;
239	c10::optional<TensorMetadata> grad_metadata_;
240	};
241
242	PyModuleSelf self_;
243	PyModuleCls cls_;
244	at::StringView cls_name_;
245
246	std::vector<ParameterInfo> parameters_;
247	// Indicates that `self_` is the kth instance of `cls_` observed.
248	size_t id_{std::numeric_limits<size_t>::max()};
249	};
250
251	struct OptimizerInfo {
252	struct ParameterInfo {
253	TensorMetadata metadata_;
254	c10::optional<TensorMetadata> grad_metadata_;
255	std::vector<std::pair<std::string, TensorMetadata>> state_;
256	};
257
258	PyOptimizerSelf self_;
259	PyOptimizerCls cls_;
260	at::StringView cls_name_;
261
262	std::vector<ParameterInfo> parameters_;
263	};
264
265	struct PyExtraFieldsBase {
266	PyExtraFieldsBase(time_t end_time_ns, size_t python_tid, PyFrameState caller)
267	: end_time_ns_{end_time_ns},
268	python_tid_{python_tid},
269	caller_{std::move(caller)} {}
270
271	time_t end_time_ns_;
272	size_t python_tid_;
273	PyFrameState caller_;
274
275	// kth python event observed. (Used by TensorBoard)
276	size_t id_{std::numeric_limits<size_t>::max()};
277	};
278
279	template <>
280	struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
281	struct args_t {
282	PyFrameState frame_state_;
283	c10::optional<NNModuleInfo> module_info_;
284	c10::optional<OptimizerInfo> optimizer_info_;
285	};
286
287	ExtraFields(
288	time_t end_time_ns,
289	size_t python_tid,
290	PyFrameState caller,
291	args_t args)
292	: PyExtraFieldsBase (end_time_ns, python_tid, caller),
293	callsite_{args.frame_state_},
294	module_{args.module_info_},
295	optimizer_{args.optimizer_info_} {}
296
297	PyFrameState callsite_;
298	c10::optional<NNModuleInfo> module_;
299	c10::optional<OptimizerInfo> optimizer_;
300	};
301
302	template <>
303	struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
304	using args_t = at::StringView;
305
306	ExtraFields(
307	time_t end_time_ns,
308	size_t python_tid,
309	PyFrameState caller,
310	args_t args)
311	: PyExtraFieldsBase (end_time_ns, python_tid, caller),
312	function_name_{std::move(args)} {}
313
314	at::StringView function_name_;
315	};
316
317	template <>
318	struct ExtraFields<EventType::Kineto> {
319	// Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used
320	// during post processing to properly embed Kineto events into the broader
321	// profiler tree structure. End users are not generally expected to use these
322	// fields directly, but they are available for debugging.
323	struct Flow {
324	uint32_t id{`0`};
325	uint32_t type{`0`};
326	uint32_t start{`0`};
327	};
328
329	std::string name_;
330	int64_t duration_us_{`0`};
331	uint64_t correlation_id_{`0`};
332	libkineto::ActivityType activity_type_;
333	Flow flow;
334	std::weak_ptr<Result> linked_activity_{};
335	};
336
337	struct TORCH_API Result : public std::enable_shared_from_this<Result> {
338	template <typename... Args>
339	[[nodiscard]] static std::shared_ptr<Result> create(Args... args) {
340	return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...));
341	}
342
343	template <typename T>
344	decltype(auto) visit(T&& visitor) {
345	return c10::visit(std::forward<T>(visitor), extra_fields_);
346	}
347
348	template <typename T>
349	decltype(auto) visit(T&& visitor) const {
350	return c10::visit(std::forward<T>(visitor), extra_fields_);
351	}
352
353	template <typename T, typename Fn>
354	void visit_if_base(Fn&& fn) const {
355	visit([&](const auto& extra_fields) {
356	using extra_fields_t = typename std::remove_cv<
357	typename std::remove_reference<decltype(extra_fields)>::type>::type;
358
359	c10::guts::if_constexpr<std::is_base_of<T, extra_fields_t>::value>(
360	[&](auto _) { fn(_(extra_fields)); });
361	});
362	}
363
364	EventType tag() const {
365	return visit([](const auto& i) { return deduceTag(i); });
366	}
367
368	std::string name() const;
369	libkineto::ActivityType kinetoType() const;
370	uint64_t correlationID() const;
371	int64_t endTimeNS() const;
372	uint64_t endTID() const;
373	c10::DeviceType deviceType() const;
374
375	int64_t start_time_ns_;
376	uint64_t start_tid_;
377	kineto::DeviceAndResource kineto_info_;
378	c10::variant<
379	ExtraFields<EventType::TorchOp>,
380	ExtraFields<EventType::Backend>,
381	ExtraFields<EventType::Vulkan>,
382	ExtraFields<EventType::Allocation>,
383	ExtraFields<EventType::OutOfMemory>,
384	ExtraFields<EventType::PyCall>,
385	ExtraFields<EventType::PyCCall>,
386	ExtraFields<EventType::Kineto>>
387	extra_fields_;
388
389	std::weak_ptr<Result> parent_;
390	std::vector<std::shared_ptr<Result>> children_;
391	bool finished_{false};
392
393	const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
394
395	private:
396	template <EventType E>
397	Result(
398	int64_t start_time_ns,
399	uint64_t start_tid,
400	kineto::DeviceAndResource kineto_info,
401	ExtraFields<E>&& extra_fields)
402	: start_time_ns_{start_time_ns},
403	start_tid_{start_tid},
404	kineto_info_{kineto_info},
405	extra_fields_{std::move(extra_fields)} {}
406
407	template <EventType E>
408	static EventType deduceTag(const ExtraFields<E>&) {
409	return E;
410	}
411	};
412
413	struct KinetoObserverContext : public at::ObserverContext {
414	struct Event {
415	TorchOpBasicFields basic_fields_;
416	approx_time_t start_time_;
417
418	// Set in the exit callback.
419	approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()};
420
421	bool allow_tf32_cublas_;
422	std::unique_ptr<perf_counters_t> counters_;
423	};
424
425	explicit KinetoObserverContext(Event* event) : event_{event} {}
426
427	Event* event_;
428	FallbackPair* fallback_{nullptr};
429	};
430
431	constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = `1024`;
432
433	// InputOutputEncoder
434	// Stores each op_events' shapes and dtypes into a contiguous AppendOnlyList
435	// so that we no longer create vectors for shapes and dtypes on every op.
436	// Those vectors can be created during post-processing.
437	class InputOutputEncoder final {
438	public:
439	void push(c10::ArrayRef<const c10::IValue> values);
440
441	// Used during post-processing to create vectors for shapes and dtype.
442	auto getNextShapesAndDtypes();
443
444	void clear();
445
446	private:
447	enum class Tag {
448	Tensor = `0`,
449	UndefinedTensor,
450	TensorListBegin, // TODO: generalize to other lists.
451	Scalar,
452	Other,
453	TERMINATOR
454	};
455
456	void push(const at::Tensor& t);
457
458	AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
459	AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE>
460	tensor_metadata_;
461	AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_;
462	AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
463	};
464
465	using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler;
466
467	class TORCH_API ThreadLocalSubqueue {
468	public:
469	ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config);
470
471	std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn);
472
473	template <class... Args>
474	void emplace_backend_event(Args&&... args) {
475	backend_events_.emplace_back(std::forward<Args>(args)...);
476	}
477
478	template <class... Args>
479	void emplace_vulkan_event(Args&&... args) {
480	vulkan_events_.emplace_back(std::forward<Args>(args)...);
481	}
482
483	template <class... Args>
484	void emplace_allocation_event(Args&&... args) {
485	allocations_.emplace_back(std::forward<Args>(args)...);
486	}
487
488	template <class... Args>
489	void emplace_ooms_event(Args&&... args) {
490	ooms_.emplace_back(std::forward<Args>(args)...);
491	}
492
493	template <class... Args>
494	void emplace_py_call(Args&&... args) {
495	py_calls_.emplace_back(std::forward<Args>(args)...);
496	}
497
498	uint64_t tid() const {
499	return tid_;
500	}
501
502	const kineto::DeviceAndResource& kineto_info() const {
503	return kineto_info_;
504	}
505
506	inline void disable_perf_profiler(perf_counters_t& counters) const {
507	perf_profiler_->Disable(counters);
508	}
509
510	private:
511	uint64_t tid_;
512	ProfilerConfig config_;
513	kineto::DeviceAndResource kineto_info_;
514	std::unique_ptr<perf_profiler_t> perf_profiler_;
515
516	friend class RecordQueue;
517	// See `containers.h` for block size benchmarks.
518	static constexpr size_t BlockSize = `512`;
519
520	struct TorchOpStorage {
521	// NB: This is a destructive operation.
522	void materialize(
523	std::vector<std::shared_ptr<Result>>& out,
524	const std::function<time_t(approx_time_t)> time_converter,
525	const uint64_t tid,
526	const kineto::DeviceAndResource& kineto_info);
527
528	template <typename T, size_t ChunkSize>
529	class EventBlock : public std::array<T, ChunkSize> {
530	public:
531	EventBlock();
532	uint64_t correlation_id(const T* ptr) const;
533
534	private:
535	uint64_t id_start_;
536	};
537
538	using event_t = KinetoObserverContext::Event;
539	class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> {
540	public:
541	template <class... Args>
542	std::pair<event_t*, uint64_t> emplace_back(Args&&... args);
543	static uint64_t correlationID(const OpList::Iterator& e);
544	} op_events_;
545
546	// report_input_shapes
547	InputOutputEncoder inputs_outputs_;
548
549	// with_stack (JIT)
550	AppendOnlyList<jit_stack_t, BlockSize> jit_stack_;
551
552	// with_modules
553	AppendOnlyList<jit_modules_t, BlockSize> jit_modules_;
554
555	// with_flops
556	AppendOnlyList<extra_args_t, BlockSize> extra_args_;
557
558	// ProfilerState::KINETO_GPU_FALLBACK
559	AppendOnlyList<FallbackPair, BlockSize> gpu_fallback_;
560	} torch_ops_;
561
562	// reportBackendEventToActiveKinetoProfiler
563	AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_;
564
565	// _reportVulkanEventToProfiler
566	AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize>
567	vulkan_events_;
568
569	// reportMemoryUsage
570	AppendOnlyList<RawAllocation, BlockSize> allocations_;
571
572	// reportOOMs
573	AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_;
574
575	// with_stack (Python)
576	AppendOnlyList<std::pair<python_tracer::TraceKey, approx_time_t>, BlockSize>
577	py_calls_;
578	};
579
580	class TORCH_API RecordQueue {
581	public:
582	RecordQueue(const ProfilerConfig& config, std::set<ActivityType> activities);
583
584	bool tracePython() const;
585	ThreadLocalSubqueue* getSubqueue();
586	void stop();
587
588	// NB: This is a destructive operation.
589	std::pair<
590	std::vector<std::shared_ptr<Result>>,
591	std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
592	getRecords(
593	std::function<time_t(approx_time_t)> time_converter,
594	uint64_t start_time_us,
595	uint64_t end_time_us);
596
597	private:
598	uint32_t id_;
599	ProfilerConfig config_;
600	std::set<ActivityType> activities_;
601	ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
602	sub_queues_;
603	std::mutex sub_queue_mutex_;
604	std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_;
605	};
606
607	} // namespace impl
608	} // namespace profiler
609	} // namespace torch
610

Browse the source code of pytorch/torch/csrc/profiler/collection.h