profiler_legacy.h source code [pytorch/torch/csrc/autograd/profiler_legacy.h]

1	#pragma once
2
3	#include <cstdint>
4	#include <forward_list>
5	#include <iostream>
6	#include <memory>
7	#include <mutex>
8	#include <sstream>
9	#include <string>
10	#include <tuple>
11	#include <vector>
12
13	#include <torch/csrc/Export.h>
14	#include <torch/csrc/profiler/api.h>
15	#include <torch/csrc/profiler/stubs/base.h>
16	#include <torch/csrc/profiler/util.h>
17
18	namespace torch {
19	namespace autograd {
20
21	struct Node;
22
23	namespace profiler {
24
25	enum class C10_API_ENUM EventKind : uint16_t {
26	Mark,
27	PushRange,
28	PopRange,
29	MemoryAlloc,
30	};
31
32	// To be deprecated, once we switch to Kineto profiling
33	struct TORCH_API LegacyEvent {
34	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
35	LegacyEvent(
36	EventKind kind,
37	at::StringView name,
38	uint16_t thread_id,
39	bool record_cuda,
40	at::RecordFunctionHandle handle = `0`,
41	std::vector<std::vector<int64_t>>&& shapes = {},
42	int node_id = -`1`,
43	bool is_async = false)
44	: name_(std::move(name)),
45	kind_(kind),
46	thread_id_(thread_id),
47	handle_(handle),
48	shapes_(shapes),
49	node_id_(node_id),
50	is_async_(is_async) {
51	record(record_cuda);
52	}
53
54	// Constructor to be used in conjunction with LegacyEvent::fromIValue.
55	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
56	LegacyEvent(
57	EventKind kind,
58	at::StringView name,
59	uint16_t thread_id,
60	at::RecordFunctionHandle handle,
61	std::vector<std::vector<int64_t>>&& shapes,
62	int node_id,
63	bool is_remote,
64	int64_t cpu_memory_usage,
65	int64_t cpu_ns,
66	bool cuda_recorded,
67	int64_t cuda_memory_usage = `0`,
68	int device = -`1`,
69	double cuda_us = -`1`)
70	: cpu_ns_(cpu_ns),
71	name_(std::move(name)),
72	kind_(kind),
73	thread_id_(thread_id),
74	handle_(handle),
75	shapes_(shapes),
76	cpu_memory_usage_(cpu_memory_usage),
77	cuda_memory_usage_(cuda_memory_usage),
78	device_(device),
79	node_id_(node_id),
80	is_remote_(is_remote),
81	cuda_us_(cuda_us) {
82	// Sanity check values that were deserialized
83	TORCH_INTERNAL_ASSERT(cpu_ns_ > `0`);
84	if (cuda_recorded) {
85	TORCH_INTERNAL_ASSERT(device_ >= `0`);
86	TORCH_INTERNAL_ASSERT(cuda_us_ >= `0`);
87	}
88	}
89
90	// Returns IValues corresponding to event structure, to be used for
91	// serialization.
92	at::IValue toIValue() const;
93
94	// Reconstructs an event from IValues given by toIValue.
95	static LegacyEvent fromIValue(const at::IValue& eventIValue);
96
97	void record(bool record_cuda);
98
99	std::string kindStr() const {
100	switch (kind_) {
101	case EventKind::Mark:
102	return "mark";
103	case EventKind::PushRange:
104	return "push";
105	case EventKind::PopRange:
106	return "pop";
107	case EventKind::MemoryAlloc:
108	return "memory_alloc";
109	}
110	throw std::runtime_error ("unknown event kind");
111	}
112
113	EventKind kind() const {
114	return kind_;
115	}
116
117	const char* name() const {
118	return name_.str();
119	}
120
121	uint64_t threadId() const {
122	return thread_id_;
123	}
124
125	std::vector<std::vector<int64_t>> shapes() const {
126	return shapes_;
127	}
128
129	double cpuElapsedUs(const LegacyEvent& e) const {
130	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
131	return static_cast<double>(e.cpu_ns_ - cpu_ns_) / (`1000.0`);
132	}
133
134	void setCpuUs(int64_t cpu_us) {
135	cpu_ns_ = static_cast<double>(cpu_us) * `1000.0`;
136	}
137
138	double cpuUs() const {
139	return static_cast<double>(cpu_ns_) / (`1000.0`);
140	}
141
142	double cudaElapsedUs(const LegacyEvent& e) const;
143
144	bool hasCuda() const {
145	return cuda_event != nullptr \|\| (isRemote() && device_ != -`1`);
146	}
147
148	int device() const {
149	return device_;
150	}
151
152	void updateMemoryStats(int64_t alloc_size, c10::Device device) {
153	if (device.is_cuda() \|\| device.type() == c10::DeviceType::HIP) {
154	cuda_memory_usage_ = alloc_size;
155	} else if (
156	device.is_cpu() \|\| device.type() == c10::DeviceType::MKLDNN \|\|
157	device.type() == c10::DeviceType::IDEEP) {
158	cpu_memory_usage_ = alloc_size;
159	} else {
160	LOG(WARNING) << "Unsupported memory profiling device: " << device;
161	}
162	}
163
164	int64_t cpuMemoryUsage() const {
165	return cpu_memory_usage_;
166	}
167
168	int64_t cudaMemoryUsage() const {
169	return cuda_memory_usage_;
170	}
171
172	at::RecordFunctionHandle handle() const {
173	return handle_;
174	}
175
176	// Node ID corresponding to this event.
177	int nodeId() const {
178	return node_id_;
179	}
180
181	// Set Node ID on this event.
182	void setNodeId(int node_id) {
183	node_id_ = node_id;
184	}
185
186	void setName(at::StringView newName_) {
187	name_ = std::move(newName_);
188	}
189
190	bool isRemote() const {
191	return is_remote_;
192	}
193
194	void setCudaUs(int64_t cuda_us) {
195	cuda_us_ = cuda_us;
196	}
197
198	void setSequenceNr(int64_t sequence_nr) {
199	sequence_nr_ = sequence_nr;
200	}
201
202	int64_t sequenceNr() const {
203	return sequence_nr_;
204	}
205
206	void setCorrelationId(uint64_t correlation_id) {
207	correlation_id_ = correlation_id;
208	}
209
210	uint64_t correlationId() const {
211	return correlation_id_;
212	}
213
214	const std::vector<std::string>& stack() const {
215	return stack_;
216	}
217
218	void setStack(const std::vector<std::string>& stack) {
219	stack_ = stack;
220	}
221
222	uint64_t fwdThreadId() const {
223	return fwd_thread_id_;
224	}
225
226	void setFwdThreadId(uint64_t fwd_thread_id) {
227	fwd_thread_id_ = fwd_thread_id;
228	}
229
230	uint8_t scope() const {
231	return scope_;
232	}
233
234	void setScope(uint8_t scope) {
235	scope_ = scope;
236	}
237
238	const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
239	return extra_args_;
240	}
241
242	void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
243	extra_args_ = std::move(save_args);
244	}
245
246	uint64_t flops() {
247	return flops_;
248	}
249
250	bool isAsync() {
251	return is_async_;
252	}
253
254	void setFlops(uint64_t flops) {
255	flops_ = flops;
256	}
257
258	private:
259	// signed to allow for negative intervals, initialized for safety.
260	int64_t cpu_ns_ = `0`;
261	at::StringView name_;
262	EventKind kind_;
263	uint64_t thread_id_;
264	uint64_t fwd_thread_id_;
265	at::RecordFunctionHandle handle_{`0`};
266	std::vector<std::vector<int64_t>> shapes_;
267	int64_t cpu_memory_usage_ = `0`;
268	int64_t cuda_memory_usage_ = `0`;
269	int device_ = -`1`;
270	torch::profiler::impl::ProfilerEventStub cuda_event = nullptr;
271	int node_id_ = `0`;
272	bool is_remote_ = false;
273	int64_t cuda_us_ = -`1`;
274	int64_t sequence_nr_ = -`1`;
275	bool is_async_ = false;
276
277	std::vector<std::string> stack_;
278	uint8_t scope_;
279	uint64_t correlation_id_;
280	// Extra arguments for computing op flops
281	std::unordered_map<std::string, c10::IValue> extra_args_;
282	uint64_t flops_ = `0`;
283	};
284
285	// a linked-list of fixed sized vectors, to avoid
286	// a std::vector resize from taking a large amount of time inside
287	// a profiling event
288	struct RangeEventList {
289	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,modernize-use-equals-default)
290	RangeEventList() {
291	events_.reserve(kReservedCapacity);
292	}
293
294	template <typename... Args>
295	void record(Args&&... args) {
296	std::lock_guard<std::mutex> guard(mutex_);
297	events_.emplace_back(std::forward<Args>(args)...);
298	}
299
300	std::vector<LegacyEvent> consolidate() {
301	std::lock_guard<std::mutex> lock(mutex_);
302	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
303	std::vector<LegacyEvent> result;
304	result.insert(
305	result.begin(),
306	std::make_move_iterator(events_.begin()),
307	std::make_move_iterator(events_.end()));
308	events_.erase(events_.begin(), events_.end());
309	return result;
310	}
311
312	size_t size() {
313	std::lock_guard<std::mutex> lock(mutex_);
314	return events_.size();
315	}
316
317	private:
318	// This mutex is used to serialize access when different threads are writing
319	// to the same instance of RangeEventList.
320	std::mutex mutex_;
321	std::vector<LegacyEvent> events_;
322
323	static const size_t kReservedCapacity = `1024`;
324	};
325
326	// A struct to control settings of disableProfiler options.
327	struct TORCH_API ProfilerDisableOptions {
328	ProfilerDisableOptions() = default;
329	ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
330	: cleanupTLSState(shouldCleanupTLSState),
331	consolidate(shouldConsolidate) {}
332	// Whether we should clean up profiler states that are thread local, such as
333	// ThreadLocalDebugInfo and thread local RecordFunction callbacks.
334	bool cleanupTLSState = true;
335	// Whether we should consolidate all currently recorded profiled events. If
336	// false, will not consolidate and other threads can continue to write to the
337	// event lists.
338	bool consolidate = true;
339	};
340
341	// NOTE: profiler mode is thread local, with automatic propagation
342	// across thread boundary (e.g. at::launch tasks)
343	TORCH_API void enableProfilerLegacy(
344	const torch::profiler::impl::ProfilerConfig&);
345	using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
346	TORCH_API thread_event_lists disableProfilerLegacy(
347	c10::optional<ProfilerDisableOptions> profilerDisableOptions =
348	c10::nullopt);
349
350	// adds profiledEvents to the current thread local recorded events. Each event
351	// will be marked with node ID given by fromNodeId.
352	TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
353	// Writes profiled events to a stream.
354	TORCH_API void writeProfilerEventsToStream(
355	std::ostream& out,
356	const std::vector<LegacyEvent*>& events);
357
358	// Usage:
359	// {
360	// RecordProfile guard("filename.trace");
361	// // code you want to profile
362	// }
363	// Then open filename.trace in chrome://tracing
364	struct TORCH_API RecordProfile {
365	RecordProfile(std::ostream& out);
366	RecordProfile(const std::string& filename);
367
368	~RecordProfile();
369
370	private:
371	void init();
372	std::unique_ptr<std::ofstream> file_;
373	std::ostream& out_;
374	void processEvents(const std::vector<LegacyEvent*>& events);
375	};
376
377	// A guard that enables the legacy profiler, taking in an optional callback to
378	// process the results Usage:
379	// {
380	// TLSLegacyProfilerGuard g([](thread_event_lists profilerResults) {
381	// // process profilerResults
382	// });
383	// Code to profile
384	// }
385	struct TORCH_API TLSLegacyProfilerGuard {
386	explicit TLSLegacyProfilerGuard(
387	const torch::profiler::impl::ProfilerConfig& cfg,
388	c10::optional<std::function<void(const thread_event_lists&)>>
389	resultCallback = c10::nullopt,
390	c10::optional<ProfilerDisableOptions> profilerDisableOptions =
391	c10::nullopt)
392	: cb_(std::move(resultCallback)),
393	// NOLINTNEXTLINE(performance-move-const-arg)
394	profilerDisableOptions_(std::move(profilerDisableOptions)) {
395	enableProfilerLegacy(cfg);
396	}
397	~TLSLegacyProfilerGuard() {
398	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
399	thread_event_lists event_lists =
400	disableProfilerLegacy(profilerDisableOptions_);
401	if (cb_) {
402	try {
403	(*cb_)(event_lists);
404	} catch (const std::exception& e) {
405	LOG(ERROR) << "Got error processing profiler events: " << e.what();
406	}
407	}
408	}
409
410	private:
411	c10::optional<std::function<void(const thread_event_lists&)>> cb_;
412	const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
413	};
414
415	} // namespace profiler
416	} // namespace autograd
417	} // namespace torch
418

Browse the source code of pytorch/torch/csrc/autograd/profiler_legacy.h