perf.cpp source code [pytorch/torch/csrc/profiler/perf.cpp]

1	#include <unordered_set>
2
3	#include <torch/csrc/profiler/perf-inl.h>
4	#include <torch/csrc/profiler/perf.h>
5
6	namespace torch {
7	namespace profiler {
8	namespace impl {
9
10	namespace linux_perf {
11
12	#if defined(__ANDROID__) \|\| defined(__linux__)
13
14	/*
15	* PerfEvent
16	* ---------
17	*/
18
19	/*
20	* Syscall wrapper for perf_event_open(2)
21	*/
22	inline long perf_event_open(
23	struct perf_event_attr* hw_event,
24	pid_t pid,
25	int cpu,
26	int group_fd,
27	unsigned long flags) {
28	return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
29	}
30
31	// TODO sync with Kineto level abstract events in profiler/events.h
32	static const std::unordered_map<
33	std::string,
34	std::pair<perf_type_id, / perf event type / uint32_t>>
35	EventTable{
36	{"cycles",
37	std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
38	{"instructions",
39	std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
40
41	// Non Standard events for testing
42	{"pagefaults",
43	std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
44	{"backend-stall-cycles",
45	std::make_pair(
46	PERF_TYPE_HARDWARE,
47	PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
48	{"frontend-stall-cycles",
49	std::make_pair(
50	PERF_TYPE_HARDWARE,
51	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
52
53	PerfEvent::~PerfEvent() {
54	if (fd_ > -`1`) {
55	close(fd_);
56	}
57	fd_ = -`1`; // poison
58	}
59
60	void PerfEvent::Init() {
61	TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
62
63	auto const it = EventTable.find(name_);
64	if (it == EventTable.end()) {
65	TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
66	}
67
68	struct perf_event_attr attr {};
69	memset(&attr, `0`, sizeof(attr));
70
71	attr.size = sizeof(perf_event_attr);
72	attr.type = it ->second.first;
73	attr.config = it ->second.second;
74	attr.disabled = `1`;
75	attr.inherit = `1`;
76	attr.exclude_kernel = `1`; // TBD
77	attr.exclude_hv = `1`;
78	/*
79	* These can be used to calculate estimated totals if the PMU is overcommitted
80	* and multiplexing is happening
81	*/
82	attr.read_format =
83	PERF_FORMAT_TOTAL_TIME_ENABLED \| PERF_FORMAT_TOTAL_TIME_RUNNING;
84
85	pid_t pid = getpid(); // this pid
86	int cpu = -`1`; // all cpus
87	int group_fd = -`1`;
88	unsigned long flags = `0`;
89
90	fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
91	if (fd_ == -`1`) {
92	TORCH_CHECK(
93	false, "perf_event_open() failed, error: ", std::strerror(errno));
94	}
95	Reset();
96	}
97
98	uint64_t PerfEvent::ReadCounter() const {
99	PerfCounter counter{};
100	long n = read(fd_, &counter, sizeof(PerfCounter));
101	TORCH_CHECK(
102	n == sizeof(counter),
103	"Read failed for Perf event fd, event : ",
104	name_,
105	", error: ",
106	std::strerror(errno));
107	TORCH_CHECK(
108	counter.time_enabled == counter.time_running,
109	"Hardware performance counter time multiplexing is not handled yet",
110	", name: ",
111	name_,
112	", enabled: ",
113	counter.time_enabled,
114	", running: ",
115	counter.time_running);
116	return counter.value;
117	}
118
119	#else /* __ANDROID__ \|\| __linux__ */
120	/*
121	* Shim class for unsupported platforms - this will always return 0 counter
122	* value
123	*/
124
125	PerfEvent::~PerfEvent(){};
126
127	void PerfEvent::Init(){};
128
129	uint64_t PerfEvent::ReadCounter() const {
130	return `0`;
131	};
132
133	#endif /* __ANDROID__ \|\| __linux__ */
134
135	/*
136	* PerfProfiler
137	* ------------
138	*/
139
140	void PerfProfiler::Configure(std::vector<std::string>& event_names) {
141	TORCH_CHECK(
142	event_names.size() <= MAX_EVENTS,
143	"Too many events to configure, configured: ",
144	event_names.size(),
145	", max allowed:",
146	MAX_EVENTS);
147	std::unordered_set<std::string> s(event_names.begin(), event_names.end());
148	TORCH_CHECK(
149	s.size() == event_names.size(), "Duplicate event names are not allowed!")
150	for (auto name : event_names) {
151	events_.emplace_back(name);
152	events_.back().Init();
153	}
154
155	// TODO
156	// Reset pthreadpool here to make sure we can attach to new children
157	// threads
158	}
159
160	void PerfProfiler::Enable() {
161	if (!start_values_.empty()) {
162	StopCounting();
163	}
164
165	start_values_.emplace(events_.size(), `0`);
166
167	auto& sv = start_values_.top();
168	for (int i = `0`; i < events_.size(); ++i) {
169	sv [i] = events_[i].ReadCounter();
170	}
171	StartCounting();
172	}
173
174	void PerfProfiler::Disable(perf_counters_t& vals) {
175	StopCounting();
176	TORCH_CHECK(
177	vals.size() == events_.size(),
178	"Can not fit all perf counters in the supplied container");
179	TORCH_CHECK(
180	!start_values_.empty(), "PerfProfiler must be enabled before disabling");
181
182	/ Always connecting this disable event to the last enable event i.e. using*
183	* whatever is on the top of the start counter value stack. */
184	perf_counters_t& sv = start_values_.top();
185	for (int i = `0`; i < events_.size(); ++i) {
186	vals [i] = CalcDelta(sv [i], events_[i].ReadCounter());
187	}
188	start_values_.pop();
189
190	// Restore it for a parent
191	if (!start_values_.empty()) {
192	StartCounting();
193	}
194	}
195	} // namespace linux_perf
196	} // namespace impl
197	} // namespace profiler
198	} // namespace torch
199

Browse the source code of pytorch/torch/csrc/profiler/perf.cpp