1#include <unordered_set>
2
3#include <torch/csrc/profiler/perf-inl.h>
4#include <torch/csrc/profiler/perf.h>
5
6namespace torch {
7namespace profiler {
8namespace impl {
9
10namespace linux_perf {
11
12#if defined(__ANDROID__) || defined(__linux__)
13
14/*
15 * PerfEvent
16 * ---------
17 */
18
19/*
20 * Syscall wrapper for perf_event_open(2)
21 */
22inline long perf_event_open(
23 struct perf_event_attr* hw_event,
24 pid_t pid,
25 int cpu,
26 int group_fd,
27 unsigned long flags) {
28 return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
29}
30
31// TODO sync with Kineto level abstract events in profiler/events.h
32static const std::unordered_map<
33 std::string,
34 std::pair<perf_type_id, /* perf event type */ uint32_t>>
35 EventTable{
36 {"cycles",
37 std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
38 {"instructions",
39 std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
40
41 // Non Standard events for testing
42 {"pagefaults",
43 std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
44 {"backend-stall-cycles",
45 std::make_pair(
46 PERF_TYPE_HARDWARE,
47 PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
48 {"frontend-stall-cycles",
49 std::make_pair(
50 PERF_TYPE_HARDWARE,
51 PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
52
53PerfEvent::~PerfEvent() {
54 if (fd_ > -1) {
55 close(fd_);
56 }
57 fd_ = -1; // poison
58}
59
60void PerfEvent::Init() {
61 TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
62
63 auto const it = EventTable.find(name_);
64 if (it == EventTable.end()) {
65 TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
66 }
67
68 struct perf_event_attr attr {};
69 memset(&attr, 0, sizeof(attr));
70
71 attr.size = sizeof(perf_event_attr);
72 attr.type = it->second.first;
73 attr.config = it->second.second;
74 attr.disabled = 1;
75 attr.inherit = 1;
76 attr.exclude_kernel = 1; // TBD
77 attr.exclude_hv = 1;
78 /*
79 * These can be used to calculate estimated totals if the PMU is overcommitted
80 * and multiplexing is happening
81 */
82 attr.read_format =
83 PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
84
85 pid_t pid = getpid(); // this pid
86 int cpu = -1; // all cpus
87 int group_fd = -1;
88 unsigned long flags = 0;
89
90 fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
91 if (fd_ == -1) {
92 TORCH_CHECK(
93 false, "perf_event_open() failed, error: ", std::strerror(errno));
94 }
95 Reset();
96}
97
98uint64_t PerfEvent::ReadCounter() const {
99 PerfCounter counter{};
100 long n = read(fd_, &counter, sizeof(PerfCounter));
101 TORCH_CHECK(
102 n == sizeof(counter),
103 "Read failed for Perf event fd, event : ",
104 name_,
105 ", error: ",
106 std::strerror(errno));
107 TORCH_CHECK(
108 counter.time_enabled == counter.time_running,
109 "Hardware performance counter time multiplexing is not handled yet",
110 ", name: ",
111 name_,
112 ", enabled: ",
113 counter.time_enabled,
114 ", running: ",
115 counter.time_running);
116 return counter.value;
117}
118
119#else /* __ANDROID__ || __linux__ */
120/*
121 * Shim class for unsupported platforms - this will always return 0 counter
122 * value
123 */
124
125PerfEvent::~PerfEvent(){};
126
127void PerfEvent::Init(){};
128
129uint64_t PerfEvent::ReadCounter() const {
130 return 0;
131};
132
133#endif /* __ANDROID__ || __linux__ */
134
135/*
136 * PerfProfiler
137 * ------------
138 */
139
140void PerfProfiler::Configure(std::vector<std::string>& event_names) {
141 TORCH_CHECK(
142 event_names.size() <= MAX_EVENTS,
143 "Too many events to configure, configured: ",
144 event_names.size(),
145 ", max allowed:",
146 MAX_EVENTS);
147 std::unordered_set<std::string> s(event_names.begin(), event_names.end());
148 TORCH_CHECK(
149 s.size() == event_names.size(), "Duplicate event names are not allowed!")
150 for (auto name : event_names) {
151 events_.emplace_back(name);
152 events_.back().Init();
153 }
154
155 // TODO
156 // Reset pthreadpool here to make sure we can attach to new children
157 // threads
158}
159
160void PerfProfiler::Enable() {
161 if (!start_values_.empty()) {
162 StopCounting();
163 }
164
165 start_values_.emplace(events_.size(), 0);
166
167 auto& sv = start_values_.top();
168 for (int i = 0; i < events_.size(); ++i) {
169 sv[i] = events_[i].ReadCounter();
170 }
171 StartCounting();
172}
173
174void PerfProfiler::Disable(perf_counters_t& vals) {
175 StopCounting();
176 TORCH_CHECK(
177 vals.size() == events_.size(),
178 "Can not fit all perf counters in the supplied container");
179 TORCH_CHECK(
180 !start_values_.empty(), "PerfProfiler must be enabled before disabling");
181
182 /* Always connecting this disable event to the last enable event i.e. using
183 * whatever is on the top of the start counter value stack. */
184 perf_counters_t& sv = start_values_.top();
185 for (int i = 0; i < events_.size(); ++i) {
186 vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
187 }
188 start_values_.pop();
189
190 // Restore it for a parent
191 if (!start_values_.empty()) {
192 StartCounting();
193 }
194}
195} // namespace linux_perf
196} // namespace impl
197} // namespace profiler
198} // namespace torch
199