1 | #include <unordered_set> |
2 | |
3 | #include <torch/csrc/profiler/perf-inl.h> |
4 | #include <torch/csrc/profiler/perf.h> |
5 | |
6 | namespace torch { |
7 | namespace profiler { |
8 | namespace impl { |
9 | |
10 | namespace linux_perf { |
11 | |
12 | #if defined(__ANDROID__) || defined(__linux__) |
13 | |
14 | /* |
15 | * PerfEvent |
16 | * --------- |
17 | */ |
18 | |
19 | /* |
20 | * Syscall wrapper for perf_event_open(2) |
21 | */ |
22 | inline long perf_event_open( |
23 | struct perf_event_attr* hw_event, |
24 | pid_t pid, |
25 | int cpu, |
26 | int group_fd, |
27 | unsigned long flags) { |
28 | return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); |
29 | } |
30 | |
31 | // TODO sync with Kineto level abstract events in profiler/events.h |
32 | static const std::unordered_map< |
33 | std::string, |
34 | std::pair<perf_type_id, /* perf event type */ uint32_t>> |
35 | EventTable{ |
36 | {"cycles" , |
37 | std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)}, |
38 | {"instructions" , |
39 | std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)}, |
40 | |
41 | // Non Standard events for testing |
42 | {"pagefaults" , |
43 | std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)}, |
44 | {"backend-stall-cycles" , |
45 | std::make_pair( |
46 | PERF_TYPE_HARDWARE, |
47 | PERF_COUNT_HW_STALLED_CYCLES_BACKEND)}, |
48 | {"frontend-stall-cycles" , |
49 | std::make_pair( |
50 | PERF_TYPE_HARDWARE, |
51 | PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}}; |
52 | |
53 | PerfEvent::~PerfEvent() { |
54 | if (fd_ > -1) { |
55 | close(fd_); |
56 | } |
57 | fd_ = -1; // poison |
58 | } |
59 | |
60 | void PerfEvent::Init() { |
61 | TORCH_CHECK(!name_.empty(), "Invalid profiler event name" ); |
62 | |
63 | auto const it = EventTable.find(name_); |
64 | if (it == EventTable.end()) { |
65 | TORCH_CHECK(false, "Unsupported profiler event name: " , name_); |
66 | } |
67 | |
68 | struct perf_event_attr attr {}; |
69 | memset(&attr, 0, sizeof(attr)); |
70 | |
71 | attr.size = sizeof(perf_event_attr); |
72 | attr.type = it->second.first; |
73 | attr.config = it->second.second; |
74 | attr.disabled = 1; |
75 | attr.inherit = 1; |
76 | attr.exclude_kernel = 1; // TBD |
77 | attr.exclude_hv = 1; |
78 | /* |
79 | * These can be used to calculate estimated totals if the PMU is overcommitted |
80 | * and multiplexing is happening |
81 | */ |
82 | attr.read_format = |
83 | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; |
84 | |
85 | pid_t pid = getpid(); // this pid |
86 | int cpu = -1; // all cpus |
87 | int group_fd = -1; |
88 | unsigned long flags = 0; |
89 | |
90 | fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags)); |
91 | if (fd_ == -1) { |
92 | TORCH_CHECK( |
93 | false, "perf_event_open() failed, error: " , std::strerror(errno)); |
94 | } |
95 | Reset(); |
96 | } |
97 | |
98 | uint64_t PerfEvent::ReadCounter() const { |
99 | PerfCounter counter{}; |
100 | long n = read(fd_, &counter, sizeof(PerfCounter)); |
101 | TORCH_CHECK( |
102 | n == sizeof(counter), |
103 | "Read failed for Perf event fd, event : " , |
104 | name_, |
105 | ", error: " , |
106 | std::strerror(errno)); |
107 | TORCH_CHECK( |
108 | counter.time_enabled == counter.time_running, |
109 | "Hardware performance counter time multiplexing is not handled yet" , |
110 | ", name: " , |
111 | name_, |
112 | ", enabled: " , |
113 | counter.time_enabled, |
114 | ", running: " , |
115 | counter.time_running); |
116 | return counter.value; |
117 | } |
118 | |
119 | #else /* __ANDROID__ || __linux__ */ |
120 | /* |
121 | * Shim class for unsupported platforms - this will always return 0 counter |
122 | * value |
123 | */ |
124 | |
125 | PerfEvent::~PerfEvent(){}; |
126 | |
127 | void PerfEvent::Init(){}; |
128 | |
129 | uint64_t PerfEvent::ReadCounter() const { |
130 | return 0; |
131 | }; |
132 | |
133 | #endif /* __ANDROID__ || __linux__ */ |
134 | |
135 | /* |
136 | * PerfProfiler |
137 | * ------------ |
138 | */ |
139 | |
140 | void PerfProfiler::Configure(std::vector<std::string>& event_names) { |
141 | TORCH_CHECK( |
142 | event_names.size() <= MAX_EVENTS, |
143 | "Too many events to configure, configured: " , |
144 | event_names.size(), |
145 | ", max allowed:" , |
146 | MAX_EVENTS); |
147 | std::unordered_set<std::string> s(event_names.begin(), event_names.end()); |
148 | TORCH_CHECK( |
149 | s.size() == event_names.size(), "Duplicate event names are not allowed!" ) |
150 | for (auto name : event_names) { |
151 | events_.emplace_back(name); |
152 | events_.back().Init(); |
153 | } |
154 | |
155 | // TODO |
156 | // Reset pthreadpool here to make sure we can attach to new children |
157 | // threads |
158 | } |
159 | |
160 | void PerfProfiler::Enable() { |
161 | if (!start_values_.empty()) { |
162 | StopCounting(); |
163 | } |
164 | |
165 | start_values_.emplace(events_.size(), 0); |
166 | |
167 | auto& sv = start_values_.top(); |
168 | for (int i = 0; i < events_.size(); ++i) { |
169 | sv[i] = events_[i].ReadCounter(); |
170 | } |
171 | StartCounting(); |
172 | } |
173 | |
174 | void PerfProfiler::Disable(perf_counters_t& vals) { |
175 | StopCounting(); |
176 | TORCH_CHECK( |
177 | vals.size() == events_.size(), |
178 | "Can not fit all perf counters in the supplied container" ); |
179 | TORCH_CHECK( |
180 | !start_values_.empty(), "PerfProfiler must be enabled before disabling" ); |
181 | |
182 | /* Always connecting this disable event to the last enable event i.e. using |
183 | * whatever is on the top of the start counter value stack. */ |
184 | perf_counters_t& sv = start_values_.top(); |
185 | for (int i = 0; i < events_.size(); ++i) { |
186 | vals[i] = CalcDelta(sv[i], events_[i].ReadCounter()); |
187 | } |
188 | start_values_.pop(); |
189 | |
190 | // Restore it for a parent |
191 | if (!start_values_.empty()) { |
192 | StartCounting(); |
193 | } |
194 | } |
195 | } // namespace linux_perf |
196 | } // namespace impl |
197 | } // namespace profiler |
198 | } // namespace torch |
199 | |