1 | // Copyright 2015 Google Inc. All rights reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #include "benchmark_runner.h" |
16 | #include "benchmark/benchmark.h" |
17 | #include "benchmark_api_internal.h" |
18 | #include "internal_macros.h" |
19 | |
20 | #ifndef BENCHMARK_OS_WINDOWS |
21 | #ifndef BENCHMARK_OS_FUCHSIA |
22 | #include <sys/resource.h> |
23 | #endif |
24 | #include <sys/time.h> |
25 | #include <unistd.h> |
26 | #endif |
27 | |
28 | #include <algorithm> |
29 | #include <atomic> |
30 | #include <condition_variable> |
31 | #include <cstdio> |
32 | #include <cstdlib> |
33 | #include <fstream> |
34 | #include <iostream> |
35 | #include <memory> |
36 | #include <string> |
37 | #include <thread> |
38 | #include <utility> |
39 | |
40 | #include "check.h" |
41 | #include "colorprint.h" |
42 | #include "commandlineflags.h" |
43 | #include "complexity.h" |
44 | #include "counter.h" |
45 | #include "internal_macros.h" |
46 | #include "log.h" |
47 | #include "mutex.h" |
48 | #include "perf_counters.h" |
49 | #include "re.h" |
50 | #include "statistics.h" |
51 | #include "string_util.h" |
52 | #include "thread_manager.h" |
53 | #include "thread_timer.h" |
54 | |
55 | namespace benchmark { |
56 | |
57 | namespace internal { |
58 | |
59 | MemoryManager* memory_manager = nullptr; |
60 | |
61 | namespace { |
62 | |
63 | static constexpr IterationCount kMaxIterations = 1000000000; |
64 | |
65 | BenchmarkReporter::Run CreateRunReport( |
66 | const benchmark::internal::BenchmarkInstance& b, |
67 | const internal::ThreadManager::Result& results, |
68 | IterationCount memory_iterations, |
69 | const MemoryManager::Result& memory_result, double seconds, |
70 | int64_t repetition_index) { |
71 | // Create report about this benchmark run. |
72 | BenchmarkReporter::Run report; |
73 | |
74 | report.run_name = b.name(); |
75 | report.error_occurred = results.has_error_; |
76 | report.error_message = results.error_message_; |
77 | report.report_label = results.report_label_; |
78 | // This is the total iterations across all threads. |
79 | report.iterations = results.iterations; |
80 | report.time_unit = b.time_unit(); |
81 | report.threads = b.threads(); |
82 | report.repetition_index = repetition_index; |
83 | report.repetitions = b.repetitions(); |
84 | |
85 | if (!report.error_occurred) { |
86 | if (b.use_manual_time()) { |
87 | report.real_accumulated_time = results.manual_time_used; |
88 | } else { |
89 | report.real_accumulated_time = results.real_time_used; |
90 | } |
91 | report.cpu_accumulated_time = results.cpu_time_used; |
92 | report.complexity_n = results.complexity_n; |
93 | report.complexity = b.complexity(); |
94 | report.complexity_lambda = b.complexity_lambda(); |
95 | report.statistics = &b.statistics(); |
96 | report.counters = results.counters; |
97 | |
98 | if (memory_iterations > 0) { |
99 | report.has_memory_result = true; |
100 | report.allocs_per_iter = |
101 | memory_iterations ? static_cast<double>(memory_result.num_allocs) / |
102 | memory_iterations |
103 | : 0; |
104 | report.max_bytes_used = memory_result.max_bytes_used; |
105 | } |
106 | |
107 | internal::Finish(&report.counters, results.iterations, seconds, b.threads()); |
108 | } |
109 | return report; |
110 | } |
111 | |
112 | // Execute one thread of benchmark b for the specified number of iterations. |
113 | // Adds the stats collected for the thread into manager->results. |
114 | void RunInThread(const BenchmarkInstance* b, IterationCount iters, |
115 | int thread_id, ThreadManager* manager, |
116 | PerfCountersMeasurement* perf_counters_measurement) { |
117 | internal::ThreadTimer timer( |
118 | b->measure_process_cpu_time() |
119 | ? internal::ThreadTimer::CreateProcessCpuTime() |
120 | : internal::ThreadTimer::Create()); |
121 | State st = |
122 | b->Run(iters, thread_id, &timer, manager, perf_counters_measurement); |
123 | CHECK(st.error_occurred() || st.iterations() >= st.max_iterations) |
124 | << "Benchmark returned before State::KeepRunning() returned false!" ; |
125 | { |
126 | MutexLock l(manager->GetBenchmarkMutex()); |
127 | internal::ThreadManager::Result& results = manager->results; |
128 | results.iterations += st.iterations(); |
129 | results.cpu_time_used += timer.cpu_time_used(); |
130 | results.real_time_used += timer.real_time_used(); |
131 | results.manual_time_used += timer.manual_time_used(); |
132 | results.complexity_n += st.complexity_length_n(); |
133 | internal::Increment(&results.counters, st.counters); |
134 | } |
135 | manager->NotifyThreadComplete(); |
136 | } |
137 | |
138 | class BenchmarkRunner { |
139 | public: |
140 | BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_, |
141 | std::vector<BenchmarkReporter::Run>* complexity_reports_) |
142 | : b(b_), |
143 | complexity_reports(*complexity_reports_), |
144 | min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time), |
145 | repeats(b.repetitions() != 0 ? b.repetitions() |
146 | : FLAGS_benchmark_repetitions), |
147 | has_explicit_iteration_count(b.iterations() != 0), |
148 | pool(b.threads() - 1), |
149 | iters(has_explicit_iteration_count ? b.iterations() : 1), |
150 | perf_counters_measurement( |
151 | PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))), |
152 | perf_counters_measurement_ptr(perf_counters_measurement.IsValid() |
153 | ? &perf_counters_measurement |
154 | : nullptr) { |
155 | run_results.display_report_aggregates_only = |
156 | (FLAGS_benchmark_report_aggregates_only || |
157 | FLAGS_benchmark_display_aggregates_only); |
158 | run_results.file_report_aggregates_only = |
159 | FLAGS_benchmark_report_aggregates_only; |
160 | if (b.aggregation_report_mode() != internal::ARM_Unspecified) { |
161 | run_results.display_report_aggregates_only = |
162 | (b.aggregation_report_mode() & |
163 | internal::ARM_DisplayReportAggregatesOnly); |
164 | run_results.file_report_aggregates_only = |
165 | (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly); |
166 | CHECK(b.threads() == 1 || !perf_counters_measurement.IsValid()) |
167 | << "Perf counters are not supported in multi-threaded cases.\n" ; |
168 | CHECK(FLAGS_benchmark_perf_counters.empty() || |
169 | perf_counters_measurement.IsValid()) |
170 | << "Perf counters were requested but could not be set up." ; |
171 | } |
172 | |
173 | for (int repetition_num = 0; repetition_num < repeats; repetition_num++) { |
174 | DoOneRepetition(repetition_num); |
175 | } |
176 | |
177 | // Calculate additional statistics |
178 | run_results.aggregates_only = ComputeStats(run_results.non_aggregates); |
179 | |
180 | // Maybe calculate complexity report |
181 | if ((b.complexity() != oNone) && b.last_benchmark_instance) { |
182 | auto additional_run_stats = ComputeBigO(complexity_reports); |
183 | run_results.aggregates_only.insert(run_results.aggregates_only.end(), |
184 | additional_run_stats.begin(), |
185 | additional_run_stats.end()); |
186 | complexity_reports.clear(); |
187 | } |
188 | } |
189 | |
190 | RunResults&& get_results() { return std::move(run_results); } |
191 | |
192 | private: |
193 | RunResults run_results; |
194 | |
195 | const benchmark::internal::BenchmarkInstance& b; |
196 | std::vector<BenchmarkReporter::Run>& complexity_reports; |
197 | |
198 | const double min_time; |
199 | const int repeats; |
200 | const bool has_explicit_iteration_count; |
201 | |
202 | std::vector<std::thread> pool; |
203 | |
204 | IterationCount iters; // preserved between repetitions! |
205 | // So only the first repetition has to find/calculate it, |
206 | // the other repetitions will just use that precomputed iteration count. |
207 | |
208 | PerfCountersMeasurement perf_counters_measurement; |
209 | PerfCountersMeasurement* const perf_counters_measurement_ptr; |
210 | |
211 | struct IterationResults { |
212 | internal::ThreadManager::Result results; |
213 | IterationCount iters; |
214 | double seconds; |
215 | }; |
216 | IterationResults DoNIterations() { |
217 | VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n" ; |
218 | |
219 | std::unique_ptr<internal::ThreadManager> manager; |
220 | manager.reset(new internal::ThreadManager(b.threads())); |
221 | |
222 | // Run all but one thread in separate threads |
223 | for (std::size_t ti = 0; ti < pool.size(); ++ti) { |
224 | pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1), |
225 | manager.get(), perf_counters_measurement_ptr); |
226 | } |
227 | // And run one thread here directly. |
228 | // (If we were asked to run just one thread, we don't create new threads.) |
229 | // Yes, we need to do this here *after* we start the separate threads. |
230 | RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr); |
231 | |
232 | // The main thread has finished. Now let's wait for the other threads. |
233 | manager->WaitForAllThreads(); |
234 | for (std::thread& thread : pool) thread.join(); |
235 | |
236 | IterationResults i; |
237 | // Acquire the measurements/counters from the manager, UNDER THE LOCK! |
238 | { |
239 | MutexLock l(manager->GetBenchmarkMutex()); |
240 | i.results = manager->results; |
241 | } |
242 | |
243 | // And get rid of the manager. |
244 | manager.reset(); |
245 | |
246 | // Adjust real/manual time stats since they were reported per thread. |
247 | i.results.real_time_used /= b.threads(); |
248 | i.results.manual_time_used /= b.threads(); |
249 | // If we were measuring whole-process CPU usage, adjust the CPU time too. |
250 | if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads(); |
251 | |
252 | VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" |
253 | << i.results.real_time_used << "\n" ; |
254 | |
255 | // By using KeepRunningBatch a benchmark can iterate more times than |
256 | // requested, so take the iteration count from i.results. |
257 | i.iters = i.results.iterations / b.threads(); |
258 | |
259 | // Base decisions off of real time if requested by this benchmark. |
260 | i.seconds = i.results.cpu_time_used; |
261 | if (b.use_manual_time()) { |
262 | i.seconds = i.results.manual_time_used; |
263 | } else if (b.use_real_time()) { |
264 | i.seconds = i.results.real_time_used; |
265 | } |
266 | |
267 | return i; |
268 | } |
269 | |
270 | IterationCount PredictNumItersNeeded(const IterationResults& i) const { |
271 | // See how much iterations should be increased by. |
272 | // Note: Avoid division by zero with max(seconds, 1ns). |
273 | double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9); |
274 | // If our last run was at least 10% of FLAGS_benchmark_min_time then we |
275 | // use the multiplier directly. |
276 | // Otherwise we use at most 10 times expansion. |
277 | // NOTE: When the last run was at least 10% of the min time the max |
278 | // expansion should be 14x. |
279 | bool is_significant = (i.seconds / min_time) > 0.1; |
280 | multiplier = is_significant ? multiplier : std::min(10.0, multiplier); |
281 | if (multiplier <= 1.0) multiplier = 2.0; |
282 | |
283 | // So what seems to be the sufficiently-large iteration count? Round up. |
284 | const IterationCount max_next_iters = static_cast<IterationCount>( |
285 | std::lround(std::max(multiplier * static_cast<double>(i.iters), |
286 | static_cast<double>(i.iters) + 1.0))); |
287 | // But we do have *some* sanity limits though.. |
288 | const IterationCount next_iters = std::min(max_next_iters, kMaxIterations); |
289 | |
290 | VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n" ; |
291 | return next_iters; // round up before conversion to integer. |
292 | } |
293 | |
294 | bool ShouldReportIterationResults(const IterationResults& i) const { |
295 | // Determine if this run should be reported; |
296 | // Either it has run for a sufficient amount of time |
297 | // or because an error was reported. |
298 | return i.results.has_error_ || |
299 | i.iters >= kMaxIterations || // Too many iterations already. |
300 | i.seconds >= min_time || // The elapsed time is large enough. |
301 | // CPU time is specified but the elapsed real time greatly exceeds |
302 | // the minimum time. |
303 | // Note that user provided timers are except from this sanity check. |
304 | ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time()); |
305 | } |
306 | |
307 | void DoOneRepetition(int64_t repetition_index) { |
308 | const bool is_the_first_repetition = repetition_index == 0; |
309 | IterationResults i; |
310 | |
311 | // We *may* be gradually increasing the length (iteration count) |
312 | // of the benchmark until we decide the results are significant. |
313 | // And once we do, we report those last results and exit. |
314 | // Please do note that the if there are repetitions, the iteration count |
315 | // is *only* calculated for the *first* repetition, and other repetitions |
316 | // simply use that precomputed iteration count. |
317 | for (;;) { |
318 | i = DoNIterations(); |
319 | |
320 | // Do we consider the results to be significant? |
321 | // If we are doing repetitions, and the first repetition was already done, |
322 | // it has calculated the correct iteration time, so we have run that very |
323 | // iteration count just now. No need to calculate anything. Just report. |
324 | // Else, the normal rules apply. |
325 | const bool results_are_significant = !is_the_first_repetition || |
326 | has_explicit_iteration_count || |
327 | ShouldReportIterationResults(i); |
328 | |
329 | if (results_are_significant) break; // Good, let's report them! |
330 | |
331 | // Nope, bad iteration. Let's re-estimate the hopefully-sufficient |
332 | // iteration count, and run the benchmark again... |
333 | |
334 | iters = PredictNumItersNeeded(i); |
335 | assert(iters > i.iters && |
336 | "if we did more iterations than we want to do the next time, " |
337 | "then we should have accepted the current iteration run." ); |
338 | } |
339 | |
340 | // Oh, one last thing, we need to also produce the 'memory measurements'.. |
341 | MemoryManager::Result memory_result; |
342 | IterationCount memory_iterations = 0; |
343 | if (memory_manager != nullptr) { |
344 | // Only run a few iterations to reduce the impact of one-time |
345 | // allocations in benchmarks that are not properly managed. |
346 | memory_iterations = std::min<IterationCount>(16, iters); |
347 | memory_manager->Start(); |
348 | std::unique_ptr<internal::ThreadManager> manager; |
349 | manager.reset(new internal::ThreadManager(1)); |
350 | RunInThread(&b, memory_iterations, 0, manager.get(), |
351 | perf_counters_measurement_ptr); |
352 | manager->WaitForAllThreads(); |
353 | manager.reset(); |
354 | |
355 | memory_manager->Stop(&memory_result); |
356 | } |
357 | |
358 | // Ok, now actualy report. |
359 | BenchmarkReporter::Run report = |
360 | CreateRunReport(b, i.results, memory_iterations, memory_result, |
361 | i.seconds, repetition_index); |
362 | |
363 | if (!report.error_occurred && b.complexity() != oNone) |
364 | complexity_reports.push_back(report); |
365 | |
366 | run_results.non_aggregates.push_back(report); |
367 | } |
368 | }; |
369 | |
370 | } // end namespace |
371 | |
372 | RunResults RunBenchmark( |
373 | const benchmark::internal::BenchmarkInstance& b, |
374 | std::vector<BenchmarkReporter::Run>* complexity_reports) { |
375 | internal::BenchmarkRunner r(b, complexity_reports); |
376 | return r.get_results(); |
377 | } |
378 | |
379 | } // end namespace internal |
380 | |
381 | } // end namespace benchmark |
382 | |