benchmark_runner.cc source code [pytorch/third_party/benchmark/src/benchmark_runner.cc]

1	// Copyright 2015 Google Inc. All rights reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	#include "benchmark_runner.h"
16
17	#include "benchmark/benchmark.h"
18	#include "benchmark_api_internal.h"
19	#include "internal_macros.h"
20
21	#ifndef BENCHMARK_OS_WINDOWS
22	#ifndef BENCHMARK_OS_FUCHSIA
23	#include <sys/resource.h>
24	#endif
25	#include <sys/time.h>
26	#include <unistd.h>
27	#endif
28
29	#include <algorithm>
30	#include <atomic>
31	#include <condition_variable>
32	#include <cstdio>
33	#include <cstdlib>
34	#include <fstream>
35	#include <iostream>
36	#include <memory>
37	#include <string>
38	#include <thread>
39	#include <utility>
40
41	#include "check.h"
42	#include "colorprint.h"
43	#include "commandlineflags.h"
44	#include "complexity.h"
45	#include "counter.h"
46	#include "internal_macros.h"
47	#include "log.h"
48	#include "mutex.h"
49	#include "perf_counters.h"
50	#include "re.h"
51	#include "statistics.h"
52	#include "string_util.h"
53	#include "thread_manager.h"
54	#include "thread_timer.h"
55
56	namespace benchmark {
57
58	namespace internal {
59
60	MemoryManager* memory_manager = nullptr;
61
62	namespace {
63
64	static constexpr IterationCount kMaxIterations = `1000000000`;
65
66	BenchmarkReporter::Run CreateRunReport(
67	const benchmark::internal::BenchmarkInstance& b,
68	const internal::ThreadManager::Result& results,
69	IterationCount memory_iterations,
70	const MemoryManager::Result* memory_result, double seconds,
71	int64_t repetition_index, int64_t repeats) {
72	// Create report about this benchmark run.
73	BenchmarkReporter::Run report;
74
75	report.run_name = b.name();
76	report.family_index = b.family_index();
77	report.per_family_instance_index = b.per_family_instance_index();
78	report.error_occurred = results.has_error_;
79	report.error_message = results.error_message_;
80	report.report_label = results.report_label_;
81	// This is the total iterations across all threads.
82	report.iterations = results.iterations;
83	report.time_unit = b.time_unit();
84	report.threads = b.threads();
85	report.repetition_index = repetition_index;
86	report.repetitions = repeats;
87
88	if (!report.error_occurred) {
89	if (b.use_manual_time()) {
90	report.real_accumulated_time = results.manual_time_used;
91	} else {
92	report.real_accumulated_time = results.real_time_used;
93	}
94	report.cpu_accumulated_time = results.cpu_time_used;
95	report.complexity_n = results.complexity_n;
96	report.complexity = b.complexity();
97	report.complexity_lambda = b.complexity_lambda();
98	report.statistics = &b.statistics();
99	report.counters = results.counters;
100
101	if (memory_iterations > `0`) {
102	assert(memory_result != nullptr);
103	report.memory_result = memory_result;
104	report.allocs_per_iter =
105	memory_iterations ? static_cast<double>(memory_result->num_allocs) /
106	memory_iterations
107	: `0`;
108	}
109
110	internal::Finish(&report.counters, results.iterations, seconds,
111	b.threads());
112	}
113	return report;
114	}
115
116	// Execute one thread of benchmark b for the specified number of iterations.
117	// Adds the stats collected for the thread into manager->results.
118	void RunInThread(const BenchmarkInstance* b, IterationCount iters,
119	int thread_id, ThreadManager* manager,
120	PerfCountersMeasurement* perf_counters_measurement) {
121	internal::ThreadTimer timer(
122	b->measure_process_cpu_time()
123	? internal::ThreadTimer::CreateProcessCpuTime()
124	: internal::ThreadTimer::Create());
125	State st =
126	b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
127	BM_CHECK(st.error_occurred() \|\| st.iterations() >= st.max_iterations)
128	<< "Benchmark returned before State::KeepRunning() returned false!";
129	{
130	MutexLock l(manager->GetBenchmarkMutex());
131	internal::ThreadManager::Result& results = manager->results;
132	results.iterations += st.iterations();
133	results.cpu_time_used += timer.cpu_time_used();
134	results.real_time_used += timer.real_time_used();
135	results.manual_time_used += timer.manual_time_used();
136	results.complexity_n += st.complexity_length_n();
137	internal::Increment(&results.counters, st.counters);
138	}
139	manager->NotifyThreadComplete();
140	}
141
142	} // end namespace
143
144	BenchmarkRunner::BenchmarkRunner(
145	const benchmark::internal::BenchmarkInstance& b_,
146	BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
147	: b(b_),
148	reports_for_family(reports_for_family_),
149	min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
150	repeats(b.repetitions() != `0` ? b.repetitions()
151	: FLAGS_benchmark_repetitions),
152	has_explicit_iteration_count(b.iterations() != `0`),
153	pool (b.threads() - `1`),
154	iters(has_explicit_iteration_count ? b.iterations() : `1`),
155	perf_counters_measurement (
156	PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, `','`))),
157	perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
158	? &perf_counters_measurement
159	: nullptr) {
160	run_results.display_report_aggregates_only =
161	(FLAGS_benchmark_report_aggregates_only \|\|
162	FLAGS_benchmark_display_aggregates_only);
163	run_results.file_report_aggregates_only =
164	FLAGS_benchmark_report_aggregates_only;
165	if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
166	run_results.display_report_aggregates_only =
167	(b.aggregation_report_mode() &
168	internal::ARM_DisplayReportAggregatesOnly);
169	run_results.file_report_aggregates_only =
170	(b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
171	BM_CHECK(FLAGS_benchmark_perf_counters.empty() \|\|
172	perf_counters_measurement.IsValid())
173	<< "Perf counters were requested but could not be set up.";
174	}
175	}
176
177	BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
178	BM_VLOG(`2`) << "Running " << b.name().str() << " for " << iters << "\n";
179
180	std::unique_ptr<internal::ThreadManager> manager;
181	manager.reset(new internal::ThreadManager (b.threads()));
182
183	// Run all but one thread in separate threads
184	for (std::size_t ti = `0`; ti < pool.size(); ++ti) {
185	pool [ti] = std::thread (&RunInThread, &b, iters, static_cast<int>(ti + `1`),
186	manager.get(), perf_counters_measurement_ptr);
187	}
188	// And run one thread here directly.
189	// (If we were asked to run just one thread, we don't create new threads.)
190	// Yes, we need to do this here after* we start the separate threads.*
191	RunInThread(&b, iters, `0`, manager.get(), perf_counters_measurement_ptr);
192
193	// The main thread has finished. Now let's wait for the other threads.
194	manager ->WaitForAllThreads();
195	for (std::thread& thread : pool) thread.join();
196
197	IterationResults i;
198	// Acquire the measurements/counters from the manager, UNDER THE LOCK!
199	{
200	MutexLock l(manager ->GetBenchmarkMutex());
201	i.results = manager ->results;
202	}
203
204	// And get rid of the manager.
205	manager.reset();
206
207	// Adjust real/manual time stats since they were reported per thread.
208	i.results.real_time_used /= b.threads();
209	i.results.manual_time_used /= b.threads();
210	// If we were measuring whole-process CPU usage, adjust the CPU time too.
211	if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
212
213	BM_VLOG(`2`) << "Ran in " << i.results.cpu_time_used << "/"
214	<< i.results.real_time_used << "\n";
215
216	// By using KeepRunningBatch a benchmark can iterate more times than
217	// requested, so take the iteration count from i.results.
218	i.iters = i.results.iterations / b.threads();
219
220	// Base decisions off of real time if requested by this benchmark.
221	i.seconds = i.results.cpu_time_used;
222	if (b.use_manual_time()) {
223	i.seconds = i.results.manual_time_used;
224	} else if (b.use_real_time()) {
225	i.seconds = i.results.real_time_used;
226	}
227
228	return i;
229	}
230
231	IterationCount BenchmarkRunner::PredictNumItersNeeded(
232	const IterationResults& i) const {
233	// See how much iterations should be increased by.
234	// Note: Avoid division by zero with max(seconds, 1ns).
235	double multiplier = min_time * `1.4` / std::max(i.seconds, `1e-9`);
236	// If our last run was at least 10% of FLAGS_benchmark_min_time then we
237	// use the multiplier directly.
238	// Otherwise we use at most 10 times expansion.
239	// NOTE: When the last run was at least 10% of the min time the max
240	// expansion should be 14x.
241	bool is_significant = (i.seconds / min_time) > `0.1`;
242	multiplier = is_significant ? multiplier : `10.0`;
243
244	// So what seems to be the sufficiently-large iteration count? Round up.
245	const IterationCount max_next_iters = static_cast<IterationCount>(
246	std::lround(std::max(multiplier * static_cast<double>(i.iters),
247	static_cast<double>(i.iters) + `1.0`)));
248	// But we do have some* sanity limits though..*
249	const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
250
251	BM_VLOG(`3`) << "Next iters: " << next_iters << ", " << multiplier << "\n";
252	return next_iters; // round up before conversion to integer.
253	}
254
255	bool BenchmarkRunner::ShouldReportIterationResults(
256	const IterationResults& i) const {
257	// Determine if this run should be reported;
258	// Either it has run for a sufficient amount of time
259	// or because an error was reported.
260	return i.results.has_error_ \|\|
261	i.iters >= kMaxIterations \|\| // Too many iterations already.
262	i.seconds >= min_time \|\| // The elapsed time is large enough.
263	// CPU time is specified but the elapsed real time greatly exceeds
264	// the minimum time.
265	// Note that user provided timers are except from this sanity check.
266	((i.results.real_time_used >= `5` * min_time) && !b.use_manual_time());
267	}
268
269	void BenchmarkRunner::DoOneRepetition() {
270	assert(HasRepeatsRemaining() && "Already done all repetitions?");
271
272	const bool is_the_first_repetition = num_repetitions_done == `0`;
273	IterationResults i;
274
275	// We may* be gradually increasing the length (iteration count)*
276	// of the benchmark until we decide the results are significant.
277	// And once we do, we report those last results and exit.
278	// Please do note that the if there are repetitions, the iteration count
279	// is only* calculated for the first repetition, and other repetitions*
280	// simply use that precomputed iteration count.
281	for (;;) {
282	b.Setup();
283	i = DoNIterations();
284	b.Teardown();
285
286	// Do we consider the results to be significant?
287	// If we are doing repetitions, and the first repetition was already done,
288	// it has calculated the correct iteration time, so we have run that very
289	// iteration count just now. No need to calculate anything. Just report.
290	// Else, the normal rules apply.
291	const bool results_are_significant = !is_the_first_repetition \|\|
292	has_explicit_iteration_count \|\|
293	ShouldReportIterationResults(i);
294
295	if (results_are_significant) break; // Good, let's report them!
296
297	// Nope, bad iteration. Let's re-estimate the hopefully-sufficient
298	// iteration count, and run the benchmark again...
299
300	iters = PredictNumItersNeeded(i);
301	assert(iters > i.iters &&
302	"if we did more iterations than we want to do the next time, "
303	"then we should have accepted the current iteration run.");
304	}
305
306	// Oh, one last thing, we need to also produce the 'memory measurements'..
307	MemoryManager::Result* memory_result = nullptr;
308	IterationCount memory_iterations = `0`;
309	if (memory_manager != nullptr) {
310	// TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
311	// optional so we don't have to own the Result here.
312	// Can't do it now due to cxx03.
313	memory_results.push_back(MemoryManager::Result ());
314	memory_result = &memory_results.back();
315	// Only run a few iterations to reduce the impact of one-time
316	// allocations in benchmarks that are not properly managed.
317	memory_iterations = std::min<IterationCount>(`16`, iters);
318	memory_manager->Start();
319	std::unique_ptr<internal::ThreadManager> manager;
320	manager.reset(new internal::ThreadManager (`1`));
321	b.Setup();
322	RunInThread(&b, memory_iterations, `0`, manager.get(),
323	perf_counters_measurement_ptr);
324	manager ->WaitForAllThreads();
325	manager.reset();
326	b.Teardown();
327
328	BENCHMARK_DISABLE_DEPRECATED_WARNING
329	memory_manager->Stop(memory_result);
330	BENCHMARK_RESTORE_DEPRECATED_WARNING
331	}
332
333	// Ok, now actually report.
334	BenchmarkReporter::Run report =
335	CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
336	num_repetitions_done, repeats);
337
338	if (reports_for_family) {
339	++reports_for_family->num_runs_done;
340	if (!report.error_occurred) reports_for_family->Runs.push_back(report);
341	}
342
343	run_results.non_aggregates.push_back(report);
344
345	++num_repetitions_done;
346	}
347
348	RunResults&& BenchmarkRunner::GetResults() {
349	assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
350
351	// Calculate additional statistics over the repetitions of this instance.
352	run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
353
354	return std::move(run_results);
355	}
356
357	} // end namespace internal
358
359	} // end namespace benchmark
360

Browse the source code of pytorch/third_party/benchmark/src/benchmark_runner.cc