1/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <folly/Portability.h>
20#include <folly/Preprocessor.h> // for FB_ANONYMOUS_VARIABLE
21#include <folly/Range.h>
22#include <folly/ScopeGuard.h>
23#include <folly/Traits.h>
24#include <folly/functional/Invoke.h>
25#include <folly/portability/GFlags.h>
26
27#include <cassert>
28#include <chrono>
29#include <functional>
30#include <limits>
31#include <type_traits>
32#include <unordered_map>
33
34#include <boost/function_types/function_arity.hpp>
35#include <glog/logging.h>
36
37DECLARE_bool(benchmark);
38
39namespace folly {
40
41/**
42 * Runs all benchmarks defined. Usually put in main().
43 */
44void runBenchmarks();
45
46/**
47 * Runs all benchmarks defined if and only if the --benchmark flag has
48 * been passed to the program. Usually put in main().
49 */
50inline bool runBenchmarksOnFlag() {
51 if (FLAGS_benchmark) {
52 runBenchmarks();
53 }
54 return FLAGS_benchmark;
55}
56
57class UserMetric {
58 public:
59 enum class Type { CUSTOM, TIME, METRIC };
60
61 int64_t value{};
62 Type type{Type::CUSTOM};
63
64 UserMetric() = default;
65 /* implicit */ UserMetric(int64_t val, Type typ = Type::CUSTOM)
66 : value(val), type(typ) {}
67};
68
69using UserCounters = std::unordered_map<std::string, UserMetric>;
70
71namespace detail {
72struct TimeIterData {
73 std::chrono::high_resolution_clock::duration duration;
74 unsigned int niter;
75 UserCounters userCounters;
76};
77
78using BenchmarkFun = std::function<TimeIterData(unsigned int)>;
79
80struct BenchmarkRegistration {
81 std::string file;
82 std::string name;
83 BenchmarkFun func;
84 bool useCounter = false;
85};
86
87struct BenchmarkResult {
88 std::string file;
89 std::string name;
90 double timeInNs;
91 UserCounters counters;
92};
93
94/**
95 * Adds a benchmark wrapped in a std::function. Only used
96 * internally. Pass by value is intentional.
97 */
98void addBenchmarkImpl(
99 const char* file,
100 StringPiece name,
101 BenchmarkFun,
102 bool useCounter);
103
104} // namespace detail
105
106/**
107 * Supporting type for BENCHMARK_SUSPEND defined below.
108 */
109struct BenchmarkSuspender {
110 using Clock = std::chrono::high_resolution_clock;
111 using TimePoint = Clock::time_point;
112 using Duration = Clock::duration;
113
114 BenchmarkSuspender() {
115 start = Clock::now();
116 }
117
118 BenchmarkSuspender(const BenchmarkSuspender&) = delete;
119 BenchmarkSuspender(BenchmarkSuspender&& rhs) noexcept {
120 start = rhs.start;
121 rhs.start = {};
122 }
123
124 BenchmarkSuspender& operator=(const BenchmarkSuspender&) = delete;
125 BenchmarkSuspender& operator=(BenchmarkSuspender&& rhs) {
126 if (start != TimePoint{}) {
127 tally();
128 }
129 start = rhs.start;
130 rhs.start = {};
131 return *this;
132 }
133
134 ~BenchmarkSuspender() {
135 if (start != TimePoint{}) {
136 tally();
137 }
138 }
139
140 void dismiss() {
141 assert(start != TimePoint{});
142 tally();
143 start = {};
144 }
145
146 void rehire() {
147 assert(start == TimePoint{});
148 start = Clock::now();
149 }
150
151 template <class F>
152 auto dismissing(F f) -> invoke_result_t<F> {
153 SCOPE_EXIT {
154 rehire();
155 };
156 dismiss();
157 return f();
158 }
159
160 /**
161 * This is for use inside of if-conditions, used in BENCHMARK macros.
162 * If-conditions bypass the explicit on operator bool.
163 */
164 explicit operator bool() const {
165 return false;
166 }
167
168 /**
169 * Accumulates time spent outside benchmark.
170 */
171 static Duration timeSpent;
172
173 private:
174 void tally() {
175 auto end = Clock::now();
176 timeSpent += end - start;
177 start = end;
178 }
179
180 TimePoint start;
181};
182
183/**
184 * Adds a benchmark. Usually not called directly but instead through
185 * the macro BENCHMARK defined below. The lambda function involved
186 * must take exactly one parameter of type unsigned, and the benchmark
187 * uses it with counter semantics (iteration occurs inside the
188 * function).
189 */
190template <typename Lambda>
191typename std::enable_if<folly::is_invocable<Lambda, unsigned>::value>::type
192addBenchmark(const char* file, StringPiece name, Lambda&& lambda) {
193 auto execute = [=](unsigned int times) {
194 BenchmarkSuspender::timeSpent = {};
195 unsigned int niter;
196
197 // CORE MEASUREMENT STARTS
198 auto start = std::chrono::high_resolution_clock::now();
199 niter = lambda(times);
200 auto end = std::chrono::high_resolution_clock::now();
201 // CORE MEASUREMENT ENDS
202 return detail::TimeIterData{
203 (end - start) - BenchmarkSuspender::timeSpent, niter, UserCounters{}};
204 };
205
206 detail::addBenchmarkImpl(file, name, detail::BenchmarkFun(execute), false);
207}
208
209/**
210 * Adds a benchmark. Usually not called directly but instead through
211 * the macro BENCHMARK defined below. The lambda function involved
212 * must take zero parameters, and the benchmark calls it repeatedly
213 * (iteration occurs outside the function).
214 */
215template <typename Lambda>
216typename std::enable_if<folly::is_invocable<Lambda>::value>::type
217addBenchmark(const char* file, StringPiece name, Lambda&& lambda) {
218 addBenchmark(file, name, [=](unsigned int times) {
219 unsigned int niter = 0;
220 while (times-- > 0) {
221 niter += lambda();
222 }
223 return niter;
224 });
225}
226
227/**
228 * similar as previous two template specialization, but lambda will also take
229 * customized counters in the following two cases
230 */
231template <typename Lambda>
232typename std::enable_if<
233 folly::is_invocable<Lambda, UserCounters&, unsigned>::value>::type
234addBenchmark(const char* file, StringPiece name, Lambda&& lambda) {
235 auto execute = [=](unsigned int times) {
236 BenchmarkSuspender::timeSpent = {};
237 unsigned int niter;
238
239 // CORE MEASUREMENT STARTS
240 auto start = std::chrono::high_resolution_clock::now();
241 UserCounters counters;
242 niter = lambda(counters, times);
243 auto end = std::chrono::high_resolution_clock::now();
244 // CORE MEASUREMENT ENDS
245 return detail::TimeIterData{
246 (end - start) - BenchmarkSuspender::timeSpent, niter, counters};
247 };
248
249 detail::addBenchmarkImpl(
250 file,
251 name,
252 std::function<detail::TimeIterData(unsigned int)>(execute),
253 true);
254}
255
256template <typename Lambda>
257typename std::enable_if<folly::is_invocable<Lambda, UserCounters&>::value>::type
258addBenchmark(const char* file, StringPiece name, Lambda&& lambda) {
259 addBenchmark(file, name, [=](UserCounters& counters, unsigned int times) {
260 unsigned int niter = 0;
261 while (times-- > 0) {
262 niter += lambda(counters);
263 }
264 return niter;
265 });
266}
267
268/**
269 * Call doNotOptimizeAway(var) to ensure that var will be computed even
270 * post-optimization. Use it for variables that are computed during
271 * benchmarking but otherwise are useless. The compiler tends to do a
272 * good job at eliminating unused variables, and this function fools it
273 * into thinking var is in fact needed.
274 *
275 * Call makeUnpredictable(var) when you don't want the optimizer to use
276 * its knowledge of var to shape the following code. This is useful
277 * when constant propagation or power reduction is possible during your
278 * benchmark but not in real use cases.
279 */
280
281#ifdef _MSC_VER
282
283#pragma optimize("", off)
284
285inline void doNotOptimizeDependencySink(const void*) {}
286
287#pragma optimize("", on)
288
289template <class T>
290void doNotOptimizeAway(const T& datum) {
291 doNotOptimizeDependencySink(&datum);
292}
293
294template <typename T>
295void makeUnpredictable(T& datum) {
296 doNotOptimizeDependencySink(&datum);
297}
298
299#else
300
301namespace detail {
302template <typename T>
303struct DoNotOptimizeAwayNeedsIndirect {
304 using Decayed = typename std::decay<T>::type;
305
306 // First two constraints ensure it can be an "r" operand.
307 // std::is_pointer check is because callers seem to expect that
308 // doNotOptimizeAway(&x) is equivalent to doNotOptimizeAway(x).
309 constexpr static bool value = !folly::is_trivially_copyable<Decayed>::value ||
310 sizeof(Decayed) > sizeof(long) || std::is_pointer<Decayed>::value;
311};
312} // namespace detail
313
314template <typename T>
315auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
316 !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
317 // The "r" constraint forces the compiler to make datum available
318 // in a register to the asm block, which means that it must have
319 // computed/loaded it. We use this path for things that are <=
320 // sizeof(long) (they have to fit), trivial (otherwise the compiler
321 // doesn't want to put them in a register), and not a pointer (because
322 // doNotOptimizeAway(&foo) would otherwise be a foot gun that didn't
323 // necessarily compute foo).
324 //
325 // An earlier version of this method had a more permissive input operand
326 // constraint, but that caused unnecessary variation between clang and
327 // gcc benchmarks.
328 asm volatile("" ::"r"(datum));
329}
330
331template <typename T>
332auto doNotOptimizeAway(const T& datum) -> typename std::enable_if<
333 detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
334 // This version of doNotOptimizeAway tells the compiler that the asm
335 // block will read datum from memory, and that in addition it might read
336 // or write from any memory location. If the memory clobber could be
337 // separated into input and output that would be preferrable.
338 asm volatile("" ::"m"(datum) : "memory");
339}
340
341template <typename T>
342auto makeUnpredictable(T& datum) -> typename std::enable_if<
343 !detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
344 asm volatile("" : "+r"(datum));
345}
346
347template <typename T>
348auto makeUnpredictable(T& datum) -> typename std::enable_if<
349 detail::DoNotOptimizeAwayNeedsIndirect<T>::value>::type {
350 asm volatile("" ::"m"(datum) : "memory");
351}
352
353#endif
354
355struct dynamic;
356
357void benchmarkResultsToDynamic(
358 const std::vector<detail::BenchmarkResult>& data,
359 dynamic&);
360
361void benchmarkResultsFromDynamic(
362 const dynamic&,
363 std::vector<detail::BenchmarkResult>&);
364
365void printResultComparison(
366 const std::vector<detail::BenchmarkResult>& base,
367 const std::vector<detail::BenchmarkResult>& test);
368
369} // namespace folly
370
371/**
372 * Introduces a benchmark function. Used internally, see BENCHMARK and
373 * friends below.
374 */
375
376#define BENCHMARK_IMPL(funName, stringName, rv, paramType, paramName) \
377 static void funName(paramType); \
378 FOLLY_MAYBE_UNUSED static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
379 (::folly::addBenchmark( \
380 __FILE__, \
381 stringName, \
382 [](paramType paramName) -> unsigned { \
383 funName(paramName); \
384 return rv; \
385 }), \
386 true); \
387 static void funName(paramType paramName)
388
389#define BENCHMARK_IMPL_COUNTERS( \
390 funName, stringName, counters, rv, paramType, paramName) \
391 static void funName( \
392 ::folly::UserCounters& FOLLY_PP_DETAIL_APPEND_VA_ARG(paramType)); \
393 FOLLY_MAYBE_UNUSED static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
394 (::folly::addBenchmark( \
395 __FILE__, \
396 stringName, \
397 [](::folly::UserCounters& counters FOLLY_PP_DETAIL_APPEND_VA_ARG( \
398 paramType paramName)) -> unsigned { \
399 funName(counters FOLLY_PP_DETAIL_APPEND_VA_ARG(paramName)); \
400 return rv; \
401 }), \
402 true); \
403 static void funName(::folly::UserCounters& counters \
404 FOLLY_PP_DETAIL_APPEND_VA_ARG(paramType paramName))
405
406/**
407 * Introduces a benchmark function with support for returning the actual
408 * number of iterations. Used internally, see BENCHMARK_MULTI and friends
409 * below.
410 */
411#define BENCHMARK_MULTI_IMPL(funName, stringName, paramType, paramName) \
412 static unsigned funName(paramType); \
413 FOLLY_MAYBE_UNUSED static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
414 (::folly::addBenchmark( \
415 __FILE__, \
416 stringName, \
417 [](paramType paramName) { return funName(paramName); }), \
418 true); \
419 static unsigned funName(paramType paramName)
420
421/**
422 * Introduces a benchmark function. Use with either one or two arguments.
423 * The first is the name of the benchmark. Use something descriptive, such
424 * as insertVectorBegin. The second argument may be missing, or could be a
425 * symbolic counter. The counter dictates how many internal iteration the
426 * benchmark does. Example:
427 *
428 * BENCHMARK(vectorPushBack) {
429 * vector<int> v;
430 * v.push_back(42);
431 * }
432 *
433 * BENCHMARK(insertVectorBegin, iters) {
434 * vector<int> v;
435 * FOR_EACH_RANGE (i, 0, iters) {
436 * v.insert(v.begin(), 42);
437 * }
438 * }
439 */
440#define BENCHMARK(name, ...) \
441 BENCHMARK_IMPL( \
442 name, \
443 FOLLY_PP_STRINGIZE(name), \
444 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
445 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
446 __VA_ARGS__)
447
448/**
449 * Allow users to record customized counter during benchmarking,
450 * there will be one extra column showing in the output result for each counter
451 *
452 * BENCHMARK_COUNTERS(insertVectorBegin, couters, iters) {
453 * vector<int> v;
454 * FOR_EACH_RANGE (i, 0, iters) {
455 * v.insert(v.begin(), 42);
456 * }
457 * BENCHMARK_SUSPEND {
458 * counters["foo"] = 10;
459 * }
460 * }
461 */
462#define BENCHMARK_COUNTERS(name, counters, ...) \
463 BENCHMARK_IMPL_COUNTERS( \
464 name, \
465 FOLLY_PP_STRINGIZE(name), \
466 counters, \
467 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
468 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
469 __VA_ARGS__)
470/**
471 * Like BENCHMARK above, but allows the user to return the actual
472 * number of iterations executed in the function body. This can be
473 * useful if the benchmark function doesn't know upfront how many
474 * iterations it's going to run or if it runs through a certain
475 * number of test cases, e.g.:
476 *
477 * BENCHMARK_MULTI(benchmarkSomething) {
478 * std::vector<int> testCases { 0, 1, 1, 2, 3, 5 };
479 * for (int c : testCases) {
480 * doSomething(c);
481 * }
482 * return testCases.size();
483 * }
484 */
485#define BENCHMARK_MULTI(name, ...) \
486 BENCHMARK_MULTI_IMPL( \
487 name, \
488 FOLLY_PP_STRINGIZE(name), \
489 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
490 __VA_ARGS__)
491
492/**
493 * Defines a benchmark that passes a parameter to another one. This is
494 * common for benchmarks that need a "problem size" in addition to
495 * "number of iterations". Consider:
496 *
497 * void pushBack(uint32_t n, size_t initialSize) {
498 * vector<int> v;
499 * BENCHMARK_SUSPEND {
500 * v.resize(initialSize);
501 * }
502 * FOR_EACH_RANGE (i, 0, n) {
503 * v.push_back(i);
504 * }
505 * }
506 * BENCHMARK_PARAM(pushBack, 0)
507 * BENCHMARK_PARAM(pushBack, 1000)
508 * BENCHMARK_PARAM(pushBack, 1000000)
509 *
510 * The benchmark above estimates the speed of push_back at different
511 * initial sizes of the vector. The framework will pass 0, 1000, and
512 * 1000000 for initialSize, and the iteration count for n.
513 */
514#define BENCHMARK_PARAM(name, param) BENCHMARK_NAMED_PARAM(name, param, param)
515
516/**
517 * Same as BENCHMARK_PARAM, but allows one to return the actual number of
518 * iterations that have been run.
519 */
520#define BENCHMARK_PARAM_MULTI(name, param) \
521 BENCHMARK_NAMED_PARAM_MULTI(name, param, param)
522
523/*
524 * Like BENCHMARK_PARAM(), but allows a custom name to be specified for each
525 * parameter, rather than using the parameter value.
526 *
527 * Useful when the parameter value is not a valid token for string pasting,
528 * of when you want to specify multiple parameter arguments.
529 *
530 * For example:
531 *
532 * void addValue(uint32_t n, int64_t bucketSize, int64_t min, int64_t max) {
533 * Histogram<int64_t> hist(bucketSize, min, max);
534 * int64_t num = min;
535 * FOR_EACH_RANGE (i, 0, n) {
536 * hist.addValue(num);
537 * ++num;
538 * if (num > max) { num = min; }
539 * }
540 * }
541 *
542 * BENCHMARK_NAMED_PARAM(addValue, 0_to_100, 1, 0, 100)
543 * BENCHMARK_NAMED_PARAM(addValue, 0_to_1000, 10, 0, 1000)
544 * BENCHMARK_NAMED_PARAM(addValue, 5k_to_20k, 250, 5000, 20000)
545 */
546#define BENCHMARK_NAMED_PARAM(name, param_name, ...) \
547 BENCHMARK_IMPL( \
548 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
549 FOLLY_PP_STRINGIZE(name) "(" FOLLY_PP_STRINGIZE(param_name) ")", \
550 iters, \
551 unsigned, \
552 iters) { \
553 name(iters, ##__VA_ARGS__); \
554 }
555
556/**
557 * Same as BENCHMARK_NAMED_PARAM, but allows one to return the actual number
558 * of iterations that have been run.
559 */
560#define BENCHMARK_NAMED_PARAM_MULTI(name, param_name, ...) \
561 BENCHMARK_MULTI_IMPL( \
562 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
563 FOLLY_PP_STRINGIZE(name) "(" FOLLY_PP_STRINGIZE(param_name) ")", \
564 unsigned, \
565 iters) { \
566 return name(iters, ##__VA_ARGS__); \
567 }
568
569/**
570 * Just like BENCHMARK, but prints the time relative to a
571 * baseline. The baseline is the most recent BENCHMARK() seen in
572 * the current scope. Example:
573 *
574 * // This is the baseline
575 * BENCHMARK(insertVectorBegin, n) {
576 * vector<int> v;
577 * FOR_EACH_RANGE (i, 0, n) {
578 * v.insert(v.begin(), 42);
579 * }
580 * }
581 *
582 * BENCHMARK_RELATIVE(insertListBegin, n) {
583 * list<int> s;
584 * FOR_EACH_RANGE (i, 0, n) {
585 * s.insert(s.begin(), 42);
586 * }
587 * }
588 *
589 * Any number of relative benchmark can be associated with a
590 * baseline. Another BENCHMARK() occurrence effectively establishes a
591 * new baseline.
592 */
593#define BENCHMARK_RELATIVE(name, ...) \
594 BENCHMARK_IMPL( \
595 name, \
596 "%" FOLLY_PP_STRINGIZE(name), \
597 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
598 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
599 __VA_ARGS__)
600
601#define BENCHMARK_COUNTERS_RELATIVE(name, counters, ...) \
602 BENCHMARK_IMPL_COUNTERS( \
603 name, \
604 "%" FOLLY_PP_STRINGIZE(name), \
605 counters, \
606 FB_ARG_2_OR_1(1, ##__VA_ARGS__), \
607 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
608 __VA_ARGS__)
609/**
610 * Same as BENCHMARK_RELATIVE, but allows one to return the actual number
611 * of iterations that have been run.
612 */
613#define BENCHMARK_RELATIVE_MULTI(name, ...) \
614 BENCHMARK_MULTI_IMPL( \
615 name, \
616 "%" FOLLY_PP_STRINGIZE(name), \
617 FB_ONE_OR_NONE(unsigned, ##__VA_ARGS__), \
618 __VA_ARGS__)
619
620/**
621 * A combination of BENCHMARK_RELATIVE and BENCHMARK_PARAM.
622 */
623#define BENCHMARK_RELATIVE_PARAM(name, param) \
624 BENCHMARK_RELATIVE_NAMED_PARAM(name, param, param)
625
626/**
627 * Same as BENCHMARK_RELATIVE_PARAM, but allows one to return the actual
628 * number of iterations that have been run.
629 */
630#define BENCHMARK_RELATIVE_PARAM_MULTI(name, param) \
631 BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param, param)
632
633/**
634 * A combination of BENCHMARK_RELATIVE and BENCHMARK_NAMED_PARAM.
635 */
636#define BENCHMARK_RELATIVE_NAMED_PARAM(name, param_name, ...) \
637 BENCHMARK_IMPL( \
638 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
639 "%" FOLLY_PP_STRINGIZE(name) "(" FOLLY_PP_STRINGIZE(param_name) ")", \
640 iters, \
641 unsigned, \
642 iters) { \
643 name(iters, ##__VA_ARGS__); \
644 }
645
646/**
647 * Same as BENCHMARK_RELATIVE_NAMED_PARAM, but allows one to return the
648 * actual number of iterations that have been run.
649 */
650#define BENCHMARK_RELATIVE_NAMED_PARAM_MULTI(name, param_name, ...) \
651 BENCHMARK_MULTI_IMPL( \
652 FB_CONCATENATE(name, FB_CONCATENATE(_, param_name)), \
653 "%" FOLLY_PP_STRINGIZE(name) "(" FOLLY_PP_STRINGIZE(param_name) ")", \
654 unsigned, \
655 iters) { \
656 return name(iters, ##__VA_ARGS__); \
657 }
658
659/**
660 * Draws a line of dashes.
661 */
662#define BENCHMARK_DRAW_LINE() \
663 FOLLY_MAYBE_UNUSED static bool FB_ANONYMOUS_VARIABLE(follyBenchmarkUnused) = \
664 (::folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }), \
665 true)
666
667/**
668 * Allows execution of code that doesn't count torward the benchmark's
669 * time budget. Example:
670 *
671 * BENCHMARK_START_GROUP(insertVectorBegin, n) {
672 * vector<int> v;
673 * BENCHMARK_SUSPEND {
674 * v.reserve(n);
675 * }
676 * FOR_EACH_RANGE (i, 0, n) {
677 * v.insert(v.begin(), 42);
678 * }
679 * }
680 */
681#define BENCHMARK_SUSPEND \
682 if (auto FB_ANONYMOUS_VARIABLE(BENCHMARK_SUSPEND) = \
683 ::folly::BenchmarkSuspender()) { \
684 } else
685