nanobenchmark.h (6775B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_NANOBENCHMARK_H_ 17 #define HIGHWAY_HWY_NANOBENCHMARK_H_ 18 19 // Benchmarks functions of a single integer argument with realistic branch 20 // prediction hit rates. Uses a robust estimator to summarize the measurements. 21 // The precision is about 0.2%. 22 // 23 // Examples: see nanobenchmark_test.cc. 24 // 25 // Background: Microbenchmarks such as http://github.com/google/benchmark 26 // can measure elapsed times on the order of a microsecond. Shorter functions 27 // are typically measured by repeating them thousands of times and dividing 28 // the total elapsed time by this count. Unfortunately, repetition (especially 29 // with the same input parameter!) influences the runtime. In time-critical 30 // code, it is reasonable to expect warm instruction/data caches and TLBs, 31 // but a perfect record of which branches will be taken is unrealistic. 32 // Unless the application also repeatedly invokes the measured function with 33 // the same parameter, the benchmark is measuring something very different - 34 // a best-case result, almost as if the parameter were made a compile-time 35 // constant. This may lead to erroneous conclusions about branch-heavy 36 // algorithms outperforming branch-free alternatives. 37 // 38 // Our approach differs in three ways. Adding fences to the timer functions 39 // reduces variability due to instruction reordering, improving the timer 40 // resolution to about 40 CPU cycles. However, shorter functions must still 41 // be invoked repeatedly. For more realistic branch prediction performance, 42 // we vary the input parameter according to a user-specified distribution. 43 // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the 44 // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the 45 // central tendency of the measurement samples with the "half sample mode", 46 // which is more robust to outliers and skewed data than the mean or median. 47 48 #include <stddef.h> 49 #include <stdint.h> 50 51 #include "hwy/highway_export.h" 52 #include "hwy/timer.h" // IWYU pragma: export 53 54 namespace hwy { 55 56 // Returns 1, but without the compiler knowing what the value is. This prevents 57 // optimizing out code. 58 HWY_DLLEXPORT int Unpredictable1(); 59 60 // Input influencing the function being measured (e.g. number of bytes to copy). 61 using FuncInput = size_t; 62 63 // "Proof of work" returned by Func to ensure the compiler does not elide it. 64 using FuncOutput = uint64_t; 65 66 // Function to measure: either 1) a captureless lambda or function with two 67 // arguments or 2) a lambda with capture, in which case the first argument 68 // is reserved for use by MeasureClosure. 69 using Func = FuncOutput (*)(const void*, FuncInput); 70 71 // Internal parameters that determine precision/resolution/measuring time. 72 struct Params { 73 // Best-case precision, expressed as a divisor of the timer resolution. 74 // Larger => more calls to Func and higher precision. 75 size_t precision_divisor = 1024; 76 77 // Ratio between full and subset input distribution sizes. Cannot be less 78 // than 2; larger values increase measurement time but more faithfully 79 // model the given input distribution. 80 size_t subset_ratio = 2; 81 82 // Together with the estimated Func duration, determines how many times to 83 // call Func before checking the sample variability. Larger values increase 84 // measurement time, memory/cache use and precision. 85 double seconds_per_eval = 4E-3; 86 87 // The minimum number of samples before estimating the central tendency. 88 size_t min_samples_per_eval = 7; 89 90 // The mode is better than median for estimating the central tendency of 91 // skewed/fat-tailed distributions, but it requires sufficient samples 92 // relative to the width of half-ranges. 93 size_t min_mode_samples = 64; 94 95 // Maximum permissible variability (= median absolute deviation / center). 96 double target_rel_mad = 0.002; 97 98 // Abort after this many evals without reaching target_rel_mad. This 99 // prevents infinite loops. 100 size_t max_evals = 9; 101 102 // Whether to print additional statistics to stdout. 103 bool verbose = true; 104 }; 105 106 // Measurement result for each unique input. 107 struct Result { 108 FuncInput input; 109 110 // Robust estimate (mode or median) of duration. 111 float ticks; 112 113 // Measure of variability (median absolute deviation relative to "ticks"). 114 float variability; 115 }; 116 117 // Precisely measures the number of ticks elapsed when calling "func" with the 118 // given inputs, shuffled to ensure realistic branch prediction hit rates. 119 // 120 // "func" returns a 'proof of work' to ensure its computations are not elided. 121 // "arg" is passed to Func, or reserved for internal use by MeasureClosure. 122 // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to 123 // "func". The values should be chosen to maximize coverage of "func". This 124 // represents a distribution, so a value's frequency should reflect its 125 // probability in the real application. Order does not matter; for example, a 126 // uniform distribution over [0, 4) could be represented as {3,0,2,1}. 127 // Returns how many Result were written to "results": one per unique input, or 128 // zero if the measurement failed (an error message goes to stderr). 129 HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg, 130 const FuncInput* inputs, size_t num_inputs, 131 Result* results, const Params& p = Params()); 132 133 // Calls operator() of the given closure (lambda function). 134 template <class Closure> 135 static FuncOutput CallClosure(const void* f, const FuncInput input) { 136 return (*reinterpret_cast<const Closure*>(f))(input); 137 } 138 139 // Same as Measure, except "closure" is typically a lambda function of 140 // FuncInput -> FuncOutput with a capture list. 141 template <class Closure> 142 static inline size_t MeasureClosure(const Closure& closure, 143 const FuncInput* inputs, 144 const size_t num_inputs, Result* results, 145 const Params& p = Params()) { 146 return Measure(static_cast<Func>(&CallClosure<Closure>), 147 reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs, 148 results, p); 149 } 150 151 } // namespace hwy 152 153 #endif // HIGHWAY_HWY_NANOBENCHMARK_H_