nanobenchmark.cc (10999B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/nanobenchmark.h" 17 18 #include <stdio.h> 19 #include <stdlib.h> 20 #include <time.h> // clock_gettime 21 22 #include <algorithm> // std::sort, std::find_if 23 #include <numeric> // std::iota 24 #include <random> 25 #include <vector> 26 27 #include "hwy/base.h" 28 #include "hwy/robust_statistics.h" 29 #include "hwy/timer.h" 30 31 namespace hwy { 32 namespace { 33 const timer::Ticks& GetTimerResolution() { 34 static const timer::Ticks timer_resolution = platform::TimerResolution(); 35 return timer_resolution; 36 } 37 38 // Estimates the expected value of "lambda" values with a variable number of 39 // samples until the variability "rel_mad" is less than "max_rel_mad". 40 template <class Lambda> 41 timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, 42 const Params& p, const Lambda& lambda) { 43 // Choose initial samples_per_eval based on a single estimated duration. 44 timer::Ticks t0 = timer::Start(); 45 lambda(); 46 timer::Ticks t1 = timer::Stop(); // Caller checks HaveTimerStop 47 timer::Ticks est = t1 - t0; 48 static const double ticks_per_second = platform::InvariantTicksPerSecond(); 49 const size_t ticks_per_eval = 50 static_cast<size_t>(ticks_per_second * p.seconds_per_eval); 51 size_t samples_per_eval = est == 0 52 ? p.min_samples_per_eval 53 : static_cast<size_t>(ticks_per_eval / est); 54 samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval); 55 56 std::vector<timer::Ticks> samples; 57 samples.reserve(1 + samples_per_eval); 58 samples.push_back(est); 59 60 // Percentage is too strict for tiny differences, so also allow a small 61 // absolute "median absolute deviation". 62 const timer::Ticks max_abs_mad = (GetTimerResolution() + 99) / 100; 63 *rel_mad = 0.0; // ensure initialized 64 65 for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) { 66 samples.reserve(samples.size() + samples_per_eval); 67 for (size_t i = 0; i < samples_per_eval; ++i) { 68 t0 = timer::Start(); 69 lambda(); 70 t1 = timer::Stop(); // Caller checks HaveTimerStop 71 samples.push_back(t1 - t0); 72 } 73 74 if (samples.size() >= p.min_mode_samples) { 75 est = robust_statistics::Mode(samples.data(), samples.size()); 76 } else { 77 // For "few" (depends also on the variance) samples, Median is safer. 78 est = robust_statistics::Median(samples.data(), samples.size()); 79 } 80 if (est == 0) { 81 HWY_WARN("estimated duration is 0\n"); 82 } 83 84 // Median absolute deviation (mad) is a robust measure of 'variability'. 85 const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( 86 samples.data(), samples.size(), est); 87 *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est); 88 89 if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) { 90 if (p.verbose) { 91 printf("%6d samples => %5d (abs_mad=%4d, rel_mad=%4.2f%%)\n", 92 static_cast<int>(samples.size()), static_cast<int>(est), 93 static_cast<int>(abs_mad), *rel_mad * 100.0); 94 } 95 return est; 96 } 97 } 98 99 if (p.verbose) { 100 printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6d samples\n", 101 *rel_mad * 100.0, max_rel_mad * 100.0, 102 static_cast<int>(samples.size())); 103 } 104 return est; 105 } 106 107 using InputVec = std::vector<FuncInput>; 108 109 // Returns vector of unique input values. 110 InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) { 111 InputVec unique(inputs, inputs + num_inputs); 112 std::sort(unique.begin(), unique.end()); 113 unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); 114 return unique; 115 } 116 117 // Returns how often we need to call func for sufficient precision. 118 size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, 119 const Params& p) { 120 // Min elapsed ticks for any input. 121 timer::Ticks min_duration = ~timer::Ticks(0); 122 123 for (const FuncInput input : unique) { 124 double rel_mad; 125 const timer::Ticks total = SampleUntilStable( 126 p.target_rel_mad, &rel_mad, p, 127 [func, arg, input]() { PreventElision(func(arg, input)); }); 128 min_duration = HWY_MIN(min_duration, total - GetTimerResolution()); 129 } 130 131 // Number of repetitions required to reach the target resolution. 132 const size_t max_skip = p.precision_divisor; 133 // Number of repetitions given the estimated duration. 134 const size_t num_skip = 135 min_duration == 0 136 ? 0 137 : static_cast<size_t>((max_skip + min_duration - 1) / min_duration); 138 if (p.verbose) { 139 printf("res=%d max_skip=%d min_dur=%d num_skip=%d\n", 140 static_cast<int>(GetTimerResolution()), static_cast<int>(max_skip), 141 static_cast<int>(min_duration), static_cast<int>(num_skip)); 142 } 143 return num_skip; 144 } 145 146 // Replicates inputs until we can omit "num_skip" occurrences of an input. 147 InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs, 148 const size_t num_unique, const size_t num_skip, 149 const Params& p) { 150 InputVec full; 151 if (num_unique == 1) { 152 full.assign(p.subset_ratio * num_skip, inputs[0]); 153 return full; 154 } 155 156 full.reserve(p.subset_ratio * num_skip * num_inputs); 157 for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) { 158 full.insert(full.end(), inputs, inputs + num_inputs); 159 } 160 std::mt19937 rng; 161 std::shuffle(full.begin(), full.end(), rng); 162 return full; 163 } 164 165 // Copies the "full" to "subset" in the same order, but with "num_skip" 166 // randomly selected occurrences of "input_to_skip" removed. 167 void FillSubset(const InputVec& full, const FuncInput input_to_skip, 168 const size_t num_skip, InputVec* subset) { 169 const size_t count = 170 static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip)); 171 // Generate num_skip random indices: which occurrence to skip. 172 std::vector<uint32_t> omit(count); 173 std::iota(omit.begin(), omit.end(), 0); 174 // omit[] is the same on every call, but that's OK because they identify the 175 // Nth instance of input_to_skip, so the position within full[] differs. 176 std::mt19937 rng; 177 std::shuffle(omit.begin(), omit.end(), rng); 178 omit.resize(num_skip); 179 std::sort(omit.begin(), omit.end()); 180 181 uint32_t occurrence = ~0u; // 0 after preincrement 182 size_t idx_omit = 0; // cursor within omit[] 183 size_t idx_subset = 0; // cursor within *subset 184 for (const FuncInput next : full) { 185 if (next == input_to_skip) { 186 ++occurrence; 187 // Haven't removed enough already 188 if (idx_omit < num_skip) { 189 // This one is up for removal 190 if (occurrence == omit[idx_omit]) { 191 ++idx_omit; 192 continue; 193 } 194 } 195 } 196 if (idx_subset < subset->size()) { 197 (*subset)[idx_subset++] = next; 198 } 199 } 200 HWY_DASSERT(idx_subset == subset->size()); 201 HWY_DASSERT(idx_omit == omit.size()); 202 HWY_DASSERT(occurrence == count - 1); 203 } 204 205 // Returns total ticks elapsed for all inputs. 206 timer::Ticks TotalDuration(const Func func, const uint8_t* arg, 207 const InputVec* inputs, const Params& p, 208 double* max_rel_mad) { 209 double rel_mad; 210 const timer::Ticks duration = 211 SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() { 212 for (const FuncInput input : *inputs) { 213 PreventElision(func(arg, input)); 214 } 215 }); 216 *max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad); 217 return duration; 218 } 219 220 // (Nearly) empty Func for measuring timer overhead/resolution. 221 HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) { 222 return input; 223 } 224 225 // Returns overhead of accessing inputs[] and calling a function; this will 226 // be deducted from future TotalDuration return values. 227 timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, 228 const Params& p) { 229 double rel_mad; 230 // Zero tolerance because repeatability is crucial and EmptyFunc is fast. 231 return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() { 232 for (const FuncInput input : *inputs) { 233 PreventElision(EmptyFunc(arg, input)); 234 } 235 }); 236 } 237 238 } // namespace 239 240 HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; } 241 242 HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, 243 const FuncInput* inputs, const size_t num_inputs, 244 Result* results, const Params& p) { 245 HWY_DASSERT(num_inputs != 0); 246 247 char cpu100[100]; 248 if (!platform::HaveTimerStop(cpu100)) { 249 HWY_WARN("CPU '%s' does not support RDTSCP, skipping benchmark.\n", cpu100); 250 return 0; 251 } 252 253 const InputVec& unique = UniqueInputs(inputs, num_inputs); 254 255 const size_t num_skip = NumSkip(func, arg, unique, p); // never 0 256 if (num_skip == 0) return 0; // NumSkip already printed error message 257 // (slightly less work on x86 to cast from signed integer) 258 const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip)); 259 260 const InputVec& full = 261 ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); 262 InputVec subset(full.size() - num_skip); 263 264 const timer::Ticks overhead = Overhead(arg, &full, p); 265 const timer::Ticks overhead_skip = Overhead(arg, &subset, p); 266 if (overhead < overhead_skip) { 267 HWY_WARN("Measurement failed: overhead %d < %d\n", 268 static_cast<int>(overhead), static_cast<int>(overhead_skip)); 269 return 0; 270 } 271 272 if (p.verbose) { 273 printf("#inputs=%5d,%5d overhead=%5d,%5d\n", static_cast<int>(full.size()), 274 static_cast<int>(subset.size()), static_cast<int>(overhead), 275 static_cast<int>(overhead_skip)); 276 } 277 278 double max_rel_mad = 0.0; 279 const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); 280 281 for (size_t i = 0; i < unique.size(); ++i) { 282 FillSubset(full, unique[i], num_skip, &subset); 283 const timer::Ticks total_skip = 284 TotalDuration(func, arg, &subset, p, &max_rel_mad); 285 286 if (total < total_skip) { 287 HWY_WARN("Measurement failed: total %f < %f\n", 288 static_cast<double>(total), static_cast<double>(total_skip)); 289 return 0; 290 } 291 292 const timer::Ticks duration = 293 (total - overhead) - (total_skip - overhead_skip); 294 results[i].input = unique[i]; 295 results[i].ticks = static_cast<float>(duration) * mul; 296 results[i].variability = static_cast<float>(max_rel_mad); 297 } 298 299 return unique.size(); 300 } 301 302 } // namespace hwy