benchmark.cc (7867B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <stdint.h> 17 #include <stdio.h> 18 #include <stdlib.h> // abort 19 20 #include <cmath> // std::abs 21 #include <memory> 22 #include <numeric> // std::iota, std::inner_product 23 24 #undef HWY_TARGET_INCLUDE 25 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc" 26 #include "hwy/foreach_target.h" // IWYU pragma: keep 27 28 // Must come after foreach_target.h to avoid redefinition errors. 29 #include "hwy/aligned_allocator.h" 30 #include "hwy/highway.h" 31 #include "hwy/nanobenchmark.h" 32 33 HWY_BEFORE_NAMESPACE(); 34 namespace hwy { 35 namespace HWY_NAMESPACE { 36 namespace { 37 38 // These templates are not found via ADL. 39 #if HWY_TARGET != HWY_SCALAR 40 using hwy::HWY_NAMESPACE::CombineShiftRightLanes; 41 #endif 42 43 class TwoArray { 44 public: 45 // Must be a multiple of the vector lane count * 8. 46 static size_t NumItems() { return 3456; } 47 48 TwoArray() 49 : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) { 50 // = 1, but compiler doesn't know 51 const float init = static_cast<float>(Unpredictable1()); 52 std::iota(a_.get(), a_.get() + NumItems(), init); 53 std::iota(b_, b_ + NumItems(), init); 54 } 55 56 protected: 57 AlignedFreeUniquePtr<float[]> a_; 58 float* b_; 59 }; 60 61 // Measures durations, verifies results, prints timings. 62 template <class Benchmark> 63 void RunBenchmark(const char* caption) { 64 printf("%10s: ", caption); 65 const size_t kNumInputs = 1; 66 const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1()); 67 const FuncInput inputs[kNumInputs] = {num_items}; 68 Result results[kNumInputs]; 69 70 Benchmark benchmark; 71 72 Params p; 73 p.verbose = false; 74 p.max_evals = 7; 75 p.target_rel_mad = 0.002; 76 const size_t num_results = MeasureClosure( 77 [&benchmark](const FuncInput input) { return benchmark(input); }, inputs, 78 kNumInputs, results, p); 79 if (num_results != kNumInputs) { 80 HWY_WARN("MeasureClosure failed.\n"); 81 } 82 83 benchmark.Verify(num_items); 84 85 for (size_t i = 0; i < num_results; ++i) { 86 const double cycles_per_item = 87 results[i].ticks / static_cast<double>(results[i].input); 88 const double mad = results[i].variability * cycles_per_item; 89 printf("%6d: %6.3f (+/- %5.3f)\n", static_cast<int>(results[i].input), 90 cycles_per_item, mad); 91 } 92 } 93 94 void Intro() { 95 const float in[16] = {1, 2, 3, 4, 5, 6}; 96 float out[16]; 97 const ScalableTag<float> d; // largest possible vector 98 for (size_t i = 0; i < 16; i += Lanes(d)) { 99 const auto vec = LoadU(d, in + i); // no alignment requirement 100 auto result = Mul(vec, vec); 101 result = Add(result, result); // can update if not const 102 StoreU(result, d, out + i); 103 } 104 printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]); 105 } 106 107 // BEGINNER: dot product 108 // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold! 109 class BenchmarkDot : public TwoArray { 110 public: 111 BenchmarkDot() : dot_{-1.0f} {} 112 113 FuncOutput operator()(const size_t num_items) { 114 const ScalableTag<float> d; 115 const size_t N = Lanes(d); 116 using V = decltype(Zero(d)); 117 // Compiler doesn't make independent sum* accumulators, so unroll manually. 118 // We cannot use an array because V might be a sizeless type. For reasonable 119 // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency). 120 V sum0 = Zero(d); 121 V sum1 = Zero(d); 122 V sum2 = Zero(d); 123 V sum3 = Zero(d); 124 const float* const HWY_RESTRICT pa = &a_[0]; 125 const float* const HWY_RESTRICT pb = b_; 126 for (size_t i = 0; i < num_items; i += 4 * N) { 127 const auto a0 = Load(d, pa + i + 0 * N); 128 const auto b0 = Load(d, pb + i + 0 * N); 129 sum0 = MulAdd(a0, b0, sum0); 130 const auto a1 = Load(d, pa + i + 1 * N); 131 const auto b1 = Load(d, pb + i + 1 * N); 132 sum1 = MulAdd(a1, b1, sum1); 133 const auto a2 = Load(d, pa + i + 2 * N); 134 const auto b2 = Load(d, pb + i + 2 * N); 135 sum2 = MulAdd(a2, b2, sum2); 136 const auto a3 = Load(d, pa + i + 3 * N); 137 const auto b3 = Load(d, pb + i + 3 * N); 138 sum3 = MulAdd(a3, b3, sum3); 139 } 140 // Reduction tree: sum of all accumulators by pairs into sum0. 141 sum0 = Add(sum0, sum1); 142 sum2 = Add(sum2, sum3); 143 sum0 = Add(sum0, sum2); 144 // Remember to store the result in `dot_` for verification; see `Verify`. 145 dot_ = ReduceSum(d, sum0); 146 // Return the result so that the benchmarking framework can ensure that the 147 // computation is not elided by the compiler. 148 return static_cast<FuncOutput>(dot_); 149 } 150 void Verify(size_t num_items) { 151 if (dot_ == -1.0f) { 152 HWY_ABORT("Dot: must call Verify after benchmark"); 153 } 154 155 const float expected = 156 std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f); 157 const float rel_err = std::abs(expected - dot_) / expected; 158 if (rel_err > 1.1E-6f) { 159 HWY_ABORT("Dot: expected %e actual %e (%e)\n", expected, dot_, rel_err); 160 } 161 } 162 163 private: 164 float dot_; // for Verify 165 }; 166 167 // INTERMEDIATE: delta coding 168 // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold! 169 struct BenchmarkDelta : public TwoArray { 170 FuncOutput operator()(const size_t num_items) const { 171 #if HWY_TARGET == HWY_SCALAR 172 b_[0] = a_[0]; 173 for (size_t i = 1; i < num_items; ++i) { 174 b_[i] = a_[i] - a_[i - 1]; 175 } 176 #elif HWY_CAP_GE256 177 // Larger vectors are split into 128-bit blocks, easiest to use the 178 // unaligned load support to shift between them. 179 const ScalableTag<float> df; 180 const size_t N = Lanes(df); 181 size_t i; 182 b_[0] = a_[0]; 183 for (i = 1; i < N; ++i) { 184 b_[i] = a_[i] - a_[i - 1]; 185 } 186 for (; i < num_items; i += N) { 187 const auto a = Load(df, &a_[i]); 188 const auto shifted = LoadU(df, &a_[i - 1]); 189 Store(a - shifted, df, &b_[i]); 190 } 191 #else // 128-bit 192 // Slightly better than unaligned loads 193 const HWY_CAPPED(float, 4) df; 194 const size_t N = Lanes(df); 195 size_t i; 196 b_[0] = a_[0]; 197 for (i = 1; i < N; ++i) { 198 b_[i] = a_[i] - a_[i - 1]; 199 } 200 auto prev = Load(df, &a_[0]); 201 for (; i < num_items; i += Lanes(df)) { 202 const auto a = Load(df, &a_[i]); 203 const auto shifted = CombineShiftRightLanes<3>(df, a, prev); 204 prev = a; 205 Store(Sub(a, shifted), df, &b_[i]); 206 } 207 #endif 208 return static_cast<FuncOutput>(b_[num_items - 1]); 209 } 210 211 void Verify(size_t num_items) { 212 for (size_t i = 0; i < num_items; ++i) { 213 const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1]; 214 const float err = std::abs(expected - b_[i]); 215 if (err > 1E-6f) { 216 HWY_WARN("Delta: expected %e, actual %e\n", expected, b_[i]); 217 } 218 } 219 } 220 }; 221 222 void RunBenchmarks() { 223 Intro(); 224 printf("------------------------ %s\n", TargetName(HWY_TARGET)); 225 RunBenchmark<BenchmarkDot>("dot"); 226 RunBenchmark<BenchmarkDelta>("delta"); 227 } 228 229 } // namespace 230 // NOLINTNEXTLINE(google-readability-namespace-comments) 231 } // namespace HWY_NAMESPACE 232 } // namespace hwy 233 HWY_AFTER_NAMESPACE(); 234 235 #if HWY_ONCE 236 namespace hwy { 237 namespace { 238 HWY_EXPORT(RunBenchmarks); 239 240 void Run() { 241 for (int64_t target : SupportedAndGeneratedTargets()) { 242 SetSupportedTargetsForTest(target); 243 HWY_DYNAMIC_DISPATCH(RunBenchmarks)(); 244 } 245 SetSupportedTargetsForTest(0); // Reset the mask afterwards. 246 } 247 248 } // namespace 249 } // namespace hwy 250 251 int main(int /*argc*/, char** /*argv*/) { 252 hwy::Run(); 253 return 0; 254 } 255 #endif // HWY_ONCE