tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

benchmark.cc (7867B)


      1 // Copyright 2019 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <stdint.h>
     17 #include <stdio.h>
     18 #include <stdlib.h>  // abort
     19 
     20 #include <cmath>  // std::abs
     21 #include <memory>
     22 #include <numeric>  // std::iota, std::inner_product
     23 
     24 #undef HWY_TARGET_INCLUDE
     25 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
     26 #include "hwy/foreach_target.h"  // IWYU pragma: keep
     27 
     28 // Must come after foreach_target.h to avoid redefinition errors.
     29 #include "hwy/aligned_allocator.h"
     30 #include "hwy/highway.h"
     31 #include "hwy/nanobenchmark.h"
     32 
     33 HWY_BEFORE_NAMESPACE();
     34 namespace hwy {
     35 namespace HWY_NAMESPACE {
     36 namespace {
     37 
     38 // These templates are not found via ADL.
     39 #if HWY_TARGET != HWY_SCALAR
     40 using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
     41 #endif
     42 
     43 class TwoArray {
     44 public:
     45  // Must be a multiple of the vector lane count * 8.
     46  static size_t NumItems() { return 3456; }
     47 
     48  TwoArray()
     49      : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
     50    // = 1, but compiler doesn't know
     51    const float init = static_cast<float>(Unpredictable1());
     52    std::iota(a_.get(), a_.get() + NumItems(), init);
     53    std::iota(b_, b_ + NumItems(), init);
     54  }
     55 
     56 protected:
     57  AlignedFreeUniquePtr<float[]> a_;
     58  float* b_;
     59 };
     60 
     61 // Measures durations, verifies results, prints timings.
     62 template <class Benchmark>
     63 void RunBenchmark(const char* caption) {
     64  printf("%10s: ", caption);
     65  const size_t kNumInputs = 1;
     66  const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
     67  const FuncInput inputs[kNumInputs] = {num_items};
     68  Result results[kNumInputs];
     69 
     70  Benchmark benchmark;
     71 
     72  Params p;
     73  p.verbose = false;
     74  p.max_evals = 7;
     75  p.target_rel_mad = 0.002;
     76  const size_t num_results = MeasureClosure(
     77      [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
     78      kNumInputs, results, p);
     79  if (num_results != kNumInputs) {
     80    HWY_WARN("MeasureClosure failed.\n");
     81  }
     82 
     83  benchmark.Verify(num_items);
     84 
     85  for (size_t i = 0; i < num_results; ++i) {
     86    const double cycles_per_item =
     87        results[i].ticks / static_cast<double>(results[i].input);
     88    const double mad = results[i].variability * cycles_per_item;
     89    printf("%6d: %6.3f (+/- %5.3f)\n", static_cast<int>(results[i].input),
     90           cycles_per_item, mad);
     91  }
     92 }
     93 
     94 void Intro() {
     95  const float in[16] = {1, 2, 3, 4, 5, 6};
     96  float out[16];
     97  const ScalableTag<float> d;  // largest possible vector
     98  for (size_t i = 0; i < 16; i += Lanes(d)) {
     99    const auto vec = LoadU(d, in + i);  // no alignment requirement
    100    auto result = Mul(vec, vec);
    101    result = Add(result, result);  // can update if not const
    102    StoreU(result, d, out + i);
    103  }
    104  printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
    105 }
    106 
    107 // BEGINNER: dot product
    108 // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
    109 class BenchmarkDot : public TwoArray {
    110 public:
    111  BenchmarkDot() : dot_{-1.0f} {}
    112 
    113  FuncOutput operator()(const size_t num_items) {
    114    const ScalableTag<float> d;
    115    const size_t N = Lanes(d);
    116    using V = decltype(Zero(d));
    117    // Compiler doesn't make independent sum* accumulators, so unroll manually.
    118    // We cannot use an array because V might be a sizeless type. For reasonable
    119    // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
    120    V sum0 = Zero(d);
    121    V sum1 = Zero(d);
    122    V sum2 = Zero(d);
    123    V sum3 = Zero(d);
    124    const float* const HWY_RESTRICT pa = &a_[0];
    125    const float* const HWY_RESTRICT pb = b_;
    126    for (size_t i = 0; i < num_items; i += 4 * N) {
    127      const auto a0 = Load(d, pa + i + 0 * N);
    128      const auto b0 = Load(d, pb + i + 0 * N);
    129      sum0 = MulAdd(a0, b0, sum0);
    130      const auto a1 = Load(d, pa + i + 1 * N);
    131      const auto b1 = Load(d, pb + i + 1 * N);
    132      sum1 = MulAdd(a1, b1, sum1);
    133      const auto a2 = Load(d, pa + i + 2 * N);
    134      const auto b2 = Load(d, pb + i + 2 * N);
    135      sum2 = MulAdd(a2, b2, sum2);
    136      const auto a3 = Load(d, pa + i + 3 * N);
    137      const auto b3 = Load(d, pb + i + 3 * N);
    138      sum3 = MulAdd(a3, b3, sum3);
    139    }
    140    // Reduction tree: sum of all accumulators by pairs into sum0.
    141    sum0 = Add(sum0, sum1);
    142    sum2 = Add(sum2, sum3);
    143    sum0 = Add(sum0, sum2);
    144    // Remember to store the result in `dot_` for verification; see `Verify`.
    145    dot_ = ReduceSum(d, sum0);
    146    // Return the result so that the benchmarking framework can ensure that the
    147    // computation is not elided by the compiler.
    148    return static_cast<FuncOutput>(dot_);
    149  }
    150  void Verify(size_t num_items) {
    151    if (dot_ == -1.0f) {
    152      HWY_ABORT("Dot: must call Verify after benchmark");
    153    }
    154 
    155    const float expected =
    156        std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
    157    const float rel_err = std::abs(expected - dot_) / expected;
    158    if (rel_err > 1.1E-6f) {
    159      HWY_ABORT("Dot: expected %e actual %e (%e)\n", expected, dot_, rel_err);
    160    }
    161  }
    162 
    163 private:
    164  float dot_;  // for Verify
    165 };
    166 
    167 // INTERMEDIATE: delta coding
    168 // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
    169 struct BenchmarkDelta : public TwoArray {
    170  FuncOutput operator()(const size_t num_items) const {
    171 #if HWY_TARGET == HWY_SCALAR
    172    b_[0] = a_[0];
    173    for (size_t i = 1; i < num_items; ++i) {
    174      b_[i] = a_[i] - a_[i - 1];
    175    }
    176 #elif HWY_CAP_GE256
    177    // Larger vectors are split into 128-bit blocks, easiest to use the
    178    // unaligned load support to shift between them.
    179    const ScalableTag<float> df;
    180    const size_t N = Lanes(df);
    181    size_t i;
    182    b_[0] = a_[0];
    183    for (i = 1; i < N; ++i) {
    184      b_[i] = a_[i] - a_[i - 1];
    185    }
    186    for (; i < num_items; i += N) {
    187      const auto a = Load(df, &a_[i]);
    188      const auto shifted = LoadU(df, &a_[i - 1]);
    189      Store(a - shifted, df, &b_[i]);
    190    }
    191 #else  // 128-bit
    192    // Slightly better than unaligned loads
    193    const HWY_CAPPED(float, 4) df;
    194    const size_t N = Lanes(df);
    195    size_t i;
    196    b_[0] = a_[0];
    197    for (i = 1; i < N; ++i) {
    198      b_[i] = a_[i] - a_[i - 1];
    199    }
    200    auto prev = Load(df, &a_[0]);
    201    for (; i < num_items; i += Lanes(df)) {
    202      const auto a = Load(df, &a_[i]);
    203      const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
    204      prev = a;
    205      Store(Sub(a, shifted), df, &b_[i]);
    206    }
    207 #endif
    208    return static_cast<FuncOutput>(b_[num_items - 1]);
    209  }
    210 
    211  void Verify(size_t num_items) {
    212    for (size_t i = 0; i < num_items; ++i) {
    213      const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
    214      const float err = std::abs(expected - b_[i]);
    215      if (err > 1E-6f) {
    216        HWY_WARN("Delta: expected %e, actual %e\n", expected, b_[i]);
    217      }
    218    }
    219  }
    220 };
    221 
    222 void RunBenchmarks() {
    223  Intro();
    224  printf("------------------------ %s\n", TargetName(HWY_TARGET));
    225  RunBenchmark<BenchmarkDot>("dot");
    226  RunBenchmark<BenchmarkDelta>("delta");
    227 }
    228 
    229 }  // namespace
    230 // NOLINTNEXTLINE(google-readability-namespace-comments)
    231 }  // namespace HWY_NAMESPACE
    232 }  // namespace hwy
    233 HWY_AFTER_NAMESPACE();
    234 
    235 #if HWY_ONCE
    236 namespace hwy {
    237 namespace {
    238 HWY_EXPORT(RunBenchmarks);
    239 
    240 void Run() {
    241  for (int64_t target : SupportedAndGeneratedTargets()) {
    242    SetSupportedTargetsForTest(target);
    243    HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
    244  }
    245  SetSupportedTargetsForTest(0);  // Reset the mask afterwards.
    246 }
    247 
    248 }  // namespace
    249 }  // namespace hwy
    250 
    251 int main(int /*argc*/, char** /*argv*/) {
    252  hwy::Run();
    253  return 0;
    254 }
    255 #endif  // HWY_ONCE