tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mutex_benchmark.cc (12337B)


      1 // Copyright 2017 The Abseil Authors.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include <cstdint>
     16 #include <mutex>  // NOLINT(build/c++11)
     17 #include <vector>
     18 
     19 #include "absl/base/config.h"
     20 #include "absl/base/internal/cycleclock.h"
     21 #include "absl/base/internal/spinlock.h"
     22 #include "absl/base/no_destructor.h"
     23 #include "absl/synchronization/blocking_counter.h"
     24 #include "absl/synchronization/internal/thread_pool.h"
     25 #include "absl/synchronization/mutex.h"
     26 #include "benchmark/benchmark.h"
     27 
     28 namespace {
     29 
     30 void BM_Mutex(benchmark::State& state) {
     31  static absl::NoDestructor<absl::Mutex> mu;
     32  for (auto _ : state) {
     33    absl::MutexLock lock(mu.get());
     34  }
     35 }
     36 BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
     37 
     38 void BM_ReaderLock(benchmark::State& state) {
     39  static absl::NoDestructor<absl::Mutex> mu;
     40  for (auto _ : state) {
     41    absl::ReaderMutexLock lock(mu.get());
     42  }
     43 }
     44 BENCHMARK(BM_ReaderLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
     45 
     46 void BM_TryLock(benchmark::State& state) {
     47  absl::Mutex mu;
     48  for (auto _ : state) {
     49    if (mu.TryLock()) {
     50      mu.Unlock();
     51    }
     52  }
     53 }
     54 BENCHMARK(BM_TryLock);
     55 
     56 void BM_ReaderTryLock(benchmark::State& state) {
     57  static absl::NoDestructor<absl::Mutex> mu;
     58  for (auto _ : state) {
     59    if (mu->ReaderTryLock()) {
     60      mu->ReaderUnlock();
     61    }
     62  }
     63 }
     64 BENCHMARK(BM_ReaderTryLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
     65 
     66 static void DelayNs(int64_t ns, int* data) {
     67  int64_t end = absl::base_internal::CycleClock::Now() +
     68                ns * absl::base_internal::CycleClock::Frequency() / 1e9;
     69  while (absl::base_internal::CycleClock::Now() < end) {
     70    ++(*data);
     71    benchmark::DoNotOptimize(*data);
     72  }
     73 }
     74 
     75 template <typename MutexType>
     76 class RaiiLocker {
     77 public:
     78  explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
     79  ~RaiiLocker() { mu_->Unlock(); }
     80 private:
     81  MutexType* mu_;
     82 };
     83 
     84 template <>
     85 class RaiiLocker<std::mutex> {
     86 public:
     87  explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
     88  ~RaiiLocker() { mu_->unlock(); }
     89 private:
     90  std::mutex* mu_;
     91 };
     92 
     93 // RAII object to change the Mutex priority of the running thread.
     94 class ScopedThreadMutexPriority {
     95 public:
     96  explicit ScopedThreadMutexPriority(int priority) {
     97    absl::base_internal::ThreadIdentity* identity =
     98        absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
     99    identity->per_thread_synch.priority = priority;
    100    // Bump next_priority_read_cycles to the infinite future so that the
    101    // implementation doesn't re-read the thread's actual scheduler priority
    102    // and replace our temporary scoped priority.
    103    identity->per_thread_synch.next_priority_read_cycles =
    104        std::numeric_limits<int64_t>::max();
    105  }
    106  ~ScopedThreadMutexPriority() {
    107    // Reset the "next priority read time" back to the infinite past so that
    108    // the next time the Mutex implementation wants to know this thread's
    109    // priority, it re-reads it from the OS instead of using our overridden
    110    // priority.
    111    absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
    112        ->per_thread_synch.next_priority_read_cycles =
    113        std::numeric_limits<int64_t>::min();
    114  }
    115 };
    116 
    117 void BM_MutexEnqueue(benchmark::State& state) {
    118  // In the "multiple priorities" variant of the benchmark, one of the
    119  // threads runs with Mutex priority 0 while the rest run at elevated priority.
    120  // This benchmarks the performance impact of the presence of a low priority
    121  // waiter when a higher priority waiter adds itself of the queue
    122  // (b/175224064).
    123  //
    124  // NOTE: The actual scheduler priority is not modified in this benchmark:
    125  // all of the threads get CPU slices with the same priority. Only the
    126  // Mutex queueing behavior is modified.
    127  const bool multiple_priorities = state.range(0);
    128  ScopedThreadMutexPriority priority_setter(
    129      (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
    130 
    131  struct Shared {
    132    absl::Mutex mu;
    133    std::atomic<int> looping_threads{0};
    134    std::atomic<int> blocked_threads{0};
    135    std::atomic<bool> thread_has_mutex{false};
    136  };
    137  static absl::NoDestructor<Shared> shared;
    138 
    139  // Set up 'blocked_threads' to count how many threads are currently blocked
    140  // in Abseil synchronization code.
    141  //
    142  // NOTE: Blocking done within the Google Benchmark library itself (e.g.
    143  // the barrier which synchronizes threads entering and exiting the benchmark
    144  // loop) does _not_ get registered in this counter. This is because Google
    145  // Benchmark uses its own synchronization primitives based on std::mutex, not
    146  // Abseil synchronization primitives. If at some point the benchmark library
    147  // merges into Abseil, this code may break.
    148  absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
    149      &shared->blocked_threads);
    150 
    151  // The benchmark framework may run several iterations in the same process,
    152  // reusing the same static-initialized 'shared' object. Given the semantics
    153  // of the members, here, we expect everything to be reset to zero by the
    154  // end of any iteration. Assert that's the case, just to be sure.
    155  ABSL_RAW_CHECK(
    156      shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
    157          shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
    158          !shared->thread_has_mutex.load(std::memory_order_relaxed),
    159      "Shared state isn't zeroed at start of benchmark iteration");
    160 
    161  static constexpr int kBatchSize = 1000;
    162  while (state.KeepRunningBatch(kBatchSize)) {
    163    shared->looping_threads.fetch_add(1);
    164    for (int i = 0; i < kBatchSize; i++) {
    165      {
    166        absl::MutexLock l(&shared->mu);
    167        shared->thread_has_mutex.store(true, std::memory_order_relaxed);
    168        // Spin until all other threads are either out of the benchmark loop
    169        // or blocked on the mutex. This ensures that the mutex queue is kept
    170        // at its maximal length to benchmark the performance of queueing on
    171        // a highly contended mutex.
    172        while (shared->looping_threads.load(std::memory_order_relaxed) -
    173                   shared->blocked_threads.load(std::memory_order_relaxed) !=
    174               1) {
    175        }
    176        shared->thread_has_mutex.store(false);
    177      }
    178      // Spin until some other thread has acquired the mutex before we block
    179      // again. This ensures that we always go through the slow (queueing)
    180      // acquisition path rather than reacquiring the mutex we just released.
    181      while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
    182             shared->looping_threads.load(std::memory_order_relaxed) > 1) {
    183      }
    184    }
    185    // The benchmark framework uses a barrier to ensure that all of the threads
    186    // complete their benchmark loop together before any of the threads exit
    187    // the loop. So, we need to remove ourselves from the "looping threads"
    188    // counter here before potentially blocking on that barrier. Otherwise,
    189    // another thread spinning above might wait forever for this thread to
    190    // block on the mutex while we in fact are waiting to exit.
    191    shared->looping_threads.fetch_add(-1);
    192  }
    193  absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
    194      nullptr);
    195 }
    196 
    197 BENCHMARK(BM_MutexEnqueue)
    198    ->Threads(4)
    199    ->Threads(64)
    200    ->Threads(128)
    201    ->Threads(512)
    202    ->ArgName("multiple_priorities")
    203    ->Arg(false)
    204    ->Arg(true);
    205 
    206 template <typename MutexType>
    207 void BM_Contended(benchmark::State& state) {
    208  int priority = state.thread_index() % state.range(1);
    209  ScopedThreadMutexPriority priority_setter(priority);
    210 
    211  struct Shared {
    212    MutexType mu;
    213    int data = 0;
    214  };
    215  static absl::NoDestructor<Shared> shared;
    216  int local = 0;
    217  for (auto _ : state) {
    218    // Here we model both local work outside of the critical section as well as
    219    // some work inside of the critical section. The idea is to capture some
    220    // more or less realisitic contention levels.
    221    // If contention is too low, the benchmark won't measure anything useful.
    222    // If contention is unrealistically high, the benchmark will favor
    223    // bad mutex implementations that block and otherwise distract threads
    224    // from the mutex and shared state for as much as possible.
    225    // To achieve this amount of local work is multiplied by number of threads
    226    // to keep ratio between local work and critical section approximately
    227    // equal regardless of number of threads.
    228    DelayNs(100 * state.threads(), &local);
    229    RaiiLocker<MutexType> locker(&shared->mu);
    230    DelayNs(state.range(0), &shared->data);
    231  }
    232 }
    233 void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
    234                        bool do_test_priorities) {
    235  const int max_num_priorities = do_test_priorities ? 2 : 1;
    236  bm->UseRealTime()
    237      // ThreadPerCpu poorly handles non-power-of-two CPU counts.
    238      ->Threads(1)
    239      ->Threads(2)
    240      ->Threads(4)
    241      ->Threads(6)
    242      ->Threads(8)
    243      ->Threads(12)
    244      ->Threads(16)
    245      ->Threads(24)
    246      ->Threads(32)
    247      ->Threads(48)
    248      ->Threads(64)
    249      ->Threads(96)
    250      ->Threads(128)
    251      ->Threads(192)
    252      ->Threads(256)
    253      ->ArgNames({"cs_ns", "num_prios"});
    254  // Some empirically chosen amounts of work in critical section.
    255  // 1 is low contention, 2000 is high contention and few values in between.
    256  for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
    257    for (int num_priorities = 1; num_priorities <= max_num_priorities;
    258         num_priorities++) {
    259      bm->ArgPair(critical_section_ns, num_priorities);
    260    }
    261  }
    262 }
    263 
    264 BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
    265    ->Apply([](benchmark::internal::Benchmark* bm) {
    266      SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
    267    });
    268 
    269 BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
    270    ->Apply([](benchmark::internal::Benchmark* bm) {
    271      SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
    272    });
    273 
    274 BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
    275    ->Apply([](benchmark::internal::Benchmark* bm) {
    276      SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
    277    });
    278 
    279 // Measure the overhead of conditions on mutex release (when they must be
    280 // evaluated).  Mutex has (some) support for equivalence classes allowing
    281 // Conditions with the same function/argument to potentially not be multiply
    282 // evaluated.
    283 //
    284 // num_classes==0 is used for the special case of every waiter being distinct.
    285 void BM_ConditionWaiters(benchmark::State& state) {
    286  int num_classes = state.range(0);
    287  int num_waiters = state.range(1);
    288 
    289  struct Helper {
    290    static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
    291      init->DecrementCount();
    292      m->LockWhen(absl::Condition(
    293          static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
    294      m->Unlock();
    295    }
    296  };
    297 
    298  if (num_classes == 0) {
    299    // No equivalence classes.
    300    num_classes = num_waiters;
    301  }
    302 
    303  absl::BlockingCounter init(num_waiters);
    304  absl::Mutex mu;
    305  std::vector<int> equivalence_classes(num_classes, 1);
    306 
    307  // Must be declared last to be destroyed first.
    308  absl::synchronization_internal::ThreadPool pool(num_waiters);
    309 
    310  for (int i = 0; i < num_waiters; i++) {
    311    // Mutex considers Conditions with the same function and argument
    312    // to be equivalent.
    313    pool.Schedule([&, i] {
    314      Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
    315    });
    316  }
    317  init.Wait();
    318 
    319  for (auto _ : state) {
    320    mu.Lock();
    321    mu.Unlock();  // Each unlock requires Condition evaluation for our waiters.
    322  }
    323 
    324  mu.Lock();
    325  for (int i = 0; i < num_classes; i++) {
    326    equivalence_classes[i] = 0;
    327  }
    328  mu.Unlock();
    329 }
    330 
    331 // Some configurations have higher thread limits than others.
    332 #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
    333 constexpr int kMaxConditionWaiters = 8192;
    334 #else
    335 constexpr int kMaxConditionWaiters = 1024;
    336 #endif
    337 BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
    338 
    339 }  // namespace