mutex_benchmark.cc (12337B)
1 // Copyright 2017 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <cstdint> 16 #include <mutex> // NOLINT(build/c++11) 17 #include <vector> 18 19 #include "absl/base/config.h" 20 #include "absl/base/internal/cycleclock.h" 21 #include "absl/base/internal/spinlock.h" 22 #include "absl/base/no_destructor.h" 23 #include "absl/synchronization/blocking_counter.h" 24 #include "absl/synchronization/internal/thread_pool.h" 25 #include "absl/synchronization/mutex.h" 26 #include "benchmark/benchmark.h" 27 28 namespace { 29 30 void BM_Mutex(benchmark::State& state) { 31 static absl::NoDestructor<absl::Mutex> mu; 32 for (auto _ : state) { 33 absl::MutexLock lock(mu.get()); 34 } 35 } 36 BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu(); 37 38 void BM_ReaderLock(benchmark::State& state) { 39 static absl::NoDestructor<absl::Mutex> mu; 40 for (auto _ : state) { 41 absl::ReaderMutexLock lock(mu.get()); 42 } 43 } 44 BENCHMARK(BM_ReaderLock)->UseRealTime()->Threads(1)->ThreadPerCpu(); 45 46 void BM_TryLock(benchmark::State& state) { 47 absl::Mutex mu; 48 for (auto _ : state) { 49 if (mu.TryLock()) { 50 mu.Unlock(); 51 } 52 } 53 } 54 BENCHMARK(BM_TryLock); 55 56 void BM_ReaderTryLock(benchmark::State& state) { 57 static absl::NoDestructor<absl::Mutex> mu; 58 for (auto _ : state) { 59 if (mu->ReaderTryLock()) { 60 mu->ReaderUnlock(); 61 } 62 } 63 } 64 BENCHMARK(BM_ReaderTryLock)->UseRealTime()->Threads(1)->ThreadPerCpu(); 65 66 static void DelayNs(int64_t ns, int* data) { 67 int64_t end = absl::base_internal::CycleClock::Now() + 68 ns * absl::base_internal::CycleClock::Frequency() / 1e9; 69 while (absl::base_internal::CycleClock::Now() < end) { 70 ++(*data); 71 benchmark::DoNotOptimize(*data); 72 } 73 } 74 75 template <typename MutexType> 76 class RaiiLocker { 77 public: 78 explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); } 79 ~RaiiLocker() { mu_->Unlock(); } 80 private: 81 MutexType* mu_; 82 }; 83 84 template <> 85 class RaiiLocker<std::mutex> { 86 public: 87 explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); } 88 ~RaiiLocker() { mu_->unlock(); } 89 private: 90 std::mutex* mu_; 91 }; 92 93 // RAII object to change the Mutex priority of the running thread. 94 class ScopedThreadMutexPriority { 95 public: 96 explicit ScopedThreadMutexPriority(int priority) { 97 absl::base_internal::ThreadIdentity* identity = 98 absl::synchronization_internal::GetOrCreateCurrentThreadIdentity(); 99 identity->per_thread_synch.priority = priority; 100 // Bump next_priority_read_cycles to the infinite future so that the 101 // implementation doesn't re-read the thread's actual scheduler priority 102 // and replace our temporary scoped priority. 103 identity->per_thread_synch.next_priority_read_cycles = 104 std::numeric_limits<int64_t>::max(); 105 } 106 ~ScopedThreadMutexPriority() { 107 // Reset the "next priority read time" back to the infinite past so that 108 // the next time the Mutex implementation wants to know this thread's 109 // priority, it re-reads it from the OS instead of using our overridden 110 // priority. 111 absl::synchronization_internal::GetOrCreateCurrentThreadIdentity() 112 ->per_thread_synch.next_priority_read_cycles = 113 std::numeric_limits<int64_t>::min(); 114 } 115 }; 116 117 void BM_MutexEnqueue(benchmark::State& state) { 118 // In the "multiple priorities" variant of the benchmark, one of the 119 // threads runs with Mutex priority 0 while the rest run at elevated priority. 120 // This benchmarks the performance impact of the presence of a low priority 121 // waiter when a higher priority waiter adds itself of the queue 122 // (b/175224064). 123 // 124 // NOTE: The actual scheduler priority is not modified in this benchmark: 125 // all of the threads get CPU slices with the same priority. Only the 126 // Mutex queueing behavior is modified. 127 const bool multiple_priorities = state.range(0); 128 ScopedThreadMutexPriority priority_setter( 129 (multiple_priorities && state.thread_index() != 0) ? 1 : 0); 130 131 struct Shared { 132 absl::Mutex mu; 133 std::atomic<int> looping_threads{0}; 134 std::atomic<int> blocked_threads{0}; 135 std::atomic<bool> thread_has_mutex{false}; 136 }; 137 static absl::NoDestructor<Shared> shared; 138 139 // Set up 'blocked_threads' to count how many threads are currently blocked 140 // in Abseil synchronization code. 141 // 142 // NOTE: Blocking done within the Google Benchmark library itself (e.g. 143 // the barrier which synchronizes threads entering and exiting the benchmark 144 // loop) does _not_ get registered in this counter. This is because Google 145 // Benchmark uses its own synchronization primitives based on std::mutex, not 146 // Abseil synchronization primitives. If at some point the benchmark library 147 // merges into Abseil, this code may break. 148 absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter( 149 &shared->blocked_threads); 150 151 // The benchmark framework may run several iterations in the same process, 152 // reusing the same static-initialized 'shared' object. Given the semantics 153 // of the members, here, we expect everything to be reset to zero by the 154 // end of any iteration. Assert that's the case, just to be sure. 155 ABSL_RAW_CHECK( 156 shared->looping_threads.load(std::memory_order_relaxed) == 0 && 157 shared->blocked_threads.load(std::memory_order_relaxed) == 0 && 158 !shared->thread_has_mutex.load(std::memory_order_relaxed), 159 "Shared state isn't zeroed at start of benchmark iteration"); 160 161 static constexpr int kBatchSize = 1000; 162 while (state.KeepRunningBatch(kBatchSize)) { 163 shared->looping_threads.fetch_add(1); 164 for (int i = 0; i < kBatchSize; i++) { 165 { 166 absl::MutexLock l(&shared->mu); 167 shared->thread_has_mutex.store(true, std::memory_order_relaxed); 168 // Spin until all other threads are either out of the benchmark loop 169 // or blocked on the mutex. This ensures that the mutex queue is kept 170 // at its maximal length to benchmark the performance of queueing on 171 // a highly contended mutex. 172 while (shared->looping_threads.load(std::memory_order_relaxed) - 173 shared->blocked_threads.load(std::memory_order_relaxed) != 174 1) { 175 } 176 shared->thread_has_mutex.store(false); 177 } 178 // Spin until some other thread has acquired the mutex before we block 179 // again. This ensures that we always go through the slow (queueing) 180 // acquisition path rather than reacquiring the mutex we just released. 181 while (!shared->thread_has_mutex.load(std::memory_order_relaxed) && 182 shared->looping_threads.load(std::memory_order_relaxed) > 1) { 183 } 184 } 185 // The benchmark framework uses a barrier to ensure that all of the threads 186 // complete their benchmark loop together before any of the threads exit 187 // the loop. So, we need to remove ourselves from the "looping threads" 188 // counter here before potentially blocking on that barrier. Otherwise, 189 // another thread spinning above might wait forever for this thread to 190 // block on the mutex while we in fact are waiting to exit. 191 shared->looping_threads.fetch_add(-1); 192 } 193 absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter( 194 nullptr); 195 } 196 197 BENCHMARK(BM_MutexEnqueue) 198 ->Threads(4) 199 ->Threads(64) 200 ->Threads(128) 201 ->Threads(512) 202 ->ArgName("multiple_priorities") 203 ->Arg(false) 204 ->Arg(true); 205 206 template <typename MutexType> 207 void BM_Contended(benchmark::State& state) { 208 int priority = state.thread_index() % state.range(1); 209 ScopedThreadMutexPriority priority_setter(priority); 210 211 struct Shared { 212 MutexType mu; 213 int data = 0; 214 }; 215 static absl::NoDestructor<Shared> shared; 216 int local = 0; 217 for (auto _ : state) { 218 // Here we model both local work outside of the critical section as well as 219 // some work inside of the critical section. The idea is to capture some 220 // more or less realisitic contention levels. 221 // If contention is too low, the benchmark won't measure anything useful. 222 // If contention is unrealistically high, the benchmark will favor 223 // bad mutex implementations that block and otherwise distract threads 224 // from the mutex and shared state for as much as possible. 225 // To achieve this amount of local work is multiplied by number of threads 226 // to keep ratio between local work and critical section approximately 227 // equal regardless of number of threads. 228 DelayNs(100 * state.threads(), &local); 229 RaiiLocker<MutexType> locker(&shared->mu); 230 DelayNs(state.range(0), &shared->data); 231 } 232 } 233 void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm, 234 bool do_test_priorities) { 235 const int max_num_priorities = do_test_priorities ? 2 : 1; 236 bm->UseRealTime() 237 // ThreadPerCpu poorly handles non-power-of-two CPU counts. 238 ->Threads(1) 239 ->Threads(2) 240 ->Threads(4) 241 ->Threads(6) 242 ->Threads(8) 243 ->Threads(12) 244 ->Threads(16) 245 ->Threads(24) 246 ->Threads(32) 247 ->Threads(48) 248 ->Threads(64) 249 ->Threads(96) 250 ->Threads(128) 251 ->Threads(192) 252 ->Threads(256) 253 ->ArgNames({"cs_ns", "num_prios"}); 254 // Some empirically chosen amounts of work in critical section. 255 // 1 is low contention, 2000 is high contention and few values in between. 256 for (int critical_section_ns : {1, 20, 50, 200, 2000}) { 257 for (int num_priorities = 1; num_priorities <= max_num_priorities; 258 num_priorities++) { 259 bm->ArgPair(critical_section_ns, num_priorities); 260 } 261 } 262 } 263 264 BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex) 265 ->Apply([](benchmark::internal::Benchmark* bm) { 266 SetupBenchmarkArgs(bm, /*do_test_priorities=*/true); 267 }); 268 269 BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock) 270 ->Apply([](benchmark::internal::Benchmark* bm) { 271 SetupBenchmarkArgs(bm, /*do_test_priorities=*/false); 272 }); 273 274 BENCHMARK_TEMPLATE(BM_Contended, std::mutex) 275 ->Apply([](benchmark::internal::Benchmark* bm) { 276 SetupBenchmarkArgs(bm, /*do_test_priorities=*/false); 277 }); 278 279 // Measure the overhead of conditions on mutex release (when they must be 280 // evaluated). Mutex has (some) support for equivalence classes allowing 281 // Conditions with the same function/argument to potentially not be multiply 282 // evaluated. 283 // 284 // num_classes==0 is used for the special case of every waiter being distinct. 285 void BM_ConditionWaiters(benchmark::State& state) { 286 int num_classes = state.range(0); 287 int num_waiters = state.range(1); 288 289 struct Helper { 290 static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) { 291 init->DecrementCount(); 292 m->LockWhen(absl::Condition( 293 static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p)); 294 m->Unlock(); 295 } 296 }; 297 298 if (num_classes == 0) { 299 // No equivalence classes. 300 num_classes = num_waiters; 301 } 302 303 absl::BlockingCounter init(num_waiters); 304 absl::Mutex mu; 305 std::vector<int> equivalence_classes(num_classes, 1); 306 307 // Must be declared last to be destroyed first. 308 absl::synchronization_internal::ThreadPool pool(num_waiters); 309 310 for (int i = 0; i < num_waiters; i++) { 311 // Mutex considers Conditions with the same function and argument 312 // to be equivalent. 313 pool.Schedule([&, i] { 314 Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]); 315 }); 316 } 317 init.Wait(); 318 319 for (auto _ : state) { 320 mu.Lock(); 321 mu.Unlock(); // Each unlock requires Condition evaluation for our waiters. 322 } 323 324 mu.Lock(); 325 for (int i = 0; i < num_classes; i++) { 326 equivalence_classes[i] = 0; 327 } 328 mu.Unlock(); 329 } 330 331 // Some configurations have higher thread limits than others. 332 #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER) 333 constexpr int kMaxConditionWaiters = 8192; 334 #else 335 constexpr int kMaxConditionWaiters = 1024; 336 #endif 337 BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters); 338 339 } // namespace