profiler.h (32443B)
1 // Copyright 2017 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef HIGHWAY_HWY_PROFILER_H_ 16 #define HIGHWAY_HWY_PROFILER_H_ 17 18 #include <stddef.h> 19 #include <stdint.h> 20 #include <string.h> // strcmp, strlen 21 22 #include <atomic> 23 #include <functional> 24 25 #include "hwy/base.h" 26 #include "hwy/highway_export.h" 27 28 // High precision, low overhead time measurements. Returns exact call counts and 29 // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). 30 // 31 // Uses RAII to capture begin/end timestamps, with user-specified zone names: 32 // `{ PROFILER_ZONE("name"); /*code*/ }` or the name of the current function: 33 // `void FuncToMeasure() { PROFILER_FUNC; /*code*/ }`. 34 // 35 // You can reduce the overhead by passing `global_idx`, which can be taken from 36 // the argument to the `ThreadPool::Run` lambda (if the pool was constructed 37 // with non-default `PoolWorkerMapping`), or from a saved copy of the 38 // thread-local `Profiler::Thread`: `PROFILER_ZONE2(global_idx, name)`. 39 // 40 // The preferred API allows passing flags, such as requesting inclusive time: 41 // `static const auto zone = profiler.AddZone("name", flags);` and then 42 // `PROFILER_ZONE3(profiler, global_idx, zone)`. 43 // 44 // After all threads exit all zones, call `Profiler::Get().PrintResults()` to 45 // print call counts and average durations [CPU cycles] to stdout, sorted in 46 // descending order of total duration. 47 48 // If zero, mock `Profiler` and `profiler::Zone` will be defined. 49 #ifndef PROFILER_ENABLED 50 #define PROFILER_ENABLED 0 51 #endif 52 53 #if PROFILER_ENABLED 54 #include <stdio.h> 55 56 #include <algorithm> // std::sort 57 #include <utility> 58 #include <vector> 59 60 #include "hwy/aligned_allocator.h" 61 #include "hwy/bit_set.h" 62 #include "hwy/timer.h" 63 #endif // PROFILER_ENABLED 64 65 namespace hwy { 66 67 // Flags: we want type-safety (enum class) to catch mistakes such as confusing 68 // zone with flags. Base type (`uint32_t`) ensures it is safe to cast. Defined 69 // outside the `#if` because callers pass them to `PROFILER_ZONE3`. When adding 70 // flags, also update `kNumFlags` and `ChildTotalMask`. 71 enum class ProfilerFlags : uint32_t { 72 kDefault = 0, 73 // The zone should report cumulative time, including all child zones. If not 74 // specified, zones report self-time, excluding child zones. 75 kInclusive = 1 76 }; 77 78 // Called during `PrintResults` to print results from other modules. 79 using ProfilerFunc = std::function<void(void)>; 80 81 template <size_t kMaxStrings> 82 class StringTable { 83 static constexpr std::memory_order kRelaxed = std::memory_order_relaxed; 84 static constexpr std::memory_order kAcq = std::memory_order_acquire; 85 static constexpr std::memory_order kRel = std::memory_order_release; 86 87 public: 88 // Returns a copy of the `name` passed to `Add` that returned the 89 // given `idx`. 90 const char* Name(size_t idx) const { 91 // `kRelaxed` is sufficient because pointers are immutable once published 92 // via a `kRelease` store. 93 return ptrs_[idx].load(kRelaxed); 94 } 95 96 // Returns `idx < kMaxStrings`. Can be called concurrently. Calls with the 97 // same `name` return the same `idx`. 98 size_t Add(const char* name) { 99 // Linear search if it already exists. `kAcq` ensures we see prior stores. 100 const size_t num_strings = next_ptr_.load(kAcq); 101 HWY_ASSERT(num_strings < kMaxStrings); 102 for (size_t idx = 1; idx < num_strings; ++idx) { 103 const char* existing = ptrs_[idx].load(kAcq); 104 // `next_ptr_` was published after writing `ptr_`, hence it is non-null. 105 HWY_ASSERT(existing != nullptr); 106 if (HWY_UNLIKELY(!strcmp(existing, name))) { 107 return idx; 108 } 109 } 110 111 // Copy `name` into `chars_` before publishing the pointer. 112 const size_t len = strlen(name) + 1; 113 const size_t pos = next_char_.fetch_add(len, kRelaxed); 114 HWY_ASSERT(pos + len <= sizeof(chars_)); 115 strcpy(chars_ + pos, name); // NOLINT 116 117 for (;;) { 118 size_t idx = next_ptr_.load(kRelaxed); 119 HWY_ASSERT(idx < kMaxStrings); 120 121 // Attempt to claim the next `idx` via CAS. 122 const char* expected = nullptr; 123 if (HWY_LIKELY(ptrs_[idx].compare_exchange_weak(expected, chars_ + pos, 124 kRel, kRelaxed))) { 125 // Publish the new count and make the `ptrs_` write visible. 126 next_ptr_.store(idx + 1, kRel); 127 HWY_DASSERT(!strcmp(Name(idx), name)); 128 return idx; 129 } 130 131 // We lost the race. `expected` has been updated. 132 if (HWY_UNLIKELY(!strcmp(expected, name))) { 133 // Done, another thread added the same name. Note that we waste the 134 // extra space in `chars_`, which is fine because it is rare. 135 HWY_DASSERT(!strcmp(Name(idx), name)); 136 return idx; 137 } 138 139 // Other thread added a different name. Retry with the next slot. 140 } 141 } 142 143 private: 144 std::atomic<const char*> ptrs_[kMaxStrings]; 145 std::atomic<size_t> next_ptr_{1}; // next idx 146 std::atomic<size_t> next_char_{0}; 147 char chars_[kMaxStrings * 55]; 148 }; 149 150 #if PROFILER_ENABLED 151 152 // Implementation details. 153 namespace profiler { 154 155 HWY_INLINE_VAR constexpr size_t kNumFlags = 1; 156 157 // Upper bounds for fixed-size data structures, guarded via HWY_DASSERT: 158 159 // Maximum nesting of zones, chosen such that `PerWorker` is 256 bytes. 160 HWY_INLINE_VAR constexpr size_t kMaxDepth = 13; 161 // Reports with more than ~50 are anyway difficult to read. 162 HWY_INLINE_VAR constexpr size_t kMaxZones = 128; 163 // Upper bound on global worker_idx across all pools. Note that fiber libraries 164 // can spawn hundreds of threads. Turin has 128-192 cores. 165 HWY_INLINE_VAR constexpr size_t kMaxWorkers = 256; 166 167 // Type-safe wrapper for zone index plus flags, returned by `AddZone`. 168 class ZoneHandle { 169 public: 170 ZoneHandle() : bits_(0) {} // for Accumulator member initialization 171 172 ZoneHandle(size_t zone_idx, ProfilerFlags flags) { 173 HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones); 174 const uint32_t flags_u = static_cast<uint32_t>(flags); 175 HWY_DASSERT(flags_u < (1u << kNumFlags)); 176 bits_ = (static_cast<uint32_t>(zone_idx) << kNumFlags) | flags_u; 177 HWY_DASSERT(ZoneIdx() == zone_idx); 178 } 179 180 ZoneHandle(const ZoneHandle& other) = default; 181 ZoneHandle& operator=(const ZoneHandle& other) = default; 182 183 bool operator==(const ZoneHandle other) const { return bits_ == other.bits_; } 184 bool operator!=(const ZoneHandle other) const { return bits_ != other.bits_; } 185 186 size_t ZoneIdx() const { 187 HWY_DASSERT(bits_ != 0); 188 const size_t zone_idx = bits_ >> kNumFlags; 189 HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones); 190 return zone_idx; 191 } 192 193 bool IsInclusive() const { 194 HWY_DASSERT(bits_ != 0); 195 return (bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive)) != 0; 196 } 197 198 // Returns a mask to zero/ignore child totals for inclusive zones. 199 uint64_t ChildTotalMask() const { 200 // With a ternary operator, clang tends to generate a branch. 201 // return IsInclusive() ? 0 : ~uint64_t{0}; 202 const uint32_t bit = 203 bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive); 204 return uint64_t{bit} - 1; 205 } 206 207 private: 208 uint32_t bits_; 209 }; 210 211 // Storage for zone names. 212 class Zones { 213 public: 214 // Returns a copy of the `name` passed to `AddZone` that returned the 215 // given `zone`. 216 const char* Name(ZoneHandle zone) const { 217 return strings_.Name(zone.ZoneIdx()); 218 } 219 220 // Can be called concurrently. Calls with the same `name` return the same 221 // `ZoneHandle.ZoneIdx()`. 222 ZoneHandle AddZone(const char* name, ProfilerFlags flags) { 223 return ZoneHandle(strings_.Add(name), flags); 224 } 225 226 private: 227 StringTable<kMaxZones> strings_; 228 }; 229 230 // Allows other classes such as `ThreadPool` to register/unregister a function 231 // to call during `PrintResults`. This allows us to gather data from the worker 232 // threads without having to wait until they exit, and decouples the profiler 233 // from other modules. Thread-safe. 234 class Funcs { 235 static constexpr auto kAcq = std::memory_order_acquire; 236 static constexpr auto kRel = std::memory_order_release; 237 238 public: 239 // Can be called concurrently with distinct keys. 240 void Add(intptr_t key, ProfilerFunc func) { 241 HWY_ASSERT(key != 0 && key != kPending); // reserved values 242 HWY_ASSERT(func); // not empty 243 244 for (size_t i = 0; i < kMaxFuncs; ++i) { 245 intptr_t expected = 0; 246 // Lost a race with a concurrent `Add`, try the next slot. 247 if (!keys_[i].compare_exchange_strong(expected, kPending, kRel)) { 248 continue; 249 } 250 // We own the slot: move func there. 251 funcs_[i] = std::move(func); 252 keys_[i].store(key, kRel); // publishes the `func` write. 253 return; 254 } 255 256 HWY_ABORT("Funcs::Add: no free slot, increase kMaxFuncs."); 257 } 258 259 // Can be called concurrently with distinct keys. It is an error to call this 260 // without a prior `Add` of the same key. 261 void Remove(intptr_t key) { 262 HWY_ASSERT(key != 0 && key != kPending); // reserved values 263 264 for (size_t i = 0; i < kMaxFuncs; ++i) { 265 intptr_t actual = keys_[i].load(kAcq); 266 if (actual == key) { 267 // In general, concurrent removal is fine, but in this specific context, 268 // owners are expected to remove their key exactly once, from the same 269 // thread that added it. In that case, CAS should not fail. 270 if (!keys_[i].compare_exchange_strong(actual, kPending, kRel)) { 271 HWY_WARN("Funcs: CAS failed, why is there a concurrent Remove?"); 272 } 273 funcs_[i] = ProfilerFunc(); 274 keys_[i].store(0, kRel); // publishes the `func` write. 275 return; 276 } 277 } 278 HWY_ABORT("Funcs::Remove: failed to find key %p.", 279 reinterpret_cast<void*>(key)); 280 } 281 282 void CallAll() const { 283 for (size_t i = 0; i < kMaxFuncs; ++i) { 284 intptr_t key = keys_[i].load(kAcq); // ensures `funcs_` is visible. 285 // Safely handles concurrent Add/Remove. 286 if (key != 0 && key != kPending) { 287 funcs_[i](); 288 } 289 } 290 } 291 292 private: 293 static constexpr size_t kMaxFuncs = 64; 294 static constexpr intptr_t kPending = -1; 295 296 ProfilerFunc funcs_[kMaxFuncs]; // non-atomic 297 std::atomic<intptr_t> keys_[kMaxFuncs] = {}; 298 }; 299 300 // Holds total duration and number of calls. Worker index is implicit in the 301 // index of this class within the `Accumulators` array. 302 struct Accumulator { 303 void Add(ZoneHandle new_zone, uint64_t self_duration) { 304 duration += self_duration; 305 306 // Only called for valid zones. 307 HWY_DASSERT(new_zone != ZoneHandle()); 308 // Our zone might not have been set yet. 309 HWY_DASSERT(zone == ZoneHandle() || zone == new_zone); 310 zone = new_zone; 311 312 num_calls += 1; 313 } 314 315 void Take(Accumulator& other) { 316 duration += other.duration; 317 other.duration = 0; 318 319 // `ZoneSet` ensures we only call this for non-empty `other`. 320 HWY_DASSERT(other.zone != ZoneHandle()); 321 // Our zone might not have been set yet. 322 HWY_DASSERT(zone == ZoneHandle() || zone == other.zone); 323 zone = other.zone; 324 325 num_calls += other.num_calls; 326 other.num_calls = 0; 327 } 328 329 uint64_t duration = 0; 330 ZoneHandle zone; // flags are used by `Results::Print` 331 uint32_t num_calls = 0; 332 }; 333 static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size"); 334 335 using ZoneSet = hwy::BitSet<kMaxZones>; 336 using WorkerSet = hwy::BitSet<kMaxWorkers>; 337 using AtomicWorkerSet = hwy::AtomicBitSet<kMaxWorkers>; 338 339 // Durations are per-CPU, but end to end performance is defined by wall time. 340 // Assuming fork-join parallelism, zones are entered by multiple threads 341 // concurrently, which means the total number of unique threads is also the 342 // degree of concurrency, so we can estimate wall time as CPU time divided by 343 // the number of unique threads seen. This is facilitated by unique `global_idx` 344 // passed in by callers, or taken from thread-local `GlobalIdx()`. 345 // 346 // We also want to support varying thread counts per call site, because the same 347 // function/zone may be called from multiple pools. `EndRootRun` calls 348 // `CountWorkersAndReset` after each top-level `ThreadPool::Run`, which 349 // generates one data point summarized via descriptive statistics. Here we 350 // implement a simpler version of `Stats` because we do not require 351 // geomean/variance/kurtosis/skewness. Because concurrency is a small integer, 352 // we can simply compute sums rather than online moments. There is also only one 353 // instance across all threads, hence we do not require a `Take`. 354 // 355 // Note that subsequently discovered prior work estimates the number of active 356 // and idle processors by updating atomic counters whenever they start/finish a 357 // task: https://homes.cs.washington.edu/~tom/pubs/quartz.pdf and "Effective 358 // performance measurement and analysis of multithreaded applications". We 359 // instead accumulate zone durations into per-thread storage. 360 // `CountWorkersAndReset` then checks how many were nonzero, which avoids 361 // expensive atomic updates and ensures accurate counts per-zone, rather than 362 // estimates of current activity at each sample. 363 // D. Vyukov's https://github.com/dvyukov/perf-load, also integrated into Linux 364 // perf, also corrects for parallelism without using atomic counters by tracing 365 // context switches. Note that we often pin threads, which avoids migrations, 366 // but reduces the number of context switch events to mainly preemptions. 367 class ConcurrencyStats { 368 public: 369 ConcurrencyStats() { Reset(); } 370 371 void Notify(const size_t x) { 372 sum_ += x; 373 ++n_; 374 min_ = HWY_MIN(min_, x); 375 max_ = HWY_MAX(max_, x); 376 } 377 378 size_t Count() const { return n_; } 379 size_t Min() const { return min_; } 380 size_t Max() const { return max_; } 381 double Mean() const { 382 return static_cast<double>(sum_) / static_cast<double>(n_); 383 } 384 385 void Reset() { 386 sum_ = 0; 387 n_ = 0; 388 min_ = hwy::HighestValue<size_t>(); 389 max_ = hwy::LowestValue<size_t>(); 390 } 391 392 private: 393 uint64_t sum_; 394 size_t n_; 395 size_t min_; 396 size_t max_; 397 }; 398 static_assert(sizeof(ConcurrencyStats) == (8 + 3 * sizeof(size_t)), ""); 399 400 // Holds the final results across all threads, including `ConcurrencyStats` 401 // and `PoolStats`, updated/printed by the main thread. 402 class Results { 403 public: 404 void TakeAccumulator(const size_t global_idx, const size_t zone_idx, 405 Accumulator& other) { 406 HWY_DASSERT(global_idx < kMaxWorkers); 407 HWY_DASSERT(zone_idx < kMaxZones); 408 HWY_DASSERT(other.zone.ZoneIdx() == zone_idx); 409 410 visited_zones_.Set(zone_idx); 411 totals_[zone_idx].Take(other); 412 workers_[zone_idx].Set(global_idx); 413 } 414 415 // Moves the total number of threads seen during the preceding root-level 416 // `ThreadPool::Run` into one data point for `ConcurrencyStats`. 417 void CountWorkersAndReset(const size_t zone_idx) { 418 HWY_DASSERT(zone_idx < kMaxZones); 419 const size_t num_workers = workers_[zone_idx].Count(); 420 // Although workers_[zone_idx] at one point was non-empty, it is reset 421 // below, and so can be empty on the second call to this via `PrintResults`, 422 // after one from `EndRootRun`. Do not add a data point if empty. 423 if (num_workers != 0) { 424 concurrency_[zone_idx].Notify(num_workers); 425 } 426 workers_[zone_idx] = WorkerSet(); 427 } 428 429 void CountWorkersAndReset() { 430 visited_zones_.Foreach( 431 [&](size_t zone_idx) { CountWorkersAndReset(zone_idx); }); 432 } 433 434 void PrintAndReset(const Zones& zones) { 435 const double inv_freq = 1.0 / hwy::platform::InvariantTicksPerSecond(); 436 437 // Sort by decreasing total (self) cost. `totals_` are sparse, so sort an 438 // index vector instead. 439 std::vector<uint32_t> indices; 440 indices.reserve(visited_zones_.Count()); 441 visited_zones_.Foreach([&](size_t zone_idx) { 442 indices.push_back(static_cast<uint32_t>(zone_idx)); 443 // In case the zone exited after `EndRootRun` and was not yet added. 444 CountWorkersAndReset(zone_idx); 445 }); 446 std::sort(indices.begin(), indices.end(), [&](uint32_t a, uint32_t b) { 447 return totals_[a].duration > totals_[b].duration; 448 }); 449 450 for (uint32_t zone_idx : indices) { 451 Accumulator& total = totals_[zone_idx]; // cleared after printing 452 HWY_ASSERT(total.zone.ZoneIdx() == zone_idx); 453 HWY_ASSERT(total.num_calls != 0); // else visited_zones_ is wrong 454 455 ConcurrencyStats& concurrency = concurrency_[zone_idx]; 456 const double duration = static_cast<double>(total.duration); 457 const double per_call = 458 static_cast<double>(total.duration) / total.num_calls; 459 // See comment on `ConcurrencyStats`. 460 const double avg_concurrency = concurrency.Mean(); 461 // Avoid division by zero. 462 const double concurrency_divisor = HWY_MAX(1.0, avg_concurrency); 463 printf("%s%-40s: %10.0f x %15.0f / %5.1f (%5zu %3zu-%3zu) = %9.6f\n", 464 total.zone.IsInclusive() ? "(I)" : " ", zones.Name(total.zone), 465 static_cast<double>(total.num_calls), per_call, avg_concurrency, 466 concurrency.Count(), concurrency.Min(), concurrency.Max(), 467 duration * inv_freq / concurrency_divisor); 468 469 total = Accumulator(); 470 concurrency.Reset(); 471 // `workers_` was already reset by `CountWorkersAndReset`. 472 } 473 visited_zones_ = ZoneSet(); 474 } 475 476 private: 477 // Indicates which of the array entries are in use. 478 ZoneSet visited_zones_; 479 Accumulator totals_[kMaxZones]; 480 WorkerSet workers_[kMaxZones]; 481 ConcurrencyStats concurrency_[kMaxZones]; 482 }; 483 484 // Delay after capturing timestamps before/after the actual zone runs. Even 485 // with frequency throttling disabled, this has a multimodal distribution, 486 // including 32, 34, 48, 52, 59, 62. 487 struct Overheads { 488 uint64_t self = 0; 489 uint64_t child = 0; 490 }; 491 static_assert(sizeof(Overheads) == 16, "Wrong Overheads size"); 492 493 class Accumulators { 494 // We generally want to group threads together because they are often 495 // accessed together during a zone, but also want to avoid threads sharing a 496 // cache line. Hence interleave 8 zones per worker. 497 static constexpr size_t kPerLine = HWY_ALIGNMENT / sizeof(Accumulator); 498 499 public: 500 Accumulator& Get(const size_t global_idx, const size_t zone_idx) { 501 HWY_DASSERT(global_idx < kMaxWorkers); 502 HWY_DASSERT(zone_idx < kMaxZones); 503 const size_t line = zone_idx / kPerLine; 504 const size_t offset = zone_idx % kPerLine; 505 return zones_[(line * kMaxWorkers + global_idx) * kPerLine + offset]; 506 } 507 508 private: 509 Accumulator zones_[kMaxZones * kMaxWorkers]; 510 }; 511 512 // Reacts to zone enter/exit events. Builds a stack of active zones and 513 // accumulates self/child duration for each. 514 class PerWorker { 515 public: 516 template <typename T> 517 static T ClampedSubtract(const T minuend, const T subtrahend) { 518 static_assert(IsUnsigned<T>(), ""); 519 const T difference = minuend - subtrahend; 520 // Clang output for this is verified to CMOV rather than branch. 521 const T no_underflow = (subtrahend > minuend) ? T{0} : ~T{0}; 522 return difference & no_underflow; 523 } 524 525 void SetOverheads(const Overheads& overheads) { overheads_ = overheads; } 526 527 // Entering a zone: push onto stack. 528 void Enter(const uint64_t t_enter) { 529 const size_t depth = depth_; 530 HWY_DASSERT(depth < kMaxDepth); 531 t_enter_[depth] = t_enter; 532 child_total_[1 + depth] = 0; 533 depth_ = 1 + depth; 534 } 535 536 // Exiting the most recently entered zone (top of stack). 537 void Exit(const uint64_t t_exit, const size_t global_idx, 538 const ZoneHandle zone, Accumulators& accumulators) { 539 HWY_DASSERT(depth_ > 0); 540 const size_t depth = depth_ - 1; 541 const size_t zone_idx = zone.ZoneIdx(); 542 const uint64_t duration = t_exit - t_enter_[depth]; 543 // Clang output for this is verified not to branch. This is 0 if inclusive, 544 // otherwise the child total. 545 const uint64_t child_total = 546 child_total_[1 + depth] & zone.ChildTotalMask(); 547 548 const uint64_t self_duration = ClampedSubtract( 549 duration, overheads_.self + overheads_.child + child_total); 550 accumulators.Get(global_idx, zone_idx).Add(zone, self_duration); 551 // For faster TakeAccumulator() - not all zones are encountered. 552 visited_zones_.Set(zone_idx); 553 554 // Adding this nested time to the parent's `child_total` will 555 // cause it to be later subtracted from the parent's `self_duration`. 556 child_total_[1 + depth - 1] += duration + overheads_.child; 557 558 depth_ = depth; 559 } 560 561 // Returns the duration of one enter/exit pair and resets all state. Called 562 // via `DetectSelfOverhead`. 563 uint64_t GetFirstDurationAndReset(size_t global_idx, 564 Accumulators& accumulators) { 565 HWY_DASSERT(depth_ == 0); 566 567 HWY_DASSERT(visited_zones_.Count() == 1); 568 const size_t zone_idx = visited_zones_.First(); 569 HWY_DASSERT(zone_idx <= 3); 570 HWY_DASSERT(visited_zones_.Get(zone_idx)); 571 visited_zones_.Clear(zone_idx); 572 573 Accumulator& zone = accumulators.Get(global_idx, zone_idx); 574 const uint64_t duration = zone.duration; 575 zone = Accumulator(); 576 return duration; 577 } 578 579 // Adds all data to `results` and resets it here. Called from the main thread. 580 void MoveTo(const size_t global_idx, Accumulators& accumulators, 581 Results& results) { 582 visited_zones_.Foreach([&](size_t zone_idx) { 583 results.TakeAccumulator(global_idx, zone_idx, 584 accumulators.Get(global_idx, zone_idx)); 585 }); 586 // OK to reset even if we have active zones, because we set `visited_zones_` 587 // when exiting the zone. 588 visited_zones_ = ZoneSet(); 589 } 590 591 private: 592 // 40 bytes: 593 ZoneSet visited_zones_; // Which `zones_` have been active on this worker. 594 uint64_t depth_ = 0; // Current nesting level for active zones. 595 Overheads overheads_; 596 597 uint64_t t_enter_[kMaxDepth]; 598 // Used to deduct child duration from parent's self time (unless inclusive). 599 // Shifting by one avoids bounds-checks for depth_ = 0 (root zone). 600 uint64_t child_total_[1 + kMaxDepth] = {0}; 601 }; 602 // Enables shift rather than multiplication. 603 static_assert(sizeof(PerWorker) == 256, "Wrong size"); 604 605 } // namespace profiler 606 607 class Profiler { 608 public: 609 static HWY_DLLEXPORT Profiler& Get(); 610 611 // Returns `global_idx` from thread-local storage (0 for the main thread). 612 // Used by `PROFILER_ZONE/PROFILER_FUNC`. It is faster to instead pass the 613 // global_idx from `ThreadPool::Run` (if constructed with non-default 614 // `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`. 615 // DEPRECATED: use `GlobalIdx` instead. 616 static size_t Thread() { return s_global_idx; } 617 static size_t GlobalIdx() { return s_global_idx; } 618 // Must be called from all worker threads, and once also on the main thread, 619 // before any use of `PROFILER_ZONE/PROFILER_FUNC`. 620 static void SetGlobalIdx(size_t global_idx) { s_global_idx = global_idx; } 621 622 void ReserveWorker(size_t global_idx) { 623 HWY_ASSERT(!workers_reserved_.Get(global_idx)); 624 workers_reserved_.Set(global_idx); 625 } 626 627 void FreeWorker(size_t global_idx) { 628 HWY_ASSERT(workers_reserved_.Get(global_idx)); 629 workers_reserved_.Clear(global_idx); 630 } 631 632 // Called by `Zone` from any thread. 633 void Enter(uint64_t t_enter, size_t global_idx) { 634 GetWorker(global_idx).Enter(t_enter); 635 } 636 637 // Called by `~Zone` from any thread. 638 void Exit(uint64_t t_exit, size_t global_idx, profiler::ZoneHandle zone) { 639 GetWorker(global_idx).Exit(t_exit, global_idx, zone, accumulators_); 640 } 641 642 uint64_t GetFirstDurationAndReset(size_t global_idx) { 643 return GetWorker(global_idx) 644 .GetFirstDurationAndReset(global_idx, accumulators_); 645 } 646 647 const char* Name(profiler::ZoneHandle zone) const { 648 return zones_.Name(zone); 649 } 650 651 // Copies `name` into the string table and returns its unique `zone`. Uses 652 // linear search, which is fine because this is called during static init. 653 // Called via static initializer and the result is passed to the `Zone` ctor. 654 profiler::ZoneHandle AddZone(const char* name, 655 ProfilerFlags flags = ProfilerFlags::kDefault) { 656 return zones_.AddZone(name, flags); 657 } 658 659 void AddFunc(void* owner, ProfilerFunc func) { 660 funcs_.Add(reinterpret_cast<intptr_t>(owner), func); 661 } 662 void RemoveFunc(void* owner) { 663 funcs_.Remove(reinterpret_cast<intptr_t>(owner)); 664 } 665 666 // For reporting average concurrency. Called by `ThreadPool::Run` on the main 667 // thread, returns true if this is the first call since the last `EndRootRun`. 668 // 669 // We want to report the concurrency of each separate 'invocation' of a zone. 670 // A unique per-call identifier (could be approximated with the line number 671 // and return address) is not sufficient because the caller may in turn be 672 // called from differing parallel sections. A per-`ThreadPool::Run` counter 673 // also under-reports concurrency because each pool in nested parallelism 674 // (over packages and CCXes) would be considered separate invocations. 675 // 676 // The alternative of detecting overlapping zones via timestamps is not 100% 677 // reliable because timers may not be synchronized across sockets or perhaps 678 // even cores. "Invariant" x86 TSCs are indeed synchronized across cores, but 679 // not across sockets unless the RESET# signal reaches each at the same time. 680 // Linux seems to make an effort to correct this, and Arm's "generic timer" 681 // broadcasts to "all cores", but there is no universal guarantee. 682 // 683 // Under the assumption that all concurrency is via our `ThreadPool`, we can 684 // record all `global_idx` for each outermost (root) `ThreadPool::Run`. This 685 // collapses all nested pools into one 'invocation'. We then compute per-zone 686 // concurrency as the number of unique `global_idx` seen per invocation. 687 bool IsRootRun() { 688 // We are not the root if a Run was already active. 689 return !run_active_.test_and_set(std::memory_order_acquire); 690 } 691 692 // Must be called if `IsRootRun` returned true. Resets the state so that the 693 // next call to `IsRootRun` will again return true. Called from main thread. 694 // Note that some zones may still be active. Their concurrency will be updated 695 // when `PrintResults` is called. 696 void EndRootRun() { 697 UpdateResults(); 698 results_.CountWorkersAndReset(); 699 700 run_active_.clear(std::memory_order_release); 701 } 702 703 // Prints results. Call from main thread after all threads have exited all 704 // zones. Resets all state, can be called again after more zones. 705 void PrintResults() { 706 UpdateResults(); 707 // `CountWorkersAndReset` is fused into `Print`, so do not call it here. 708 709 results_.PrintAndReset(zones_); 710 711 funcs_.CallAll(); 712 } 713 714 // TODO: remove when no longer called. 715 void SetMaxThreads(size_t) {} 716 717 private: 718 // Sets main thread index, computes self-overhead, and checks timer support. 719 Profiler(); 720 721 profiler::PerWorker& GetWorker(size_t global_idx) { 722 HWY_DASSERT(workers_reserved_.Get(global_idx)); 723 return workers_[global_idx]; 724 } 725 726 // Moves accumulators into Results. Called from the main thread. 727 void UpdateResults() { 728 // Ensure we see all writes from before the workers' release fence. 729 std::atomic_thread_fence(std::memory_order_acquire); 730 731 workers_reserved_.Foreach([&](size_t global_idx) { 732 workers_[global_idx].MoveTo(global_idx, accumulators_, results_); 733 }); 734 } 735 736 static thread_local size_t s_global_idx; 737 738 // These are atomic because `ThreadFunc` reserves its slot(s) and even 739 // `ThreadPool::ThreadPool` may be called concurrently. Both have bit `i` set 740 // between calls to `Reserve*(i)` and `Free*(i)`. They are consulted in 741 // `UpdateResults` and to validate arguments in debug builds, and only updated 742 // in the pool/thread init/shutdown. 743 profiler::AtomicWorkerSet workers_reserved_; 744 745 std::atomic_flag run_active_ = ATOMIC_FLAG_INIT; 746 747 profiler::Funcs funcs_; 748 749 // To avoid locking, each worker has its own working set. We could access this 750 // through `thread_local` pointers, but that is slow to read on x86. Because 751 // our `ThreadPool` anyway passes a `global_idx` argument, we can instead pass 752 // that through the `PROFILER_ZONE2/PROFILER_ZONE3` macros. 753 profiler::PerWorker workers_[profiler::kMaxWorkers]; 754 755 profiler::Accumulators accumulators_; 756 757 profiler::Results results_; 758 759 profiler::Zones zones_; 760 }; 761 762 namespace profiler { 763 764 // RAII for zone entry/exit. 765 class Zone { 766 public: 767 // Thread-compatible; must not call concurrently with the same `global_idx`, 768 // which is either: 769 // - passed from `ThreadPool::Run` (if it was constructed with non-default 770 // `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`; 771 // - obtained from `Profiler::GlobalIdx()`; or 772 // - 0 if running on the main thread. 773 Zone(Profiler& profiler, size_t global_idx, ZoneHandle zone) 774 : profiler_(profiler) { 775 HWY_FENCE; 776 const uint64_t t_enter = timer::Start(); 777 HWY_FENCE; 778 global_idx_ = static_cast<uint32_t>(global_idx); 779 zone_ = zone; 780 profiler.Enter(t_enter, global_idx); 781 HWY_FENCE; 782 } 783 784 ~Zone() { 785 HWY_FENCE; 786 const uint64_t t_exit = timer::Stop(); 787 profiler_.Exit(t_exit, static_cast<size_t>(global_idx_), zone_); 788 HWY_FENCE; 789 } 790 791 private: 792 Profiler& profiler_; 793 uint32_t global_idx_; 794 ZoneHandle zone_; 795 }; 796 797 } // namespace profiler 798 #else // profiler disabled: stub implementation 799 800 namespace profiler { 801 struct ZoneHandle {}; 802 } // namespace profiler 803 804 struct Profiler { 805 static HWY_DLLEXPORT Profiler& Get(); 806 807 // DEPRECATED: use `GlobalIdx` instead. 808 static size_t Thread() { return 0; } 809 static size_t GlobalIdx() { return 0; } 810 static void SetGlobalIdx(size_t) {} 811 void ReserveWorker(size_t) {} 812 void FreeWorker(size_t) {} 813 void Enter(uint64_t, size_t) {} 814 void Exit(uint64_t, size_t, profiler::ZoneHandle) {} 815 uint64_t GetFirstDurationAndReset(size_t) { return 0; } 816 817 const char* Name(profiler::ZoneHandle) const { return nullptr; } 818 profiler::ZoneHandle AddZone(const char*, 819 ProfilerFlags = ProfilerFlags::kDefault) { 820 return profiler::ZoneHandle(); 821 } 822 823 void AddFunc(void*, ProfilerFunc) {} 824 void RemoveFunc(void*) {} 825 826 bool IsRootRun() { return false; } 827 void EndRootRun() {} 828 void PrintResults() {} 829 830 // TODO: remove when no longer called. 831 void SetMaxThreads(size_t) {} 832 }; 833 834 namespace profiler { 835 struct Zone { 836 Zone(Profiler&, size_t, ZoneHandle) {} 837 }; 838 839 } // namespace profiler 840 #endif // PROFILER_ENABLED || HWY_IDE 841 842 } // namespace hwy 843 844 // Creates a `Zone` lvalue with a line-dependent name, which records the elapsed 845 // time from here until the end of the current scope. `p` is from 846 // `Profiler::Get()` or a cached reference. `global_idx < kMaxWorkers`. `zone` 847 // is the return value of `AddZone`. Separating its static init from the `Zone` 848 // may be more efficient than `PROFILER_ZONE2`. 849 #define PROFILER_ZONE3(p, global_idx, zone) \ 850 HWY_FENCE; \ 851 const hwy::profiler::Zone HWY_CONCAT(Z, __LINE__)(p, global_idx, zone); \ 852 HWY_FENCE 853 854 // For compatibility with old callers that do not pass `p` nor `flags`. 855 // Also calls AddZone. Usage: `PROFILER_ZONE2(global_idx, "MyZone");` 856 #define PROFILER_ZONE2(global_idx, name) \ 857 static const hwy::profiler::ZoneHandle HWY_CONCAT(zone, __LINE__) = \ 858 hwy::Profiler::Get().AddZone(name); \ 859 PROFILER_ZONE3(hwy::Profiler::Get(), global_idx, HWY_CONCAT(zone, __LINE__)) 860 #define PROFILER_FUNC2(global_idx) PROFILER_ZONE2(global_idx, __func__) 861 862 // OBSOLETE: it is more efficient to pass `global_idx` from `ThreadPool` to 863 // `PROFILER_ZONE2/PROFILER_ZONE3`. Here we get it from thread_local storage. 864 #define PROFILER_ZONE(name) PROFILER_ZONE2(hwy::Profiler::GlobalIdx(), name) 865 #define PROFILER_FUNC PROFILER_FUNC2(hwy::Profiler::GlobalIdx()) 866 867 // DEPRECATED: Use `hwy::Profiler::Get()` directly instead. 868 #define PROFILER_ADD_ZONE(name) hwy::Profiler::Get().AddZone(name) 869 #define PROFILER_IS_ROOT_RUN() hwy::Profiler::Get().IsRootRun() 870 #define PROFILER_END_ROOT_RUN() hwy::Profiler::Get().EndRootRun() 871 #define PROFILER_PRINT_RESULTS() hwy::Profiler::Get().PrintResults() 872 873 #endif // HIGHWAY_HWY_PROFILER_H_