tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

profiler.h (32443B)


      1 // Copyright 2017 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #ifndef HIGHWAY_HWY_PROFILER_H_
     16 #define HIGHWAY_HWY_PROFILER_H_
     17 
     18 #include <stddef.h>
     19 #include <stdint.h>
     20 #include <string.h>  // strcmp, strlen
     21 
     22 #include <atomic>
     23 #include <functional>
     24 
     25 #include "hwy/base.h"
     26 #include "hwy/highway_export.h"
     27 
     28 // High precision, low overhead time measurements. Returns exact call counts and
     29 // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
     30 //
     31 // Uses RAII to capture begin/end timestamps, with user-specified zone names:
     32 // `{ PROFILER_ZONE("name"); /*code*/ }` or the name of the current function:
     33 // `void FuncToMeasure() { PROFILER_FUNC; /*code*/ }`.
     34 //
     35 // You can reduce the overhead by passing `global_idx`, which can be taken from
     36 // the argument to the `ThreadPool::Run` lambda (if the pool was constructed
     37 // with non-default `PoolWorkerMapping`), or from a saved copy of the
     38 // thread-local `Profiler::Thread`: `PROFILER_ZONE2(global_idx, name)`.
     39 //
     40 // The preferred API allows passing flags, such as requesting inclusive time:
     41 // `static const auto zone = profiler.AddZone("name", flags);` and then
     42 // `PROFILER_ZONE3(profiler, global_idx, zone)`.
     43 //
     44 // After all threads exit all zones, call `Profiler::Get().PrintResults()` to
     45 // print call counts and average durations [CPU cycles] to stdout, sorted in
     46 // descending order of total duration.
     47 
     48 // If zero, mock `Profiler` and `profiler::Zone` will be defined.
     49 #ifndef PROFILER_ENABLED
     50 #define PROFILER_ENABLED 0
     51 #endif
     52 
     53 #if PROFILER_ENABLED
     54 #include <stdio.h>
     55 
     56 #include <algorithm>  // std::sort
     57 #include <utility>
     58 #include <vector>
     59 
     60 #include "hwy/aligned_allocator.h"
     61 #include "hwy/bit_set.h"
     62 #include "hwy/timer.h"
     63 #endif  // PROFILER_ENABLED
     64 
     65 namespace hwy {
     66 
     67 // Flags: we want type-safety (enum class) to catch mistakes such as confusing
     68 // zone with flags. Base type (`uint32_t`) ensures it is safe to cast. Defined
     69 // outside the `#if` because callers pass them to `PROFILER_ZONE3`. When adding
     70 // flags, also update `kNumFlags` and `ChildTotalMask`.
     71 enum class ProfilerFlags : uint32_t {
     72  kDefault = 0,
     73  // The zone should report cumulative time, including all child zones. If not
     74  // specified, zones report self-time, excluding child zones.
     75  kInclusive = 1
     76 };
     77 
     78 // Called during `PrintResults` to print results from other modules.
     79 using ProfilerFunc = std::function<void(void)>;
     80 
     81 template <size_t kMaxStrings>
     82 class StringTable {
     83  static constexpr std::memory_order kRelaxed = std::memory_order_relaxed;
     84  static constexpr std::memory_order kAcq = std::memory_order_acquire;
     85  static constexpr std::memory_order kRel = std::memory_order_release;
     86 
     87 public:
     88  // Returns a copy of the `name` passed to `Add` that returned the
     89  // given `idx`.
     90  const char* Name(size_t idx) const {
     91    // `kRelaxed` is sufficient because pointers are immutable once published
     92    // via a `kRelease` store.
     93    return ptrs_[idx].load(kRelaxed);
     94  }
     95 
     96  // Returns `idx < kMaxStrings`. Can be called concurrently. Calls with the
     97  // same `name` return the same `idx`.
     98  size_t Add(const char* name) {
     99    // Linear search if it already exists. `kAcq` ensures we see prior stores.
    100    const size_t num_strings = next_ptr_.load(kAcq);
    101    HWY_ASSERT(num_strings < kMaxStrings);
    102    for (size_t idx = 1; idx < num_strings; ++idx) {
    103      const char* existing = ptrs_[idx].load(kAcq);
    104      // `next_ptr_` was published after writing `ptr_`, hence it is non-null.
    105      HWY_ASSERT(existing != nullptr);
    106      if (HWY_UNLIKELY(!strcmp(existing, name))) {
    107        return idx;
    108      }
    109    }
    110 
    111    // Copy `name` into `chars_` before publishing the pointer.
    112    const size_t len = strlen(name) + 1;
    113    const size_t pos = next_char_.fetch_add(len, kRelaxed);
    114    HWY_ASSERT(pos + len <= sizeof(chars_));
    115    strcpy(chars_ + pos, name);  // NOLINT
    116 
    117    for (;;) {
    118      size_t idx = next_ptr_.load(kRelaxed);
    119      HWY_ASSERT(idx < kMaxStrings);
    120 
    121      // Attempt to claim the next `idx` via CAS.
    122      const char* expected = nullptr;
    123      if (HWY_LIKELY(ptrs_[idx].compare_exchange_weak(expected, chars_ + pos,
    124                                                      kRel, kRelaxed))) {
    125        // Publish the new count and make the `ptrs_` write visible.
    126        next_ptr_.store(idx + 1, kRel);
    127        HWY_DASSERT(!strcmp(Name(idx), name));
    128        return idx;
    129      }
    130 
    131      // We lost the race. `expected` has been updated.
    132      if (HWY_UNLIKELY(!strcmp(expected, name))) {
    133        // Done, another thread added the same name. Note that we waste the
    134        // extra space in `chars_`, which is fine because it is rare.
    135        HWY_DASSERT(!strcmp(Name(idx), name));
    136        return idx;
    137      }
    138 
    139      // Other thread added a different name. Retry with the next slot.
    140    }
    141  }
    142 
    143 private:
    144  std::atomic<const char*> ptrs_[kMaxStrings];
    145  std::atomic<size_t> next_ptr_{1};  // next idx
    146  std::atomic<size_t> next_char_{0};
    147  char chars_[kMaxStrings * 55];
    148 };
    149 
    150 #if PROFILER_ENABLED
    151 
    152 // Implementation details.
    153 namespace profiler {
    154 
    155 HWY_INLINE_VAR constexpr size_t kNumFlags = 1;
    156 
    157 // Upper bounds for fixed-size data structures, guarded via HWY_DASSERT:
    158 
    159 // Maximum nesting of zones, chosen such that `PerWorker` is 256 bytes.
    160 HWY_INLINE_VAR constexpr size_t kMaxDepth = 13;
    161 // Reports with more than ~50 are anyway difficult to read.
    162 HWY_INLINE_VAR constexpr size_t kMaxZones = 128;
    163 // Upper bound on global worker_idx across all pools. Note that fiber libraries
    164 // can spawn hundreds of threads. Turin has 128-192 cores.
    165 HWY_INLINE_VAR constexpr size_t kMaxWorkers = 256;
    166 
    167 // Type-safe wrapper for zone index plus flags, returned by `AddZone`.
    168 class ZoneHandle {
    169 public:
    170  ZoneHandle() : bits_(0) {}  // for Accumulator member initialization
    171 
    172  ZoneHandle(size_t zone_idx, ProfilerFlags flags) {
    173    HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones);
    174    const uint32_t flags_u = static_cast<uint32_t>(flags);
    175    HWY_DASSERT(flags_u < (1u << kNumFlags));
    176    bits_ = (static_cast<uint32_t>(zone_idx) << kNumFlags) | flags_u;
    177    HWY_DASSERT(ZoneIdx() == zone_idx);
    178  }
    179 
    180  ZoneHandle(const ZoneHandle& other) = default;
    181  ZoneHandle& operator=(const ZoneHandle& other) = default;
    182 
    183  bool operator==(const ZoneHandle other) const { return bits_ == other.bits_; }
    184  bool operator!=(const ZoneHandle other) const { return bits_ != other.bits_; }
    185 
    186  size_t ZoneIdx() const {
    187    HWY_DASSERT(bits_ != 0);
    188    const size_t zone_idx = bits_ >> kNumFlags;
    189    HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones);
    190    return zone_idx;
    191  }
    192 
    193  bool IsInclusive() const {
    194    HWY_DASSERT(bits_ != 0);
    195    return (bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive)) != 0;
    196  }
    197 
    198  // Returns a mask to zero/ignore child totals for inclusive zones.
    199  uint64_t ChildTotalMask() const {
    200    // With a ternary operator, clang tends to generate a branch.
    201    // return IsInclusive() ? 0 : ~uint64_t{0};
    202    const uint32_t bit =
    203        bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive);
    204    return uint64_t{bit} - 1;
    205  }
    206 
    207 private:
    208  uint32_t bits_;
    209 };
    210 
    211 // Storage for zone names.
    212 class Zones {
    213 public:
    214  // Returns a copy of the `name` passed to `AddZone` that returned the
    215  // given `zone`.
    216  const char* Name(ZoneHandle zone) const {
    217    return strings_.Name(zone.ZoneIdx());
    218  }
    219 
    220  // Can be called concurrently. Calls with the same `name` return the same
    221  // `ZoneHandle.ZoneIdx()`.
    222  ZoneHandle AddZone(const char* name, ProfilerFlags flags) {
    223    return ZoneHandle(strings_.Add(name), flags);
    224  }
    225 
    226 private:
    227  StringTable<kMaxZones> strings_;
    228 };
    229 
    230 // Allows other classes such as `ThreadPool` to register/unregister a function
    231 // to call during `PrintResults`. This allows us to gather data from the worker
    232 // threads without having to wait until they exit, and decouples the profiler
    233 // from other modules. Thread-safe.
    234 class Funcs {
    235  static constexpr auto kAcq = std::memory_order_acquire;
    236  static constexpr auto kRel = std::memory_order_release;
    237 
    238 public:
    239  // Can be called concurrently with distinct keys.
    240  void Add(intptr_t key, ProfilerFunc func) {
    241    HWY_ASSERT(key != 0 && key != kPending);  // reserved values
    242    HWY_ASSERT(func);                         // not empty
    243 
    244    for (size_t i = 0; i < kMaxFuncs; ++i) {
    245      intptr_t expected = 0;
    246      // Lost a race with a concurrent `Add`, try the next slot.
    247      if (!keys_[i].compare_exchange_strong(expected, kPending, kRel)) {
    248        continue;
    249      }
    250      // We own the slot: move func there.
    251      funcs_[i] = std::move(func);
    252      keys_[i].store(key, kRel);  // publishes the `func` write.
    253      return;
    254    }
    255 
    256    HWY_ABORT("Funcs::Add: no free slot, increase kMaxFuncs.");
    257  }
    258 
    259  // Can be called concurrently with distinct keys. It is an error to call this
    260  // without a prior `Add` of the same key.
    261  void Remove(intptr_t key) {
    262    HWY_ASSERT(key != 0 && key != kPending);  // reserved values
    263 
    264    for (size_t i = 0; i < kMaxFuncs; ++i) {
    265      intptr_t actual = keys_[i].load(kAcq);
    266      if (actual == key) {
    267        // In general, concurrent removal is fine, but in this specific context,
    268        // owners are expected to remove their key exactly once, from the same
    269        // thread that added it. In that case, CAS should not fail.
    270        if (!keys_[i].compare_exchange_strong(actual, kPending, kRel)) {
    271          HWY_WARN("Funcs: CAS failed, why is there a concurrent Remove?");
    272        }
    273        funcs_[i] = ProfilerFunc();
    274        keys_[i].store(0, kRel);  // publishes the `func` write.
    275        return;
    276      }
    277    }
    278    HWY_ABORT("Funcs::Remove: failed to find key %p.",
    279              reinterpret_cast<void*>(key));
    280  }
    281 
    282  void CallAll() const {
    283    for (size_t i = 0; i < kMaxFuncs; ++i) {
    284      intptr_t key = keys_[i].load(kAcq);  // ensures `funcs_` is visible.
    285      // Safely handles concurrent Add/Remove.
    286      if (key != 0 && key != kPending) {
    287        funcs_[i]();
    288      }
    289    }
    290  }
    291 
    292 private:
    293  static constexpr size_t kMaxFuncs = 64;
    294  static constexpr intptr_t kPending = -1;
    295 
    296  ProfilerFunc funcs_[kMaxFuncs];  // non-atomic
    297  std::atomic<intptr_t> keys_[kMaxFuncs] = {};
    298 };
    299 
    300 // Holds total duration and number of calls. Worker index is implicit in the
    301 // index of this class within the `Accumulators` array.
    302 struct Accumulator {
    303  void Add(ZoneHandle new_zone, uint64_t self_duration) {
    304    duration += self_duration;
    305 
    306    // Only called for valid zones.
    307    HWY_DASSERT(new_zone != ZoneHandle());
    308    // Our zone might not have been set yet.
    309    HWY_DASSERT(zone == ZoneHandle() || zone == new_zone);
    310    zone = new_zone;
    311 
    312    num_calls += 1;
    313  }
    314 
    315  void Take(Accumulator& other) {
    316    duration += other.duration;
    317    other.duration = 0;
    318 
    319    // `ZoneSet` ensures we only call this for non-empty `other`.
    320    HWY_DASSERT(other.zone != ZoneHandle());
    321    // Our zone might not have been set yet.
    322    HWY_DASSERT(zone == ZoneHandle() || zone == other.zone);
    323    zone = other.zone;
    324 
    325    num_calls += other.num_calls;
    326    other.num_calls = 0;
    327  }
    328 
    329  uint64_t duration = 0;
    330  ZoneHandle zone;  // flags are used by `Results::Print`
    331  uint32_t num_calls = 0;
    332 };
    333 static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
    334 
    335 using ZoneSet = hwy::BitSet<kMaxZones>;
    336 using WorkerSet = hwy::BitSet<kMaxWorkers>;
    337 using AtomicWorkerSet = hwy::AtomicBitSet<kMaxWorkers>;
    338 
    339 // Durations are per-CPU, but end to end performance is defined by wall time.
    340 // Assuming fork-join parallelism, zones are entered by multiple threads
    341 // concurrently, which means the total number of unique threads is also the
    342 // degree of concurrency, so we can estimate wall time as CPU time divided by
    343 // the number of unique threads seen. This is facilitated by unique `global_idx`
    344 // passed in by callers, or taken from thread-local `GlobalIdx()`.
    345 //
    346 // We also want to support varying thread counts per call site, because the same
    347 // function/zone may be called from multiple pools. `EndRootRun` calls
    348 // `CountWorkersAndReset` after each top-level `ThreadPool::Run`, which
    349 // generates one data point summarized via descriptive statistics. Here we
    350 // implement a simpler version of `Stats` because we do not require
    351 // geomean/variance/kurtosis/skewness. Because concurrency is a small integer,
    352 // we can simply compute sums rather than online moments. There is also only one
    353 // instance across all threads, hence we do not require a `Take`.
    354 //
    355 // Note that subsequently discovered prior work estimates the number of active
    356 // and idle processors by updating atomic counters whenever they start/finish a
    357 // task: https://homes.cs.washington.edu/~tom/pubs/quartz.pdf and "Effective
    358 // performance measurement and analysis of multithreaded applications". We
    359 // instead accumulate zone durations into per-thread storage.
    360 // `CountWorkersAndReset` then checks how many were nonzero, which avoids
    361 // expensive atomic updates and ensures accurate counts per-zone, rather than
    362 // estimates of current activity at each sample.
    363 // D. Vyukov's https://github.com/dvyukov/perf-load, also integrated into Linux
    364 // perf, also corrects for parallelism without using atomic counters by tracing
    365 // context switches. Note that we often pin threads, which avoids migrations,
    366 // but reduces the number of context switch events to mainly preemptions.
    367 class ConcurrencyStats {
    368 public:
    369  ConcurrencyStats() { Reset(); }
    370 
    371  void Notify(const size_t x) {
    372    sum_ += x;
    373    ++n_;
    374    min_ = HWY_MIN(min_, x);
    375    max_ = HWY_MAX(max_, x);
    376  }
    377 
    378  size_t Count() const { return n_; }
    379  size_t Min() const { return min_; }
    380  size_t Max() const { return max_; }
    381  double Mean() const {
    382    return static_cast<double>(sum_) / static_cast<double>(n_);
    383  }
    384 
    385  void Reset() {
    386    sum_ = 0;
    387    n_ = 0;
    388    min_ = hwy::HighestValue<size_t>();
    389    max_ = hwy::LowestValue<size_t>();
    390  }
    391 
    392 private:
    393  uint64_t sum_;
    394  size_t n_;
    395  size_t min_;
    396  size_t max_;
    397 };
    398 static_assert(sizeof(ConcurrencyStats) == (8 + 3 * sizeof(size_t)), "");
    399 
    400 // Holds the final results across all threads, including `ConcurrencyStats`
    401 // and `PoolStats`, updated/printed by the main thread.
    402 class Results {
    403 public:
    404  void TakeAccumulator(const size_t global_idx, const size_t zone_idx,
    405                       Accumulator& other) {
    406    HWY_DASSERT(global_idx < kMaxWorkers);
    407    HWY_DASSERT(zone_idx < kMaxZones);
    408    HWY_DASSERT(other.zone.ZoneIdx() == zone_idx);
    409 
    410    visited_zones_.Set(zone_idx);
    411    totals_[zone_idx].Take(other);
    412    workers_[zone_idx].Set(global_idx);
    413  }
    414 
    415  // Moves the total number of threads seen during the preceding root-level
    416  // `ThreadPool::Run` into one data point for `ConcurrencyStats`.
    417  void CountWorkersAndReset(const size_t zone_idx) {
    418    HWY_DASSERT(zone_idx < kMaxZones);
    419    const size_t num_workers = workers_[zone_idx].Count();
    420    // Although workers_[zone_idx] at one point was non-empty, it is reset
    421    // below, and so can be empty on the second call to this via `PrintResults`,
    422    // after one from `EndRootRun`. Do not add a data point if empty.
    423    if (num_workers != 0) {
    424      concurrency_[zone_idx].Notify(num_workers);
    425    }
    426    workers_[zone_idx] = WorkerSet();
    427  }
    428 
    429  void CountWorkersAndReset() {
    430    visited_zones_.Foreach(
    431        [&](size_t zone_idx) { CountWorkersAndReset(zone_idx); });
    432  }
    433 
    434  void PrintAndReset(const Zones& zones) {
    435    const double inv_freq = 1.0 / hwy::platform::InvariantTicksPerSecond();
    436 
    437    // Sort by decreasing total (self) cost. `totals_` are sparse, so sort an
    438    // index vector instead.
    439    std::vector<uint32_t> indices;
    440    indices.reserve(visited_zones_.Count());
    441    visited_zones_.Foreach([&](size_t zone_idx) {
    442      indices.push_back(static_cast<uint32_t>(zone_idx));
    443      // In case the zone exited after `EndRootRun` and was not yet added.
    444      CountWorkersAndReset(zone_idx);
    445    });
    446    std::sort(indices.begin(), indices.end(), [&](uint32_t a, uint32_t b) {
    447      return totals_[a].duration > totals_[b].duration;
    448    });
    449 
    450    for (uint32_t zone_idx : indices) {
    451      Accumulator& total = totals_[zone_idx];  // cleared after printing
    452      HWY_ASSERT(total.zone.ZoneIdx() == zone_idx);
    453      HWY_ASSERT(total.num_calls != 0);  // else visited_zones_ is wrong
    454 
    455      ConcurrencyStats& concurrency = concurrency_[zone_idx];
    456      const double duration = static_cast<double>(total.duration);
    457      const double per_call =
    458          static_cast<double>(total.duration) / total.num_calls;
    459      // See comment on `ConcurrencyStats`.
    460      const double avg_concurrency = concurrency.Mean();
    461      // Avoid division by zero.
    462      const double concurrency_divisor = HWY_MAX(1.0, avg_concurrency);
    463      printf("%s%-40s: %10.0f x %15.0f / %5.1f (%5zu %3zu-%3zu) = %9.6f\n",
    464             total.zone.IsInclusive() ? "(I)" : "   ", zones.Name(total.zone),
    465             static_cast<double>(total.num_calls), per_call, avg_concurrency,
    466             concurrency.Count(), concurrency.Min(), concurrency.Max(),
    467             duration * inv_freq / concurrency_divisor);
    468 
    469      total = Accumulator();
    470      concurrency.Reset();
    471      // `workers_` was already reset by `CountWorkersAndReset`.
    472    }
    473    visited_zones_ = ZoneSet();
    474  }
    475 
    476 private:
    477  // Indicates which of the array entries are in use.
    478  ZoneSet visited_zones_;
    479  Accumulator totals_[kMaxZones];
    480  WorkerSet workers_[kMaxZones];
    481  ConcurrencyStats concurrency_[kMaxZones];
    482 };
    483 
    484 // Delay after capturing timestamps before/after the actual zone runs. Even
    485 // with frequency throttling disabled, this has a multimodal distribution,
    486 // including 32, 34, 48, 52, 59, 62.
    487 struct Overheads {
    488  uint64_t self = 0;
    489  uint64_t child = 0;
    490 };
    491 static_assert(sizeof(Overheads) == 16, "Wrong Overheads size");
    492 
    493 class Accumulators {
    494  // We generally want to group threads together because they are often
    495  // accessed together during a zone, but also want to avoid threads sharing a
    496  // cache line. Hence interleave 8 zones per worker.
    497  static constexpr size_t kPerLine = HWY_ALIGNMENT / sizeof(Accumulator);
    498 
    499 public:
    500  Accumulator& Get(const size_t global_idx, const size_t zone_idx) {
    501    HWY_DASSERT(global_idx < kMaxWorkers);
    502    HWY_DASSERT(zone_idx < kMaxZones);
    503    const size_t line = zone_idx / kPerLine;
    504    const size_t offset = zone_idx % kPerLine;
    505    return zones_[(line * kMaxWorkers + global_idx) * kPerLine + offset];
    506  }
    507 
    508 private:
    509  Accumulator zones_[kMaxZones * kMaxWorkers];
    510 };
    511 
    512 // Reacts to zone enter/exit events. Builds a stack of active zones and
    513 // accumulates self/child duration for each.
    514 class PerWorker {
    515 public:
    516  template <typename T>
    517  static T ClampedSubtract(const T minuend, const T subtrahend) {
    518    static_assert(IsUnsigned<T>(), "");
    519    const T difference = minuend - subtrahend;
    520    // Clang output for this is verified to CMOV rather than branch.
    521    const T no_underflow = (subtrahend > minuend) ? T{0} : ~T{0};
    522    return difference & no_underflow;
    523  }
    524 
    525  void SetOverheads(const Overheads& overheads) { overheads_ = overheads; }
    526 
    527  // Entering a zone: push onto stack.
    528  void Enter(const uint64_t t_enter) {
    529    const size_t depth = depth_;
    530    HWY_DASSERT(depth < kMaxDepth);
    531    t_enter_[depth] = t_enter;
    532    child_total_[1 + depth] = 0;
    533    depth_ = 1 + depth;
    534  }
    535 
    536  // Exiting the most recently entered zone (top of stack).
    537  void Exit(const uint64_t t_exit, const size_t global_idx,
    538            const ZoneHandle zone, Accumulators& accumulators) {
    539    HWY_DASSERT(depth_ > 0);
    540    const size_t depth = depth_ - 1;
    541    const size_t zone_idx = zone.ZoneIdx();
    542    const uint64_t duration = t_exit - t_enter_[depth];
    543    // Clang output for this is verified not to branch. This is 0 if inclusive,
    544    // otherwise the child total.
    545    const uint64_t child_total =
    546        child_total_[1 + depth] & zone.ChildTotalMask();
    547 
    548    const uint64_t self_duration = ClampedSubtract(
    549        duration, overheads_.self + overheads_.child + child_total);
    550    accumulators.Get(global_idx, zone_idx).Add(zone, self_duration);
    551    // For faster TakeAccumulator() - not all zones are encountered.
    552    visited_zones_.Set(zone_idx);
    553 
    554    // Adding this nested time to the parent's `child_total` will
    555    // cause it to be later subtracted from the parent's `self_duration`.
    556    child_total_[1 + depth - 1] += duration + overheads_.child;
    557 
    558    depth_ = depth;
    559  }
    560 
    561  // Returns the duration of one enter/exit pair and resets all state. Called
    562  // via `DetectSelfOverhead`.
    563  uint64_t GetFirstDurationAndReset(size_t global_idx,
    564                                    Accumulators& accumulators) {
    565    HWY_DASSERT(depth_ == 0);
    566 
    567    HWY_DASSERT(visited_zones_.Count() == 1);
    568    const size_t zone_idx = visited_zones_.First();
    569    HWY_DASSERT(zone_idx <= 3);
    570    HWY_DASSERT(visited_zones_.Get(zone_idx));
    571    visited_zones_.Clear(zone_idx);
    572 
    573    Accumulator& zone = accumulators.Get(global_idx, zone_idx);
    574    const uint64_t duration = zone.duration;
    575    zone = Accumulator();
    576    return duration;
    577  }
    578 
    579  // Adds all data to `results` and resets it here. Called from the main thread.
    580  void MoveTo(const size_t global_idx, Accumulators& accumulators,
    581              Results& results) {
    582    visited_zones_.Foreach([&](size_t zone_idx) {
    583      results.TakeAccumulator(global_idx, zone_idx,
    584                              accumulators.Get(global_idx, zone_idx));
    585    });
    586    // OK to reset even if we have active zones, because we set `visited_zones_`
    587    // when exiting the zone.
    588    visited_zones_ = ZoneSet();
    589  }
    590 
    591 private:
    592  // 40 bytes:
    593  ZoneSet visited_zones_;  // Which `zones_` have been active on this worker.
    594  uint64_t depth_ = 0;     // Current nesting level for active zones.
    595  Overheads overheads_;
    596 
    597  uint64_t t_enter_[kMaxDepth];
    598  // Used to deduct child duration from parent's self time (unless inclusive).
    599  // Shifting by one avoids bounds-checks for depth_ = 0 (root zone).
    600  uint64_t child_total_[1 + kMaxDepth] = {0};
    601 };
    602 // Enables shift rather than multiplication.
    603 static_assert(sizeof(PerWorker) == 256, "Wrong size");
    604 
    605 }  // namespace profiler
    606 
    607 class Profiler {
    608 public:
    609  static HWY_DLLEXPORT Profiler& Get();
    610 
    611  // Returns `global_idx` from thread-local storage (0 for the main thread).
    612  // Used by `PROFILER_ZONE/PROFILER_FUNC`. It is faster to instead pass the
    613  // global_idx from `ThreadPool::Run` (if constructed with non-default
    614  // `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`.
    615  // DEPRECATED: use `GlobalIdx` instead.
    616  static size_t Thread() { return s_global_idx; }
    617  static size_t GlobalIdx() { return s_global_idx; }
    618  // Must be called from all worker threads, and once also on the main thread,
    619  // before any use of `PROFILER_ZONE/PROFILER_FUNC`.
    620  static void SetGlobalIdx(size_t global_idx) { s_global_idx = global_idx; }
    621 
    622  void ReserveWorker(size_t global_idx) {
    623    HWY_ASSERT(!workers_reserved_.Get(global_idx));
    624    workers_reserved_.Set(global_idx);
    625  }
    626 
    627  void FreeWorker(size_t global_idx) {
    628    HWY_ASSERT(workers_reserved_.Get(global_idx));
    629    workers_reserved_.Clear(global_idx);
    630  }
    631 
    632  // Called by `Zone` from any thread.
    633  void Enter(uint64_t t_enter, size_t global_idx) {
    634    GetWorker(global_idx).Enter(t_enter);
    635  }
    636 
    637  // Called by `~Zone` from any thread.
    638  void Exit(uint64_t t_exit, size_t global_idx, profiler::ZoneHandle zone) {
    639    GetWorker(global_idx).Exit(t_exit, global_idx, zone, accumulators_);
    640  }
    641 
    642  uint64_t GetFirstDurationAndReset(size_t global_idx) {
    643    return GetWorker(global_idx)
    644        .GetFirstDurationAndReset(global_idx, accumulators_);
    645  }
    646 
    647  const char* Name(profiler::ZoneHandle zone) const {
    648    return zones_.Name(zone);
    649  }
    650 
    651  // Copies `name` into the string table and returns its unique `zone`. Uses
    652  // linear search, which is fine because this is called during static init.
    653  // Called via static initializer and the result is passed to the `Zone` ctor.
    654  profiler::ZoneHandle AddZone(const char* name,
    655                               ProfilerFlags flags = ProfilerFlags::kDefault) {
    656    return zones_.AddZone(name, flags);
    657  }
    658 
    659  void AddFunc(void* owner, ProfilerFunc func) {
    660    funcs_.Add(reinterpret_cast<intptr_t>(owner), func);
    661  }
    662  void RemoveFunc(void* owner) {
    663    funcs_.Remove(reinterpret_cast<intptr_t>(owner));
    664  }
    665 
    666  // For reporting average concurrency. Called by `ThreadPool::Run` on the main
    667  // thread, returns true if this is the first call since the last `EndRootRun`.
    668  //
    669  // We want to report the concurrency of each separate 'invocation' of a zone.
    670  // A unique per-call identifier (could be approximated with the line number
    671  // and return address) is not sufficient because the caller may in turn be
    672  // called from differing parallel sections. A per-`ThreadPool::Run` counter
    673  // also under-reports concurrency because each pool in nested parallelism
    674  // (over packages and CCXes) would be considered separate invocations.
    675  //
    676  // The alternative of detecting overlapping zones via timestamps is not 100%
    677  // reliable because timers may not be synchronized across sockets or perhaps
    678  // even cores. "Invariant" x86 TSCs are indeed synchronized across cores, but
    679  // not across sockets unless the RESET# signal reaches each at the same time.
    680  // Linux seems to make an effort to correct this, and Arm's "generic timer"
    681  // broadcasts to "all cores", but there is no universal guarantee.
    682  //
    683  // Under the assumption that all concurrency is via our `ThreadPool`, we can
    684  // record all `global_idx` for each outermost (root) `ThreadPool::Run`. This
    685  // collapses all nested pools into one 'invocation'. We then compute per-zone
    686  // concurrency as the number of unique `global_idx` seen per invocation.
    687  bool IsRootRun() {
    688    // We are not the root if a Run was already active.
    689    return !run_active_.test_and_set(std::memory_order_acquire);
    690  }
    691 
    692  // Must be called if `IsRootRun` returned true. Resets the state so that the
    693  // next call to `IsRootRun` will again return true. Called from main thread.
    694  // Note that some zones may still be active. Their concurrency will be updated
    695  // when `PrintResults` is called.
    696  void EndRootRun() {
    697    UpdateResults();
    698    results_.CountWorkersAndReset();
    699 
    700    run_active_.clear(std::memory_order_release);
    701  }
    702 
    703  // Prints results. Call from main thread after all threads have exited all
    704  // zones. Resets all state, can be called again after more zones.
    705  void PrintResults() {
    706    UpdateResults();
    707    // `CountWorkersAndReset` is fused into `Print`, so do not call it here.
    708 
    709    results_.PrintAndReset(zones_);
    710 
    711    funcs_.CallAll();
    712  }
    713 
    714  // TODO: remove when no longer called.
    715  void SetMaxThreads(size_t) {}
    716 
    717 private:
    718  // Sets main thread index, computes self-overhead, and checks timer support.
    719  Profiler();
    720 
    721  profiler::PerWorker& GetWorker(size_t global_idx) {
    722    HWY_DASSERT(workers_reserved_.Get(global_idx));
    723    return workers_[global_idx];
    724  }
    725 
    726  // Moves accumulators into Results. Called from the main thread.
    727  void UpdateResults() {
    728    // Ensure we see all writes from before the workers' release fence.
    729    std::atomic_thread_fence(std::memory_order_acquire);
    730 
    731    workers_reserved_.Foreach([&](size_t global_idx) {
    732      workers_[global_idx].MoveTo(global_idx, accumulators_, results_);
    733    });
    734  }
    735 
    736  static thread_local size_t s_global_idx;
    737 
    738  // These are atomic because `ThreadFunc` reserves its slot(s) and even
    739  // `ThreadPool::ThreadPool` may be called concurrently. Both have bit `i` set
    740  // between calls to `Reserve*(i)` and `Free*(i)`. They are consulted in
    741  // `UpdateResults` and to validate arguments in debug builds, and only updated
    742  // in the pool/thread init/shutdown.
    743  profiler::AtomicWorkerSet workers_reserved_;
    744 
    745  std::atomic_flag run_active_ = ATOMIC_FLAG_INIT;
    746 
    747  profiler::Funcs funcs_;
    748 
    749  // To avoid locking, each worker has its own working set. We could access this
    750  // through `thread_local` pointers, but that is slow to read on x86. Because
    751  // our `ThreadPool` anyway passes a `global_idx` argument, we can instead pass
    752  // that through the `PROFILER_ZONE2/PROFILER_ZONE3` macros.
    753  profiler::PerWorker workers_[profiler::kMaxWorkers];
    754 
    755  profiler::Accumulators accumulators_;
    756 
    757  profiler::Results results_;
    758 
    759  profiler::Zones zones_;
    760 };
    761 
    762 namespace profiler {
    763 
    764 // RAII for zone entry/exit.
    765 class Zone {
    766 public:
    767  // Thread-compatible; must not call concurrently with the same `global_idx`,
    768  // which is either:
    769  // - passed from `ThreadPool::Run` (if it was constructed with non-default
    770  //  `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`;
    771  // - obtained from `Profiler::GlobalIdx()`; or
    772  // - 0 if running on the main thread.
    773  Zone(Profiler& profiler, size_t global_idx, ZoneHandle zone)
    774      : profiler_(profiler) {
    775    HWY_FENCE;
    776    const uint64_t t_enter = timer::Start();
    777    HWY_FENCE;
    778    global_idx_ = static_cast<uint32_t>(global_idx);
    779    zone_ = zone;
    780    profiler.Enter(t_enter, global_idx);
    781    HWY_FENCE;
    782  }
    783 
    784  ~Zone() {
    785    HWY_FENCE;
    786    const uint64_t t_exit = timer::Stop();
    787    profiler_.Exit(t_exit, static_cast<size_t>(global_idx_), zone_);
    788    HWY_FENCE;
    789  }
    790 
    791 private:
    792  Profiler& profiler_;
    793  uint32_t global_idx_;
    794  ZoneHandle zone_;
    795 };
    796 
    797 }  // namespace profiler
    798 #else   // profiler disabled: stub implementation
    799 
    800 namespace profiler {
    801 struct ZoneHandle {};
    802 }  // namespace profiler
    803 
    804 struct Profiler {
    805  static HWY_DLLEXPORT Profiler& Get();
    806 
    807  // DEPRECATED: use `GlobalIdx` instead.
    808  static size_t Thread() { return 0; }
    809  static size_t GlobalIdx() { return 0; }
    810  static void SetGlobalIdx(size_t) {}
    811  void ReserveWorker(size_t) {}
    812  void FreeWorker(size_t) {}
    813  void Enter(uint64_t, size_t) {}
    814  void Exit(uint64_t, size_t, profiler::ZoneHandle) {}
    815  uint64_t GetFirstDurationAndReset(size_t) { return 0; }
    816 
    817  const char* Name(profiler::ZoneHandle) const { return nullptr; }
    818  profiler::ZoneHandle AddZone(const char*,
    819                               ProfilerFlags = ProfilerFlags::kDefault) {
    820    return profiler::ZoneHandle();
    821  }
    822 
    823  void AddFunc(void*, ProfilerFunc) {}
    824  void RemoveFunc(void*) {}
    825 
    826  bool IsRootRun() { return false; }
    827  void EndRootRun() {}
    828  void PrintResults() {}
    829 
    830  // TODO: remove when no longer called.
    831  void SetMaxThreads(size_t) {}
    832 };
    833 
    834 namespace profiler {
    835 struct Zone {
    836  Zone(Profiler&, size_t, ZoneHandle) {}
    837 };
    838 
    839 }  // namespace profiler
    840 #endif  // PROFILER_ENABLED || HWY_IDE
    841 
    842 }  // namespace hwy
    843 
    844 // Creates a `Zone` lvalue with a line-dependent name, which records the elapsed
    845 // time from here until the end of the current scope. `p` is from
    846 // `Profiler::Get()` or a cached reference. `global_idx < kMaxWorkers`. `zone`
    847 // is the return value of `AddZone`. Separating its static init from the `Zone`
    848 // may be more efficient than `PROFILER_ZONE2`.
    849 #define PROFILER_ZONE3(p, global_idx, zone)                               \
    850  HWY_FENCE;                                                              \
    851  const hwy::profiler::Zone HWY_CONCAT(Z, __LINE__)(p, global_idx, zone); \
    852  HWY_FENCE
    853 
    854 // For compatibility with old callers that do not pass `p` nor `flags`.
    855 // Also calls AddZone. Usage: `PROFILER_ZONE2(global_idx, "MyZone");`
    856 #define PROFILER_ZONE2(global_idx, name)                              \
    857  static const hwy::profiler::ZoneHandle HWY_CONCAT(zone, __LINE__) = \
    858      hwy::Profiler::Get().AddZone(name);                             \
    859  PROFILER_ZONE3(hwy::Profiler::Get(), global_idx, HWY_CONCAT(zone, __LINE__))
    860 #define PROFILER_FUNC2(global_idx) PROFILER_ZONE2(global_idx, __func__)
    861 
    862 // OBSOLETE: it is more efficient to pass `global_idx` from `ThreadPool` to
    863 // `PROFILER_ZONE2/PROFILER_ZONE3`. Here we get it from thread_local storage.
    864 #define PROFILER_ZONE(name) PROFILER_ZONE2(hwy::Profiler::GlobalIdx(), name)
    865 #define PROFILER_FUNC PROFILER_FUNC2(hwy::Profiler::GlobalIdx())
    866 
    867 // DEPRECATED: Use `hwy::Profiler::Get()` directly instead.
    868 #define PROFILER_ADD_ZONE(name) hwy::Profiler::Get().AddZone(name)
    869 #define PROFILER_IS_ROOT_RUN() hwy::Profiler::Get().IsRootRun()
    870 #define PROFILER_END_ROOT_RUN() hwy::Profiler::Get().EndRootRun()
    871 #define PROFILER_PRINT_RESULTS() hwy::Profiler::Get().PrintResults()
    872 
    873 #endif  // HIGHWAY_HWY_PROFILER_H_