[ tor-browser ].git.dasho

perf_counters.cc (13437B)
      1 // Copyright 2024 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include "hwy/perf_counters.h"
     17 
     18 #include "hwy/detect_compiler_arch.h"  // HWY_OS_LINUX
     19 
     20 #if HWY_OS_LINUX || HWY_IDE
     21 #include <errno.h>
     22 #include <fcntl.h>  // open
     23 #include <linux/perf_event.h>
     24 #include <stddef.h>
     25 #include <stdint.h>
     26 #include <stdio.h>
     27 #include <string.h>  // strcmp
     28 #include <sys/ioctl.h>
     29 #include <sys/prctl.h>
     30 #include <sys/stat.h>  // O_RDONLY
     31 #include <sys/syscall.h>
     32 #include <sys/utsname.h>
     33 #include <unistd.h>
     34 
     35 #include <string>
     36 #include <vector>
     37 
     38 #include "hwy/base.h"  // HWY_ASSERT
     39 #include "hwy/bit_set.h"
     40 #include "hwy/timer.h"
     41 
     42 #endif  // HWY_OS_LINUX || HWY_IDE
     43 
     44 namespace hwy {
     45 namespace platform {
     46 
     47 #if HWY_OS_LINUX || HWY_IDE
     48 
     49 namespace {
     50 
     51 bool PerfCountersSupported() {
     52  // This is the documented way.
     53  struct stat s;
     54  return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0;
     55 }
     56 
     57 // If we detect Linux < 6.9 and AMD EPYC, use cycles instead of ref-cycles
     58 // because the latter is not supported and returns 0, see
     59 // https://lwn.net/Articles/967791/.
     60 uint64_t RefCyclesOrCycles() {
     61  const uint32_t ref_cycles = PERF_COUNT_HW_REF_CPU_CYCLES;
     62 
     63  utsname buf;
     64  if (uname(&buf) != 0) return ref_cycles;
     65  if (std::string(buf.sysname) != "Linux") return ref_cycles;
     66  int major, minor;
     67  if (sscanf(buf.release, "%d.%d", &major, &minor) != 2) return ref_cycles;
     68  if (major > 6 || (major == 6 && minor >= 9)) return ref_cycles;
     69 
     70  // AMD Zen4 CPU
     71  char cpu100[100];
     72  if (!GetCpuString(cpu100)) return ref_cycles;
     73  if (std::string(cpu100).rfind("AMD EPYC", 0) != 0) return ref_cycles;
     74 
     75  return PERF_COUNT_HW_CPU_CYCLES;
     76 }
     77 
     78 struct CounterConfig {  // for perf_event_open
     79  uint64_t config;
     80  uint32_t type;
     81  PerfCounters::Counter c;
     82 };
     83 
     84 std::vector<CounterConfig> AllCounterConfigs() {
     85  constexpr uint32_t kHW = PERF_TYPE_HARDWARE;
     86  constexpr uint32_t kSW = PERF_TYPE_SOFTWARE;
     87  constexpr uint32_t kC = PERF_TYPE_HW_CACHE;
     88  constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL;
     89  constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8;
     90  constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8;
     91  constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16;
     92 
     93  // Order is important for bin-packing event groups. x86 can only handle two
     94  // LLC-related events per group, so spread them out and arrange SW events
     95  // such that do not start a new group. This list of counters may change.
     96  return {{RefCyclesOrCycles(), kHW, PerfCounters::kRefCycles},
     97          {PERF_COUNT_HW_INSTRUCTIONS, kHW, PerfCounters::kInstructions},
     98          {PERF_COUNT_SW_PAGE_FAULTS, kSW, PerfCounters::kPageFaults},
     99          {kL3 | kLoad | kAcc, kC, PerfCounters::kL3Loads},
    100          {kL3 | kStore | kAcc, kC, PerfCounters::kL3Stores},
    101          {PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW, PerfCounters::kBranches},
    102          {PERF_COUNT_HW_BRANCH_MISSES, kHW, PerfCounters::kBranchMispredicts},
    103          // Second group:
    104          {PERF_COUNT_HW_BUS_CYCLES, kHW, PerfCounters::kBusCycles},
    105          {PERF_COUNT_SW_CPU_MIGRATIONS, kSW, PerfCounters::kMigrations},
    106          {PERF_COUNT_HW_CACHE_REFERENCES, kHW, PerfCounters::kCacheRefs},
    107          {PERF_COUNT_HW_CACHE_MISSES, kHW, PerfCounters::kCacheMisses}};
    108 }
    109 
    110 size_t& PackedIdx(PerfCounters::Counter c) {
    111  static size_t packed_idx[64];
    112  return packed_idx[static_cast<size_t>(c)];
    113 }
    114 
    115 class PMU {
    116  static perf_event_attr MakeAttr(const CounterConfig& cc) {
    117    perf_event_attr attr = {};
    118    attr.type = cc.type;
    119    attr.size = sizeof(attr);
    120    attr.config = cc.config;
    121    // We request more counters than the HW may support. If so, they are
    122    // multiplexed and only active for a fraction of the runtime. Recording the
    123    // times lets us extrapolate. GROUP enables a single syscall to reduce the
    124    // cost of reading.
    125    attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
    126                       PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_GROUP;
    127    // Do not set inherit=1 because that conflicts with PERF_FORMAT_GROUP.
    128    // Do not set disable=1, so that perf_event_open verifies all events in the
    129    // group can be scheduled together.
    130    attr.exclude_kernel = 1;  // required if perf_event_paranoid == 1
    131    attr.exclude_hv = 1;      // = hypervisor
    132    return attr;
    133  }
    134 
    135  static int SysPerfEventOpen(const CounterConfig& cc, int leader_fd) {
    136    perf_event_attr attr = MakeAttr(cc);
    137    const int pid = 0;   // current process (cannot also be -1)
    138    const int cpu = -1;  // any CPU
    139    // Retry if interrupted by signals; this actually happens (b/64774091).
    140    for (int retry = 0; retry < 10; ++retry) {
    141      const int flags = 0;
    142      const int fd = static_cast<int>(
    143          syscall(__NR_perf_event_open, &attr, pid, cpu, leader_fd, flags));
    144      if (!(fd == -1 && errno == EINTR)) return fd;
    145    }
    146    HWY_WARN("perf_event_open retries were insufficient.");
    147    return -1;
    148  }
    149 
    150  // Reads from `fd`; recovers from interruptions before/during the read.
    151  static bool ReadBytes(int fd, ssize_t size, void* to) {
    152    uint8_t* bytes = reinterpret_cast<uint8_t*>(to);
    153    ssize_t pos = 0;
    154    for (int retry = 0; retry < 10; ++retry) {
    155      const ssize_t bytes_read =
    156          read(fd, bytes + pos, static_cast<size_t>(size - pos));
    157      if (HWY_UNLIKELY(bytes_read <= 0)) {
    158        if (errno == EINTR) continue;
    159        HWY_WARN("perf read() failed, errno %d.", errno);
    160        return false;
    161      }
    162      pos += bytes_read;
    163      HWY_ASSERT(pos <= size);
    164      if (HWY_LIKELY(pos == size)) return true;  // success
    165    }
    166    HWY_WARN("perf read() wanted %d bytes, got %d.", static_cast<int>(size),
    167             static_cast<int>(pos));
    168    return false;
    169  }
    170 
    171  // Array size in Buf; this is another upper bound on group size. It should be
    172  // loose because it only wastes a bit of stack space, whereas an unnecessary
    173  // extra group decreases coverage. Most HW supports 4-8 counters per group.
    174  static constexpr size_t kMaxEventsPerGroup = PerfCounters::kCapacity;
    175 
    176 #pragma pack(push, 1)
    177  struct Buf {
    178    uint64_t num_events;
    179    uint64_t time_enabled;
    180    uint64_t time_running;
    181    uint64_t values[kMaxEventsPerGroup];
    182  };
    183 #pragma pack(pop)
    184 
    185  // Returns false on error, otherwise sets `extrapolate` and `values`.
    186  static bool ReadAndExtrapolate(int fd, size_t num_events, double& extrapolate,
    187                                 double* HWY_RESTRICT values) {
    188    Buf buf;
    189    const ssize_t want_bytes =  // size of var-len `Buf`
    190        static_cast<ssize_t>(24 + num_events * sizeof(uint64_t));
    191    if (HWY_UNLIKELY(!ReadBytes(fd, want_bytes, &buf))) return false;
    192 
    193    HWY_DASSERT(num_events == buf.num_events);
    194    HWY_DASSERT(buf.time_running <= buf.time_enabled);
    195    // If the group was not yet scheduled, we must avoid division by zero.
    196    // In case counters were previously running and not reset, their current
    197    // values may be nonzero. Returning zero could be interpreted as counters
    198    // running backwards, so we instead treat this as a failure and mark the
    199    // counters as invalid.
    200    if (HWY_UNLIKELY(buf.time_running == 0)) return false;
    201 
    202    // Extrapolate each value.
    203    extrapolate = static_cast<double>(buf.time_enabled) /
    204                  static_cast<double>(buf.time_running);
    205    for (size_t i = 0; i < buf.num_events; ++i) {
    206      values[i] = static_cast<double>(buf.values[i]) * extrapolate;
    207    }
    208    return true;
    209  }
    210 
    211 public:
    212  bool Init() {
    213    // Allow callers who do not know about each other to each call `Init`.
    214    // If this already succeeded, we're done; if not, we will try again.
    215    if (HWY_UNLIKELY(!fds_.empty())) return true;
    216    if (HWY_UNLIKELY(!PerfCountersSupported())) {
    217      HWY_WARN(
    218          "This Linux does not support perf counters. The program will"
    219          "continue, but counters will return zero.");
    220      return false;
    221    }
    222 
    223    groups_.push_back(Group());
    224    fds_.reserve(PerfCounters::kCapacity);
    225 
    226    for (const CounterConfig& config : AllCounterConfigs()) {
    227      // If the group is limited by our buffer size, add a new one.
    228      if (HWY_UNLIKELY(groups_.back().num_events == kMaxEventsPerGroup)) {
    229        groups_.push_back(Group());
    230      }
    231 
    232      int fd = SysPerfEventOpen(config, groups_.back().leader_fd);
    233      // Retry in case the group is limited by HW capacity. Do not check
    234      // errno because it is too inconsistent (ENOSPC, EINVAL, others?).
    235      if (HWY_UNLIKELY(fd < 0)) {
    236        fd = SysPerfEventOpen(config, /*leader_fd=*/-1);
    237        if (fd >= 0 && groups_.back().num_events != 0) {
    238          groups_.push_back(Group());
    239        }
    240      }
    241 
    242      if (HWY_UNLIKELY(fd < 0)) {
    243        HWY_WARN("perf_event_open %d errno %d for counter %s.", fd, errno,
    244                 PerfCounters::Name(config.c));
    245      } else {
    246        // Add to group and set as leader if empty.
    247        if (groups_.back().leader_fd == -1) {
    248          groups_.back().leader_fd = fd;
    249 
    250          // Ensure the leader is not a SW event, because adding an HW
    251          // event to a group with only SW events is slow, and starting
    252          // with SW may trigger a bug, see
    253          // https://lore.kernel.org/lkml/tip-a1150c202207cc8501bebc45b63c264f91959260@git.kernel.org/
    254          if (HWY_UNLIKELY(config.type == PERF_TYPE_SOFTWARE)) {
    255            HWY_WARN("SW event %s should not be leader.",
    256                     PerfCounters::Name(config.c));
    257          }
    258        }
    259 
    260        PackedIdx(config.c) = fds_.size();
    261        groups_.back().num_events += 1;
    262        valid_.Set(static_cast<size_t>(config.c));
    263        fds_.push_back(fd);
    264      }
    265    }
    266 
    267    // If no counters are available, remove the empty group.
    268    if (HWY_UNLIKELY(fds_.empty())) {
    269      HWY_ASSERT(groups_.size() == 1);
    270      HWY_ASSERT(groups_.back().num_events == 0);
    271      HWY_ASSERT(groups_.back().leader_fd == -1);
    272      groups_.clear();
    273    }
    274 
    275    size_t num_valid = 0;
    276    for (const Group& group : groups_) {
    277      num_valid += group.num_events;
    278      // All groups have a leader and are not empty.
    279      HWY_ASSERT(group.leader_fd >= 0);
    280      HWY_ASSERT(0 != group.num_events &&
    281                 group.num_events <= kMaxEventsPerGroup);
    282    }
    283    // Total `num_events` matches `fds_` and `Valid()`.
    284    HWY_ASSERT(num_valid == fds_.size());
    285    HWY_ASSERT(num_valid == valid_.Count());
    286    HWY_ASSERT(num_valid <= PerfCounters::kCapacity);
    287 
    288    if (num_valid) {
    289      StopAllAndReset();
    290      return true;
    291    } else {
    292      HWY_WARN("No valid counters found.");
    293      return true;
    294    }
    295  }
    296 
    297  bool StartAll() {
    298    if (HWY_UNLIKELY(fds_.empty())) return false;
    299    HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_ENABLE) == 0);
    300    return true;
    301  }
    302 
    303  void StopAllAndReset() {
    304    HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_DISABLE) == 0);
    305    for (int fd : fds_) {
    306      HWY_ASSERT(ioctl(fd, PERF_EVENT_IOC_RESET, 0) == 0);
    307    }
    308  }
    309 
    310  // Returns false on error, otherwise sets `valid`, `max_extrapolate`, and
    311  // `values`.
    312  bool Read(BitSet64& valid, double& max_extrapolate, double* values) {
    313    if (HWY_UNLIKELY(!valid_.Any())) return false;
    314 
    315    // Read all counters into buffer in the order in which they were opened.
    316    max_extrapolate = 1.0;
    317    double* pos = values;
    318    for (const Group& group : groups_) {
    319      double extrapolate;
    320      if (HWY_UNLIKELY(!ReadAndExtrapolate(group.leader_fd, group.num_events,
    321                                           extrapolate, pos))) {
    322        return false;
    323      }
    324      max_extrapolate = HWY_MAX(max_extrapolate, extrapolate);
    325      pos += group.num_events;
    326    }
    327 
    328    valid = valid_;
    329    HWY_DASSERT(pos == values + valid.Count());
    330    return true;
    331  }
    332 
    333 private:
    334  std::vector<int> fds_;  // one per valid_
    335  BitSet64 valid_;
    336 
    337  struct Group {
    338    size_t num_events = 0;
    339    int leader_fd = -1;
    340  };
    341  std::vector<Group> groups_;
    342 };
    343 
    344 // Monostate, see header.
    345 PMU& GetPMU() {
    346  static PMU& pmu = *new PMU();  // avoids exit-dtor warning (no dtor required)
    347  return pmu;
    348 }
    349 
    350 }  // namespace
    351 
    352 HWY_DLLEXPORT bool PerfCounters::Init() { return GetPMU().Init(); }
    353 HWY_DLLEXPORT bool PerfCounters::StartAll() { return GetPMU().StartAll(); }
    354 HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {
    355  GetPMU().StopAllAndReset();
    356 }
    357 HWY_DLLEXPORT PerfCounters::PerfCounters() {
    358  if (HWY_UNLIKELY(!GetPMU().Read(valid_, max_extrapolate_, values_))) {
    359    valid_ = BitSet64();
    360    max_extrapolate_ = 0.0;
    361    hwy::ZeroBytes(values_, sizeof(values_));
    362  }
    363 }
    364 HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter c) {
    365  return PackedIdx(c);
    366 }
    367 #else
    368 HWY_DLLEXPORT bool PerfCounters::Init() { return false; }
    369 HWY_DLLEXPORT bool PerfCounters::StartAll() { return false; }
    370 HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {}
    371 HWY_DLLEXPORT PerfCounters::PerfCounters()
    372    : max_extrapolate_(1.0), values_{0.0} {}
    373 HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter) { return 0; }
    374 #endif  // HWY_OS_LINUX || HWY_IDE
    375 
    376 }  // namespace platform
    377 }  // namespace hwy
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE