perf_counters.cc (13437B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/perf_counters.h" 17 18 #include "hwy/detect_compiler_arch.h" // HWY_OS_LINUX 19 20 #if HWY_OS_LINUX || HWY_IDE 21 #include <errno.h> 22 #include <fcntl.h> // open 23 #include <linux/perf_event.h> 24 #include <stddef.h> 25 #include <stdint.h> 26 #include <stdio.h> 27 #include <string.h> // strcmp 28 #include <sys/ioctl.h> 29 #include <sys/prctl.h> 30 #include <sys/stat.h> // O_RDONLY 31 #include <sys/syscall.h> 32 #include <sys/utsname.h> 33 #include <unistd.h> 34 35 #include <string> 36 #include <vector> 37 38 #include "hwy/base.h" // HWY_ASSERT 39 #include "hwy/bit_set.h" 40 #include "hwy/timer.h" 41 42 #endif // HWY_OS_LINUX || HWY_IDE 43 44 namespace hwy { 45 namespace platform { 46 47 #if HWY_OS_LINUX || HWY_IDE 48 49 namespace { 50 51 bool PerfCountersSupported() { 52 // This is the documented way. 53 struct stat s; 54 return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0; 55 } 56 57 // If we detect Linux < 6.9 and AMD EPYC, use cycles instead of ref-cycles 58 // because the latter is not supported and returns 0, see 59 // https://lwn.net/Articles/967791/. 60 uint64_t RefCyclesOrCycles() { 61 const uint32_t ref_cycles = PERF_COUNT_HW_REF_CPU_CYCLES; 62 63 utsname buf; 64 if (uname(&buf) != 0) return ref_cycles; 65 if (std::string(buf.sysname) != "Linux") return ref_cycles; 66 int major, minor; 67 if (sscanf(buf.release, "%d.%d", &major, &minor) != 2) return ref_cycles; 68 if (major > 6 || (major == 6 && minor >= 9)) return ref_cycles; 69 70 // AMD Zen4 CPU 71 char cpu100[100]; 72 if (!GetCpuString(cpu100)) return ref_cycles; 73 if (std::string(cpu100).rfind("AMD EPYC", 0) != 0) return ref_cycles; 74 75 return PERF_COUNT_HW_CPU_CYCLES; 76 } 77 78 struct CounterConfig { // for perf_event_open 79 uint64_t config; 80 uint32_t type; 81 PerfCounters::Counter c; 82 }; 83 84 std::vector<CounterConfig> AllCounterConfigs() { 85 constexpr uint32_t kHW = PERF_TYPE_HARDWARE; 86 constexpr uint32_t kSW = PERF_TYPE_SOFTWARE; 87 constexpr uint32_t kC = PERF_TYPE_HW_CACHE; 88 constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL; 89 constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8; 90 constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8; 91 constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16; 92 93 // Order is important for bin-packing event groups. x86 can only handle two 94 // LLC-related events per group, so spread them out and arrange SW events 95 // such that do not start a new group. This list of counters may change. 96 return {{RefCyclesOrCycles(), kHW, PerfCounters::kRefCycles}, 97 {PERF_COUNT_HW_INSTRUCTIONS, kHW, PerfCounters::kInstructions}, 98 {PERF_COUNT_SW_PAGE_FAULTS, kSW, PerfCounters::kPageFaults}, 99 {kL3 | kLoad | kAcc, kC, PerfCounters::kL3Loads}, 100 {kL3 | kStore | kAcc, kC, PerfCounters::kL3Stores}, 101 {PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW, PerfCounters::kBranches}, 102 {PERF_COUNT_HW_BRANCH_MISSES, kHW, PerfCounters::kBranchMispredicts}, 103 // Second group: 104 {PERF_COUNT_HW_BUS_CYCLES, kHW, PerfCounters::kBusCycles}, 105 {PERF_COUNT_SW_CPU_MIGRATIONS, kSW, PerfCounters::kMigrations}, 106 {PERF_COUNT_HW_CACHE_REFERENCES, kHW, PerfCounters::kCacheRefs}, 107 {PERF_COUNT_HW_CACHE_MISSES, kHW, PerfCounters::kCacheMisses}}; 108 } 109 110 size_t& PackedIdx(PerfCounters::Counter c) { 111 static size_t packed_idx[64]; 112 return packed_idx[static_cast<size_t>(c)]; 113 } 114 115 class PMU { 116 static perf_event_attr MakeAttr(const CounterConfig& cc) { 117 perf_event_attr attr = {}; 118 attr.type = cc.type; 119 attr.size = sizeof(attr); 120 attr.config = cc.config; 121 // We request more counters than the HW may support. If so, they are 122 // multiplexed and only active for a fraction of the runtime. Recording the 123 // times lets us extrapolate. GROUP enables a single syscall to reduce the 124 // cost of reading. 125 attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 126 PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_GROUP; 127 // Do not set inherit=1 because that conflicts with PERF_FORMAT_GROUP. 128 // Do not set disable=1, so that perf_event_open verifies all events in the 129 // group can be scheduled together. 130 attr.exclude_kernel = 1; // required if perf_event_paranoid == 1 131 attr.exclude_hv = 1; // = hypervisor 132 return attr; 133 } 134 135 static int SysPerfEventOpen(const CounterConfig& cc, int leader_fd) { 136 perf_event_attr attr = MakeAttr(cc); 137 const int pid = 0; // current process (cannot also be -1) 138 const int cpu = -1; // any CPU 139 // Retry if interrupted by signals; this actually happens (b/64774091). 140 for (int retry = 0; retry < 10; ++retry) { 141 const int flags = 0; 142 const int fd = static_cast<int>( 143 syscall(__NR_perf_event_open, &attr, pid, cpu, leader_fd, flags)); 144 if (!(fd == -1 && errno == EINTR)) return fd; 145 } 146 HWY_WARN("perf_event_open retries were insufficient."); 147 return -1; 148 } 149 150 // Reads from `fd`; recovers from interruptions before/during the read. 151 static bool ReadBytes(int fd, ssize_t size, void* to) { 152 uint8_t* bytes = reinterpret_cast<uint8_t*>(to); 153 ssize_t pos = 0; 154 for (int retry = 0; retry < 10; ++retry) { 155 const ssize_t bytes_read = 156 read(fd, bytes + pos, static_cast<size_t>(size - pos)); 157 if (HWY_UNLIKELY(bytes_read <= 0)) { 158 if (errno == EINTR) continue; 159 HWY_WARN("perf read() failed, errno %d.", errno); 160 return false; 161 } 162 pos += bytes_read; 163 HWY_ASSERT(pos <= size); 164 if (HWY_LIKELY(pos == size)) return true; // success 165 } 166 HWY_WARN("perf read() wanted %d bytes, got %d.", static_cast<int>(size), 167 static_cast<int>(pos)); 168 return false; 169 } 170 171 // Array size in Buf; this is another upper bound on group size. It should be 172 // loose because it only wastes a bit of stack space, whereas an unnecessary 173 // extra group decreases coverage. Most HW supports 4-8 counters per group. 174 static constexpr size_t kMaxEventsPerGroup = PerfCounters::kCapacity; 175 176 #pragma pack(push, 1) 177 struct Buf { 178 uint64_t num_events; 179 uint64_t time_enabled; 180 uint64_t time_running; 181 uint64_t values[kMaxEventsPerGroup]; 182 }; 183 #pragma pack(pop) 184 185 // Returns false on error, otherwise sets `extrapolate` and `values`. 186 static bool ReadAndExtrapolate(int fd, size_t num_events, double& extrapolate, 187 double* HWY_RESTRICT values) { 188 Buf buf; 189 const ssize_t want_bytes = // size of var-len `Buf` 190 static_cast<ssize_t>(24 + num_events * sizeof(uint64_t)); 191 if (HWY_UNLIKELY(!ReadBytes(fd, want_bytes, &buf))) return false; 192 193 HWY_DASSERT(num_events == buf.num_events); 194 HWY_DASSERT(buf.time_running <= buf.time_enabled); 195 // If the group was not yet scheduled, we must avoid division by zero. 196 // In case counters were previously running and not reset, their current 197 // values may be nonzero. Returning zero could be interpreted as counters 198 // running backwards, so we instead treat this as a failure and mark the 199 // counters as invalid. 200 if (HWY_UNLIKELY(buf.time_running == 0)) return false; 201 202 // Extrapolate each value. 203 extrapolate = static_cast<double>(buf.time_enabled) / 204 static_cast<double>(buf.time_running); 205 for (size_t i = 0; i < buf.num_events; ++i) { 206 values[i] = static_cast<double>(buf.values[i]) * extrapolate; 207 } 208 return true; 209 } 210 211 public: 212 bool Init() { 213 // Allow callers who do not know about each other to each call `Init`. 214 // If this already succeeded, we're done; if not, we will try again. 215 if (HWY_UNLIKELY(!fds_.empty())) return true; 216 if (HWY_UNLIKELY(!PerfCountersSupported())) { 217 HWY_WARN( 218 "This Linux does not support perf counters. The program will" 219 "continue, but counters will return zero."); 220 return false; 221 } 222 223 groups_.push_back(Group()); 224 fds_.reserve(PerfCounters::kCapacity); 225 226 for (const CounterConfig& config : AllCounterConfigs()) { 227 // If the group is limited by our buffer size, add a new one. 228 if (HWY_UNLIKELY(groups_.back().num_events == kMaxEventsPerGroup)) { 229 groups_.push_back(Group()); 230 } 231 232 int fd = SysPerfEventOpen(config, groups_.back().leader_fd); 233 // Retry in case the group is limited by HW capacity. Do not check 234 // errno because it is too inconsistent (ENOSPC, EINVAL, others?). 235 if (HWY_UNLIKELY(fd < 0)) { 236 fd = SysPerfEventOpen(config, /*leader_fd=*/-1); 237 if (fd >= 0 && groups_.back().num_events != 0) { 238 groups_.push_back(Group()); 239 } 240 } 241 242 if (HWY_UNLIKELY(fd < 0)) { 243 HWY_WARN("perf_event_open %d errno %d for counter %s.", fd, errno, 244 PerfCounters::Name(config.c)); 245 } else { 246 // Add to group and set as leader if empty. 247 if (groups_.back().leader_fd == -1) { 248 groups_.back().leader_fd = fd; 249 250 // Ensure the leader is not a SW event, because adding an HW 251 // event to a group with only SW events is slow, and starting 252 // with SW may trigger a bug, see 253 // https://lore.kernel.org/lkml/tip-a1150c202207cc8501bebc45b63c264f91959260@git.kernel.org/ 254 if (HWY_UNLIKELY(config.type == PERF_TYPE_SOFTWARE)) { 255 HWY_WARN("SW event %s should not be leader.", 256 PerfCounters::Name(config.c)); 257 } 258 } 259 260 PackedIdx(config.c) = fds_.size(); 261 groups_.back().num_events += 1; 262 valid_.Set(static_cast<size_t>(config.c)); 263 fds_.push_back(fd); 264 } 265 } 266 267 // If no counters are available, remove the empty group. 268 if (HWY_UNLIKELY(fds_.empty())) { 269 HWY_ASSERT(groups_.size() == 1); 270 HWY_ASSERT(groups_.back().num_events == 0); 271 HWY_ASSERT(groups_.back().leader_fd == -1); 272 groups_.clear(); 273 } 274 275 size_t num_valid = 0; 276 for (const Group& group : groups_) { 277 num_valid += group.num_events; 278 // All groups have a leader and are not empty. 279 HWY_ASSERT(group.leader_fd >= 0); 280 HWY_ASSERT(0 != group.num_events && 281 group.num_events <= kMaxEventsPerGroup); 282 } 283 // Total `num_events` matches `fds_` and `Valid()`. 284 HWY_ASSERT(num_valid == fds_.size()); 285 HWY_ASSERT(num_valid == valid_.Count()); 286 HWY_ASSERT(num_valid <= PerfCounters::kCapacity); 287 288 if (num_valid) { 289 StopAllAndReset(); 290 return true; 291 } else { 292 HWY_WARN("No valid counters found."); 293 return true; 294 } 295 } 296 297 bool StartAll() { 298 if (HWY_UNLIKELY(fds_.empty())) return false; 299 HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_ENABLE) == 0); 300 return true; 301 } 302 303 void StopAllAndReset() { 304 HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_DISABLE) == 0); 305 for (int fd : fds_) { 306 HWY_ASSERT(ioctl(fd, PERF_EVENT_IOC_RESET, 0) == 0); 307 } 308 } 309 310 // Returns false on error, otherwise sets `valid`, `max_extrapolate`, and 311 // `values`. 312 bool Read(BitSet64& valid, double& max_extrapolate, double* values) { 313 if (HWY_UNLIKELY(!valid_.Any())) return false; 314 315 // Read all counters into buffer in the order in which they were opened. 316 max_extrapolate = 1.0; 317 double* pos = values; 318 for (const Group& group : groups_) { 319 double extrapolate; 320 if (HWY_UNLIKELY(!ReadAndExtrapolate(group.leader_fd, group.num_events, 321 extrapolate, pos))) { 322 return false; 323 } 324 max_extrapolate = HWY_MAX(max_extrapolate, extrapolate); 325 pos += group.num_events; 326 } 327 328 valid = valid_; 329 HWY_DASSERT(pos == values + valid.Count()); 330 return true; 331 } 332 333 private: 334 std::vector<int> fds_; // one per valid_ 335 BitSet64 valid_; 336 337 struct Group { 338 size_t num_events = 0; 339 int leader_fd = -1; 340 }; 341 std::vector<Group> groups_; 342 }; 343 344 // Monostate, see header. 345 PMU& GetPMU() { 346 static PMU& pmu = *new PMU(); // avoids exit-dtor warning (no dtor required) 347 return pmu; 348 } 349 350 } // namespace 351 352 HWY_DLLEXPORT bool PerfCounters::Init() { return GetPMU().Init(); } 353 HWY_DLLEXPORT bool PerfCounters::StartAll() { return GetPMU().StartAll(); } 354 HWY_DLLEXPORT void PerfCounters::StopAllAndReset() { 355 GetPMU().StopAllAndReset(); 356 } 357 HWY_DLLEXPORT PerfCounters::PerfCounters() { 358 if (HWY_UNLIKELY(!GetPMU().Read(valid_, max_extrapolate_, values_))) { 359 valid_ = BitSet64(); 360 max_extrapolate_ = 0.0; 361 hwy::ZeroBytes(values_, sizeof(values_)); 362 } 363 } 364 HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter c) { 365 return PackedIdx(c); 366 } 367 #else 368 HWY_DLLEXPORT bool PerfCounters::Init() { return false; } 369 HWY_DLLEXPORT bool PerfCounters::StartAll() { return false; } 370 HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {} 371 HWY_DLLEXPORT PerfCounters::PerfCounters() 372 : max_extrapolate_(1.0), values_{0.0} {} 373 HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter) { return 0; } 374 #endif // HWY_OS_LINUX || HWY_IDE 375 376 } // namespace platform 377 } // namespace hwy