perf_counters.h (5554B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_PERF_COUNTERS_H_ 17 #define HIGHWAY_HWY_PERF_COUNTERS_H_ 18 19 // Reads OS/CPU performance counters. 20 21 #include <stddef.h> 22 23 #include "hwy/base.h" // HWY_ABORT 24 #include "hwy/bit_set.h" 25 26 namespace hwy { 27 namespace platform { 28 29 // Avoid padding in case callers such as profiler.h store many instances. 30 #pragma pack(push, 1) 31 // Provides access to CPU/OS performance counters. Each instance has space for 32 // multiple counter values; which counters these are may change in future. 33 // Although counters are per-CPU, Linux accesses them via a syscall, hence we 34 // use the monostate pattern to avoid callers having to pass around a pointer. 35 // Note that this is not thread-safe, so the static member functions should only 36 // be called from the main thread. 37 class PerfCounters { 38 public: 39 // Chosen such that this class occupies one or two cache lines. 40 static constexpr size_t kCapacity = 14; 41 42 // Bit indices used to identify counters. The ordering is arbitrary. Some of 43 // these counters may be 'removed' in the sense of not being visited by 44 // `Foreach`, but their enumerators will remain. New counters may be appended. 45 enum Counter { 46 kRefCycles = 0, 47 kInstructions, 48 kBranches, 49 kBranchMispredicts, 50 kBusCycles, 51 kCacheRefs, 52 kCacheMisses, 53 kL3Loads, 54 kL3Stores, 55 kPageFaults, // SW 56 kMigrations // SW 57 }; // BitSet64 requires these values to be less than 64. 58 59 // Strings for user-facing messages, not used in the implementation. 60 static inline const char* Name(Counter c) { 61 switch (c) { 62 case kRefCycles: 63 return "ref_cycles"; 64 case kInstructions: 65 return "instructions"; 66 case kBranches: 67 return "branches"; 68 case kBranchMispredicts: 69 return "branch_mispredicts"; 70 case kBusCycles: 71 return "bus_cycles"; 72 case kCacheRefs: 73 return "cache_refs"; 74 case kCacheMisses: 75 return "cache_misses"; 76 case kL3Loads: 77 return "l3_load"; 78 case kL3Stores: 79 return "l3_store"; 80 case kPageFaults: 81 return "page_fault"; 82 case kMigrations: 83 return "migration"; 84 default: 85 HWY_UNREACHABLE; 86 } 87 } 88 89 // Returns false if counters are unavailable. Must be called at least once 90 // before `StartAll`; it is separate to reduce the overhead of repeatedly 91 // stopping/starting counters. 92 HWY_DLLEXPORT static bool Init(); 93 94 // Returns false if counters are unavailable, otherwise starts them. Note that 95 // they default to stopped. Unless this is called, the values read may be 0. 96 HWY_DLLEXPORT static bool StartAll(); 97 98 // Stops and zeros all counters. This is not necessary if users subtract the 99 // previous counter values, but can increase precision because floating-point 100 // has more precision near zero. 101 HWY_DLLEXPORT static void StopAllAndReset(); 102 103 // Reads the current (extrapolated, in case of multiplexing) counter values. 104 HWY_DLLEXPORT PerfCounters(); 105 106 // Returns whether any counters were successfully read. 107 bool AnyValid() const { return valid_.Any(); } 108 109 // Returns whether the given counter was successfully read. 110 bool IsValid(Counter c) const { 111 const size_t bit_idx = static_cast<size_t>(c); 112 return valid_.Get(bit_idx); 113 } 114 115 // Returns the maximum extrapolation factor for any counter, which is the 116 // total time between `StartAll` and now or the last `StopAllAndReset`, 117 // divided by the time that the counter was actually running. This 118 // approximates the number of counter groups that the CPU multiplexes onto the 119 // actual counter hardware. It is only meaningful if AnyValid(). 120 double MaxExtrapolate() const { return max_extrapolate_; } 121 122 // Returns the value of the given counter, or zero if it is not valid. 123 double Get(Counter c) const { 124 return IsValid(c) ? values_[IndexForCounter(c)] : 0.0; 125 } 126 127 // For each valid counter in increasing numerical order, calls `visitor` with 128 // the value and `Counter`. 129 template <class Visitor> 130 void Foreach(const Visitor& visitor) { 131 valid_.Foreach([&](size_t bit_idx) { 132 const Counter c = static_cast<Counter>(bit_idx); 133 visitor(values_[IndexForCounter(c)], c); 134 }); 135 } 136 137 private: 138 // Index within `values_` for a given counter. 139 HWY_DLLEXPORT static size_t IndexForCounter(Counter c); 140 141 BitSet64 valid_; 142 double max_extrapolate_; 143 // Floating-point because these are extrapolated (multiplexing). It would be 144 // nice for this to fit in one cache line to reduce the cost of reading 145 // counters in profiler.h, but some of the values are too large for float and 146 // we want more than 8 counters. Ensure all values are sums, not ratios, so 147 // that profiler.h can add/subtract them. These are contiguous in memory, in 148 // the order that counters were initialized. 149 double values_[kCapacity]; 150 }; 151 #pragma pack(pop) 152 153 } // namespace platform 154 } // namespace hwy 155 156 #endif // HIGHWAY_HWY_PERF_COUNTERS_H_