timer.h (9878B)
1 // Copyright 2023 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_TIMER_H_ 17 #define HIGHWAY_HWY_TIMER_H_ 18 19 // Platform-specific timer functions. Provides Now() and functions for 20 // interpreting and converting Ticks. 21 22 #include <stdint.h> 23 #include <time.h> // clock_gettime 24 25 #include "hwy/base.h" 26 27 #if defined(_WIN32) || defined(_WIN64) 28 #ifndef NOMINMAX 29 #define NOMINMAX 30 #endif // NOMINMAX 31 #ifndef WIN32_LEAN_AND_MEAN 32 #define WIN32_LEAN_AND_MEAN 33 #endif // WIN32_LEAN_AND_MEAN 34 #include <windows.h> 35 #endif 36 37 #if defined(__APPLE__) 38 #include <mach/mach.h> 39 #include <mach/mach_time.h> 40 #endif 41 42 #if defined(__HAIKU__) 43 #include <OS.h> 44 #endif 45 46 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) 47 #include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq 48 #endif 49 50 #if HWY_ARCH_X86 && HWY_COMPILER_MSVC 51 #include <intrin.h> 52 #endif 53 54 namespace hwy { 55 namespace platform { 56 57 // Returns current timestamp [in seconds] relative to an unspecified origin. 58 // Features: monotonic (no negative elapsed time), steady (unaffected by system 59 // time changes), high-resolution (on the order of microseconds). 60 // Uses InvariantTicksPerSecond and the baseline version of timer::Start(). 61 HWY_DLLEXPORT double Now(); 62 63 // Functions related to `Ticks` below. 64 65 // Returns whether it is safe to call timer::Stop without executing an illegal 66 // instruction; if false, fills cpu100 (a pointer to a 100 character buffer) 67 // via GetCpuString(). 68 HWY_DLLEXPORT bool HaveTimerStop(char* cpu100); 69 70 // Returns tick rate, useful for converting timer::Ticks to seconds. Invariant 71 // means the tick counter frequency is independent of CPU throttling or sleep. 72 // This call may be expensive, callers should cache the result. 73 HWY_DLLEXPORT double InvariantTicksPerSecond(); 74 75 // Returns ticks elapsed in back to back timer calls, i.e. a function of the 76 // timer resolution (minimum measurable difference) and overhead. 77 // This call is expensive, callers should cache the result. 78 HWY_DLLEXPORT uint64_t TimerResolution(); 79 80 // Returns false if no detailed description is available, otherwise fills 81 // `cpu100` with up to 100 characters (including \0) identifying the CPU model. 82 HWY_DLLEXPORT bool GetCpuString(char* cpu100); 83 84 } // namespace platform 85 86 struct Timestamp { 87 Timestamp() { t = platform::Now(); } 88 double t; 89 }; 90 91 static inline double SecondsSince(const Timestamp& t0) { 92 const Timestamp t1; 93 return t1.t - t0.t; 94 } 95 96 // Low-level Start/Stop functions, previously in timer-inl.h. 97 98 namespace timer { 99 100 // Ticks := platform-specific timer values (CPU cycles on x86). Must be 101 // unsigned to guarantee wraparound on overflow. 102 using Ticks = uint64_t; 103 104 // Start/Stop return absolute timestamps and must be placed immediately before 105 // and after the region to measure. We provide separate Start/Stop functions 106 // because they use different fences. 107 // 108 // Background: RDTSC is not 'serializing'; earlier instructions may complete 109 // after it, and/or later instructions may complete before it. 'Fences' ensure 110 // regions' elapsed times are independent of such reordering. The only 111 // documented unprivileged serializing instruction is CPUID, which acts as a 112 // full fence (no reordering across it in either direction). Unfortunately 113 // the latency of CPUID varies wildly (perhaps made worse by not initializing 114 // its EAX input). Because it cannot reliably be deducted from the region's 115 // elapsed time, it must not be included in the region to measure (i.e. 116 // between the two RDTSC). 117 // 118 // The newer RDTSCP is sometimes described as serializing, but it actually 119 // only serves as a half-fence with release semantics. Although all 120 // instructions in the region will complete before the final timestamp is 121 // captured, subsequent instructions may leak into the region and increase the 122 // elapsed time. Inserting another fence after the final `RDTSCP` would prevent 123 // such reordering without affecting the measured region. 124 // 125 // Fortunately, such a fence exists. The LFENCE instruction is only documented 126 // to delay later loads until earlier loads are visible. However, Intel's 127 // reference manual says it acts as a full fence (waiting until all earlier 128 // instructions have completed, and delaying later instructions until it 129 // completes). AMD assigns the same behavior to MFENCE. 130 // 131 // We need a fence before the initial RDTSC to prevent earlier instructions 132 // from leaking into the region, and arguably another after RDTSC to avoid 133 // region instructions from completing before the timestamp is recorded. 134 // When surrounded by fences, the additional `RDTSCP` half-fence provides no 135 // benefit, so the initial timestamp can be recorded via RDTSC, which has 136 // lower overhead than `RDTSCP` because it does not read TSC_AUX. In summary, 137 // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. 138 // 139 // Using Start+Start leads to higher variance and overhead than Stop+Stop. 140 // However, Stop+Stop includes an LFENCE in the region measurements, which 141 // adds a delay dependent on earlier loads. The combination of Start+Stop 142 // is faster than Start+Start and more consistent than Stop+Stop because 143 // the first LFENCE already delayed subsequent loads before the measured 144 // region. This combination seems not to have been considered in prior work: 145 // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c 146 // 147 // Note: performance counters can measure 'exact' instructions-retired or 148 // (unhalted) cycle counts. The RDPMC instruction is not serializing and also 149 // requires fences. Unfortunately, it is not accessible on all OSes and we 150 // prefer to avoid kernel-mode drivers. Performance counters are also affected 151 // by several under/over-count errata, so we use the TSC instead. 152 153 // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, 154 // divide by InvariantTicksPerSecond. 155 static HWY_INLINE Ticks Start() { 156 Ticks t; 157 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) 158 asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); 159 #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC 160 // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. 161 asm volatile("mrs %0, cntvct_el0" : "=r"(t)); 162 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC 163 _ReadWriteBarrier(); 164 _mm_lfence(); 165 _ReadWriteBarrier(); 166 t = __rdtsc(); 167 _ReadWriteBarrier(); 168 _mm_lfence(); 169 _ReadWriteBarrier(); 170 #elif HWY_ARCH_X86_64 171 asm volatile( 172 "lfence\n\t" 173 "rdtsc\n\t" 174 "shl $32, %%rdx\n\t" 175 "or %%rdx, %0\n\t" 176 "lfence" 177 : "=a"(t) 178 : 179 // "memory" avoids reordering. rdx = TSC >> 32. 180 // "cc" = flags modified by SHL. 181 : "rdx", "memory", "cc"); 182 #elif HWY_ARCH_RISCV 183 asm volatile("fence; rdtime %0" : "=r"(t)); 184 #elif defined(_WIN32) || defined(_WIN64) 185 LARGE_INTEGER counter; 186 (void)QueryPerformanceCounter(&counter); 187 t = counter.QuadPart; 188 #elif defined(__APPLE__) 189 t = mach_absolute_time(); 190 #elif defined(__HAIKU__) 191 t = system_time_nsecs(); // since boot 192 #else // POSIX 193 timespec ts; 194 clock_gettime(CLOCK_MONOTONIC, &ts); 195 t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec); 196 #endif 197 return t; 198 } 199 200 // WARNING: on x86, caller must check `HaveTimerStop()` before using this! 201 static HWY_INLINE Ticks Stop() { 202 uint64_t t; 203 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) 204 asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); 205 #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC 206 // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. 207 asm volatile("mrs %0, cntvct_el0" : "=r"(t)); 208 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC 209 _ReadWriteBarrier(); 210 unsigned aux; 211 t = __rdtscp(&aux); 212 _ReadWriteBarrier(); 213 _mm_lfence(); 214 _ReadWriteBarrier(); 215 #elif HWY_ARCH_X86_64 216 // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). 217 asm volatile( 218 "rdtscp\n\t" 219 "shl $32, %%rdx\n\t" 220 "or %%rdx, %0\n\t" 221 "lfence" 222 : "=a"(t) 223 : 224 // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. 225 // "cc" = flags modified by SHL. 226 : "rcx", "rdx", "memory", "cc"); 227 #else 228 t = Start(); 229 #endif 230 return t; 231 } 232 233 } // namespace timer 234 235 // Wrapper around Start/Stop that checks whether the CPU supports Stop. 236 class Timer { 237 public: 238 Timer() { 239 char cpu100[100]; 240 have_timer_stop_ = platform::HaveTimerStop(cpu100); 241 } 242 243 // Before/After have fences to prevent the measured code 'leaking out'. 244 timer::Ticks Before() const { return timer::Start(); } 245 timer::Ticks After() const { 246 return have_timer_stop_ ? timer::Stop() : timer::Start(); 247 } 248 249 private: 250 bool have_timer_stop_; 251 }; 252 253 static inline double Seconds(timer::Ticks ticks) { 254 return static_cast<double>(ticks) / platform::InvariantTicksPerSecond(); 255 } 256 257 // Measures elapsed time since construction, with automatic reset. 258 class Stopwatch { 259 public: 260 explicit Stopwatch(const Timer& timestamps) : timer_(timestamps) { Reset(); } 261 262 timer::Ticks Origin() const { return t0_; } 263 void Reset() { t0_ = timer_.Before(); } 264 265 // Also resets the start time to the current time to enable reuse without a 266 // second call to the timer. 267 timer::Ticks Elapsed() { 268 const timer::Ticks t1 = timer_.After(); 269 const timer::Ticks elapsed = t1 - t0_; 270 t0_ = t1; 271 return elapsed; 272 } 273 274 private: 275 const Timer& timer_; 276 timer::Ticks t0_; 277 }; 278 279 } // namespace hwy 280 281 #endif // HIGHWAY_HWY_TIMER_H_