tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

timer.h (9878B)


      1 // Copyright 2023 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef HIGHWAY_HWY_TIMER_H_
     17 #define HIGHWAY_HWY_TIMER_H_
     18 
     19 // Platform-specific timer functions. Provides Now() and functions for
     20 // interpreting and converting Ticks.
     21 
     22 #include <stdint.h>
     23 #include <time.h>  // clock_gettime
     24 
     25 #include "hwy/base.h"
     26 
     27 #if defined(_WIN32) || defined(_WIN64)
     28 #ifndef NOMINMAX
     29 #define NOMINMAX
     30 #endif  // NOMINMAX
     31 #ifndef WIN32_LEAN_AND_MEAN
     32 #define WIN32_LEAN_AND_MEAN
     33 #endif  // WIN32_LEAN_AND_MEAN
     34 #include <windows.h>
     35 #endif
     36 
     37 #if defined(__APPLE__)
     38 #include <mach/mach.h>
     39 #include <mach/mach_time.h>
     40 #endif
     41 
     42 #if defined(__HAIKU__)
     43 #include <OS.h>
     44 #endif
     45 
     46 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
     47 #include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
     48 #endif
     49 
     50 #if HWY_ARCH_X86 && HWY_COMPILER_MSVC
     51 #include <intrin.h>
     52 #endif
     53 
     54 namespace hwy {
     55 namespace platform {
     56 
     57 // Returns current timestamp [in seconds] relative to an unspecified origin.
     58 // Features: monotonic (no negative elapsed time), steady (unaffected by system
     59 // time changes), high-resolution (on the order of microseconds).
     60 // Uses InvariantTicksPerSecond and the baseline version of timer::Start().
     61 HWY_DLLEXPORT double Now();
     62 
     63 // Functions related to `Ticks` below.
     64 
     65 // Returns whether it is safe to call timer::Stop without executing an illegal
     66 // instruction; if false, fills cpu100 (a pointer to a 100 character buffer)
     67 // via GetCpuString().
     68 HWY_DLLEXPORT bool HaveTimerStop(char* cpu100);
     69 
     70 // Returns tick rate, useful for converting timer::Ticks to seconds. Invariant
     71 // means the tick counter frequency is independent of CPU throttling or sleep.
     72 // This call may be expensive, callers should cache the result.
     73 HWY_DLLEXPORT double InvariantTicksPerSecond();
     74 
     75 // Returns ticks elapsed in back to back timer calls, i.e. a function of the
     76 // timer resolution (minimum measurable difference) and overhead.
     77 // This call is expensive, callers should cache the result.
     78 HWY_DLLEXPORT uint64_t TimerResolution();
     79 
     80 // Returns false if no detailed description is available, otherwise fills
     81 // `cpu100` with up to 100 characters (including \0) identifying the CPU model.
     82 HWY_DLLEXPORT bool GetCpuString(char* cpu100);
     83 
     84 }  // namespace platform
     85 
     86 struct Timestamp {
     87  Timestamp() { t = platform::Now(); }
     88  double t;
     89 };
     90 
     91 static inline double SecondsSince(const Timestamp& t0) {
     92  const Timestamp t1;
     93  return t1.t - t0.t;
     94 }
     95 
     96 // Low-level Start/Stop functions, previously in timer-inl.h.
     97 
     98 namespace timer {
     99 
    100 // Ticks := platform-specific timer values (CPU cycles on x86). Must be
    101 // unsigned to guarantee wraparound on overflow.
    102 using Ticks = uint64_t;
    103 
    104 // Start/Stop return absolute timestamps and must be placed immediately before
    105 // and after the region to measure. We provide separate Start/Stop functions
    106 // because they use different fences.
    107 //
    108 // Background: RDTSC is not 'serializing'; earlier instructions may complete
    109 // after it, and/or later instructions may complete before it. 'Fences' ensure
    110 // regions' elapsed times are independent of such reordering. The only
    111 // documented unprivileged serializing instruction is CPUID, which acts as a
    112 // full fence (no reordering across it in either direction). Unfortunately
    113 // the latency of CPUID varies wildly (perhaps made worse by not initializing
    114 // its EAX input). Because it cannot reliably be deducted from the region's
    115 // elapsed time, it must not be included in the region to measure (i.e.
    116 // between the two RDTSC).
    117 //
    118 // The newer RDTSCP is sometimes described as serializing, but it actually
    119 // only serves as a half-fence with release semantics. Although all
    120 // instructions in the region will complete before the final timestamp is
    121 // captured, subsequent instructions may leak into the region and increase the
    122 // elapsed time. Inserting another fence after the final `RDTSCP` would prevent
    123 // such reordering without affecting the measured region.
    124 //
    125 // Fortunately, such a fence exists. The LFENCE instruction is only documented
    126 // to delay later loads until earlier loads are visible. However, Intel's
    127 // reference manual says it acts as a full fence (waiting until all earlier
    128 // instructions have completed, and delaying later instructions until it
    129 // completes). AMD assigns the same behavior to MFENCE.
    130 //
    131 // We need a fence before the initial RDTSC to prevent earlier instructions
    132 // from leaking into the region, and arguably another after RDTSC to avoid
    133 // region instructions from completing before the timestamp is recorded.
    134 // When surrounded by fences, the additional `RDTSCP` half-fence provides no
    135 // benefit, so the initial timestamp can be recorded via RDTSC, which has
    136 // lower overhead than `RDTSCP` because it does not read TSC_AUX. In summary,
    137 // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
    138 //
    139 // Using Start+Start leads to higher variance and overhead than Stop+Stop.
    140 // However, Stop+Stop includes an LFENCE in the region measurements, which
    141 // adds a delay dependent on earlier loads. The combination of Start+Stop
    142 // is faster than Start+Start and more consistent than Stop+Stop because
    143 // the first LFENCE already delayed subsequent loads before the measured
    144 // region. This combination seems not to have been considered in prior work:
    145 // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
    146 //
    147 // Note: performance counters can measure 'exact' instructions-retired or
    148 // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
    149 // requires fences. Unfortunately, it is not accessible on all OSes and we
    150 // prefer to avoid kernel-mode drivers. Performance counters are also affected
    151 // by several under/over-count errata, so we use the TSC instead.
    152 
    153 // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
    154 // divide by InvariantTicksPerSecond.
    155 static HWY_INLINE Ticks Start() {
    156  Ticks t;
    157 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
    158  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
    159 #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
    160  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
    161  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
    162 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
    163  _ReadWriteBarrier();
    164  _mm_lfence();
    165  _ReadWriteBarrier();
    166  t = __rdtsc();
    167  _ReadWriteBarrier();
    168  _mm_lfence();
    169  _ReadWriteBarrier();
    170 #elif HWY_ARCH_X86_64
    171  asm volatile(
    172      "lfence\n\t"
    173      "rdtsc\n\t"
    174      "shl $32, %%rdx\n\t"
    175      "or %%rdx, %0\n\t"
    176      "lfence"
    177      : "=a"(t)
    178      :
    179      // "memory" avoids reordering. rdx = TSC >> 32.
    180      // "cc" = flags modified by SHL.
    181      : "rdx", "memory", "cc");
    182 #elif HWY_ARCH_RISCV
    183  asm volatile("fence; rdtime %0" : "=r"(t));
    184 #elif defined(_WIN32) || defined(_WIN64)
    185  LARGE_INTEGER counter;
    186  (void)QueryPerformanceCounter(&counter);
    187  t = counter.QuadPart;
    188 #elif defined(__APPLE__)
    189  t = mach_absolute_time();
    190 #elif defined(__HAIKU__)
    191  t = system_time_nsecs();  // since boot
    192 #else  // POSIX
    193  timespec ts;
    194  clock_gettime(CLOCK_MONOTONIC, &ts);
    195  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
    196 #endif
    197  return t;
    198 }
    199 
    200 // WARNING: on x86, caller must check `HaveTimerStop()` before using this!
    201 static HWY_INLINE Ticks Stop() {
    202  uint64_t t;
    203 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
    204  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
    205 #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
    206  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
    207  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
    208 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
    209  _ReadWriteBarrier();
    210  unsigned aux;
    211  t = __rdtscp(&aux);
    212  _ReadWriteBarrier();
    213  _mm_lfence();
    214  _ReadWriteBarrier();
    215 #elif HWY_ARCH_X86_64
    216  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
    217  asm volatile(
    218      "rdtscp\n\t"
    219      "shl $32, %%rdx\n\t"
    220      "or %%rdx, %0\n\t"
    221      "lfence"
    222      : "=a"(t)
    223      :
    224      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
    225      // "cc" = flags modified by SHL.
    226      : "rcx", "rdx", "memory", "cc");
    227 #else
    228  t = Start();
    229 #endif
    230  return t;
    231 }
    232 
    233 }  // namespace timer
    234 
    235 // Wrapper around Start/Stop that checks whether the CPU supports Stop.
    236 class Timer {
    237 public:
    238  Timer() {
    239    char cpu100[100];
    240    have_timer_stop_ = platform::HaveTimerStop(cpu100);
    241  }
    242 
    243  // Before/After have fences to prevent the measured code 'leaking out'.
    244  timer::Ticks Before() const { return timer::Start(); }
    245  timer::Ticks After() const {
    246    return have_timer_stop_ ? timer::Stop() : timer::Start();
    247  }
    248 
    249 private:
    250  bool have_timer_stop_;
    251 };
    252 
    253 static inline double Seconds(timer::Ticks ticks) {
    254  return static_cast<double>(ticks) / platform::InvariantTicksPerSecond();
    255 }
    256 
    257 // Measures elapsed time since construction, with automatic reset.
    258 class Stopwatch {
    259 public:
    260  explicit Stopwatch(const Timer& timestamps) : timer_(timestamps) { Reset(); }
    261 
    262  timer::Ticks Origin() const { return t0_; }
    263  void Reset() { t0_ = timer_.Before(); }
    264 
    265  // Also resets the start time to the current time to enable reuse without a
    266  // second call to the timer.
    267  timer::Ticks Elapsed() {
    268    const timer::Ticks t1 = timer_.After();
    269    const timer::Ticks elapsed = t1 - t0_;
    270    t0_ = t1;
    271    return elapsed;
    272  }
    273 
    274 private:
    275  const Timer& timer_;
    276  timer::Ticks t0_;
    277 };
    278 
    279 }  // namespace hwy
    280 
    281 #endif  // HIGHWAY_HWY_TIMER_H_