tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

timer.cc (6180B)


      1 // Copyright 2019 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include "hwy/timer.h"
     17 
     18 #include <stdlib.h>
     19 
     20 #include <chrono>  // NOLINT
     21 #include <ratio>   // NOLINT
     22 
     23 #include "hwy/base.h"
     24 #include "hwy/robust_statistics.h"
     25 #include "hwy/x86_cpuid.h"
     26 
     27 namespace hwy {
     28 
     29 #if HWY_ARCH_X86
     30 namespace x86 {
     31 
     32 static bool HasRDTSCP() {
     33  uint32_t abcd[4];
     34  Cpuid(0x80000001U, 0, abcd);                    // Extended feature flags
     35  if ((abcd[3] & (1u << 27)) == 0) return false;  // RDTSCP
     36 
     37  Cpuid(0x80000007U, 0, abcd);
     38  if ((abcd[3] & (1u << 8)) == 0) {
     39    HWY_WARN("TSC not constant/invariant, may vary frequency or jump.");
     40  }
     41  return true;
     42 }
     43 
     44 }  // namespace x86
     45 #endif  // HWY_ARCH_X86
     46 
     47 // Measures the actual current frequency of Ticks. We cannot rely on the nominal
     48 // frequency encoded in x86 GetCpuString because it is misleading on M1 Rosetta,
     49 // and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
     50 // used on RISC-V and aarch64.
     51 static HWY_MAYBE_UNUSED double MeasureNominalClockRate() {
     52  double max_ticks_per_sec = 0.0;
     53  // Arbitrary, enough to ignore 2 outliers without excessive init time.
     54  for (int rep = 0; rep < 3; ++rep) {
     55    auto time0 = std::chrono::steady_clock::now();
     56    using Time = decltype(time0);
     57    const timer::Ticks ticks0 = timer::Start();
     58    const Time time_min = time0 + std::chrono::milliseconds(10);
     59 
     60    Time time1;
     61    timer::Ticks ticks1;
     62    for (;;) {
     63      time1 = std::chrono::steady_clock::now();
     64      // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
     65      // another codepath, just use Start instead. now() presumably has its own
     66      // fence-like behavior.
     67      ticks1 = timer::Start();  // Do not use Stop, see comment above
     68      if (time1 >= time_min) break;
     69    }
     70 
     71    const double dticks = static_cast<double>(ticks1 - ticks0);
     72    std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0;
     73    const double ticks_per_sec = dticks / dtime.count();
     74    max_ticks_per_sec = HWY_MAX(max_ticks_per_sec, ticks_per_sec);
     75  }
     76  return max_ticks_per_sec;
     77 }
     78 
     79 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
     80 namespace ppc {
     81 
     82 static HWY_INLINE double GetTimebaseFreq() {
     83  const auto timebase_freq = __ppc_get_timebase_freq();
     84  // If timebase_freq is greater than 0, then return timebase_freq.
     85 
     86  // Otherwise, if timebase_freq is less than or equal to 0, fall back to
     87  // MeasureNominalClockRate(). This works around issues if running on QEMU on
     88  // non-PPC CPU's.
     89  return (timebase_freq > 0) ? static_cast<double>(timebase_freq)
     90                             : MeasureNominalClockRate();
     91 }
     92 
     93 }  // namespace ppc
     94 #endif
     95 
     96 namespace platform {
     97 
     98 HWY_DLLEXPORT bool GetCpuString(char* cpu100) {
     99 #if HWY_ARCH_X86
    100  uint32_t abcd[4];
    101 
    102  // Check if brand string is supported (it is on all reasonable Intel/AMD)
    103  x86::Cpuid(0x80000000U, 0, abcd);
    104  if (abcd[0] < 0x80000004U) {
    105    cpu100[0] = '\0';
    106    return false;
    107  }
    108 
    109  for (size_t i = 0; i < 3; ++i) {
    110    x86::Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd);
    111    CopyBytes<sizeof(abcd)>(&abcd[0], cpu100 + i * 16);  // not same size
    112  }
    113  cpu100[48] = '\0';
    114  return true;
    115 #else
    116  cpu100[0] = '?';
    117  cpu100[1] = '\0';
    118  return false;
    119 #endif
    120 }
    121 
    122 HWY_DLLEXPORT double Now() {
    123  static const double mul = 1.0 / InvariantTicksPerSecond();
    124  return static_cast<double>(timer::Start()) * mul;
    125 }
    126 
    127 HWY_DLLEXPORT bool HaveTimerStop(char* cpu100) {
    128 #if HWY_ARCH_X86
    129  if (!x86::HasRDTSCP()) {
    130    (void)GetCpuString(cpu100);
    131    return false;
    132  }
    133 #endif
    134  *cpu100 = '\0';
    135  return true;
    136 }
    137 
    138 HWY_DLLEXPORT double InvariantTicksPerSecond() {
    139 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
    140  static const double freq = ppc::GetTimebaseFreq();
    141  return freq;
    142 #elif HWY_ARCH_X86 || HWY_ARCH_RISCV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
    143  // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
    144  static const double freq = MeasureNominalClockRate();
    145  return freq;
    146 #elif defined(_WIN32) || defined(_WIN64)
    147  LARGE_INTEGER freq;
    148  (void)QueryPerformanceFrequency(&freq);
    149  return static_cast<double>(freq.QuadPart);
    150 #elif defined(__APPLE__)
    151  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
    152  mach_timebase_info_data_t timebase;
    153  (void)mach_timebase_info(&timebase);
    154  return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
    155 #else
    156  return 1E9;  // Haiku and clock_gettime return nanoseconds.
    157 #endif
    158 }
    159 
    160 HWY_DLLEXPORT uint64_t TimerResolution() {
    161  char cpu100[100];
    162  bool can_use_stop = HaveTimerStop(cpu100);
    163 
    164  // For measuring timer overhead/resolution. Used in a nested loop =>
    165  // quadratic time, acceptable because we know timer overhead is "low".
    166  // constexpr because this is used to define array bounds.
    167  constexpr size_t kTimerSamples = 256;
    168 
    169  // Nested loop avoids exceeding stack/L1 capacity.
    170  timer::Ticks repetitions[kTimerSamples];
    171  for (size_t rep = 0; rep < kTimerSamples; ++rep) {
    172    timer::Ticks samples[kTimerSamples];
    173    if (can_use_stop) {
    174      for (size_t i = 0; i < kTimerSamples; ++i) {
    175        const timer::Ticks t0 = timer::Start();
    176        const timer::Ticks t1 = timer::Stop();  // we checked HasRDTSCP above
    177        samples[i] = t1 - t0;
    178      }
    179    } else {
    180      for (size_t i = 0; i < kTimerSamples; ++i) {
    181        const timer::Ticks t0 = timer::Start();
    182        const timer::Ticks t1 = timer::Start();  // do not use Stop, see above
    183        samples[i] = t1 - t0;
    184      }
    185    }
    186    repetitions[rep] = robust_statistics::Mode(samples);
    187  }
    188  return robust_statistics::Mode(repetitions);
    189 }
    190 
    191 }  // namespace platform
    192 }  // namespace hwy