timer.cc (6180B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/timer.h" 17 18 #include <stdlib.h> 19 20 #include <chrono> // NOLINT 21 #include <ratio> // NOLINT 22 23 #include "hwy/base.h" 24 #include "hwy/robust_statistics.h" 25 #include "hwy/x86_cpuid.h" 26 27 namespace hwy { 28 29 #if HWY_ARCH_X86 30 namespace x86 { 31 32 static bool HasRDTSCP() { 33 uint32_t abcd[4]; 34 Cpuid(0x80000001U, 0, abcd); // Extended feature flags 35 if ((abcd[3] & (1u << 27)) == 0) return false; // RDTSCP 36 37 Cpuid(0x80000007U, 0, abcd); 38 if ((abcd[3] & (1u << 8)) == 0) { 39 HWY_WARN("TSC not constant/invariant, may vary frequency or jump."); 40 } 41 return true; 42 } 43 44 } // namespace x86 45 #endif // HWY_ARCH_X86 46 47 // Measures the actual current frequency of Ticks. We cannot rely on the nominal 48 // frequency encoded in x86 GetCpuString because it is misleading on M1 Rosetta, 49 // and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also 50 // used on RISC-V and aarch64. 51 static HWY_MAYBE_UNUSED double MeasureNominalClockRate() { 52 double max_ticks_per_sec = 0.0; 53 // Arbitrary, enough to ignore 2 outliers without excessive init time. 54 for (int rep = 0; rep < 3; ++rep) { 55 auto time0 = std::chrono::steady_clock::now(); 56 using Time = decltype(time0); 57 const timer::Ticks ticks0 = timer::Start(); 58 const Time time_min = time0 + std::chrono::milliseconds(10); 59 60 Time time1; 61 timer::Ticks ticks1; 62 for (;;) { 63 time1 = std::chrono::steady_clock::now(); 64 // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid 65 // another codepath, just use Start instead. now() presumably has its own 66 // fence-like behavior. 67 ticks1 = timer::Start(); // Do not use Stop, see comment above 68 if (time1 >= time_min) break; 69 } 70 71 const double dticks = static_cast<double>(ticks1 - ticks0); 72 std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0; 73 const double ticks_per_sec = dticks / dtime.count(); 74 max_ticks_per_sec = HWY_MAX(max_ticks_per_sec, ticks_per_sec); 75 } 76 return max_ticks_per_sec; 77 } 78 79 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) 80 namespace ppc { 81 82 static HWY_INLINE double GetTimebaseFreq() { 83 const auto timebase_freq = __ppc_get_timebase_freq(); 84 // If timebase_freq is greater than 0, then return timebase_freq. 85 86 // Otherwise, if timebase_freq is less than or equal to 0, fall back to 87 // MeasureNominalClockRate(). This works around issues if running on QEMU on 88 // non-PPC CPU's. 89 return (timebase_freq > 0) ? static_cast<double>(timebase_freq) 90 : MeasureNominalClockRate(); 91 } 92 93 } // namespace ppc 94 #endif 95 96 namespace platform { 97 98 HWY_DLLEXPORT bool GetCpuString(char* cpu100) { 99 #if HWY_ARCH_X86 100 uint32_t abcd[4]; 101 102 // Check if brand string is supported (it is on all reasonable Intel/AMD) 103 x86::Cpuid(0x80000000U, 0, abcd); 104 if (abcd[0] < 0x80000004U) { 105 cpu100[0] = '\0'; 106 return false; 107 } 108 109 for (size_t i = 0; i < 3; ++i) { 110 x86::Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd); 111 CopyBytes<sizeof(abcd)>(&abcd[0], cpu100 + i * 16); // not same size 112 } 113 cpu100[48] = '\0'; 114 return true; 115 #else 116 cpu100[0] = '?'; 117 cpu100[1] = '\0'; 118 return false; 119 #endif 120 } 121 122 HWY_DLLEXPORT double Now() { 123 static const double mul = 1.0 / InvariantTicksPerSecond(); 124 return static_cast<double>(timer::Start()) * mul; 125 } 126 127 HWY_DLLEXPORT bool HaveTimerStop(char* cpu100) { 128 #if HWY_ARCH_X86 129 if (!x86::HasRDTSCP()) { 130 (void)GetCpuString(cpu100); 131 return false; 132 } 133 #endif 134 *cpu100 = '\0'; 135 return true; 136 } 137 138 HWY_DLLEXPORT double InvariantTicksPerSecond() { 139 #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) 140 static const double freq = ppc::GetTimebaseFreq(); 141 return freq; 142 #elif HWY_ARCH_X86 || HWY_ARCH_RISCV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) 143 // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs. 144 static const double freq = MeasureNominalClockRate(); 145 return freq; 146 #elif defined(_WIN32) || defined(_WIN64) 147 LARGE_INTEGER freq; 148 (void)QueryPerformanceFrequency(&freq); 149 return static_cast<double>(freq.QuadPart); 150 #elif defined(__APPLE__) 151 // https://developer.apple.com/library/mac/qa/qa1398/_index.html 152 mach_timebase_info_data_t timebase; 153 (void)mach_timebase_info(&timebase); 154 return static_cast<double>(timebase.denom) / timebase.numer * 1E9; 155 #else 156 return 1E9; // Haiku and clock_gettime return nanoseconds. 157 #endif 158 } 159 160 HWY_DLLEXPORT uint64_t TimerResolution() { 161 char cpu100[100]; 162 bool can_use_stop = HaveTimerStop(cpu100); 163 164 // For measuring timer overhead/resolution. Used in a nested loop => 165 // quadratic time, acceptable because we know timer overhead is "low". 166 // constexpr because this is used to define array bounds. 167 constexpr size_t kTimerSamples = 256; 168 169 // Nested loop avoids exceeding stack/L1 capacity. 170 timer::Ticks repetitions[kTimerSamples]; 171 for (size_t rep = 0; rep < kTimerSamples; ++rep) { 172 timer::Ticks samples[kTimerSamples]; 173 if (can_use_stop) { 174 for (size_t i = 0; i < kTimerSamples; ++i) { 175 const timer::Ticks t0 = timer::Start(); 176 const timer::Ticks t1 = timer::Stop(); // we checked HasRDTSCP above 177 samples[i] = t1 - t0; 178 } 179 } else { 180 for (size_t i = 0; i < kTimerSamples; ++i) { 181 const timer::Ticks t0 = timer::Start(); 182 const timer::Ticks t1 = timer::Start(); // do not use Stop, see above 183 samples[i] = t1 - t0; 184 } 185 } 186 repetitions[rep] = robust_statistics::Mode(samples); 187 } 188 return robust_statistics::Mode(repetitions); 189 } 190 191 } // namespace platform 192 } // namespace hwy