spin.h (10902B)
1 // Copyright 2025 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_ 17 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_ 18 19 // Relatively power-efficient spin lock for low-latency synchronization. 20 21 #include <stdint.h> 22 23 #include <atomic> 24 25 #include "hwy/base.h" 26 #include "hwy/cache_control.h" // Pause 27 28 #ifndef HWY_ENABLE_MONITORX // allow override 29 // Clang 3.9 suffices for mwaitx, but the target pragma requires 9.0. 30 #if HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 900) || \ 31 (HWY_COMPILER_GCC_ACTUAL >= 502) || defined(__MWAITX__)) 32 #define HWY_ENABLE_MONITORX 1 33 #else 34 #define HWY_ENABLE_MONITORX 0 35 #endif 36 #endif // HWY_ENABLE_MONITORX 37 38 #ifndef HWY_ENABLE_UMONITOR // allow override 39 #if HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 900) || \ 40 (HWY_COMPILER_GCC_ACTUAL >= 901) || defined(__WAITPKG__)) 41 #define HWY_ENABLE_UMONITOR 1 42 #else 43 #define HWY_ENABLE_UMONITOR 0 44 #endif 45 #endif // HWY_ENABLE_UMONITOR 46 47 // Inline assembly is preferred because it allows inlining of `UntilDifferent` 48 // etc, but we also support intrinsics for MSVC. 49 #ifndef HWY_ENABLE_SPIN_ASM // allow override 50 #if (HWY_COMPILER_CLANG || HWY_COMPILER_GCC) && HWY_ARCH_X86_64 51 #define HWY_ENABLE_SPIN_ASM 1 52 #else 53 #define HWY_ENABLE_SPIN_ASM 0 54 #endif 55 #endif // HWY_ENABLE_SPIN_ASM 56 57 #if HWY_ENABLE_MONITORX || HWY_ENABLE_UMONITOR 58 #if HWY_ENABLE_SPIN_ASM 59 #define HWY_INLINE_SPIN HWY_INLINE // can inline functions with inline assembly 60 #else 61 // Intrinsics require attributes, which prevent inlining. 62 #define HWY_INLINE_SPIN 63 #include <x86intrin.h> 64 #endif // HWY_ENABLE_SPIN_ASM 65 66 #include "hwy/x86_cpuid.h" 67 #endif // HWY_ENABLE_MONITORX || HWY_ENABLE_UMONITOR 68 69 namespace hwy { 70 71 // Returned by `UntilDifferent` in a single register. 72 struct SpinResult { 73 // We also use u32 because that is all that futex.h supports. 74 uint32_t current; 75 // Number of retries before returning, useful for checking that the 76 // monitor/wait did not just return immediately. 77 uint32_t reps; 78 }; 79 80 // User-space monitor/wait are supported on Zen2+ AMD and SPR+ Intel. Spin waits 81 // are rarely called from SIMD code, hence we do not integrate this into 82 // `HWY_TARGET` and its runtime dispatch mechanism. Returned by `Type()`, also 83 // used by callers to set the `disabled` argument for `DetectSpin`. 84 enum class SpinType : uint8_t { 85 #if HWY_ENABLE_MONITORX 86 kMonitorX = 1, // AMD 87 #endif 88 #if HWY_ENABLE_UMONITOR 89 kUMonitor = 2, // Intel 90 #endif 91 kPause = 3, 92 kSentinel // for iterating over all enumerators. Must be last. 93 }; 94 95 // For printing which is in use. 96 static inline const char* ToString(SpinType type) { 97 switch (type) { 98 #if HWY_ENABLE_MONITORX 99 case SpinType::kMonitorX: 100 return "MonitorX_C1"; 101 #endif 102 #if HWY_ENABLE_UMONITOR 103 case SpinType::kUMonitor: 104 return "UMonitor_C0.2"; 105 #endif 106 case SpinType::kPause: 107 return "Pause"; 108 case SpinType::kSentinel: 109 default: 110 return nullptr; 111 } 112 } 113 114 // Indirect function calls turn out to be too expensive because this is called 115 // multiple times per ThreadPool barrier. We will instead inline the spin and 116 // barrier using policy classes. This one is always available; use it as a 117 // reference for the interface. Note that Pause varies across CPUs: it can be 118 // a no-op, or wait 140 cycles. 119 struct SpinPause { 120 SpinType Type() const { return SpinType::kPause; } 121 122 // Spins until `watched != prev` and returns the new value, similar to 123 // `BlockUntilDifferent` in `futex.h`. 124 HWY_INLINE SpinResult UntilDifferent( 125 const uint32_t prev, const std::atomic<uint32_t>& watched) const { 126 for (uint32_t reps = 0;; ++reps) { 127 const uint32_t current = watched.load(std::memory_order_acquire); 128 if (current != prev) return SpinResult{current, reps}; 129 hwy::Pause(); 130 } 131 } 132 133 // Returns number of retries until `watched == expected`. 134 HWY_INLINE size_t UntilEqual(const uint32_t expected, 135 const std::atomic<uint32_t>& watched) const { 136 for (size_t reps = 0;; ++reps) { 137 const uint32_t current = watched.load(std::memory_order_acquire); 138 if (current == expected) return reps; 139 hwy::Pause(); 140 } 141 } 142 }; 143 144 #if HWY_ENABLE_MONITORX || HWY_IDE 145 #if !HWY_ENABLE_SPIN_ASM 146 HWY_PUSH_ATTRIBUTES("mwaitx") 147 #endif 148 149 // AMD's user-mode monitor/wait (Zen2+). 150 class SpinMonitorX { 151 public: 152 SpinType Type() const { return SpinType::kMonitorX; } 153 154 HWY_INLINE_SPIN SpinResult UntilDifferent( 155 const uint32_t prev, const std::atomic<uint32_t>& watched) const { 156 for (uint32_t reps = 0;; ++reps) { 157 uint32_t current = watched.load(std::memory_order_acquire); 158 if (current != prev) return SpinResult{current, reps}; 159 Monitor(&watched); 160 // Double-checked 'lock' to avoid missed events: 161 current = watched.load(std::memory_order_acquire); 162 if (current != prev) return SpinResult{current, reps}; 163 Wait(); 164 } 165 } 166 167 HWY_INLINE_SPIN size_t UntilEqual( 168 const uint32_t expected, const std::atomic<uint32_t>& watched) const { 169 for (size_t reps = 0;; ++reps) { 170 uint32_t current = watched.load(std::memory_order_acquire); 171 if (current == expected) return reps; 172 Monitor(&watched); 173 // Double-checked 'lock' to avoid missed events: 174 current = watched.load(std::memory_order_acquire); 175 if (current == expected) return reps; 176 Wait(); 177 } 178 } 179 180 private: 181 static HWY_INLINE void Monitor(const void* addr) { 182 // No extensions/hints currently defined. 183 #if HWY_ENABLE_SPIN_ASM 184 asm volatile("monitorx" ::"a"(addr), "c"(0), "d"(0)); 185 #else 186 _mm_monitorx(const_cast<void*>(addr), 0, 0); 187 #endif 188 } 189 190 static HWY_INLINE void Wait() { 191 #if HWY_ENABLE_SPIN_ASM 192 // EBX=0 cycles means no timeout/infinite. 193 asm volatile("mwaitx" ::"a"(kHints), "b"(0), "c"(kExtensions)); 194 #else 195 _mm_mwaitx(kExtensions, kHints, /*cycles=*/0); 196 #endif 197 } 198 199 // 0xF would be C0. Its wakeup latency is less than 0.1 us shorter, and 200 // package power is sometimes actually higher than with Pause. The 201 // difference in spurious wakeups is minor. 202 static constexpr unsigned kHints = 0x0; // C1: a bit deeper than C0 203 // No timeout required, we assume the mwaitx does not miss stores, see 204 // https://www.usenix.org/system/files/usenixsecurity23-zhang-ruiyi.pdf.] 205 static constexpr unsigned kExtensions = 0; 206 }; 207 208 #if !HWY_ENABLE_SPIN_ASM 209 HWY_POP_ATTRIBUTES 210 #endif 211 #endif // HWY_ENABLE_MONITORX 212 213 #if HWY_ENABLE_UMONITOR || HWY_IDE 214 #if !HWY_ENABLE_SPIN_ASM 215 HWY_PUSH_ATTRIBUTES("waitpkg") 216 #endif 217 218 // Intel's user-mode monitor/wait (SPR+). 219 class SpinUMonitor { 220 public: 221 SpinType Type() const { return SpinType::kUMonitor; } 222 223 HWY_INLINE_SPIN SpinResult UntilDifferent( 224 const uint32_t prev, const std::atomic<uint32_t>& watched) const { 225 for (uint32_t reps = 0;; ++reps) { 226 uint32_t current = watched.load(std::memory_order_acquire); 227 if (current != prev) return SpinResult{current, reps}; 228 Monitor(&watched); 229 // Double-checked 'lock' to avoid missed events: 230 current = watched.load(std::memory_order_acquire); 231 if (current != prev) return SpinResult{current, reps}; 232 Wait(); 233 } 234 } 235 236 HWY_INLINE_SPIN size_t UntilEqual( 237 const uint32_t expected, const std::atomic<uint32_t>& watched) const { 238 for (size_t reps = 0;; ++reps) { 239 uint32_t current = watched.load(std::memory_order_acquire); 240 if (current == expected) return reps; 241 Monitor(&watched); 242 // Double-checked 'lock' to avoid missed events: 243 current = watched.load(std::memory_order_acquire); 244 if (current == expected) return reps; 245 Wait(); 246 } 247 } 248 249 private: 250 static HWY_INLINE void Monitor(const void* addr) { 251 #if HWY_ENABLE_SPIN_ASM 252 asm volatile("umonitor %%rcx" ::"c"(addr)); 253 #else 254 _umonitor(const_cast<void*>(addr)); 255 #endif 256 } 257 258 static HWY_INLINE void Wait() { 259 #if HWY_ENABLE_SPIN_ASM 260 asm volatile("umwait %%ecx" ::"c"(kControl), "d"(kDeadline >> 32), 261 "a"(kDeadline & 0xFFFFFFFFu)); 262 #else 263 _umwait(kControl, kDeadline); 264 #endif 265 } 266 267 // 1 would be C0.1. C0.2 has 20x fewer spurious wakeups and additional 4% 268 // package power savings vs Pause on SPR. It comes at the cost of 269 // 0.4-0.6us higher wake latency, but the total is comparable to Zen4. 270 static constexpr unsigned kControl = 0; // C0.2 for deeper sleep 271 static constexpr uint64_t kDeadline = ~uint64_t{0}; // no timeout, see above 272 }; 273 274 #if !HWY_ENABLE_SPIN_ASM 275 HWY_POP_ATTRIBUTES 276 #endif 277 #endif // HWY_ENABLE_UMONITOR 278 279 // TODO(janwas): add WFE on Arm. May wake at 10 kHz, but still worthwhile. 280 281 // Returns the best-available type whose bit in `disabled` is not set. Example: 282 // to disable kUMonitor, pass `1 << static_cast<int>(SpinType::kUMonitor)`. 283 // Ignores `disabled` for `kPause` if it is the only supported and enabled type. 284 // Somewhat expensive, typically called during initialization. 285 static inline SpinType DetectSpin(int disabled = 0) { 286 const auto HWY_MAYBE_UNUSED enabled = [disabled](SpinType type) { 287 return (disabled & (1 << static_cast<int>(type))) == 0; 288 }; 289 290 #if HWY_ENABLE_MONITORX 291 if (enabled(SpinType::kMonitorX) && x86::IsAMD()) { 292 uint32_t abcd[4]; 293 x86::Cpuid(0x80000001U, 0, abcd); 294 if (x86::IsBitSet(abcd[2], 29)) return SpinType::kMonitorX; 295 } 296 #endif // HWY_ENABLE_MONITORX 297 298 #if HWY_ENABLE_UMONITOR 299 if (enabled(SpinType::kUMonitor) && x86::MaxLevel() >= 7) { 300 uint32_t abcd[4]; 301 x86::Cpuid(7, 0, abcd); 302 if (x86::IsBitSet(abcd[2], 5)) return SpinType::kUMonitor; 303 } 304 #endif // HWY_ENABLE_UMONITOR 305 306 if (!enabled(SpinType::kPause)) { 307 HWY_WARN("Ignoring attempt to disable Pause, it is the only option left."); 308 } 309 return SpinType::kPause; 310 } 311 312 // Calls `func(spin, args)` for the given `spin_type`. 313 template <class Func, typename... Args> 314 HWY_INLINE void CallWithSpin(SpinType spin_type, Func&& func, Args&&... args) { 315 switch (spin_type) { 316 #if HWY_ENABLE_MONITORX 317 case SpinType::kMonitorX: 318 func(SpinMonitorX(), std::forward<Args>(args)...); 319 break; 320 #endif 321 #if HWY_ENABLE_UMONITOR 322 case SpinType::kUMonitor: 323 func(SpinUMonitor(), std::forward<Args>(args)...); 324 break; 325 #endif 326 case SpinType::kPause: 327 default: 328 func(SpinPause(), std::forward<Args>(args)...); 329 break; 330 } 331 } 332 333 } // namespace hwy 334 335 #endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_