tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

spin.h (10902B)


      1 // Copyright 2025 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_
     17 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_
     18 
     19 // Relatively power-efficient spin lock for low-latency synchronization.
     20 
     21 #include <stdint.h>
     22 
     23 #include <atomic>
     24 
     25 #include "hwy/base.h"
     26 #include "hwy/cache_control.h"  // Pause
     27 
     28 #ifndef HWY_ENABLE_MONITORX  // allow override
     29 // Clang 3.9 suffices for mwaitx, but the target pragma requires 9.0.
     30 #if HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 900) || \
     31                     (HWY_COMPILER_GCC_ACTUAL >= 502) || defined(__MWAITX__))
     32 #define HWY_ENABLE_MONITORX 1
     33 #else
     34 #define HWY_ENABLE_MONITORX 0
     35 #endif
     36 #endif  // HWY_ENABLE_MONITORX
     37 
     38 #ifndef HWY_ENABLE_UMONITOR  // allow override
     39 #if HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 900) || \
     40                     (HWY_COMPILER_GCC_ACTUAL >= 901) || defined(__WAITPKG__))
     41 #define HWY_ENABLE_UMONITOR 1
     42 #else
     43 #define HWY_ENABLE_UMONITOR 0
     44 #endif
     45 #endif  // HWY_ENABLE_UMONITOR
     46 
     47 // Inline assembly is preferred because it allows inlining of `UntilDifferent`
     48 // etc, but we also support intrinsics for MSVC.
     49 #ifndef HWY_ENABLE_SPIN_ASM  // allow override
     50 #if (HWY_COMPILER_CLANG || HWY_COMPILER_GCC) && HWY_ARCH_X86_64
     51 #define HWY_ENABLE_SPIN_ASM 1
     52 #else
     53 #define HWY_ENABLE_SPIN_ASM 0
     54 #endif
     55 #endif  // HWY_ENABLE_SPIN_ASM
     56 
     57 #if HWY_ENABLE_MONITORX || HWY_ENABLE_UMONITOR
     58 #if HWY_ENABLE_SPIN_ASM
     59 #define HWY_INLINE_SPIN HWY_INLINE  // can inline functions with inline assembly
     60 #else
     61 // Intrinsics require attributes, which prevent inlining.
     62 #define HWY_INLINE_SPIN
     63 #include <x86intrin.h>
     64 #endif  // HWY_ENABLE_SPIN_ASM
     65 
     66 #include "hwy/x86_cpuid.h"
     67 #endif  // HWY_ENABLE_MONITORX || HWY_ENABLE_UMONITOR
     68 
     69 namespace hwy {
     70 
     71 // Returned by `UntilDifferent` in a single register.
     72 struct SpinResult {
     73  // We also use u32 because that is all that futex.h supports.
     74  uint32_t current;
     75  // Number of retries before returning, useful for checking that the
     76  // monitor/wait did not just return immediately.
     77  uint32_t reps;
     78 };
     79 
     80 // User-space monitor/wait are supported on Zen2+ AMD and SPR+ Intel. Spin waits
     81 // are rarely called from SIMD code, hence we do not integrate this into
     82 // `HWY_TARGET` and its runtime dispatch mechanism. Returned by `Type()`, also
     83 // used by callers to set the `disabled` argument for `DetectSpin`.
     84 enum class SpinType : uint8_t {
     85 #if HWY_ENABLE_MONITORX
     86  kMonitorX = 1,  // AMD
     87 #endif
     88 #if HWY_ENABLE_UMONITOR
     89  kUMonitor = 2,  // Intel
     90 #endif
     91  kPause = 3,
     92  kSentinel  // for iterating over all enumerators. Must be last.
     93 };
     94 
     95 // For printing which is in use.
     96 static inline const char* ToString(SpinType type) {
     97  switch (type) {
     98 #if HWY_ENABLE_MONITORX
     99    case SpinType::kMonitorX:
    100      return "MonitorX_C1";
    101 #endif
    102 #if HWY_ENABLE_UMONITOR
    103    case SpinType::kUMonitor:
    104      return "UMonitor_C0.2";
    105 #endif
    106    case SpinType::kPause:
    107      return "Pause";
    108    case SpinType::kSentinel:
    109    default:
    110      return nullptr;
    111  }
    112 }
    113 
    114 // Indirect function calls turn out to be too expensive because this is called
    115 // multiple times per ThreadPool barrier. We will instead inline the spin and
    116 // barrier using policy classes. This one is always available; use it as a
    117 // reference for the interface. Note that Pause varies across CPUs: it can be
    118 // a no-op, or wait 140 cycles.
    119 struct SpinPause {
    120  SpinType Type() const { return SpinType::kPause; }
    121 
    122  // Spins until `watched != prev` and returns the new value, similar to
    123  // `BlockUntilDifferent` in `futex.h`.
    124  HWY_INLINE SpinResult UntilDifferent(
    125      const uint32_t prev, const std::atomic<uint32_t>& watched) const {
    126    for (uint32_t reps = 0;; ++reps) {
    127      const uint32_t current = watched.load(std::memory_order_acquire);
    128      if (current != prev) return SpinResult{current, reps};
    129      hwy::Pause();
    130    }
    131  }
    132 
    133  // Returns number of retries until `watched == expected`.
    134  HWY_INLINE size_t UntilEqual(const uint32_t expected,
    135                               const std::atomic<uint32_t>& watched) const {
    136    for (size_t reps = 0;; ++reps) {
    137      const uint32_t current = watched.load(std::memory_order_acquire);
    138      if (current == expected) return reps;
    139      hwy::Pause();
    140    }
    141  }
    142 };
    143 
    144 #if HWY_ENABLE_MONITORX || HWY_IDE
    145 #if !HWY_ENABLE_SPIN_ASM
    146 HWY_PUSH_ATTRIBUTES("mwaitx")
    147 #endif
    148 
    149 // AMD's user-mode monitor/wait (Zen2+).
    150 class SpinMonitorX {
    151 public:
    152  SpinType Type() const { return SpinType::kMonitorX; }
    153 
    154  HWY_INLINE_SPIN SpinResult UntilDifferent(
    155      const uint32_t prev, const std::atomic<uint32_t>& watched) const {
    156    for (uint32_t reps = 0;; ++reps) {
    157      uint32_t current = watched.load(std::memory_order_acquire);
    158      if (current != prev) return SpinResult{current, reps};
    159      Monitor(&watched);
    160      // Double-checked 'lock' to avoid missed events:
    161      current = watched.load(std::memory_order_acquire);
    162      if (current != prev) return SpinResult{current, reps};
    163      Wait();
    164    }
    165  }
    166 
    167  HWY_INLINE_SPIN size_t UntilEqual(
    168      const uint32_t expected, const std::atomic<uint32_t>& watched) const {
    169    for (size_t reps = 0;; ++reps) {
    170      uint32_t current = watched.load(std::memory_order_acquire);
    171      if (current == expected) return reps;
    172      Monitor(&watched);
    173      // Double-checked 'lock' to avoid missed events:
    174      current = watched.load(std::memory_order_acquire);
    175      if (current == expected) return reps;
    176      Wait();
    177    }
    178  }
    179 
    180 private:
    181  static HWY_INLINE void Monitor(const void* addr) {
    182    // No extensions/hints currently defined.
    183 #if HWY_ENABLE_SPIN_ASM
    184    asm volatile("monitorx" ::"a"(addr), "c"(0), "d"(0));
    185 #else
    186    _mm_monitorx(const_cast<void*>(addr), 0, 0);
    187 #endif
    188  }
    189 
    190  static HWY_INLINE void Wait() {
    191 #if HWY_ENABLE_SPIN_ASM
    192    // EBX=0 cycles means no timeout/infinite.
    193    asm volatile("mwaitx" ::"a"(kHints), "b"(0), "c"(kExtensions));
    194 #else
    195    _mm_mwaitx(kExtensions, kHints, /*cycles=*/0);
    196 #endif
    197  }
    198 
    199  // 0xF would be C0. Its wakeup latency is less than 0.1 us shorter, and
    200  // package power is sometimes actually higher than with Pause. The
    201  // difference in spurious wakeups is minor.
    202  static constexpr unsigned kHints = 0x0;  // C1: a bit deeper than C0
    203  // No timeout required, we assume the mwaitx does not miss stores, see
    204  // https://www.usenix.org/system/files/usenixsecurity23-zhang-ruiyi.pdf.]
    205  static constexpr unsigned kExtensions = 0;
    206 };
    207 
    208 #if !HWY_ENABLE_SPIN_ASM
    209 HWY_POP_ATTRIBUTES
    210 #endif
    211 #endif  // HWY_ENABLE_MONITORX
    212 
    213 #if HWY_ENABLE_UMONITOR || HWY_IDE
    214 #if !HWY_ENABLE_SPIN_ASM
    215 HWY_PUSH_ATTRIBUTES("waitpkg")
    216 #endif
    217 
    218 // Intel's user-mode monitor/wait (SPR+).
    219 class SpinUMonitor {
    220 public:
    221  SpinType Type() const { return SpinType::kUMonitor; }
    222 
    223  HWY_INLINE_SPIN SpinResult UntilDifferent(
    224      const uint32_t prev, const std::atomic<uint32_t>& watched) const {
    225    for (uint32_t reps = 0;; ++reps) {
    226      uint32_t current = watched.load(std::memory_order_acquire);
    227      if (current != prev) return SpinResult{current, reps};
    228      Monitor(&watched);
    229      // Double-checked 'lock' to avoid missed events:
    230      current = watched.load(std::memory_order_acquire);
    231      if (current != prev) return SpinResult{current, reps};
    232      Wait();
    233    }
    234  }
    235 
    236  HWY_INLINE_SPIN size_t UntilEqual(
    237      const uint32_t expected, const std::atomic<uint32_t>& watched) const {
    238    for (size_t reps = 0;; ++reps) {
    239      uint32_t current = watched.load(std::memory_order_acquire);
    240      if (current == expected) return reps;
    241      Monitor(&watched);
    242      // Double-checked 'lock' to avoid missed events:
    243      current = watched.load(std::memory_order_acquire);
    244      if (current == expected) return reps;
    245      Wait();
    246    }
    247  }
    248 
    249 private:
    250  static HWY_INLINE void Monitor(const void* addr) {
    251 #if HWY_ENABLE_SPIN_ASM
    252    asm volatile("umonitor %%rcx" ::"c"(addr));
    253 #else
    254    _umonitor(const_cast<void*>(addr));
    255 #endif
    256  }
    257 
    258  static HWY_INLINE void Wait() {
    259 #if HWY_ENABLE_SPIN_ASM
    260    asm volatile("umwait %%ecx" ::"c"(kControl), "d"(kDeadline >> 32),
    261                 "a"(kDeadline & 0xFFFFFFFFu));
    262 #else
    263    _umwait(kControl, kDeadline);
    264 #endif
    265  }
    266 
    267  // 1 would be C0.1. C0.2 has 20x fewer spurious wakeups and additional 4%
    268  // package power savings vs Pause on SPR. It comes at the cost of
    269  // 0.4-0.6us higher wake latency, but the total is comparable to Zen4.
    270  static constexpr unsigned kControl = 0;              // C0.2 for deeper sleep
    271  static constexpr uint64_t kDeadline = ~uint64_t{0};  // no timeout, see above
    272 };
    273 
    274 #if !HWY_ENABLE_SPIN_ASM
    275 HWY_POP_ATTRIBUTES
    276 #endif
    277 #endif  // HWY_ENABLE_UMONITOR
    278 
    279 // TODO(janwas): add WFE on Arm. May wake at 10 kHz, but still worthwhile.
    280 
    281 // Returns the best-available type whose bit in `disabled` is not set. Example:
    282 // to disable kUMonitor, pass `1 << static_cast<int>(SpinType::kUMonitor)`.
    283 // Ignores `disabled` for `kPause` if it is the only supported and enabled type.
    284 // Somewhat expensive, typically called during initialization.
    285 static inline SpinType DetectSpin(int disabled = 0) {
    286  const auto HWY_MAYBE_UNUSED enabled = [disabled](SpinType type) {
    287    return (disabled & (1 << static_cast<int>(type))) == 0;
    288  };
    289 
    290 #if HWY_ENABLE_MONITORX
    291  if (enabled(SpinType::kMonitorX) && x86::IsAMD()) {
    292    uint32_t abcd[4];
    293    x86::Cpuid(0x80000001U, 0, abcd);
    294    if (x86::IsBitSet(abcd[2], 29)) return SpinType::kMonitorX;
    295  }
    296 #endif  // HWY_ENABLE_MONITORX
    297 
    298 #if HWY_ENABLE_UMONITOR
    299  if (enabled(SpinType::kUMonitor) && x86::MaxLevel() >= 7) {
    300    uint32_t abcd[4];
    301    x86::Cpuid(7, 0, abcd);
    302    if (x86::IsBitSet(abcd[2], 5)) return SpinType::kUMonitor;
    303  }
    304 #endif  // HWY_ENABLE_UMONITOR
    305 
    306  if (!enabled(SpinType::kPause)) {
    307    HWY_WARN("Ignoring attempt to disable Pause, it is the only option left.");
    308  }
    309  return SpinType::kPause;
    310 }
    311 
    312 // Calls `func(spin, args)` for the given `spin_type`.
    313 template <class Func, typename... Args>
    314 HWY_INLINE void CallWithSpin(SpinType spin_type, Func&& func, Args&&... args) {
    315  switch (spin_type) {
    316 #if HWY_ENABLE_MONITORX
    317    case SpinType::kMonitorX:
    318      func(SpinMonitorX(), std::forward<Args>(args)...);
    319      break;
    320 #endif
    321 #if HWY_ENABLE_UMONITOR
    322    case SpinType::kUMonitor:
    323      func(SpinUMonitor(), std::forward<Args>(args)...);
    324      break;
    325 #endif
    326    case SpinType::kPause:
    327    default:
    328      func(SpinPause(), std::forward<Args>(args)...);
    329      break;
    330  }
    331 }
    332 
    333 }  // namespace hwy
    334 
    335 #endif  // HIGHWAY_HWY_CONTRIB_THREAD_POOL_SPIN_H_