futex.h (8834B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_ 17 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_ 18 19 // Keyed event (futex): kernel queue of blocked threads, identified by the 20 // address of an atomic u32 called `current` within the same process (do NOT 21 // use with shared-memory mappings). 22 // 23 // Futex equivalents: https://outerproduct.net/futex-dictionary.html; we 24 // support Linux/Emscripten/Apple/Windows and C++20 std::atomic::wait, plus a 25 // NanoSleep fallback. 26 27 #include <time.h> 28 29 #include <atomic> 30 #include <climits> // INT_MAX 31 32 #include "hwy/base.h" 33 34 #if HWY_OS_APPLE 35 #include <AvailabilityMacros.h> 36 // __ulock* were added in OS X 10.12 (Sierra, 2016). 37 #if MAC_OS_X_VERSION_MAX_ALLOWED < 101200 && !defined(HWY_DISABLE_FUTEX) 38 #define HWY_DISABLE_FUTEX 39 #endif 40 #endif // HWY_OS_APPLE 41 42 #if HWY_OS_WIN 43 // Need to include <windows.h> on Windows, even if HWY_DISABLE_FUTEX is defined, 44 // since hwy::NanoSleep uses Windows API's that are defined in windows.h. 45 #ifndef NOMINMAX 46 #define NOMINMAX 47 #endif // NOMINMAX 48 #ifndef WIN32_LEAN_AND_MEAN 49 #define WIN32_LEAN_AND_MEAN 50 #endif // WIN32_LEAN_AND_MEAN 51 #include <windows.h> 52 #endif 53 54 #if HWY_ARCH_WASM 55 #include <emscripten/threading.h> 56 #include <math.h> // INFINITY 57 58 #elif HWY_OS_LINUX 59 #include <errno.h> // IWYU pragma: keep 60 #include <linux/futex.h> // FUTEX_* 61 #include <pthread.h> 62 #include <sys/syscall.h> // SYS_* 63 #include <unistd.h> 64 // Android may not declare these: 65 #ifndef SYS_futex 66 #ifdef SYS_futex_time64 // 32-bit with 64-bit time_t 67 #define SYS_futex SYS_futex_time64 68 #else 69 #define SYS_futex __NR_futex 70 #endif // SYS_futex_time64 71 #endif // SYS_futex 72 #ifndef FUTEX_WAIT_PRIVATE 73 #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | 128) 74 #endif 75 #ifndef FUTEX_WAKE_PRIVATE 76 #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | 128) 77 #endif 78 79 #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX) 80 // These are private APIs, so add an opt-out. 81 extern "C" { 82 int __ulock_wait(uint32_t op, void* address, uint64_t val, uint32_t max_us); 83 int __ulock_wake(uint32_t op, void* address, uint64_t zero); 84 } // extern "C" 85 #define UL_COMPARE_AND_WAIT 1 86 #define ULF_WAKE_ALL 0x00000100 87 88 #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) 89 // WakeByAddressAll requires Windows 8, so add an opt-out. 90 #if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL 91 #pragma comment(lib, "synchronization.lib") 92 #endif 93 94 #elif HWY_CXX_LANG < 202002L // NOT C++20, which has native support 95 #define HWY_FUTEX_SLEEP 96 #endif 97 98 namespace hwy { 99 100 // Attempts to pause for the specified nanoseconds, though the resolution is 101 // closer to 0.1 microseconds. Returns false if no wait happened. Thread-safe. 102 static inline bool NanoSleep(uint64_t ns) { 103 #if HWY_OS_WIN 104 static thread_local HANDLE hTimer = nullptr; 105 if (HWY_UNLIKELY(hTimer == nullptr)) { 106 // Must be manual reset: auto-reset would immediately signal after the next 107 // SetWaitableTimer. 108 hTimer = CreateWaitableTimer(nullptr, TRUE, nullptr); 109 if (hTimer == nullptr) return false; 110 } 111 112 // Negative means relative, in units of 100 ns. 113 LARGE_INTEGER time; 114 time.QuadPart = -static_cast<LONGLONG>(ns / 100); 115 const LONG period = 0; // signal once 116 if (!SetWaitableTimer(hTimer, &time, period, nullptr, nullptr, FALSE)) { 117 return false; 118 } 119 120 (void)WaitForSingleObject(hTimer, INFINITE); 121 return true; 122 #else 123 timespec duration; 124 duration.tv_sec = static_cast<time_t>(ns / 1000000000); 125 duration.tv_nsec = static_cast<decltype(duration.tv_nsec)>(ns % 1000000000); 126 timespec remainder; 127 // Repeat if interrupted by a signal. Note that the remainder may be rounded 128 // up, which could cause an infinite loop if continually interrupted. Using 129 // clock_nanosleep would work, but we'd have to get the current time. We 130 // assume durations are short, and instead just cap the number of retries. 131 for (int rep = 0; rep < 3; ++rep) { 132 if (nanosleep(&duration, &remainder) == 0 || errno != EINTR) break; 133 duration = remainder; 134 } 135 return true; 136 #endif 137 } 138 139 // Waits until `current != prev` and returns the new value. May return 140 // immediately if `current` already changed, or after blocking and waking. 141 static inline uint32_t BlockUntilDifferent( 142 const uint32_t prev, const std::atomic<uint32_t>& current) { 143 const auto acq = std::memory_order_acquire; 144 145 #if HWY_ARCH_WASM 146 // It is always safe to cast to void. 147 volatile void* address = 148 const_cast<volatile void*>(static_cast<const volatile void*>(¤t)); 149 const double max_ms = INFINITY; 150 for (;;) { 151 const uint32_t next = current.load(acq); 152 if (next != prev) return next; 153 const int ret = emscripten_futex_wait(address, prev, max_ms); 154 HWY_DASSERT(ret >= 0); 155 (void)ret; 156 } 157 158 #elif HWY_OS_LINUX 159 // Safe to cast because std::atomic is a standard layout type. 160 const uint32_t* address = reinterpret_cast<const uint32_t*>(¤t); 161 // _PRIVATE requires this only be used in the same process, and avoids 162 // virtual->physical lookups and atomic reference counting. 163 const int op = FUTEX_WAIT_PRIVATE; 164 for (;;) { 165 const uint32_t next = current.load(acq); 166 if (next != prev) return next; 167 // timeout=null may prevent interrupts via signal. No lvalue because 168 // the timespec type is only standardized since C++17 or C11. 169 const auto ret = syscall(SYS_futex, address, op, prev, nullptr, nullptr, 0); 170 if (ret == -1) { 171 HWY_DASSERT(errno == EAGAIN); // otherwise an actual error 172 } 173 } 174 175 #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) 176 // It is always safe to cast to void. 177 volatile void* address = 178 const_cast<volatile void*>(static_cast<const volatile void*>(¤t)); 179 // API is not const-correct, but only loads from the pointer. 180 PVOID pprev = const_cast<void*>(static_cast<const void*>(&prev)); 181 const DWORD max_ms = INFINITE; 182 for (;;) { 183 const uint32_t next = current.load(acq); 184 if (next != prev) return next; 185 const BOOL ok = WaitOnAddress(address, pprev, sizeof(prev), max_ms); 186 HWY_DASSERT(ok); 187 (void)ok; 188 } 189 190 #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX) 191 // It is always safe to cast to void. 192 void* address = const_cast<void*>(static_cast<const void*>(¤t)); 193 for (;;) { 194 const uint32_t next = current.load(acq); 195 if (next != prev) return next; 196 __ulock_wait(UL_COMPARE_AND_WAIT, address, prev, 0); 197 } 198 199 #elif defined(HWY_FUTEX_SLEEP) 200 for (;;) { 201 const uint32_t next = current.load(acq); 202 if (next != prev) return next; 203 NanoSleep(2000); 204 } 205 206 #elif HWY_CXX_LANG >= 202002L 207 current.wait(prev, acq); // No spurious wakeup. 208 const uint32_t next = current.load(acq); 209 HWY_DASSERT(next != prev); 210 return next; 211 212 #else 213 #error "Logic error, should have reached HWY_FUTEX_SLEEP" 214 #endif // HWY_OS_* 215 } // BlockUntilDifferent 216 217 // Wakes all threads, if any, that are waiting because they called 218 // `BlockUntilDifferent` with the same `current`. 219 static inline void WakeAll(std::atomic<uint32_t>& current) { 220 #if HWY_ARCH_WASM 221 // It is always safe to cast to void. 222 volatile void* address = static_cast<volatile void*>(¤t); 223 const int max_to_wake = INT_MAX; // actually signed 224 const int ret = emscripten_futex_wake(address, max_to_wake); 225 HWY_DASSERT(ret >= 0); 226 (void)ret; 227 228 #elif HWY_OS_LINUX 229 // Safe to cast because std::atomic is a standard layout type. 230 uint32_t* address = reinterpret_cast<uint32_t*>(¤t); 231 const int max_to_wake = INT_MAX; // actually signed 232 const auto ret = syscall(SYS_futex, address, FUTEX_WAKE_PRIVATE, max_to_wake, 233 nullptr, nullptr, 0); 234 HWY_DASSERT(ret >= 0); // number woken 235 (void)ret; 236 237 #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) 238 // It is always safe to cast to void. 239 void* address = static_cast<void*>(¤t); 240 WakeByAddressAll(address); 241 242 #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX) 243 // It is always safe to cast to void. 244 void* address = static_cast<void*>(¤t); 245 __ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, address, 0); 246 247 #elif defined(HWY_FUTEX_SLEEP) 248 // NanoSleep loop does not require wakeup. 249 (void)current; 250 #elif HWY_CXX_LANG >= 202002L 251 current.notify_all(); 252 253 #else 254 #error "Logic error, should have reached HWY_FUTEX_SLEEP" 255 #endif 256 } // WakeAll 257 258 } // namespace hwy 259 260 #endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_