base.h (113541B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_BASE_H_ 17 #define HIGHWAY_HWY_BASE_H_ 18 19 // Target-independent definitions. 20 21 // IWYU pragma: begin_exports 22 #include <stddef.h> 23 #include <stdint.h> 24 #if defined(HWY_HEADER_ONLY) 25 #include <stdarg.h> 26 #include <stdio.h> 27 #endif 28 29 #if !defined(HWY_NO_LIBCXX) 30 #include <ostream> 31 #endif 32 33 #include "hwy/detect_compiler_arch.h" 34 #include "hwy/highway_export.h" 35 36 #include <mozilla/Attributes.h> 37 38 // API version (https://semver.org/); keep in sync with CMakeLists.txt and 39 // meson.build. 40 #define HWY_MAJOR 1 41 #define HWY_MINOR 3 42 #define HWY_PATCH 0 43 44 // True if the Highway version >= major.minor.0. Added in 1.2.0. 45 #define HWY_VERSION_GE(major, minor) \ 46 (HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor))) 47 // True if the Highway version < major.minor.0. Added in 1.2.0. 48 #define HWY_VERSION_LT(major, minor) \ 49 (HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor))) 50 51 // "IWYU pragma: keep" does not work for these includes, so hide from the IDE. 52 #if !HWY_IDE 53 54 #if !defined(HWY_NO_LIBCXX) 55 #ifndef __STDC_FORMAT_MACROS 56 #define __STDC_FORMAT_MACROS // before inttypes.h 57 #endif 58 #include <inttypes.h> 59 #endif 60 61 #endif // !HWY_IDE 62 63 #if !defined(HWY_NO_LIBCXX) || HWY_COMPILER_MSVC 64 #include <atomic> 65 #endif 66 67 #ifndef HWY_HAVE_COMPARE_HEADER // allow override 68 #define HWY_HAVE_COMPARE_HEADER 0 69 #if defined(__has_include) // note: wrapper macro fails on Clang ~17 70 #if __has_include(<compare>) 71 #undef HWY_HAVE_COMPARE_HEADER 72 #define HWY_HAVE_COMPARE_HEADER 1 73 #endif // __has_include 74 #endif // defined(__has_include) 75 #endif // HWY_HAVE_COMPARE_HEADER 76 77 #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE // allow override 78 #if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \ 79 __cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER 80 #include <compare> 81 #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1 82 #else 83 #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0 84 #endif 85 #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE 86 87 // IWYU pragma: end_exports 88 89 #if HWY_COMPILER_MSVC 90 #include <string.h> // memcpy 91 #endif 92 93 //------------------------------------------------------------------------------ 94 // Compiler-specific definitions 95 96 #define HWY_STR_IMPL(macro) #macro 97 #define HWY_STR(macro) HWY_STR_IMPL(macro) 98 99 #if HWY_COMPILER_MSVC 100 101 #include <intrin.h> 102 103 #define HWY_FUNCTION __FUNCSIG__ // function name + template args 104 #define HWY_RESTRICT __restrict 105 #define HWY_INLINE __forceinline 106 #define HWY_NOINLINE __declspec(noinline) 107 #define HWY_FLATTEN 108 #define HWY_NORETURN __declspec(noreturn) 109 #define HWY_LIKELY(expr) (expr) 110 #define HWY_UNLIKELY(expr) (expr) 111 #define HWY_UNREACHABLE __assume(false) 112 #define HWY_PRAGMA(tokens) __pragma(tokens) 113 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) 114 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) 115 #define HWY_MAYBE_UNUSED 116 #define HWY_HAS_ASSUME_ALIGNED 0 117 #if (_MSC_VER >= 1700) 118 #define HWY_MUST_USE_RESULT _Check_return_ 119 #else 120 #define HWY_MUST_USE_RESULT 121 #endif 122 123 #else 124 125 #define HWY_FUNCTION __PRETTY_FUNCTION__ // function name + template args 126 #define HWY_RESTRICT __restrict__ 127 // force inlining without optimization enabled creates very inefficient code 128 // that can cause compiler timeout 129 #ifdef __OPTIMIZE__ 130 #define HWY_INLINE inline __attribute__((always_inline)) 131 #else 132 #define HWY_INLINE inline 133 #endif 134 #define HWY_NOINLINE __attribute__((noinline)) 135 #define HWY_FLATTEN __attribute__((flatten)) 136 #define HWY_NORETURN __attribute__((noreturn)) 137 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) 138 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) 139 #if HWY_COMPILER_GCC || HWY_HAS_BUILTIN(__builtin_unreachable) 140 #define HWY_UNREACHABLE __builtin_unreachable() 141 #else 142 #define HWY_UNREACHABLE 143 #endif 144 #define HWY_PRAGMA(tokens) _Pragma(#tokens) 145 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) 146 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) 147 // Encountered "attribute list cannot appear here" when using the C++17 148 // [[maybe_unused]], so only use the old style attribute for now. 149 #define HWY_MAYBE_UNUSED __attribute__((unused)) 150 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) 151 152 #endif // !HWY_COMPILER_MSVC 153 154 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \ 155 (HWY_COMPILER_ICC && !HWY_COMPILER_ICX) 156 // The use of __attribute__((unused)) in private class member variables triggers 157 // a compiler warning with GCC 11 and earlier and ICC 158 159 // GCC 11 and earlier and ICC also do not emit -Wunused-private-field warnings 160 // for unused private class member variables 161 #define HWY_MEMBER_VAR_MAYBE_UNUSED 162 #else 163 // Clang and ICX need __attribute__((unused)) in unused private class member 164 // variables to suppress -Wunused-private-field warnings unless this warning is 165 // ignored by using HWY_DIAGNOSTICS_OFF 166 #define HWY_MEMBER_VAR_MAYBE_UNUSED HWY_MAYBE_UNUSED 167 #endif 168 169 //------------------------------------------------------------------------------ 170 // Builtin/attributes (no more #include after this point due to namespace!) 171 172 namespace hwy { 173 174 // Enables error-checking of format strings. 175 #if HWY_HAS_ATTRIBUTE(__format__) 176 #define HWY_FORMAT(idx_fmt, idx_arg) \ 177 __attribute__((__format__(__printf__, idx_fmt, idx_arg))) 178 #else 179 #define HWY_FORMAT(idx_fmt, idx_arg) 180 #endif 181 182 // Returns a void* pointer which the compiler then assumes is N-byte aligned. 183 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); 184 // 185 // The assignment semantics are required by GCC/Clang. ICC provides an in-place 186 // __assume_aligned, whereas MSVC's __assume appears unsuitable. 187 #if HWY_HAS_BUILTIN(__builtin_assume_aligned) 188 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) 189 #else 190 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ 191 #endif 192 193 // Returns a pointer whose type is `type` (T*), while allowing the compiler to 194 // assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T). 195 #define HWY_RCAST_ALIGNED(type, ptr) \ 196 reinterpret_cast<type>( \ 197 HWY_ASSUME_ALIGNED((ptr), alignof(hwy::RemovePtr<type>))) 198 199 // Clang and GCC require attributes on each function into which SIMD intrinsics 200 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and 201 // automatic annotation via pragmas. 202 #if HWY_COMPILER_ICC 203 // As of ICC 2021.{1-9} the pragma is neither implemented nor required. 204 #define HWY_PUSH_ATTRIBUTES(targets_str) 205 #define HWY_POP_ATTRIBUTES 206 #elif HWY_COMPILER_CLANG 207 #define HWY_PUSH_ATTRIBUTES(targets_str) \ 208 HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ 209 apply_to = function)) 210 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) 211 #elif HWY_COMPILER_GCC_ACTUAL 212 #define HWY_PUSH_ATTRIBUTES(targets_str) \ 213 HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) 214 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) 215 #else 216 #define HWY_PUSH_ATTRIBUTES(targets_str) 217 #define HWY_POP_ATTRIBUTES 218 #endif 219 220 //------------------------------------------------------------------------------ 221 // Macros 222 223 // Note: it is safe to remove `static` for users who want to use modules, but 224 // that might be a breaking change for some users, hence we do not by default. 225 #define HWY_API static HWY_INLINE HWY_FLATTEN 226 227 #define HWY_CONCAT_IMPL(a, b) a##b 228 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) 229 230 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) 231 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) 232 233 #if HWY_COMPILER_GCC_ACTUAL 234 // nielskm: GCC does not support '#pragma GCC unroll' without the factor. 235 #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor) 236 #define HWY_DEFAULT_UNROLL HWY_UNROLL(4) 237 #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX 238 #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor) 239 #define HWY_DEFAULT_UNROLL HWY_UNROLL() 240 #else 241 #define HWY_UNROLL(factor) 242 #define HWY_DEFAULT_UNROLL 243 #endif 244 245 // Tell a compiler that the expression always evaluates to true. 246 // The expression should be free from any side effects. 247 // Some older compilers may have trouble with complex expressions, therefore 248 // it is advisable to split multiple conditions into separate assume statements, 249 // and manually check the generated code. 250 // OK but could fail: 251 // HWY_ASSUME(x == 2 && y == 3); 252 // Better: 253 // HWY_ASSUME(x == 2); 254 // HWY_ASSUME(y == 3); 255 #if (HWY_CXX_LANG >= 202302L) && HWY_HAS_CPP_ATTRIBUTE(assume) 256 #define HWY_ASSUME(expr) [[assume(expr)]] 257 #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC 258 #define HWY_ASSUME(expr) __assume(expr) 259 // __builtin_assume() was added in clang 3.6. 260 #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume) 261 #define HWY_ASSUME(expr) __builtin_assume(expr) 262 // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added 263 // later, so check for the compiler version directly. 264 #elif HWY_COMPILER_GCC_ACTUAL >= 405 265 #define HWY_ASSUME(expr) \ 266 ((expr) ? static_cast<void>(0) : __builtin_unreachable()) 267 #else 268 #define HWY_ASSUME(expr) static_cast<void>(0) 269 #endif 270 271 // Compile-time fence to prevent undesirable code reordering. On Clang, the 272 // typical `asm volatile("" : : : "memory")` seems to be ignored. Note that 273 // `std::atomic_thread_fence` affects other threads, hence might generate a 274 // barrier instruction, but this does not. 275 #if !defined(HWY_NO_LIBCXX) 276 #define HWY_FENCE std::atomic_signal_fence(std::memory_order_seq_cst) 277 #elif HWY_COMPILER_GCC 278 #define HWY_FENCE asm volatile("" : : : "memory") 279 #else 280 #define HWY_FENCE 281 #endif 282 283 // 4 instances of a given literal value, useful as input to LoadDup128. 284 #define HWY_REP4(literal) literal, literal, literal, literal 285 286 //------------------------------------------------------------------------------ 287 // Abort / Warn 288 289 #if defined(HWY_HEADER_ONLY) 290 HWY_DLLEXPORT inline void HWY_FORMAT(3, 4) 291 Warn(const char* file, int line, const char* format, ...) { 292 char buf[800]; 293 va_list args; 294 va_start(args, format); 295 vsnprintf(buf, sizeof(buf), format, args); 296 va_end(args); 297 298 fprintf(stderr, "Warn at %s:%d: %s\n", file, line, buf); 299 } 300 301 HWY_DLLEXPORT HWY_NORETURN inline void HWY_FORMAT(3, 4) 302 Abort(const char* file, int line, const char* format, ...) { 303 char buf[800]; 304 va_list args; 305 va_start(args, format); 306 vsnprintf(buf, sizeof(buf), format, args); 307 va_end(args); 308 309 fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf); 310 311 fflush(stderr); 312 313 // Now terminate the program: 314 #if HWY_ARCH_RISCV 315 exit(1); // trap/abort just freeze Spike. 316 #else 317 abort(); // Compile error without this due to HWY_NORETURN. 318 #endif 319 } 320 #else // !HWY_HEADER_ONLY 321 // Interfaces for custom Warn/Abort handlers. 322 typedef void (*WarnFunc)(const char* file, int line, const char* message); 323 324 typedef void (*AbortFunc)(const char* file, int line, const char* message); 325 326 // Returns current Warn() handler, or nullptr if no handler was yet registered, 327 // indicating Highway should print to stderr. 328 // DEPRECATED because this is thread-hostile and prone to misuse (modifying the 329 // underlying pointer through the reference). 330 HWY_DLLEXPORT WarnFunc& GetWarnFunc(); 331 332 // Returns current Abort() handler, or nullptr if no handler was yet registered, 333 // indicating Highway should print to stderr and abort. 334 // DEPRECATED because this is thread-hostile and prone to misuse (modifying the 335 // underlying pointer through the reference). 336 HWY_DLLEXPORT AbortFunc& GetAbortFunc(); 337 338 // Sets a new Warn() handler and returns the previous handler, which is nullptr 339 // if no previous handler was registered, and should otherwise be called from 340 // the new handler. Thread-safe. 341 HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func); 342 343 // Sets a new Abort() handler and returns the previous handler, which is nullptr 344 // if no previous handler was registered, and should otherwise be called from 345 // the new handler. If all handlers return, then Highway will terminate the app. 346 // Thread-safe. 347 HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func); 348 349 HWY_DLLEXPORT void HWY_FORMAT(3, 4) 350 Warn(const char* file, int line, const char* format, ...); 351 352 HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) 353 Abort(const char* file, int line, const char* format, ...); 354 355 #endif // HWY_HEADER_ONLY 356 357 #define HWY_WARN(format, ...) \ 358 ::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__) 359 360 #define HWY_ABORT(format, ...) \ 361 ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) 362 363 // Always enabled. 364 #define HWY_ASSERT_M(condition, msg) \ 365 do { \ 366 if (!(condition)) { \ 367 HWY_ABORT("Assert %s: %s", #condition, msg); \ 368 } \ 369 } while (0) 370 #define HWY_ASSERT(condition) HWY_ASSERT_M(condition, "") 371 372 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \ 373 defined(__SANITIZE_MEMORY__) 374 #define HWY_IS_MSAN 1 375 #else 376 #define HWY_IS_MSAN 0 377 #endif 378 379 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \ 380 defined(__SANITIZE_ADDRESS__) 381 #define HWY_IS_ASAN 1 382 #else 383 #define HWY_IS_ASAN 0 384 #endif 385 386 #if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \ 387 defined(__SANITIZE_HWADDRESS__) 388 #define HWY_IS_HWASAN 1 389 #else 390 #define HWY_IS_HWASAN 0 391 #endif 392 393 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \ 394 defined(__SANITIZE_THREAD__) 395 #define HWY_IS_TSAN 1 396 #else 397 #define HWY_IS_TSAN 0 398 #endif 399 400 #if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \ 401 defined(UNDEFINED_BEHAVIOR_SANITIZER) 402 #define HWY_IS_UBSAN 1 403 #else 404 #define HWY_IS_UBSAN 0 405 #endif 406 407 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo. 408 // You can disable MSAN by adding this attribute to the function that fails. 409 #if HWY_IS_MSAN 410 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory)) 411 #else 412 #define HWY_ATTR_NO_MSAN 413 #endif 414 415 #if HWY_IS_ASAN || HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN 416 #define HWY_IS_SANITIZER 1 417 #else 418 #define HWY_IS_SANITIZER 0 419 #endif 420 421 // For enabling HWY_DASSERT and shortening tests in slower debug builds 422 // 423 // Note: `HWY_IS_UBSAN` is specifically excluded from engaging debug 424 // builds. This is in service of Chromium's `-fsanitize=array-bounds` by 425 // default, where we don't want Highway to unconditionally build in 426 // debug mode. 427 // 428 // See also: 429 // https://docs.google.com/document/d/1eCtY4AZF-SiFHxhIYWzEytdIx3C24de7ccD6Y5Gn2H8/edit?tab=t.9zkn85hr82ms#heading=h.efcshvfql42c 430 #if !defined(HWY_IS_DEBUG_BUILD) 431 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent 432 // MSVC defines NDEBUG (if not, could instead check _DEBUG). 433 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || \ 434 (HWY_IS_ASAN || (HWY_IS_SANITIZER && !HWY_IS_UBSAN)) || \ 435 defined(__clang_analyzer__) 436 #define HWY_IS_DEBUG_BUILD 1 437 #else 438 #define HWY_IS_DEBUG_BUILD 0 439 #endif 440 #endif // HWY_IS_DEBUG_BUILD 441 442 #if HWY_IS_DEBUG_BUILD 443 #define HWY_DASSERT_M(condition, msg) HWY_ASSERT_M(condition, msg) 444 #define HWY_DASSERT(condition) HWY_ASSERT_M(condition, "") 445 #else 446 #define HWY_DASSERT_M(condition, msg) \ 447 do { \ 448 } while (0) 449 #define HWY_DASSERT(condition) \ 450 do { \ 451 } while (0) 452 #endif 453 454 //------------------------------------------------------------------------------ 455 // CopyBytes / ZeroBytes 456 457 #if HWY_COMPILER_MSVC 458 #pragma intrinsic(memcpy) 459 #pragma intrinsic(memset) 460 #endif 461 462 template <size_t kBytes, typename From, typename To> 463 HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { 464 #if HWY_COMPILER_MSVC 465 memcpy(to, from, kBytes); 466 #else 467 __builtin_memcpy(to, from, kBytes); 468 #endif 469 } 470 471 HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to, 472 size_t num_of_bytes_to_copy) { 473 #if HWY_COMPILER_MSVC 474 memcpy(to, from, num_of_bytes_to_copy); 475 #else 476 __builtin_memcpy(to, from, num_of_bytes_to_copy); 477 #endif 478 } 479 480 // Same as CopyBytes, but for same-sized objects; avoids a size argument. 481 template <typename From, typename To> 482 HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { 483 static_assert(sizeof(From) == sizeof(To), ""); 484 CopyBytes<sizeof(From)>(from, to); 485 } 486 487 template <size_t kBytes, typename To> 488 HWY_API void ZeroBytes(To* to) { 489 #if HWY_COMPILER_MSVC 490 memset(to, 0, kBytes); 491 #else 492 __builtin_memset(to, 0, kBytes); 493 #endif 494 } 495 496 HWY_API void ZeroBytes(void* to, size_t num_bytes) { 497 #if HWY_COMPILER_MSVC 498 memset(to, 0, num_bytes); 499 #else 500 __builtin_memset(to, 0, num_bytes); 501 #endif 502 } 503 504 //------------------------------------------------------------------------------ 505 // kMaxVectorSize (undocumented, pending removal) 506 507 #if HWY_ARCH_X86 508 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 509 #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \ 510 __riscv_v_intrinsic >= 11000 511 // Not actually an upper bound on the size. 512 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; 513 #else 514 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; 515 #endif 516 517 //------------------------------------------------------------------------------ 518 // Alignment 519 520 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays 521 // should be allocated dynamically via aligned_allocator.h because Lanes() may 522 // exceed the stack size. 523 #if HWY_ARCH_X86 524 #define HWY_ALIGN_MAX alignas(64) 525 #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \ 526 __riscv_v_intrinsic >= 11000 527 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned 528 #else 529 #define HWY_ALIGN_MAX alignas(16) 530 #endif 531 532 //------------------------------------------------------------------------------ 533 // Lane types 534 535 // hwy::float16_t and hwy::bfloat16_t are forward declared here to allow 536 // BitCastScalar to be implemented before the implementations of the 537 // hwy::float16_t and hwy::bfloat16_t types 538 struct float16_t; 539 struct bfloat16_t; 540 541 using float32_t = float; 542 using float64_t = double; 543 544 #pragma pack(push, 1) 545 546 // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: 547 // https://reviews.llvm.org/D86310 548 struct alignas(16) uint128_t { 549 uint64_t lo; // little-endian layout 550 uint64_t hi; 551 }; 552 553 // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key 554 // field is to be compared (Lt128Upper instead of Lt128). 555 struct alignas(16) K64V64 { 556 uint64_t value; // little-endian layout 557 uint64_t key; 558 }; 559 560 // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier 561 // than when considering both to be a 64-bit key. 562 struct alignas(8) K32V32 { 563 uint32_t value; // little-endian layout 564 uint32_t key; 565 }; 566 567 #pragma pack(pop) 568 569 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a, 570 const uint128_t& b) { 571 return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi; 572 } 573 // Required for std::greater. 574 static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a, 575 const uint128_t& b) { 576 return b < a; 577 } 578 static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a, 579 const uint128_t& b) { 580 return a.lo == b.lo && a.hi == b.hi; 581 } 582 583 #if !defined(HWY_NO_LIBCXX) 584 static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os, 585 const uint128_t& n) { 586 return os << "[hi=" << n.hi << ",lo=" << n.lo << "]"; 587 } 588 #endif 589 590 static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a, 591 const K64V64& b) { 592 return a.key < b.key; 593 } 594 // Required for std::greater. 595 static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a, 596 const K64V64& b) { 597 return b < a; 598 } 599 static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a, 600 const K64V64& b) { 601 return a.key == b.key; 602 } 603 604 #if !defined(HWY_NO_LIBCXX) 605 static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os, 606 const K64V64& n) { 607 return os << "[k=" << n.key << ",v=" << n.value << "]"; 608 } 609 #endif 610 611 static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a, 612 const K32V32& b) { 613 return a.key < b.key; 614 } 615 // Required for std::greater. 616 static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a, 617 const K32V32& b) { 618 return b < a; 619 } 620 static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a, 621 const K32V32& b) { 622 return a.key == b.key; 623 } 624 625 #if !defined(HWY_NO_LIBCXX) 626 static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os, 627 const K32V32& n) { 628 return os << "[k=" << n.key << ",v=" << n.value << "]"; 629 } 630 #endif 631 632 //------------------------------------------------------------------------------ 633 // Controlling overload resolution (SFINAE) 634 635 template <bool Condition> 636 struct EnableIfT {}; 637 template <> 638 struct EnableIfT<true> { 639 using type = void; 640 }; 641 642 template <bool Condition> 643 using EnableIf = typename EnableIfT<Condition>::type; 644 645 template <typename T, typename U> 646 struct IsSameT { 647 enum { value = 0 }; 648 }; 649 650 template <typename T> 651 struct IsSameT<T, T> { 652 enum { value = 1 }; 653 }; 654 655 template <typename T, typename U> 656 HWY_API constexpr bool IsSame() { 657 return IsSameT<T, U>::value; 658 } 659 660 // Returns whether T matches either of U1 or U2 661 template <typename T, typename U1, typename U2> 662 HWY_API constexpr bool IsSameEither() { 663 return IsSameT<T, U1>::value || IsSameT<T, U2>::value; 664 } 665 666 template <bool Condition, typename Then, typename Else> 667 struct IfT { 668 using type = Then; 669 }; 670 671 template <class Then, class Else> 672 struct IfT<false, Then, Else> { 673 using type = Else; 674 }; 675 676 template <bool Condition, typename Then, typename Else> 677 using If = typename IfT<Condition, Then, Else>::type; 678 679 template <typename T> 680 struct IsConstT { 681 enum { value = 0 }; 682 }; 683 684 template <typename T> 685 struct IsConstT<const T> { 686 enum { value = 1 }; 687 }; 688 689 template <typename T> 690 HWY_API constexpr bool IsConst() { 691 return IsConstT<T>::value; 692 } 693 694 template <class T> 695 struct RemoveConstT { 696 using type = T; 697 }; 698 template <class T> 699 struct RemoveConstT<const T> { 700 using type = T; 701 }; 702 703 template <class T> 704 using RemoveConst = typename RemoveConstT<T>::type; 705 706 template <class T> 707 struct RemoveVolatileT { 708 using type = T; 709 }; 710 template <class T> 711 struct RemoveVolatileT<volatile T> { 712 using type = T; 713 }; 714 715 template <class T> 716 using RemoveVolatile = typename RemoveVolatileT<T>::type; 717 718 template <class T> 719 struct RemoveRefT { 720 using type = T; 721 }; 722 template <class T> 723 struct RemoveRefT<T&> { 724 using type = T; 725 }; 726 template <class T> 727 struct RemoveRefT<T&&> { 728 using type = T; 729 }; 730 731 template <class T> 732 using RemoveRef = typename RemoveRefT<T>::type; 733 734 template <class T> 735 using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>; 736 737 template <class T> 738 struct RemovePtrT { 739 using type = T; 740 }; 741 template <class T> 742 struct RemovePtrT<T*> { 743 using type = T; 744 }; 745 template <class T> 746 struct RemovePtrT<const T*> { 747 using type = T; 748 }; 749 template <class T> 750 struct RemovePtrT<volatile T*> { 751 using type = T; 752 }; 753 template <class T> 754 struct RemovePtrT<const volatile T*> { 755 using type = T; 756 }; 757 758 template <class T> 759 using RemovePtr = typename RemovePtrT<T>::type; 760 761 // Insert into template/function arguments to enable this overload only for 762 // vectors of exactly, at most (LE), or more than (GT) this many bytes. 763 // 764 // As an example, checking for a total size of 16 bytes will match both 765 // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>. 766 #define HWY_IF_V_SIZE(T, kN, bytes) \ 767 hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr 768 #define HWY_IF_V_SIZE_LE(T, kN, bytes) \ 769 hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr 770 #define HWY_IF_V_SIZE_GT(T, kN, bytes) \ 771 hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr 772 773 #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr 774 #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr 775 #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr 776 777 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr 778 #define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr 779 #define HWY_IF_SIGNED(T) \ 780 hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \ 781 !hwy::IsSpecialFloat<T>()>* = nullptr 782 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr 783 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr 784 #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr 785 #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr 786 #define HWY_IF_SPECIAL_FLOAT(T) \ 787 hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr 788 #define HWY_IF_NOT_SPECIAL_FLOAT(T) \ 789 hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr 790 #define HWY_IF_FLOAT_OR_SPECIAL(T) \ 791 hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr 792 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \ 793 hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr 794 #define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr 795 796 #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr 797 #define HWY_IF_NOT_T_SIZE(T, bytes) \ 798 hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr 799 // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds 800 // too similar. If you want the opposite of this (2 or 4 bytes), ask for those 801 // bits explicitly (0x14) instead of attempting to 'negate' 0x102. 802 #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \ 803 hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr 804 #define HWY_IF_T_SIZE_LE(T, bytes) \ 805 hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr 806 #define HWY_IF_T_SIZE_GT(T, bytes) \ 807 hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr 808 809 #define HWY_IF_SAME(T, expected) \ 810 hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr 811 #define HWY_IF_NOT_SAME(T, expected) \ 812 hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr 813 814 // One of two expected types 815 #define HWY_IF_SAME2(T, expected1, expected2) \ 816 hwy::EnableIf< \ 817 hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \ 818 nullptr 819 820 #define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t) 821 #define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t) 822 #define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t) 823 #define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t) 824 825 #define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t) 826 #define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t) 827 #define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t) 828 #define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t) 829 830 #define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t) 831 #define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t) 832 833 #define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t) 834 #define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t) 835 836 #define HWY_IF_F32(T) HWY_IF_SAME(T, float) 837 #define HWY_IF_F64(T) HWY_IF_SAME(T, double) 838 839 // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double 840 // overloads. 841 #define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t) 842 #define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t) 843 #define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t) 844 #define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t) 845 846 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \ 847 hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr 848 849 // Empty struct used as a size tag type. 850 template <size_t N> 851 struct SizeTag {}; 852 853 template <class T> 854 class DeclValT { 855 private: 856 template <class U, class URef = U&&> 857 static URef TryAddRValRef(int); 858 template <class U, class Arg> 859 static U TryAddRValRef(Arg); 860 861 public: 862 using type = decltype(TryAddRValRef<T>(0)); 863 enum { kDisableDeclValEvaluation = 1 }; 864 }; 865 866 // hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an 867 // expression of a decltype specifier. 868 869 // hwy::DeclVal<T>() does not require that T have a public default constructor 870 template <class T> 871 HWY_API typename DeclValT<T>::type DeclVal() noexcept { 872 static_assert(!DeclValT<T>::kDisableDeclValEvaluation, 873 "DeclVal() cannot be used in an evaluated context"); 874 } 875 876 template <class T> 877 struct IsArrayT { 878 enum { value = 0 }; 879 }; 880 881 template <class T> 882 struct IsArrayT<T[]> { 883 enum { value = 1 }; 884 }; 885 886 template <class T, size_t N> 887 struct IsArrayT<T[N]> { 888 enum { value = 1 }; 889 }; 890 891 template <class T> 892 static constexpr bool IsArray() { 893 return IsArrayT<T>::value; 894 } 895 896 #if HWY_COMPILER_MSVC 897 HWY_DIAGNOSTICS(push) 898 HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers") 899 #endif 900 901 template <class From, class To> 902 class IsConvertibleT { 903 private: 904 template <class T> 905 static hwy::SizeTag<1> TestFuncWithToArg(T); 906 907 template <class T, class U> 908 static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>( 909 DeclVal<T>())) 910 TryConvTest(int); 911 912 template <class T, class U, class Arg> 913 static hwy::SizeTag<0> TryConvTest(Arg); 914 915 public: 916 enum { 917 value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() && 918 IsSame<RemoveConst<RemoveVolatile<To>>, void>()) || 919 (!IsArray<To>() && 920 (IsSame<To, decltype(DeclVal<To>())>() || 921 !IsSame<const RemoveConst<To>, RemoveConst<To>>()) && 922 IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>()) 923 }; 924 }; 925 926 #if HWY_COMPILER_MSVC 927 HWY_DIAGNOSTICS(pop) 928 #endif 929 930 template <class From, class To> 931 HWY_API constexpr bool IsConvertible() { 932 return IsConvertibleT<From, To>::value; 933 } 934 935 template <class From, class To> 936 class IsStaticCastableT { 937 private: 938 template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))> 939 static hwy::SizeTag<1> TryStaticCastTest(int); 940 941 template <class T, class U, class Arg> 942 static hwy::SizeTag<0> TryStaticCastTest(Arg); 943 944 public: 945 enum { 946 value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>() 947 }; 948 }; 949 950 template <class From, class To> 951 static constexpr bool IsStaticCastable() { 952 return IsStaticCastableT<From, To>::value; 953 } 954 955 #define HWY_IF_CASTABLE(From, To) \ 956 hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr 957 958 #define HWY_IF_OP_CASTABLE(op, T, Native) \ 959 HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native) 960 961 template <class T, class From> 962 class IsAssignableT { 963 private: 964 template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())> 965 static hwy::SizeTag<1> TryAssignTest(int); 966 967 template <class T1, class T2, class Arg> 968 static hwy::SizeTag<0> TryAssignTest(Arg); 969 970 public: 971 enum { 972 value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>() 973 }; 974 }; 975 976 template <class T, class From> 977 static constexpr bool IsAssignable() { 978 return IsAssignableT<T, From>::value; 979 } 980 981 #define HWY_IF_ASSIGNABLE(T, From) \ 982 hwy::EnableIf<IsAssignable<T, From>()>* = nullptr 983 984 // ---------------------------------------------------------------------------- 985 // IsSpecialFloat 986 987 // These types are often special-cased and not supported in all ops. 988 template <typename T> 989 HWY_API constexpr bool IsSpecialFloat() { 990 return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>(); 991 } 992 993 // ----------------------------------------------------------------------------- 994 // IsIntegerLaneType and IsInteger 995 996 template <class T> 997 HWY_API constexpr bool IsIntegerLaneType() { 998 return false; 999 } 1000 template <> 1001 HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() { 1002 return true; 1003 } 1004 template <> 1005 HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() { 1006 return true; 1007 } 1008 template <> 1009 HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() { 1010 return true; 1011 } 1012 template <> 1013 HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() { 1014 return true; 1015 } 1016 template <> 1017 HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() { 1018 return true; 1019 } 1020 template <> 1021 HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() { 1022 return true; 1023 } 1024 template <> 1025 HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() { 1026 return true; 1027 } 1028 template <> 1029 HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() { 1030 return true; 1031 } 1032 1033 namespace detail { 1034 1035 template <class T> 1036 static HWY_INLINE constexpr bool IsNonCvInteger() { 1037 // NOTE: Do not add a IsNonCvInteger<wchar_t>() specialization below as it is 1038 // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC 1039 // with the /Zc:wchar_t- option. 1040 return IsIntegerLaneType<T>() || IsSame<T, wchar_t>() || 1041 IsSameEither<T, size_t, ptrdiff_t>() || 1042 IsSameEither<T, intptr_t, uintptr_t>(); 1043 } 1044 template <> 1045 HWY_INLINE constexpr bool IsNonCvInteger<bool>() { 1046 return true; 1047 } 1048 template <> 1049 HWY_INLINE constexpr bool IsNonCvInteger<char>() { 1050 return true; 1051 } 1052 template <> 1053 HWY_INLINE constexpr bool IsNonCvInteger<signed char>() { 1054 return true; 1055 } 1056 template <> 1057 HWY_INLINE constexpr bool IsNonCvInteger<unsigned char>() { 1058 return true; 1059 } 1060 template <> 1061 HWY_INLINE constexpr bool IsNonCvInteger<short>() { // NOLINT 1062 return true; 1063 } 1064 template <> 1065 HWY_INLINE constexpr bool IsNonCvInteger<unsigned short>() { // NOLINT 1066 return true; 1067 } 1068 template <> 1069 HWY_INLINE constexpr bool IsNonCvInteger<int>() { 1070 return true; 1071 } 1072 template <> 1073 HWY_INLINE constexpr bool IsNonCvInteger<unsigned>() { 1074 return true; 1075 } 1076 template <> 1077 HWY_INLINE constexpr bool IsNonCvInteger<long>() { // NOLINT 1078 return true; 1079 } 1080 template <> 1081 HWY_INLINE constexpr bool IsNonCvInteger<unsigned long>() { // NOLINT 1082 return true; 1083 } 1084 template <> 1085 HWY_INLINE constexpr bool IsNonCvInteger<long long>() { // NOLINT 1086 return true; 1087 } 1088 template <> 1089 HWY_INLINE constexpr bool IsNonCvInteger<unsigned long long>() { // NOLINT 1090 return true; 1091 } 1092 #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L 1093 template <> 1094 HWY_INLINE constexpr bool IsNonCvInteger<char8_t>() { 1095 return true; 1096 } 1097 #endif 1098 template <> 1099 HWY_INLINE constexpr bool IsNonCvInteger<char16_t>() { 1100 return true; 1101 } 1102 template <> 1103 HWY_INLINE constexpr bool IsNonCvInteger<char32_t>() { 1104 return true; 1105 } 1106 1107 } // namespace detail 1108 1109 template <class T> 1110 HWY_API constexpr bool IsInteger() { 1111 return detail::IsNonCvInteger<RemoveCvRef<T>>(); 1112 } 1113 1114 // ----------------------------------------------------------------------------- 1115 // BitCastScalar 1116 1117 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926 1118 #define HWY_BITCASTSCALAR_CONSTEXPR constexpr 1119 #else 1120 #define HWY_BITCASTSCALAR_CONSTEXPR 1121 #endif 1122 1123 #if __cpp_constexpr >= 201304L 1124 #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR 1125 #else 1126 #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR 1127 #endif 1128 1129 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926 1130 namespace detail { 1131 1132 template <class From> 1133 struct BitCastScalarSrcCastHelper { 1134 static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) { 1135 return val; 1136 } 1137 }; 1138 1139 #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000 1140 // Workaround for Clang 9 constexpr __builtin_bit_cast bug 1141 template <class To, class From, 1142 hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() && 1143 hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr> 1144 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To 1145 BuiltinBitCastScalar(const From& val) { 1146 static_assert(sizeof(To) == sizeof(From), 1147 "sizeof(To) == sizeof(From) must be true"); 1148 return static_cast<To>(val); 1149 } 1150 1151 template <class To, class From, 1152 hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() && 1153 hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr> 1154 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To 1155 BuiltinBitCastScalar(const From& val) { 1156 return __builtin_bit_cast(To, val); 1157 } 1158 #endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000 1159 1160 } // namespace detail 1161 1162 template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)> 1163 HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) { 1164 // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either 1165 // const typename From::Native& or const uint16_t& using 1166 // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to 1167 // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr 1168 // if To is not a pointer type, union type, or a struct/class containing a 1169 // pointer, union, or reference subobject 1170 #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000 1171 return detail::BuiltinBitCastScalar<To>( 1172 detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef( 1173 val)); 1174 #else 1175 return __builtin_bit_cast( 1176 To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef( 1177 val)); 1178 #endif 1179 } 1180 template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)> 1181 HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) { 1182 // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val 1183 // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits 1184 // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to 1185 // be constexpr if the __builtin_bit_cast intrinsic is available. 1186 return To::FromBits(BitCastScalar<uint16_t>(val)); 1187 } 1188 #else 1189 template <class To, class From> 1190 HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) { 1191 To result; 1192 CopySameSize(&val, &result); 1193 return result; 1194 } 1195 #endif 1196 1197 //------------------------------------------------------------------------------ 1198 // F16 lane type 1199 1200 #pragma pack(push, 1) 1201 1202 #ifndef HWY_NEON_HAVE_F16C // allow override 1203 // Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are 1204 // included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires 1205 // __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee. 1206 #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \ 1207 (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \ 1208 (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE)) 1209 #define HWY_NEON_HAVE_F16C 1 1210 #else 1211 #define HWY_NEON_HAVE_F16C 0 1212 #endif 1213 #endif // HWY_NEON_HAVE_F16C 1214 1215 // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies 1216 // HWY_HAVE_FLOAT16. 1217 #if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600 1218 #define HWY_RVV_HAVE_F16_VEC 1 1219 #else 1220 #define HWY_RVV_HAVE_F16_VEC 0 1221 #endif 1222 1223 // x86 compiler supports _Float16, not necessarily with operators. 1224 // Avoid clang-cl because it lacks __extendhfsf2. 1225 #if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \ 1226 ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \ 1227 HWY_COMPILER_GCC_ACTUAL >= 1200) 1228 #define HWY_SSE2_HAVE_F16_TYPE 1 1229 #else 1230 #define HWY_SSE2_HAVE_F16_TYPE 0 1231 #endif 1232 1233 #ifndef HWY_HAVE_SCALAR_F16_TYPE // allow override 1234 // Compiler supports _Float16, not necessarily with operators. 1235 #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || \ 1236 __SPIRV_DEVICE__ 1237 #define HWY_HAVE_SCALAR_F16_TYPE 1 1238 #else 1239 #define HWY_HAVE_SCALAR_F16_TYPE 0 1240 #endif 1241 #endif // HWY_HAVE_SCALAR_F16_TYPE 1242 1243 #ifndef HWY_HAVE_SCALAR_F16_OPERATORS 1244 // Recent enough compiler also has operators. 1245 #if HWY_HAVE_SCALAR_F16_TYPE && \ 1246 (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \ 1247 (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \ 1248 !defined(_WIN32)) || \ 1249 (HWY_ARCH_ARM && \ 1250 (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800))) 1251 #define HWY_HAVE_SCALAR_F16_OPERATORS 1 1252 #else 1253 #define HWY_HAVE_SCALAR_F16_OPERATORS 0 1254 #endif 1255 #endif // HWY_HAVE_SCALAR_F16_OPERATORS 1256 1257 namespace detail { 1258 1259 template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()> 1260 struct SpecialFloatUnwrapArithOpOperandT {}; 1261 1262 template <class T, class TVal> 1263 struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> { 1264 using type = T; 1265 }; 1266 1267 template <class T> 1268 using SpecialFloatUnwrapArithOpOperand = 1269 typename SpecialFloatUnwrapArithOpOperandT<T>::type; 1270 1271 template <class T, class TVal = RemoveCvRef<T>> 1272 struct NativeSpecialFloatToWrapperT { 1273 using type = T; 1274 }; 1275 1276 template <class T> 1277 using NativeSpecialFloatToWrapper = 1278 typename NativeSpecialFloatToWrapperT<T>::type; 1279 1280 } // namespace detail 1281 1282 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name 1283 // by concatenating base type and bits. We use a wrapper class instead of a 1284 // typedef to the native type to ensure that the same symbols, e.g. for VQSort, 1285 // are generated regardless of F16 support; see #1684. 1286 struct alignas(2) float16_t { 1287 #if HWY_HAVE_SCALAR_F16_TYPE 1288 #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || __SPIRV_DEVICE__ 1289 using Native = _Float16; 1290 #elif HWY_NEON_HAVE_F16C 1291 using Native = __fp16; 1292 #else 1293 #error "Logic error: condition should be 'all but NEON_HAVE_F16C'" 1294 #endif 1295 #elif HWY_IDE 1296 using Native = uint16_t; 1297 #endif // HWY_HAVE_SCALAR_F16_TYPE 1298 1299 union { 1300 #if HWY_HAVE_SCALAR_F16_TYPE || HWY_IDE 1301 // Accessed via NativeLaneType, and used directly if 1302 // HWY_HAVE_SCALAR_F16_OPERATORS. 1303 Native native; 1304 #endif 1305 // Only accessed via NativeLaneType or U16LaneType. 1306 uint16_t bits; 1307 }; 1308 1309 // Default init and copying. 1310 float16_t() noexcept = default; 1311 constexpr float16_t(const float16_t&) noexcept = default; 1312 constexpr float16_t(float16_t&&) noexcept = default; 1313 float16_t& operator=(const float16_t&) noexcept = default; 1314 float16_t& operator=(float16_t&&) noexcept = default; 1315 1316 #if HWY_HAVE_SCALAR_F16_TYPE 1317 // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit 1318 // float16_t(intrinsic()), but user code expects implicit conversions. 1319 MOZ_IMPLICIT constexpr float16_t(Native arg) noexcept : native(arg) {} 1320 constexpr operator Native() const noexcept { return native; } 1321 #endif 1322 1323 #if HWY_HAVE_SCALAR_F16_TYPE 1324 static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) { 1325 return float16_t(BitCastScalar<Native>(bits)); 1326 } 1327 #else 1328 1329 private: 1330 struct F16FromU16BitsTag {}; 1331 constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits) 1332 : bits(u16_bits) {} 1333 1334 public: 1335 static constexpr float16_t FromBits(uint16_t bits) { 1336 return float16_t(F16FromU16BitsTag(), bits); 1337 } 1338 #endif 1339 1340 // When backed by a native type, ensure the wrapper behaves like the native 1341 // type by forwarding all operators. Unfortunately it seems difficult to reuse 1342 // this code in a base class, so we repeat it in float16_t. 1343 #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE 1344 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() && 1345 IsConvertible<T, Native>()>* = nullptr> 1346 MOZ_IMPLICIT constexpr float16_t(T&& arg) noexcept 1347 : native(static_cast<Native>(static_cast<T&&>(arg))) {} 1348 1349 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() && 1350 !IsConvertible<T, Native>() && 1351 IsStaticCastable<T, Native>()>* = nullptr> 1352 explicit constexpr float16_t(T&& arg) noexcept 1353 : native(static_cast<Native>(static_cast<T&&>(arg))) {} 1354 1355 // pre-decrement operator (--x) 1356 HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept { 1357 native = static_cast<Native>(native - Native{1}); 1358 return *this; 1359 } 1360 1361 // post-decrement operator (x--) 1362 HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept { 1363 float16_t result = *this; 1364 native = static_cast<Native>(native - Native{1}); 1365 return result; 1366 } 1367 1368 // pre-increment operator (++x) 1369 HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept { 1370 native = static_cast<Native>(native + Native{1}); 1371 return *this; 1372 } 1373 1374 // post-increment operator (x++) 1375 HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept { 1376 float16_t result = *this; 1377 native = static_cast<Native>(native + Native{1}); 1378 return result; 1379 } 1380 1381 constexpr float16_t operator-() const noexcept { 1382 return float16_t(static_cast<Native>(-native)); 1383 } 1384 constexpr float16_t operator+() const noexcept { return *this; } 1385 1386 // Reduce clutter by generating `operator+` and `operator+=` etc. Note that 1387 // we cannot token-paste `operator` and `+`, so pass it in as `op_func`. 1388 #define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \ 1389 constexpr float16_t op_func(const float16_t& rhs) const noexcept { \ 1390 return float16_t(static_cast<Native>(native op rhs.native)); \ 1391 } \ 1392 template <typename T, HWY_IF_NOT_F16(T), \ 1393 typename UnwrappedT = \ 1394 detail::SpecialFloatUnwrapArithOpOperand<const T&>, \ 1395 typename RawResultT = \ 1396 decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \ 1397 typename ResultT = \ 1398 detail::NativeSpecialFloatToWrapper<RawResultT>, \ 1399 HWY_IF_CASTABLE(RawResultT, ResultT)> \ 1400 constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \ 1401 static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \ 1402 return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \ 1403 } \ 1404 HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \ 1405 const hwy::float16_t& rhs) noexcept { \ 1406 native = static_cast<Native>(native op rhs.native); \ 1407 return *this; \ 1408 } \ 1409 template <typename T, HWY_IF_NOT_F16(T), \ 1410 HWY_IF_OP_CASTABLE(op, const T&, Native), \ 1411 HWY_IF_ASSIGNABLE( \ 1412 Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \ 1413 HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \ 1414 noexcept( \ 1415 static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \ 1416 native = static_cast<Native>(native op rhs); \ 1417 return *this; \ 1418 } 1419 1420 HWY_FLOAT16_BINARY_OP(+, operator+, operator+=) 1421 HWY_FLOAT16_BINARY_OP(-, operator-, operator-=) 1422 HWY_FLOAT16_BINARY_OP(*, operator*, operator*=) 1423 HWY_FLOAT16_BINARY_OP(/, operator/, operator/=) 1424 #undef HWY_FLOAT16_BINARY_OP 1425 1426 #endif // HWY_HAVE_SCALAR_F16_OPERATORS 1427 }; 1428 static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t"); 1429 1430 #if HWY_HAVE_SCALAR_F16_TYPE 1431 namespace detail { 1432 1433 #if HWY_HAVE_SCALAR_F16_OPERATORS 1434 template <class T> 1435 struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> { 1436 using type = hwy::float16_t::Native; 1437 }; 1438 #endif 1439 1440 template <class T> 1441 struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> { 1442 using type = hwy::float16_t; 1443 }; 1444 1445 } // namespace detail 1446 #endif // HWY_HAVE_SCALAR_F16_TYPE 1447 1448 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926 1449 namespace detail { 1450 1451 template <> 1452 struct BitCastScalarSrcCastHelper<hwy::float16_t> { 1453 #if HWY_HAVE_SCALAR_F16_TYPE 1454 static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef( 1455 const hwy::float16_t& val) { 1456 return val.native; 1457 } 1458 #else 1459 static HWY_INLINE constexpr const uint16_t& CastSrcValRef( 1460 const hwy::float16_t& val) { 1461 return val.bits; 1462 } 1463 #endif 1464 }; 1465 1466 } // namespace detail 1467 #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926 1468 1469 #if HWY_HAVE_SCALAR_F16_OPERATORS 1470 #define HWY_F16_CONSTEXPR constexpr 1471 #else 1472 #define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR 1473 #endif // HWY_HAVE_SCALAR_F16_OPERATORS 1474 1475 HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) { 1476 #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE 1477 return static_cast<float>(f16); 1478 #endif 1479 #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE 1480 const uint16_t bits16 = BitCastScalar<uint16_t>(f16); 1481 const uint32_t sign = static_cast<uint32_t>(bits16 >> 15); 1482 const uint32_t biased_exp = (bits16 >> 10) & 0x1F; 1483 const uint32_t mantissa = bits16 & 0x3FF; 1484 1485 // Subnormal or zero 1486 if (biased_exp == 0) { 1487 const float subnormal = 1488 (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024)); 1489 return sign ? -subnormal : subnormal; 1490 } 1491 1492 // Normalized, infinity or NaN: convert the representation directly 1493 // (faster than ldexp/tables). 1494 const uint32_t biased_exp32 = 1495 biased_exp == 31 ? 0xFF : biased_exp + (127 - 15); 1496 const uint32_t mantissa32 = mantissa << (23 - 10); 1497 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; 1498 1499 return BitCastScalar<float>(bits32); 1500 #endif // !HWY_HAVE_SCALAR_F16_OPERATORS 1501 } 1502 1503 #if HWY_IS_DEBUG_BUILD && \ 1504 (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926) 1505 #if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L 1506 // If C++23 if !consteval support is available, only execute 1507 // HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated 1508 // context to avoid compilation errors. 1509 #define HWY_F16_FROM_F32_DASSERT(condition) \ 1510 do { \ 1511 if !consteval { \ 1512 HWY_DASSERT(condition); \ 1513 } \ 1514 } while (0) 1515 #elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \ 1516 HWY_COMPILER_MSVC >= 1926 1517 // If the __builtin_is_constant_evaluated() intrinsic is available, 1518 // only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns 1519 // false to avoid compilation errors if F16FromF32 is called from a 1520 // constant-evaluated context. 1521 #define HWY_F16_FROM_F32_DASSERT(condition) \ 1522 do { \ 1523 if (!__builtin_is_constant_evaluated()) { \ 1524 HWY_DASSERT(condition); \ 1525 } \ 1526 } while (0) 1527 #else 1528 // If C++23 if !consteval support is not available, 1529 // the __builtin_is_constant_evaluated() intrinsic is not available, 1530 // HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available, 1531 // do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is 1532 // called from a constant-evaluated context. 1533 #define HWY_F16_FROM_F32_DASSERT(condition) \ 1534 do { \ 1535 } while (0) 1536 #endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L 1537 #else 1538 // If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not 1539 // available, define HWY_F16_FROM_F32_DASSERT(condition) as 1540 // HWY_DASSERT(condition) 1541 #define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition) 1542 #endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) || 1543 // HWY_COMPILER_MSVC >= 1926) 1544 1545 HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) { 1546 #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE 1547 return float16_t(static_cast<float16_t::Native>(f32)); 1548 #endif 1549 #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE 1550 const uint32_t bits32 = BitCastScalar<uint32_t>(f32); 1551 const uint32_t sign = bits32 >> 31; 1552 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; 1553 constexpr uint32_t kMantissaMask = 0x7FFFFF; 1554 const uint32_t mantissa32 = bits32 & kMantissaMask; 1555 1556 // Before shifting (truncation), round to nearest even to reduce bias. If 1557 // the lowest remaining mantissa bit is odd, increase the offset. Example 1558 // with the lowest remaining bit (left) and next lower two bits; the 1559 // latter, plus two more, will be truncated. 1560 // 0[00] + 1 = 0[01] 1561 // 0[01] + 1 = 0[10] 1562 // 0[10] + 1 = 0[11] (round down toward even) 1563 // 0[11] + 1 = 1[00] (round up) 1564 // 1[00] + 10 = 1[10] 1565 // 1[01] + 10 = 1[11] 1566 // 1[10] + 10 = C0[00] (round up toward even with C=1 carry out) 1567 // 1[11] + 10 = C0[01] (round up toward even with C=1 carry out) 1568 1569 // If |f32| >= 2^-24, f16_ulp_bit_idx is the index of the F32 mantissa bit 1570 // that will be shifted down into the ULP bit of the rounded down F16 result 1571 1572 // The biased F32 exponent of 2^-14 (the smallest positive normal F16 value) 1573 // is 113, and bit 13 of the F32 mantissa will be shifted down to into the ULP 1574 // bit of the rounded down F16 result if |f32| >= 2^14 1575 1576 // If |f32| < 2^-24, f16_ulp_bit_idx is equal to 24 as there are 24 mantissa 1577 // bits (including the implied 1 bit) in the mantissa of a normal F32 value 1578 // and as we want to round up the mantissa if |f32| > 2^-25 && |f32| < 2^-24 1579 const int32_t f16_ulp_bit_idx = 1580 HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24); 1581 const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1; 1582 const uint32_t rounded = 1583 mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u; 1584 const bool carry = rounded >= (1u << 23); 1585 1586 const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry; 1587 1588 // Tiny or zero => zero. 1589 if (exp < -24) { 1590 // restore original sign 1591 return float16_t::FromBits(static_cast<uint16_t>(sign << 15)); 1592 } 1593 1594 // If biased_exp16 would be >= 31, first check whether the input was NaN so we 1595 // can set the mantissa to nonzero. 1596 const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0; 1597 const bool overflowed = exp >= 16; 1598 const uint32_t biased_exp16 = 1599 static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31)); 1600 // exp = [-24, -15] => subnormal, shift the mantissa. 1601 const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0)); 1602 HWY_F16_FROM_F32_DASSERT(sub_exp < 11); 1603 const uint32_t shifted_mantissa = 1604 (rounded & kMantissaMask) >> (23 - 10 + sub_exp); 1605 const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp); 1606 const uint32_t mantissa16 = is_nan ? 0x3FF 1607 : overflowed ? 0u 1608 : (leading + shifted_mantissa); 1609 1610 #if HWY_IS_DEBUG_BUILD 1611 if (exp < -14) { 1612 HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0); 1613 HWY_F16_FROM_F32_DASSERT(sub_exp >= 1); 1614 } else if (exp <= 15) { 1615 HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); 1616 HWY_F16_FROM_F32_DASSERT(sub_exp == 0); 1617 } 1618 #endif 1619 1620 HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024); 1621 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; 1622 HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000); 1623 const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe 1624 return float16_t::FromBits(narrowed); 1625 #endif // !HWY_HAVE_SCALAR_F16_OPERATORS 1626 } 1627 1628 HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) { 1629 #if HWY_HAVE_SCALAR_F16_OPERATORS 1630 return float16_t(static_cast<float16_t::Native>(f64)); 1631 #else 1632 // The mantissa bits of f64 are first rounded using round-to-odd rounding 1633 // to the nearest f64 value that has the lower 29 bits zeroed out to 1634 // ensure that the result is correctly rounded to a F16. 1635 1636 // The F64 round-to-odd operation below will round a normal F64 value 1637 // (using round-to-odd rounding) to a F64 value that has 24 bits of precision. 1638 1639 // It is okay if the magnitude of a denormal F64 value is rounded up in the 1640 // F64 round-to-odd step below as the magnitude of a denormal F64 value is 1641 // much smaller than 2^(-24) (the smallest positive denormal F16 value). 1642 1643 // It is also okay if bit 29 of a NaN F64 value is changed by the F64 1644 // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually 1645 // discarded or ignored by the conversion of a F32 NaN value to a F16. 1646 1647 // If f64 is a NaN value, the result of the F64 round-to-odd step will be a 1648 // NaN value as the result of the F64 round-to-odd step will have at least one 1649 // mantissa bit if f64 is a NaN value. 1650 1651 // The F64 round-to-odd step will ensure that the F64 to F32 conversion is 1652 // exact if the magnitude of the rounded F64 value (using round-to-odd 1653 // rounding) is between 2^(-126) (the smallest normal F32 value) and 1654 // HighestValue<float>() (the largest finite F32 value) 1655 1656 // It is okay if the F64 to F32 conversion is inexact for F64 values that have 1657 // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32 1658 // value is much smaller than 2^(-24) (the smallest positive denormal F16 1659 // value). 1660 1661 return F16FromF32( 1662 static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>( 1663 (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) | 1664 ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) & 1665 0x0000000020000000ULL))))); 1666 #endif 1667 } 1668 1669 // More convenient to define outside float16_t because these may use 1670 // F32FromF16, which is defined after the struct. 1671 HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs, 1672 float16_t rhs) noexcept { 1673 #if HWY_HAVE_SCALAR_F16_OPERATORS 1674 return lhs.native == rhs.native; 1675 #else 1676 return F32FromF16(lhs) == F32FromF16(rhs); 1677 #endif 1678 } 1679 HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs, 1680 float16_t rhs) noexcept { 1681 #if HWY_HAVE_SCALAR_F16_OPERATORS 1682 return lhs.native != rhs.native; 1683 #else 1684 return F32FromF16(lhs) != F32FromF16(rhs); 1685 #endif 1686 } 1687 HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept { 1688 #if HWY_HAVE_SCALAR_F16_OPERATORS 1689 return lhs.native < rhs.native; 1690 #else 1691 return F32FromF16(lhs) < F32FromF16(rhs); 1692 #endif 1693 } 1694 HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs, 1695 float16_t rhs) noexcept { 1696 #if HWY_HAVE_SCALAR_F16_OPERATORS 1697 return lhs.native <= rhs.native; 1698 #else 1699 return F32FromF16(lhs) <= F32FromF16(rhs); 1700 #endif 1701 } 1702 HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept { 1703 #if HWY_HAVE_SCALAR_F16_OPERATORS 1704 return lhs.native > rhs.native; 1705 #else 1706 return F32FromF16(lhs) > F32FromF16(rhs); 1707 #endif 1708 } 1709 HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs, 1710 float16_t rhs) noexcept { 1711 #if HWY_HAVE_SCALAR_F16_OPERATORS 1712 return lhs.native >= rhs.native; 1713 #else 1714 return F32FromF16(lhs) >= F32FromF16(rhs); 1715 #endif 1716 } 1717 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE 1718 HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>( 1719 float16_t lhs, float16_t rhs) noexcept { 1720 #if HWY_HAVE_SCALAR_F16_OPERATORS 1721 return lhs.native <=> rhs.native; 1722 #else 1723 return F32FromF16(lhs) <=> F32FromF16(rhs); 1724 #endif 1725 } 1726 #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE 1727 1728 //------------------------------------------------------------------------------ 1729 // BF16 lane type 1730 1731 // Compiler supports ACLE __bf16, not necessarily with operators. 1732 1733 // Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug 1734 // in GCC 13 and earlier that sometimes causes BF16 constant values to be 1735 // incorrectly loaded on AArch64, and this GCC bug on AArch64 is 1736 // described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867. 1737 1738 #if HWY_ARCH_ARM_A64 && \ 1739 (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400) 1740 #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1 1741 #else 1742 #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0 1743 #endif 1744 1745 // x86 compiler supports __bf16, not necessarily with operators. 1746 // Disable in debug builds due to clang miscompiles as of 2025-07-22: casting 1747 // bf16 <-> f32 in convert_test results in 0x2525 for 1.0 instead of 0x3f80. 1748 // Reported at https://github.com/llvm/llvm-project/issues/151692. 1749 #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1750 #if HWY_ARCH_X86 && defined(__SSE2__) && \ 1751 ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL && \ 1752 (!HWY_IS_DEBUG_BUILD || HWY_COMPILER3_CLANG >= 220101)) || \ 1753 HWY_COMPILER_GCC_ACTUAL >= 1300) 1754 #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1 1755 #else 1756 #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0 1757 #endif 1758 #endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1759 1760 // Compiler supports __bf16, not necessarily with operators. 1761 #if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1762 #define HWY_HAVE_SCALAR_BF16_TYPE 1 1763 #else 1764 #define HWY_HAVE_SCALAR_BF16_TYPE 0 1765 #endif 1766 1767 #ifndef HWY_HAVE_SCALAR_BF16_OPERATORS 1768 // Recent enough compiler also has operators. aarch64 clang 18 hits internal 1769 // compiler errors on bf16 ToString, hence only enable on GCC for now. 1770 // GCC >= 13 will insert a function call to the __extendbfsf2 helper function 1771 // for scalar conversions from __bf16 to float. This is prohibitively expensive, 1772 // so refrain from using scalar BF16 operators on these compiler versions. 1773 // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121853 1774 #if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1700) 1775 #define HWY_HAVE_SCALAR_BF16_OPERATORS 1 1776 #else 1777 #define HWY_HAVE_SCALAR_BF16_OPERATORS 0 1778 #endif 1779 #endif // HWY_HAVE_SCALAR_BF16_OPERATORS 1780 1781 #if HWY_HAVE_SCALAR_BF16_OPERATORS 1782 #define HWY_BF16_CONSTEXPR constexpr 1783 #else 1784 #define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR 1785 #endif 1786 1787 struct alignas(2) bfloat16_t { 1788 #if HWY_HAVE_SCALAR_BF16_TYPE 1789 using Native = __bf16; 1790 #elif HWY_IDE 1791 using Native = uint16_t; 1792 #endif 1793 1794 union { 1795 #if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE 1796 // Accessed via NativeLaneType, and used directly if 1797 // HWY_HAVE_SCALAR_BF16_OPERATORS. 1798 Native native; 1799 #endif 1800 // Only accessed via NativeLaneType or U16LaneType. 1801 uint16_t bits; 1802 }; 1803 1804 // Default init and copying 1805 bfloat16_t() noexcept = default; 1806 constexpr bfloat16_t(bfloat16_t&&) noexcept = default; 1807 constexpr bfloat16_t(const bfloat16_t&) noexcept = default; 1808 bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default; 1809 bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default; 1810 1811 // Only enable implicit conversions if we have a native type. 1812 #if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE 1813 MOZ_IMPLICIT constexpr bfloat16_t(Native arg) noexcept : native(arg) {} 1814 constexpr operator Native() const noexcept { return native; } 1815 #endif 1816 1817 #if HWY_HAVE_SCALAR_BF16_TYPE 1818 static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) { 1819 return bfloat16_t(BitCastScalar<Native>(bits)); 1820 } 1821 #else 1822 1823 private: 1824 struct BF16FromU16BitsTag {}; 1825 constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits) 1826 : bits(u16_bits) {} 1827 1828 public: 1829 static constexpr bfloat16_t FromBits(uint16_t bits) { 1830 return bfloat16_t(BF16FromU16BitsTag(), bits); 1831 } 1832 #endif 1833 1834 // When backed by a native type, ensure the wrapper behaves like the native 1835 // type by forwarding all operators. Unfortunately it seems difficult to reuse 1836 // this code in a base class, so we repeat it in float16_t. 1837 #if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE 1838 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() && 1839 !IsSame<RemoveCvRef<T>, bfloat16_t>() && 1840 IsConvertible<T, Native>()>* = nullptr> 1841 constexpr bfloat16_t(T&& arg) noexcept( 1842 noexcept(static_cast<Native>(DeclVal<T>()))) 1843 : native(static_cast<Native>(static_cast<T&&>(arg))) {} 1844 1845 template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() && 1846 !IsSame<RemoveCvRef<T>, bfloat16_t>() && 1847 !IsConvertible<T, Native>() && 1848 IsStaticCastable<T, Native>()>* = nullptr> 1849 explicit constexpr bfloat16_t(T&& arg) noexcept( 1850 noexcept(static_cast<Native>(DeclVal<T>()))) 1851 : native(static_cast<Native>(static_cast<T&&>(arg))) {} 1852 1853 HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept { 1854 native = arg; 1855 return *this; 1856 } 1857 1858 // pre-decrement operator (--x) 1859 HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept { 1860 native = static_cast<Native>(native - Native{1}); 1861 return *this; 1862 } 1863 1864 // post-decrement operator (x--) 1865 HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept { 1866 bfloat16_t result = *this; 1867 native = static_cast<Native>(native - Native{1}); 1868 return result; 1869 } 1870 1871 // pre-increment operator (++x) 1872 HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept { 1873 native = static_cast<Native>(native + Native{1}); 1874 return *this; 1875 } 1876 1877 // post-increment operator (x++) 1878 HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept { 1879 bfloat16_t result = *this; 1880 native = static_cast<Native>(native + Native{1}); 1881 return result; 1882 } 1883 1884 constexpr bfloat16_t operator-() const noexcept { 1885 return bfloat16_t(static_cast<Native>(-native)); 1886 } 1887 constexpr bfloat16_t operator+() const noexcept { return *this; } 1888 1889 // Reduce clutter by generating `operator+` and `operator+=` etc. Note that 1890 // we cannot token-paste `operator` and `+`, so pass it in as `op_func`. 1891 #define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \ 1892 constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \ 1893 return bfloat16_t(static_cast<Native>(native op rhs.native)); \ 1894 } \ 1895 template <typename T, HWY_IF_NOT_BF16(T), \ 1896 typename UnwrappedT = \ 1897 detail::SpecialFloatUnwrapArithOpOperand<const T&>, \ 1898 typename RawResultT = \ 1899 decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \ 1900 typename ResultT = \ 1901 detail::NativeSpecialFloatToWrapper<RawResultT>, \ 1902 HWY_IF_CASTABLE(RawResultT, ResultT)> \ 1903 constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \ 1904 static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \ 1905 return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \ 1906 } \ 1907 HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \ 1908 const hwy::bfloat16_t& rhs) noexcept { \ 1909 native = static_cast<Native>(native op rhs.native); \ 1910 return *this; \ 1911 } \ 1912 template <typename T, HWY_IF_NOT_BF16(T), \ 1913 HWY_IF_OP_CASTABLE(op, const T&, Native), \ 1914 HWY_IF_ASSIGNABLE( \ 1915 Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \ 1916 HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \ 1917 noexcept( \ 1918 static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \ 1919 native = static_cast<Native>(native op rhs); \ 1920 return *this; \ 1921 } 1922 HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=) 1923 HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=) 1924 HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=) 1925 HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=) 1926 #undef HWY_BFLOAT16_BINARY_OP 1927 1928 #endif // HWY_HAVE_SCALAR_BF16_OPERATORS 1929 }; 1930 static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t"); 1931 1932 #pragma pack(pop) 1933 1934 #if HWY_HAVE_SCALAR_BF16_TYPE 1935 namespace detail { 1936 1937 #if HWY_HAVE_SCALAR_BF16_OPERATORS 1938 template <class T> 1939 struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> { 1940 using type = hwy::bfloat16_t::Native; 1941 }; 1942 #endif 1943 1944 template <class T> 1945 struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> { 1946 using type = hwy::bfloat16_t; 1947 }; 1948 1949 } // namespace detail 1950 #endif // HWY_HAVE_SCALAR_BF16_TYPE 1951 1952 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926 1953 namespace detail { 1954 1955 template <> 1956 struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> { 1957 #if HWY_HAVE_SCALAR_BF16_TYPE 1958 static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef( 1959 const hwy::bfloat16_t& val) { 1960 return val.native; 1961 } 1962 #else 1963 static HWY_INLINE constexpr const uint16_t& CastSrcValRef( 1964 const hwy::bfloat16_t& val) { 1965 return val.bits; 1966 } 1967 #endif 1968 }; 1969 1970 } // namespace detail 1971 #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926 1972 1973 HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) { 1974 #if HWY_HAVE_SCALAR_BF16_OPERATORS 1975 return static_cast<float>(bf); 1976 #else 1977 return BitCastScalar<float>(static_cast<uint32_t>( 1978 static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16)); 1979 #endif 1980 } 1981 1982 namespace detail { 1983 1984 // Returns the increment to add to the bits of a finite F32 value to round a 1985 // finite F32 to the nearest BF16 value 1986 static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr( 1987 const uint32_t f32_bits) { 1988 return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u) 1989 ? (0x7FFFu + ((f32_bits >> 16) & 1u)) 1990 : 0u); 1991 } 1992 1993 // If f32_bits is the bit representation of a NaN F32 value, make sure that 1994 // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16 1995 // values and to prevent NaN F32 values from being converted to an infinite 1996 // BF16 value 1997 static HWY_INLINE constexpr uint32_t BF16BitsIfSNAN(uint32_t f32_bits) { 1998 return ((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) ? (uint32_t{1} << 6) : 0; 1999 } 2000 2001 // Converts f32_bits (which is the bits of a F32 value) to BF16 bits, 2002 // rounded to the nearest F16 value 2003 static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits( 2004 const uint32_t f32_bits) { 2005 return static_cast<uint16_t>( 2006 BF16BitsIfSNAN(f32_bits) | 2007 ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16)); 2008 } 2009 2010 } // namespace detail 2011 2012 HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) { 2013 // The rounding mode is not specified in the C++ standard, so ignore 2014 // `HWY_HAVE_SCALAR_BF16_OPERATORS` and only use our round to nearest. 2015 return bfloat16_t::FromBits( 2016 detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f))); 2017 } 2018 2019 HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) { 2020 // The mantissa bits of f64 are first rounded using round-to-odd rounding 2021 // to the nearest f64 value that has the lower 38 bits zeroed out to 2022 // ensure that the result is correctly rounded to a BF16. 2023 2024 // The F64 round-to-odd operation below will round a normal F64 value 2025 // (using round-to-odd rounding) to a F64 value that has 15 bits of precision. 2026 2027 // It is okay if the magnitude of a denormal F64 value is rounded up in the 2028 // F64 round-to-odd step below as the magnitude of a denormal F64 value is 2029 // much smaller than 2^(-133) (the smallest positive denormal BF16 value). 2030 2031 // It is also okay if bit 38 of a NaN F64 value is changed by the F64 2032 // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually 2033 // discarded or ignored by the conversion of a F32 NaN value to a BF16. 2034 2035 // If f64 is a NaN value, the result of the F64 round-to-odd step will be a 2036 // NaN value as the result of the F64 round-to-odd step will have at least one 2037 // mantissa bit if f64 is a NaN value. 2038 2039 // The F64 round-to-odd step below will ensure that the F64 to F32 conversion 2040 // is exact if the magnitude of the rounded F64 value (using round-to-odd 2041 // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal 2042 // BF16 value) and HighestValue<float>() (the largest finite F32 value). 2043 2044 // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to 2045 // F32 conversion is guaranteed to be less than or equal to 2^(-135), which 2046 // ensures that the F32 to BF16 conversion is correctly rounded, even if the 2047 // conversion of a rounded F64 value whose magnitude is less than 2^(-135) 2048 // to a F32 is inexact. 2049 2050 return BF16FromF32( 2051 static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>( 2052 (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) | 2053 ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) & 2054 0x0000004000000000ULL))))); 2055 } 2056 2057 // More convenient to define outside bfloat16_t because these may use 2058 // F32FromBF16, which is defined after the struct. 2059 2060 HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs, 2061 bfloat16_t rhs) noexcept { 2062 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2063 return lhs.native == rhs.native; 2064 #else 2065 return F32FromBF16(lhs) == F32FromBF16(rhs); 2066 #endif 2067 } 2068 2069 HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs, 2070 bfloat16_t rhs) noexcept { 2071 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2072 return lhs.native != rhs.native; 2073 #else 2074 return F32FromBF16(lhs) != F32FromBF16(rhs); 2075 #endif 2076 } 2077 HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs, 2078 bfloat16_t rhs) noexcept { 2079 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2080 return lhs.native < rhs.native; 2081 #else 2082 return F32FromBF16(lhs) < F32FromBF16(rhs); 2083 #endif 2084 } 2085 HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs, 2086 bfloat16_t rhs) noexcept { 2087 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2088 return lhs.native <= rhs.native; 2089 #else 2090 return F32FromBF16(lhs) <= F32FromBF16(rhs); 2091 #endif 2092 } 2093 HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs, 2094 bfloat16_t rhs) noexcept { 2095 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2096 return lhs.native > rhs.native; 2097 #else 2098 return F32FromBF16(lhs) > F32FromBF16(rhs); 2099 #endif 2100 } 2101 HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs, 2102 bfloat16_t rhs) noexcept { 2103 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2104 return lhs.native >= rhs.native; 2105 #else 2106 return F32FromBF16(lhs) >= F32FromBF16(rhs); 2107 #endif 2108 } 2109 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE 2110 HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>( 2111 bfloat16_t lhs, bfloat16_t rhs) noexcept { 2112 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2113 return lhs.native <=> rhs.native; 2114 #else 2115 return F32FromBF16(lhs) <=> F32FromBF16(rhs); 2116 #endif 2117 } 2118 #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE 2119 2120 //------------------------------------------------------------------------------ 2121 // Type relations 2122 2123 namespace detail { 2124 2125 template <typename T> 2126 struct Relations; 2127 template <> 2128 struct Relations<uint8_t> { 2129 using Unsigned = uint8_t; 2130 using Signed = int8_t; 2131 using Wide = uint16_t; 2132 enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; 2133 }; 2134 template <> 2135 struct Relations<int8_t> { 2136 using Unsigned = uint8_t; 2137 using Signed = int8_t; 2138 using Wide = int16_t; 2139 enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; 2140 }; 2141 template <> 2142 struct Relations<uint16_t> { 2143 using Unsigned = uint16_t; 2144 using Signed = int16_t; 2145 using Float = float16_t; 2146 using Wide = uint32_t; 2147 using Narrow = uint8_t; 2148 enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; 2149 }; 2150 template <> 2151 struct Relations<int16_t> { 2152 using Unsigned = uint16_t; 2153 using Signed = int16_t; 2154 using Float = float16_t; 2155 using Wide = int32_t; 2156 using Narrow = int8_t; 2157 enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; 2158 }; 2159 template <> 2160 struct Relations<uint32_t> { 2161 using Unsigned = uint32_t; 2162 using Signed = int32_t; 2163 using Float = float; 2164 using Wide = uint64_t; 2165 using Narrow = uint16_t; 2166 enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; 2167 }; 2168 template <> 2169 struct Relations<int32_t> { 2170 using Unsigned = uint32_t; 2171 using Signed = int32_t; 2172 using Float = float; 2173 using Wide = int64_t; 2174 using Narrow = int16_t; 2175 enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; 2176 }; 2177 template <> 2178 struct Relations<uint64_t> { 2179 using Unsigned = uint64_t; 2180 using Signed = int64_t; 2181 using Float = double; 2182 using Wide = uint128_t; 2183 using Narrow = uint32_t; 2184 enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; 2185 }; 2186 template <> 2187 struct Relations<int64_t> { 2188 using Unsigned = uint64_t; 2189 using Signed = int64_t; 2190 using Float = double; 2191 using Narrow = int32_t; 2192 enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; 2193 }; 2194 template <> 2195 struct Relations<uint128_t> { 2196 using Unsigned = uint128_t; 2197 using Narrow = uint64_t; 2198 enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; 2199 }; 2200 template <> 2201 struct Relations<float16_t> { 2202 using Unsigned = uint16_t; 2203 using Signed = int16_t; 2204 using Float = float16_t; 2205 using Wide = float; 2206 enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; 2207 }; 2208 template <> 2209 struct Relations<bfloat16_t> { 2210 using Unsigned = uint16_t; 2211 using Signed = int16_t; 2212 using Wide = float; 2213 enum { is_signed = 1, is_float = 1, is_bf16 = 1 }; 2214 }; 2215 template <> 2216 struct Relations<float> { 2217 using Unsigned = uint32_t; 2218 using Signed = int32_t; 2219 using Float = float; 2220 using Wide = double; 2221 using Narrow = float16_t; 2222 enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; 2223 }; 2224 template <> 2225 struct Relations<double> { 2226 using Unsigned = uint64_t; 2227 using Signed = int64_t; 2228 using Float = double; 2229 using Narrow = float; 2230 enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; 2231 }; 2232 2233 template <size_t N> 2234 struct TypeFromSize; 2235 template <> 2236 struct TypeFromSize<1> { 2237 using Unsigned = uint8_t; 2238 using Signed = int8_t; 2239 }; 2240 template <> 2241 struct TypeFromSize<2> { 2242 using Unsigned = uint16_t; 2243 using Signed = int16_t; 2244 using Float = float16_t; 2245 }; 2246 template <> 2247 struct TypeFromSize<4> { 2248 using Unsigned = uint32_t; 2249 using Signed = int32_t; 2250 using Float = float; 2251 }; 2252 template <> 2253 struct TypeFromSize<8> { 2254 using Unsigned = uint64_t; 2255 using Signed = int64_t; 2256 using Float = double; 2257 }; 2258 template <> 2259 struct TypeFromSize<16> { 2260 using Unsigned = uint128_t; 2261 }; 2262 2263 } // namespace detail 2264 2265 // Aliases for types of a different category, but the same size. 2266 template <typename T> 2267 using MakeUnsigned = typename detail::Relations<T>::Unsigned; 2268 template <typename T> 2269 using MakeSigned = typename detail::Relations<T>::Signed; 2270 template <typename T> 2271 using MakeFloat = typename detail::Relations<T>::Float; 2272 2273 // Aliases for types of the same category, but different size. 2274 template <typename T> 2275 using MakeWide = typename detail::Relations<T>::Wide; 2276 template <typename T> 2277 using MakeNarrow = typename detail::Relations<T>::Narrow; 2278 2279 // Obtain type from its size [bytes]. 2280 template <size_t N> 2281 using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned; 2282 template <size_t N> 2283 using SignedFromSize = typename detail::TypeFromSize<N>::Signed; 2284 template <size_t N> 2285 using FloatFromSize = typename detail::TypeFromSize<N>::Float; 2286 2287 // Avoid confusion with SizeTag where the parameter is a lane size. 2288 using UnsignedTag = SizeTag<0>; 2289 using SignedTag = SizeTag<0x100>; // integer 2290 using FloatTag = SizeTag<0x200>; 2291 using SpecialTag = SizeTag<0x300>; 2292 2293 template <typename T, class R = detail::Relations<T>> 2294 constexpr auto TypeTag() 2295 -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> { 2296 return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>(); 2297 } 2298 2299 // For when we only want to distinguish FloatTag from everything else. 2300 using NonFloatTag = SizeTag<0x400>; 2301 2302 template <typename T, class R = detail::Relations<T>> 2303 constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> { 2304 return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>(); 2305 } 2306 2307 //------------------------------------------------------------------------------ 2308 // Type traits 2309 2310 template <typename T> 2311 HWY_API constexpr bool IsFloat3264() { 2312 return IsSameEither<RemoveCvRef<T>, float, double>(); 2313 } 2314 2315 template <typename T> 2316 HWY_API constexpr bool IsFloat() { 2317 // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or 2318 // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1. 2319 return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>(); 2320 } 2321 2322 template <typename T> 2323 HWY_API constexpr bool IsSigned() { 2324 return static_cast<T>(0) > static_cast<T>(-1); 2325 } 2326 template <> 2327 constexpr bool IsSigned<float16_t>() { 2328 return true; 2329 } 2330 template <> 2331 constexpr bool IsSigned<bfloat16_t>() { 2332 return true; 2333 } 2334 template <> 2335 constexpr bool IsSigned<hwy::uint128_t>() { 2336 return false; 2337 } 2338 template <> 2339 constexpr bool IsSigned<hwy::K64V64>() { 2340 return false; 2341 } 2342 template <> 2343 constexpr bool IsSigned<hwy::K32V32>() { 2344 return false; 2345 } 2346 2347 template <typename T> 2348 HWY_API constexpr bool IsUnsigned() { 2349 return IsInteger<T>() && !IsSigned<T>(); 2350 } 2351 2352 template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()> 2353 struct MakeLaneTypeIfIntegerT { 2354 using type = T; 2355 }; 2356 2357 template <typename T> 2358 struct MakeLaneTypeIfIntegerT<T, true> { 2359 using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>, 2360 UnsignedFromSize<sizeof(T)>>; 2361 }; 2362 2363 template <typename T> 2364 using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type; 2365 2366 // Largest/smallest representable integer values. 2367 template <typename T> 2368 HWY_API constexpr T LimitsMax() { 2369 static_assert(IsInteger<T>(), "Only for integer types"); 2370 using TU = UnsignedFromSize<sizeof(T)>; 2371 return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1) 2372 : static_cast<TU>(~TU(0))); 2373 } 2374 template <typename T> 2375 HWY_API constexpr T LimitsMin() { 2376 static_assert(IsInteger<T>(), "Only for integer types"); 2377 return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>() 2378 : static_cast<T>(0); 2379 } 2380 2381 // Largest/smallest representable value (integer or float). This naming avoids 2382 // confusion with numeric_limits<float>::min() (the smallest positive value). 2383 // Cannot be constexpr because we use CopySameSize for [b]float16_t. 2384 template <typename T> 2385 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() { 2386 return LimitsMin<T>(); 2387 } 2388 template <> 2389 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() { 2390 return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127 2391 } 2392 template <> 2393 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() { 2394 return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15 2395 } 2396 template <> 2397 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() { 2398 return -3.402823466e+38F; 2399 } 2400 template <> 2401 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() { 2402 return -1.7976931348623158e+308; 2403 } 2404 2405 template <typename T> 2406 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() { 2407 return LimitsMax<T>(); 2408 } 2409 template <> 2410 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() { 2411 return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127 2412 } 2413 template <> 2414 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() { 2415 return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15 2416 } 2417 template <> 2418 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() { 2419 return 3.402823466e+38F; 2420 } 2421 template <> 2422 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() { 2423 return 1.7976931348623158e+308; 2424 } 2425 2426 // Difference between 1.0 and the next representable value. Equal to 2427 // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision. 2428 template <typename T> 2429 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() { 2430 return 1; 2431 } 2432 template <> 2433 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() { 2434 return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125 2435 } 2436 template <> 2437 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() { 2438 return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625 2439 } 2440 template <> 2441 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() { 2442 return 1.192092896e-7f; 2443 } 2444 template <> 2445 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() { 2446 return 2.2204460492503131e-16; 2447 } 2448 2449 // Returns width in bits of the mantissa field in IEEE binary16/32/64. 2450 template <typename T> 2451 constexpr int MantissaBits() { 2452 static_assert(sizeof(T) == 0, "Only instantiate the specializations"); 2453 return 0; 2454 } 2455 template <> 2456 constexpr int MantissaBits<bfloat16_t>() { 2457 return 7; 2458 } 2459 template <> 2460 constexpr int MantissaBits<float16_t>() { 2461 return 10; 2462 } 2463 template <> 2464 constexpr int MantissaBits<float>() { 2465 return 23; 2466 } 2467 template <> 2468 constexpr int MantissaBits<double>() { 2469 return 52; 2470 } 2471 2472 // Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with 2473 // the largest possible (biased) exponent field. Used by IsInf. 2474 template <typename T> 2475 constexpr MakeSigned<T> MaxExponentTimes2() { 2476 return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1)); 2477 } 2478 2479 // Returns bitmask of the sign bit in IEEE binary16/32/64. 2480 template <typename T> 2481 constexpr MakeUnsigned<T> SignMask() { 2482 return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1); 2483 } 2484 2485 // Returns bitmask of the exponent field in IEEE binary16/32/64. 2486 template <typename T> 2487 constexpr MakeUnsigned<T> ExponentMask() { 2488 return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & 2489 static_cast<MakeUnsigned<T>>(~SignMask<T>()); 2490 } 2491 2492 // Returns bitmask of the mantissa field in IEEE binary16/32/64. 2493 template <typename T> 2494 constexpr MakeUnsigned<T> MantissaMask() { 2495 return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1; 2496 } 2497 2498 // Returns 1 << mantissa_bits as a floating-point number. All integers whose 2499 // absolute value are less than this can be represented exactly. 2500 template <typename T> 2501 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() { 2502 static_assert(sizeof(T) == 0, "Only instantiate the specializations"); 2503 return 0; 2504 } 2505 template <> 2506 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() { 2507 return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7 2508 } 2509 template <> 2510 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() { 2511 return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10 2512 } 2513 template <> 2514 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() { 2515 return 8388608.0f; // 1 << 23 2516 } 2517 template <> 2518 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() { 2519 // floating point literal with p52 requires C++17. 2520 return 4503599627370496.0; // 1 << 52 2521 } 2522 2523 // Returns width in bits of the exponent field in IEEE binary16/32/64. 2524 template <typename T> 2525 constexpr int ExponentBits() { 2526 // Exponent := remaining bits after deducting sign and mantissa. 2527 return 8 * sizeof(T) - 1 - MantissaBits<T>(); 2528 } 2529 2530 // Returns largest value of the biased exponent field in IEEE binary16/32/64, 2531 // right-shifted so that the LSB is bit zero. Example: 0xFF for float. 2532 // This is expressed as a signed integer for more efficient comparison. 2533 template <typename T> 2534 constexpr MakeSigned<T> MaxExponentField() { 2535 return (MakeSigned<T>{1} << ExponentBits<T>()) - 1; 2536 } 2537 2538 namespace detail { 2539 2540 template <typename T> 2541 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T 2542 NegativeInfOrLowestValue(hwy::FloatTag /* tag */) { 2543 return BitCastScalar<T>( 2544 static_cast<MakeUnsigned<T>>(SignMask<T>() | ExponentMask<T>())); 2545 } 2546 2547 template <typename T> 2548 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T 2549 NegativeInfOrLowestValue(hwy::NonFloatTag /* tag */) { 2550 return LowestValue<T>(); 2551 } 2552 2553 template <typename T> 2554 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T 2555 PositiveInfOrHighestValue(hwy::FloatTag /* tag */) { 2556 return BitCastScalar<T>(ExponentMask<T>()); 2557 } 2558 2559 template <typename T> 2560 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T 2561 PositiveInfOrHighestValue(hwy::NonFloatTag /* tag */) { 2562 return HighestValue<T>(); 2563 } 2564 2565 } // namespace detail 2566 2567 template <typename T> 2568 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T NegativeInfOrLowestValue() { 2569 return detail::NegativeInfOrLowestValue<T>(IsFloatTag<T>()); 2570 } 2571 2572 template <typename T> 2573 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T PositiveInfOrHighestValue() { 2574 return detail::PositiveInfOrHighestValue<T>(IsFloatTag<T>()); 2575 } 2576 2577 //------------------------------------------------------------------------------ 2578 // Additional F16/BF16 operators 2579 2580 #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS 2581 2582 #define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \ 2583 template < \ 2584 typename T1, \ 2585 hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \ 2586 hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \ 2587 typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \ 2588 typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \ 2589 HWY_IF_CASTABLE(RawResultT, ResultT)> \ 2590 static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \ 2591 return static_cast<ResultT>(a op b.native); \ 2592 } 2593 2594 #define HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(op, assign_op, T2) \ 2595 template <typename T1, \ 2596 hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \ 2597 hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \ 2598 typename ResultT = \ 2599 decltype(DeclVal<T1&>() assign_op DeclVal<T2::Native>())> \ 2600 static HWY_INLINE constexpr ResultT operator assign_op(T1& a, \ 2601 T2 b) noexcept { \ 2602 return (a assign_op b.native); \ 2603 } 2604 2605 #define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \ 2606 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \ 2607 template < \ 2608 typename T2, \ 2609 hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \ 2610 hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \ 2611 typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \ 2612 typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \ 2613 HWY_IF_CASTABLE(RawResultT, ResultT)> \ 2614 static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \ 2615 return static_cast<ResultT>(a.native op b); \ 2616 } 2617 2618 #if HWY_HAVE_SCALAR_F16_OPERATORS 2619 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t) 2620 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t) 2621 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t) 2622 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t) 2623 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, float16_t) 2624 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, float16_t) 2625 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, float16_t) 2626 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, float16_t) 2627 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t) 2628 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t) 2629 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t) 2630 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t) 2631 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t) 2632 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t) 2633 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE 2634 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t) 2635 #endif 2636 #endif // HWY_HAVE_SCALAR_F16_OPERATORS 2637 2638 #if HWY_HAVE_SCALAR_BF16_OPERATORS 2639 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t) 2640 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t) 2641 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t) 2642 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t) 2643 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, bfloat16_t) 2644 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, bfloat16_t) 2645 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, bfloat16_t) 2646 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, bfloat16_t) 2647 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t) 2648 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t) 2649 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t) 2650 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t) 2651 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t) 2652 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t) 2653 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE 2654 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t) 2655 #endif 2656 #endif // HWY_HAVE_SCALAR_BF16_OPERATORS 2657 2658 #undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP 2659 #undef HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP 2660 #undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP 2661 2662 #endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS 2663 2664 //------------------------------------------------------------------------------ 2665 // Type conversions (after IsSpecialFloat) 2666 2667 HWY_API float F32FromF16Mem(const void* ptr) { 2668 float16_t f16; 2669 CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16); 2670 return F32FromF16(f16); 2671 } 2672 2673 HWY_API float F32FromBF16Mem(const void* ptr) { 2674 bfloat16_t bf; 2675 CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf); 2676 return F32FromBF16(bf); 2677 } 2678 2679 #if HWY_HAVE_SCALAR_F16_OPERATORS 2680 #define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR 2681 #else 2682 #define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR 2683 #endif 2684 2685 namespace detail { 2686 2687 template <class TTo, class TFrom> 2688 static HWY_INLINE HWY_MAYBE_UNUSED constexpr TTo ConvertScalarToResult( 2689 hwy::SizeTag<0> /*conv_to_tag*/, TFrom in) { 2690 return static_cast<TTo>(static_cast<TFrom>(in)); 2691 } 2692 2693 template <class TTo> 2694 static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo 2695 ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, float in) { 2696 return F16FromF32(in); 2697 } 2698 2699 template <class TTo> 2700 static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo 2701 ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, double in) { 2702 return F16FromF64(in); 2703 } 2704 2705 template <class TTo> 2706 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo 2707 ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, float in) { 2708 return BF16FromF32(in); 2709 } 2710 2711 template <class TTo> 2712 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo 2713 ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, double in) { 2714 return BF16FromF64(in); 2715 } 2716 2717 template <class TFrom, HWY_IF_BF16(TFrom)> 2718 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR float 2719 ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) { 2720 return F32FromBF16(in); 2721 } 2722 2723 template <class TFrom, HWY_IF_F16(TFrom)> 2724 static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR float 2725 ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) { 2726 return F32FromF16(in); 2727 } 2728 2729 template <class TFrom> 2730 static HWY_INLINE HWY_MAYBE_UNUSED constexpr auto 2731 ConvertScalarSpecialFloatToF32(hwy::FloatTag /*conv_from_tag*/, TFrom in) 2732 -> hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float> { 2733 return static_cast< 2734 hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float>>( 2735 in); 2736 } 2737 2738 template <class TFrom> 2739 static HWY_INLINE HWY_MAYBE_UNUSED constexpr TFrom 2740 ConvertScalarSpecialFloatToF32(hwy::SizeTag<0> /*conv_from_tag*/, TFrom in) { 2741 return static_cast<TFrom>(in); 2742 } 2743 2744 } // namespace detail 2745 2746 template <typename TTo, typename TFrom> 2747 HWY_API constexpr TTo ConvertScalarTo(TFrom in) { 2748 return detail::ConvertScalarToResult<TTo>( 2749 hwy::SizeTag< 2750 (!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() && 2751 hwy::IsSpecialFloat<TTo>()) 2752 ? (hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>() ? 0x300 2753 : 0x200) 2754 : 0>(), 2755 detail::ConvertScalarSpecialFloatToF32( 2756 hwy::SizeTag< 2757 (!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() && 2758 (hwy::IsSpecialFloat<TFrom>() || hwy::IsSpecialFloat<TTo>())) 2759 ? (hwy::IsSpecialFloat<TFrom>() ? 0x300 : 0x200) 2760 : 0>(), 2761 static_cast<TFrom&&>(in))); 2762 } 2763 2764 //------------------------------------------------------------------------------ 2765 // Helper functions 2766 2767 template <typename T1, typename T2> 2768 constexpr inline T1 DivCeil(T1 a, T2 b) { 2769 #if HWY_CXX_LANG >= 201703L 2770 HWY_DASSERT(b != T2{0}); 2771 #endif 2772 return (a + b - 1) / b; 2773 } 2774 2775 // Works for any non-zero `align`; if a power of two, compiler emits ADD+AND. 2776 constexpr inline size_t RoundUpTo(size_t what, size_t align) { 2777 return DivCeil(what, align) * align; 2778 } 2779 2780 // Works for any `align`; if a power of two, compiler emits AND. 2781 constexpr inline size_t RoundDownTo(size_t what, size_t align) { 2782 return what - (what % align); 2783 } 2784 2785 namespace detail { 2786 2787 // T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right 2788 // shift 2789 template <class T> 2790 static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val, 2791 int shift_amt) { 2792 return static_cast<T>(val >> shift_amt); 2793 } 2794 2795 // T is signed and (val >> shift_amt) is a non-arithmetic right shift 2796 template <class T> 2797 static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val, 2798 int shift_amt) { 2799 using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>; 2800 return static_cast<T>( 2801 (val < 0) ? static_cast<TU>( 2802 ~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt)) 2803 : static_cast<TU>(static_cast<TU>(val) >> shift_amt)); 2804 } 2805 2806 } // namespace detail 2807 2808 // If T is an signed integer type, ScalarShr is guaranteed to perform an 2809 // arithmetic right shift 2810 2811 // Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to 2812 // perform a logical right shift 2813 template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)> 2814 HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) { 2815 using NonCvRefT = RemoveCvRef<T>; 2816 return detail::ScalarShr( 2817 hwy::SizeTag<((IsSigned<NonCvRefT>() && 2818 (LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) != 2819 static_cast<NonCvRefT>(-1)) 2820 ? 0x100 2821 : 0)>(), 2822 static_cast<NonCvRefT>(val), shift_amt); 2823 } 2824 2825 // Undefined results for x == 0. 2826 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { 2827 HWY_DASSERT(x != 0); 2828 #if HWY_COMPILER_MSVC 2829 unsigned long index; // NOLINT 2830 _BitScanForward(&index, x); 2831 return index; 2832 #else // HWY_COMPILER_MSVC 2833 return static_cast<size_t>(__builtin_ctz(x)); 2834 #endif // HWY_COMPILER_MSVC 2835 } 2836 2837 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { 2838 HWY_DASSERT(x != 0); 2839 #if HWY_COMPILER_MSVC 2840 #if HWY_ARCH_X86_64 2841 unsigned long index; // NOLINT 2842 _BitScanForward64(&index, x); 2843 return index; 2844 #else // HWY_ARCH_X86_64 2845 // _BitScanForward64 not available 2846 uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF); 2847 unsigned long index; // NOLINT 2848 if (lsb == 0) { 2849 uint32_t msb = static_cast<uint32_t>(x >> 32u); 2850 _BitScanForward(&index, msb); 2851 return 32 + index; 2852 } else { 2853 _BitScanForward(&index, lsb); 2854 return index; 2855 } 2856 #endif // HWY_ARCH_X86_64 2857 #else // HWY_COMPILER_MSVC 2858 return static_cast<size_t>(__builtin_ctzll(x)); 2859 #endif // HWY_COMPILER_MSVC 2860 } 2861 2862 // Undefined results for x == 0. 2863 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { 2864 HWY_DASSERT(x != 0); 2865 #if HWY_COMPILER_MSVC 2866 unsigned long index; // NOLINT 2867 _BitScanReverse(&index, x); 2868 return 31 - index; 2869 #else // HWY_COMPILER_MSVC 2870 return static_cast<size_t>(__builtin_clz(x)); 2871 #endif // HWY_COMPILER_MSVC 2872 } 2873 2874 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { 2875 HWY_DASSERT(x != 0); 2876 #if HWY_COMPILER_MSVC 2877 #if HWY_ARCH_X86_64 2878 unsigned long index; // NOLINT 2879 _BitScanReverse64(&index, x); 2880 return 63 - index; 2881 #else // HWY_ARCH_X86_64 2882 // _BitScanReverse64 not available 2883 const uint32_t msb = static_cast<uint32_t>(x >> 32u); 2884 unsigned long index; // NOLINT 2885 if (msb == 0) { 2886 const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF); 2887 _BitScanReverse(&index, lsb); 2888 return 63 - index; 2889 } else { 2890 _BitScanReverse(&index, msb); 2891 return 31 - index; 2892 } 2893 #endif // HWY_ARCH_X86_64 2894 #else // HWY_COMPILER_MSVC 2895 return static_cast<size_t>(__builtin_clzll(x)); 2896 #endif // HWY_COMPILER_MSVC 2897 } 2898 2899 template <class T, HWY_IF_INTEGER(RemoveCvRef<T>), 2900 HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))> 2901 HWY_API size_t PopCount(T x) { 2902 uint32_t u32_x = static_cast<uint32_t>( 2903 static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x)); 2904 2905 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG 2906 return static_cast<size_t>(__builtin_popcountl(u32_x)); 2907 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) 2908 return static_cast<size_t>(_mm_popcnt_u32(u32_x)); 2909 #else 2910 u32_x -= ((u32_x >> 1) & 0x55555555u); 2911 u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u)); 2912 u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu); 2913 u32_x += (u32_x >> 8); 2914 u32_x += (u32_x >> 16); 2915 return static_cast<size_t>(u32_x & 0x3Fu); 2916 #endif 2917 } 2918 2919 template <class T, HWY_IF_INTEGER(RemoveCvRef<T>), 2920 HWY_IF_T_SIZE(RemoveCvRef<T>, 8)> 2921 HWY_API size_t PopCount(T x) { 2922 uint64_t u64_x = static_cast<uint64_t>( 2923 static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x)); 2924 2925 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG 2926 return static_cast<size_t>(__builtin_popcountll(u64_x)); 2927 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) 2928 return _mm_popcnt_u64(u64_x); 2929 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) 2930 return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) + 2931 _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32)); 2932 #else 2933 u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL); 2934 u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) + 2935 (u64_x & 0x3333333333333333ULL)); 2936 u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL); 2937 u64_x += (u64_x >> 8); 2938 u64_x += (u64_x >> 16); 2939 u64_x += (u64_x >> 32); 2940 return static_cast<size_t>(u64_x & 0x7Fu); 2941 #endif 2942 } 2943 2944 // Skip HWY_API due to GCC "function not considered for inlining". Previously 2945 // such errors were caused by underlying type mismatches, but it's not clear 2946 // what is still mismatched despite all the casts. 2947 template <typename TI> 2948 /*HWY_API*/ constexpr size_t FloorLog2(TI x) { 2949 return x == TI{1} 2950 ? 0 2951 : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1); 2952 } 2953 2954 template <typename TI> 2955 /*HWY_API*/ constexpr size_t CeilLog2(TI x) { 2956 return x == TI{1} 2957 ? 0 2958 : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1); 2959 } 2960 2961 template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)> 2962 HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) { 2963 return t + static_cast<T>(increment); 2964 } 2965 2966 template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)> 2967 HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) { 2968 return ConvertScalarTo<T>(ConvertScalarTo<float>(t) + 2969 ConvertScalarTo<float>(increment)); 2970 } 2971 2972 template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)> 2973 HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) { 2974 using TU = MakeUnsigned<T>; 2975 // Sub-int types would promote to int, not unsigned, which would trigger 2976 // warnings, so first promote to the largest unsigned type. Due to 2977 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8 2978 // until fixed in 9.3, we use built-in types rather than uint64_t. 2979 return static_cast<T>(static_cast<TU>( 2980 static_cast<unsigned long long>(static_cast<unsigned long long>(t) + 2981 static_cast<unsigned long long>(n)) & 2982 uint64_t{hwy::LimitsMax<TU>()})); 2983 } 2984 2985 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 2986 #pragma intrinsic(_mul128) 2987 #pragma intrinsic(_umul128) 2988 #endif 2989 2990 // 64 x 64 = 128 bit multiplication 2991 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { 2992 #if defined(__SIZEOF_INT128__) 2993 __uint128_t product = 2994 static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); 2995 *upper = static_cast<uint64_t>(product >> 64); 2996 return static_cast<uint64_t>(product & 0xFFFFFFFFFFFFFFFFULL); 2997 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 2998 return _umul128(a, b, upper); 2999 #else 3000 constexpr uint64_t kLo32 = 0xFFFFFFFFU; 3001 const uint64_t lo_lo = (a & kLo32) * (b & kLo32); 3002 const uint64_t hi_lo = (a >> 32) * (b & kLo32); 3003 const uint64_t lo_hi = (a & kLo32) * (b >> 32); 3004 const uint64_t hi_hi = (a >> 32) * (b >> 32); 3005 const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; 3006 *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; 3007 return (t << 32) | (lo_lo & kLo32); 3008 #endif 3009 } 3010 3011 HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) { 3012 #if defined(__SIZEOF_INT128__) 3013 __int128_t product = static_cast<__int128_t>(a) * static_cast<__int128_t>(b); 3014 *upper = static_cast<int64_t>(product >> 64); 3015 return static_cast<int64_t>(product & 0xFFFFFFFFFFFFFFFFULL); 3016 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 3017 return _mul128(a, b, upper); 3018 #else 3019 uint64_t unsigned_upper; 3020 const int64_t lower = static_cast<int64_t>(Mul128( 3021 static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper)); 3022 *upper = static_cast<int64_t>( 3023 unsigned_upper - 3024 (static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) - 3025 (static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a))); 3026 return lower; 3027 #endif 3028 } 3029 3030 // Precomputation for fast n / divisor and n % divisor, where n is a variable 3031 // and divisor is unchanging but unknown at compile-time. 3032 class Divisor { 3033 public: 3034 explicit Divisor(uint32_t divisor) : divisor_(divisor) { 3035 if (divisor <= 1) return; 3036 3037 const uint32_t len = 3038 static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1)); 3039 const uint64_t u_hi = (2ULL << len) - divisor; 3040 const uint32_t q = Truncate((u_hi << 32) / divisor); 3041 3042 mul_ = q + 1; 3043 shift1_ = 1; 3044 shift2_ = len; 3045 } 3046 3047 uint32_t GetDivisor() const { return divisor_; } 3048 3049 // Returns n / divisor_. 3050 uint32_t Divide(uint32_t n) const { 3051 const uint64_t mul = mul_; 3052 const uint32_t t = Truncate((mul * n) >> 32); 3053 return (t + ((n - t) >> shift1_)) >> shift2_; 3054 } 3055 3056 // Returns n % divisor_. 3057 uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); } 3058 3059 private: 3060 static uint32_t Truncate(uint64_t x) { 3061 return static_cast<uint32_t>(x & 0xFFFFFFFFu); 3062 } 3063 3064 uint32_t divisor_; 3065 uint32_t mul_ = 1; 3066 uint32_t shift1_ = 0; 3067 uint32_t shift2_ = 0; 3068 }; 3069 3070 #ifndef HWY_HAVE_DIV128 // allow override 3071 // Exclude clang-cl because it calls __divti3 from clang_rt.builtins-x86_64, 3072 // which is not linked in. 3073 #if (HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64) || \ 3074 (defined(__SIZEOF_INT128__) && !HWY_COMPILER_CLANGCL) 3075 #define HWY_HAVE_DIV128 1 3076 #else 3077 #define HWY_HAVE_DIV128 0 3078 #endif 3079 #endif // HWY_HAVE_DIV128 3080 3081 // Divisor64 can precompute the multiplicative inverse. 3082 #if HWY_HAVE_DIV128 3083 3084 #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64 3085 #pragma intrinsic(_udiv128) 3086 #pragma intrinsic(__umulh) 3087 #endif 3088 3089 // As above, but for 64-bit divisors: more expensive to compute and initialize. 3090 class Divisor64 { 3091 public: 3092 explicit Divisor64(uint64_t divisor) : divisor_(divisor) { 3093 if (divisor <= 1) return; 3094 3095 const uint64_t len = 3096 static_cast<uint64_t>(63 - Num0BitsAboveMS1Bit_Nonzero64(divisor - 1)); 3097 const uint64_t u_hi = (2ULL << len) - divisor; 3098 const uint64_t q = Div128(u_hi, divisor); 3099 3100 mul_ = q + 1; 3101 shift1_ = 1; 3102 shift2_ = len; 3103 } 3104 3105 uint64_t GetDivisor() const { return divisor_; } 3106 3107 // Returns n / divisor_. 3108 uint64_t Divide(uint64_t n) const { 3109 const uint64_t t = MulHigh(mul_, n); 3110 return (t + ((n - t) >> shift1_)) >> shift2_; 3111 } 3112 3113 // Returns n % divisor_. 3114 uint64_t Remainder(uint64_t n) const { return n - (Divide(n) * divisor_); } 3115 3116 private: 3117 uint64_t divisor_; 3118 3119 static uint64_t Div128(uint64_t hi, uint64_t div) { 3120 #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64 3121 unsigned __int64 remainder; // unused 3122 return _udiv128(hi, uint64_t{0}, div, &remainder); 3123 #else 3124 using u128 = unsigned __int128; 3125 const u128 hi128 = static_cast<u128>(hi) << 64; 3126 return static_cast<uint64_t>(hi128 / static_cast<u128>(div)); 3127 #endif 3128 } 3129 3130 static uint64_t MulHigh(uint64_t a, uint64_t b) { 3131 #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64 3132 return __umulh(a, b); 3133 #else 3134 using u128 = unsigned __int128; 3135 const u128 a128 = static_cast<u128>(a); 3136 const u128 b128 = static_cast<u128>(b); 3137 return static_cast<uint64_t>((a128 * b128) >> 64); 3138 #endif 3139 } 3140 3141 uint64_t mul_ = 1; 3142 uint64_t shift1_ = 0; 3143 uint64_t shift2_ = 0; 3144 }; 3145 #else 3146 // No Div128 available, use built-in 64-bit division on each call. 3147 class Divisor64 { 3148 public: 3149 explicit Divisor64(uint64_t divisor) : divisor_(divisor) {} 3150 3151 uint64_t GetDivisor() const { return divisor_; } 3152 3153 uint64_t Divide(uint64_t n) const { return n / divisor_; } 3154 uint64_t Remainder(uint64_t n) const { return n % divisor_; } 3155 3156 private: 3157 uint64_t divisor_; 3158 }; 3159 #endif // HWY_HAVE_DIV128 3160 3161 namespace detail { 3162 3163 template <typename T> 3164 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/, 3165 T val) { 3166 using TU = MakeUnsigned<T>; 3167 return BitCastScalar<T>( 3168 static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>()))); 3169 } 3170 3171 template <typename T> 3172 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T 3173 ScalarAbs(hwy::SpecialTag /*tag*/, T val) { 3174 return ScalarAbs(hwy::FloatTag(), val); 3175 } 3176 3177 template <typename T> 3178 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T 3179 ScalarAbs(hwy::SignedTag /*tag*/, T val) { 3180 using TU = MakeUnsigned<T>; 3181 return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val; 3182 } 3183 3184 template <typename T> 3185 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T 3186 ScalarAbs(hwy::UnsignedTag /*tag*/, T val) { 3187 return val; 3188 } 3189 3190 } // namespace detail 3191 3192 template <typename T> 3193 HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) { 3194 using TVal = MakeLaneTypeIfInteger< 3195 detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>; 3196 return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val)); 3197 } 3198 3199 template <typename T> 3200 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) { 3201 using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>; 3202 using TU = MakeUnsigned<TF>; 3203 return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>()); 3204 } 3205 3206 template <typename T> 3207 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) { 3208 using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>; 3209 using TU = MakeUnsigned<TF>; 3210 return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) == 3211 static_cast<TU>(MaxExponentTimes2<TF>()); 3212 } 3213 3214 namespace detail { 3215 3216 template <typename T> 3217 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite( 3218 hwy::FloatTag /*tag*/, T val) { 3219 using TU = MakeUnsigned<T>; 3220 return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>()); 3221 } 3222 3223 template <typename T> 3224 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite( 3225 hwy::NonFloatTag /*tag*/, T /*val*/) { 3226 // Integer values are always finite 3227 return true; 3228 } 3229 3230 } // namespace detail 3231 3232 template <typename T> 3233 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) { 3234 using TVal = MakeLaneTypeIfInteger< 3235 detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>; 3236 return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(), 3237 static_cast<TVal>(val)); 3238 } 3239 3240 template <typename T> 3241 HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn, 3242 T sign) { 3243 using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>; 3244 using TU = MakeUnsigned<TF>; 3245 return BitCastScalar<TF>(static_cast<TU>( 3246 (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) | 3247 (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>()))); 3248 } 3249 3250 template <typename T> 3251 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) { 3252 using TVal = MakeLaneTypeIfInteger< 3253 detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>; 3254 using TU = MakeUnsigned<TVal>; 3255 return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0); 3256 } 3257 3258 // Prevents the compiler from eliding the computations that led to "output". 3259 #if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \ 3260 !defined(_SOFT_FLOAT) 3261 // Workaround to avoid test failures on PPC if compiled with Clang 3262 template <class T, HWY_IF_F32(T)> 3263 HWY_API void PreventElision(T&& output) { 3264 asm volatile("" : "+f"(output)::"memory"); 3265 } 3266 template <class T, HWY_IF_F64(T)> 3267 HWY_API void PreventElision(T&& output) { 3268 asm volatile("" : "+d"(output)::"memory"); 3269 } 3270 template <class T, HWY_IF_NOT_FLOAT3264(T)> 3271 HWY_API void PreventElision(T&& output) { 3272 asm volatile("" : "+r"(output)::"memory"); 3273 } 3274 #else 3275 template <class T> 3276 HWY_API void PreventElision(T&& output) { 3277 #if HWY_COMPILER_MSVC 3278 // MSVC does not support inline assembly anymore (and never supported GCC's 3279 // RTL constraints). Self-assignment with #pragma optimize("off") might be 3280 // expected to prevent elision, but it does not with MSVC 2015. Type-punning 3281 // with volatile pointers generates inefficient code on MSVC 2017. 3282 static std::atomic<RemoveCvRef<T>> sink; 3283 sink.store(output, std::memory_order_relaxed); 3284 #else 3285 // Works by indicating to the compiler that "output" is being read and 3286 // modified. The +r constraint avoids unnecessary writes to memory, but only 3287 // works for built-in types (typically FuncOutput). 3288 asm volatile("" : "+r"(output) : : "memory"); 3289 #endif 3290 } 3291 #endif 3292 3293 } // namespace hwy 3294 3295 #endif // HIGHWAY_HWY_BASE_H_