tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

base.h (113541B)


      1 // Copyright 2020 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef HIGHWAY_HWY_BASE_H_
     17 #define HIGHWAY_HWY_BASE_H_
     18 
     19 // Target-independent definitions.
     20 
     21 // IWYU pragma: begin_exports
     22 #include <stddef.h>
     23 #include <stdint.h>
     24 #if defined(HWY_HEADER_ONLY)
     25 #include <stdarg.h>
     26 #include <stdio.h>
     27 #endif
     28 
     29 #if !defined(HWY_NO_LIBCXX)
     30 #include <ostream>
     31 #endif
     32 
     33 #include "hwy/detect_compiler_arch.h"
     34 #include "hwy/highway_export.h"
     35 
     36 #include <mozilla/Attributes.h>
     37 
     38 // API version (https://semver.org/); keep in sync with CMakeLists.txt and
     39 // meson.build.
     40 #define HWY_MAJOR 1
     41 #define HWY_MINOR 3
     42 #define HWY_PATCH 0
     43 
     44 // True if the Highway version >= major.minor.0. Added in 1.2.0.
     45 #define HWY_VERSION_GE(major, minor) \
     46  (HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor)))
     47 // True if the Highway version < major.minor.0. Added in 1.2.0.
     48 #define HWY_VERSION_LT(major, minor) \
     49  (HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor)))
     50 
     51 // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
     52 #if !HWY_IDE
     53 
     54 #if !defined(HWY_NO_LIBCXX)
     55 #ifndef __STDC_FORMAT_MACROS
     56 #define __STDC_FORMAT_MACROS  // before inttypes.h
     57 #endif
     58 #include <inttypes.h>
     59 #endif
     60 
     61 #endif  // !HWY_IDE
     62 
     63 #if !defined(HWY_NO_LIBCXX) || HWY_COMPILER_MSVC
     64 #include <atomic>
     65 #endif
     66 
     67 #ifndef HWY_HAVE_COMPARE_HEADER  // allow override
     68 #define HWY_HAVE_COMPARE_HEADER 0
     69 #if defined(__has_include)  // note: wrapper macro fails on Clang ~17
     70 #if __has_include(<compare>)
     71 #undef HWY_HAVE_COMPARE_HEADER
     72 #define HWY_HAVE_COMPARE_HEADER 1
     73 #endif  // __has_include
     74 #endif  // defined(__has_include)
     75 #endif  // HWY_HAVE_COMPARE_HEADER
     76 
     77 #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE  // allow override
     78 #if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
     79    __cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER
     80 #include <compare>
     81 #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
     82 #else
     83 #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
     84 #endif
     85 #endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE
     86 
     87 // IWYU pragma: end_exports
     88 
     89 #if HWY_COMPILER_MSVC
     90 #include <string.h>  // memcpy
     91 #endif
     92 
     93 //------------------------------------------------------------------------------
     94 // Compiler-specific definitions
     95 
     96 #define HWY_STR_IMPL(macro) #macro
     97 #define HWY_STR(macro) HWY_STR_IMPL(macro)
     98 
     99 #if HWY_COMPILER_MSVC
    100 
    101 #include <intrin.h>
    102 
    103 #define HWY_FUNCTION __FUNCSIG__  // function name + template args
    104 #define HWY_RESTRICT __restrict
    105 #define HWY_INLINE __forceinline
    106 #define HWY_NOINLINE __declspec(noinline)
    107 #define HWY_FLATTEN
    108 #define HWY_NORETURN __declspec(noreturn)
    109 #define HWY_LIKELY(expr) (expr)
    110 #define HWY_UNLIKELY(expr) (expr)
    111 #define HWY_UNREACHABLE __assume(false)
    112 #define HWY_PRAGMA(tokens) __pragma(tokens)
    113 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
    114 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
    115 #define HWY_MAYBE_UNUSED
    116 #define HWY_HAS_ASSUME_ALIGNED 0
    117 #if (_MSC_VER >= 1700)
    118 #define HWY_MUST_USE_RESULT _Check_return_
    119 #else
    120 #define HWY_MUST_USE_RESULT
    121 #endif
    122 
    123 #else
    124 
    125 #define HWY_FUNCTION __PRETTY_FUNCTION__  // function name + template args
    126 #define HWY_RESTRICT __restrict__
    127 // force inlining without optimization enabled creates very inefficient code
    128 // that can cause compiler timeout
    129 #ifdef __OPTIMIZE__
    130 #define HWY_INLINE inline __attribute__((always_inline))
    131 #else
    132 #define HWY_INLINE inline
    133 #endif
    134 #define HWY_NOINLINE __attribute__((noinline))
    135 #define HWY_FLATTEN __attribute__((flatten))
    136 #define HWY_NORETURN __attribute__((noreturn))
    137 #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
    138 #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
    139 #if HWY_COMPILER_GCC || HWY_HAS_BUILTIN(__builtin_unreachable)
    140 #define HWY_UNREACHABLE __builtin_unreachable()
    141 #else
    142 #define HWY_UNREACHABLE
    143 #endif
    144 #define HWY_PRAGMA(tokens) _Pragma(#tokens)
    145 #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
    146 #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
    147 // Encountered "attribute list cannot appear here" when using the C++17
    148 // [[maybe_unused]], so only use the old style attribute for now.
    149 #define HWY_MAYBE_UNUSED __attribute__((unused))
    150 #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
    151 
    152 #endif  // !HWY_COMPILER_MSVC
    153 
    154 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
    155    (HWY_COMPILER_ICC && !HWY_COMPILER_ICX)
    156 // The use of __attribute__((unused)) in private class member variables triggers
    157 // a compiler warning with GCC 11 and earlier and ICC
    158 
    159 // GCC 11 and earlier and ICC also do not emit -Wunused-private-field warnings
    160 // for unused private class member variables
    161 #define HWY_MEMBER_VAR_MAYBE_UNUSED
    162 #else
    163 // Clang and ICX need __attribute__((unused)) in unused private class member
    164 // variables to suppress -Wunused-private-field warnings unless this warning is
    165 // ignored by using HWY_DIAGNOSTICS_OFF
    166 #define HWY_MEMBER_VAR_MAYBE_UNUSED HWY_MAYBE_UNUSED
    167 #endif
    168 
    169 //------------------------------------------------------------------------------
    170 // Builtin/attributes (no more #include after this point due to namespace!)
    171 
    172 namespace hwy {
    173 
    174 // Enables error-checking of format strings.
    175 #if HWY_HAS_ATTRIBUTE(__format__)
    176 #define HWY_FORMAT(idx_fmt, idx_arg) \
    177  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
    178 #else
    179 #define HWY_FORMAT(idx_fmt, idx_arg)
    180 #endif
    181 
    182 // Returns a void* pointer which the compiler then assumes is N-byte aligned.
    183 // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
    184 //
    185 // The assignment semantics are required by GCC/Clang. ICC provides an in-place
    186 // __assume_aligned, whereas MSVC's __assume appears unsuitable.
    187 #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
    188 #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
    189 #else
    190 #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
    191 #endif
    192 
    193 // Returns a pointer whose type is `type` (T*), while allowing the compiler to
    194 // assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
    195 #define HWY_RCAST_ALIGNED(type, ptr) \
    196  reinterpret_cast<type>(            \
    197      HWY_ASSUME_ALIGNED((ptr), alignof(hwy::RemovePtr<type>)))
    198 
    199 // Clang and GCC require attributes on each function into which SIMD intrinsics
    200 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
    201 // automatic annotation via pragmas.
    202 #if HWY_COMPILER_ICC
    203 // As of ICC 2021.{1-9} the pragma is neither implemented nor required.
    204 #define HWY_PUSH_ATTRIBUTES(targets_str)
    205 #define HWY_POP_ATTRIBUTES
    206 #elif HWY_COMPILER_CLANG
    207 #define HWY_PUSH_ATTRIBUTES(targets_str)                                \
    208  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
    209                                  apply_to = function))
    210 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
    211 #elif HWY_COMPILER_GCC_ACTUAL
    212 #define HWY_PUSH_ATTRIBUTES(targets_str) \
    213  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
    214 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
    215 #else
    216 #define HWY_PUSH_ATTRIBUTES(targets_str)
    217 #define HWY_POP_ATTRIBUTES
    218 #endif
    219 
    220 //------------------------------------------------------------------------------
    221 // Macros
    222 
    223 // Note: it is safe to remove `static` for users who want to use modules, but
    224 // that might be a breaking change for some users, hence we do not by default.
    225 #define HWY_API static HWY_INLINE HWY_FLATTEN
    226 
    227 #define HWY_CONCAT_IMPL(a, b) a##b
    228 #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
    229 
    230 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
    231 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
    232 
    233 #if HWY_COMPILER_GCC_ACTUAL
    234 // nielskm: GCC does not support '#pragma GCC unroll' without the factor.
    235 #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
    236 #define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
    237 #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
    238 #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
    239 #define HWY_DEFAULT_UNROLL HWY_UNROLL()
    240 #else
    241 #define HWY_UNROLL(factor)
    242 #define HWY_DEFAULT_UNROLL
    243 #endif
    244 
    245 // Tell a compiler that the expression always evaluates to true.
    246 // The expression should be free from any side effects.
    247 // Some older compilers may have trouble with complex expressions, therefore
    248 // it is advisable to split multiple conditions into separate assume statements,
    249 // and manually check the generated code.
    250 // OK but could fail:
    251 //   HWY_ASSUME(x == 2 && y == 3);
    252 // Better:
    253 //   HWY_ASSUME(x == 2);
    254 //   HWY_ASSUME(y == 3);
    255 #if (HWY_CXX_LANG >= 202302L) && HWY_HAS_CPP_ATTRIBUTE(assume)
    256 #define HWY_ASSUME(expr) [[assume(expr)]]
    257 #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
    258 #define HWY_ASSUME(expr) __assume(expr)
    259 // __builtin_assume() was added in clang 3.6.
    260 #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
    261 #define HWY_ASSUME(expr) __builtin_assume(expr)
    262 // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
    263 // later, so check for the compiler version directly.
    264 #elif HWY_COMPILER_GCC_ACTUAL >= 405
    265 #define HWY_ASSUME(expr) \
    266  ((expr) ? static_cast<void>(0) : __builtin_unreachable())
    267 #else
    268 #define HWY_ASSUME(expr) static_cast<void>(0)
    269 #endif
    270 
    271 // Compile-time fence to prevent undesirable code reordering. On Clang, the
    272 // typical `asm volatile("" : : : "memory")` seems to be ignored. Note that
    273 // `std::atomic_thread_fence` affects other threads, hence might generate a
    274 // barrier instruction, but this does not.
    275 #if !defined(HWY_NO_LIBCXX)
    276 #define HWY_FENCE std::atomic_signal_fence(std::memory_order_seq_cst)
    277 #elif HWY_COMPILER_GCC
    278 #define HWY_FENCE asm volatile("" : : : "memory")
    279 #else
    280 #define HWY_FENCE
    281 #endif
    282 
    283 // 4 instances of a given literal value, useful as input to LoadDup128.
    284 #define HWY_REP4(literal) literal, literal, literal, literal
    285 
    286 //------------------------------------------------------------------------------
    287 // Abort / Warn
    288 
    289 #if defined(HWY_HEADER_ONLY)
    290 HWY_DLLEXPORT inline void HWY_FORMAT(3, 4)
    291    Warn(const char* file, int line, const char* format, ...) {
    292  char buf[800];
    293  va_list args;
    294  va_start(args, format);
    295  vsnprintf(buf, sizeof(buf), format, args);
    296  va_end(args);
    297 
    298  fprintf(stderr, "Warn at %s:%d: %s\n", file, line, buf);
    299 }
    300 
    301 HWY_DLLEXPORT HWY_NORETURN inline void HWY_FORMAT(3, 4)
    302    Abort(const char* file, int line, const char* format, ...) {
    303  char buf[800];
    304  va_list args;
    305  va_start(args, format);
    306  vsnprintf(buf, sizeof(buf), format, args);
    307  va_end(args);
    308 
    309  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
    310 
    311  fflush(stderr);
    312 
    313 // Now terminate the program:
    314 #if HWY_ARCH_RISCV
    315  exit(1);  // trap/abort just freeze Spike.
    316 #else
    317  abort();  // Compile error without this due to HWY_NORETURN.
    318 #endif
    319 }
    320 #else  // !HWY_HEADER_ONLY
    321 // Interfaces for custom Warn/Abort handlers.
    322 typedef void (*WarnFunc)(const char* file, int line, const char* message);
    323 
    324 typedef void (*AbortFunc)(const char* file, int line, const char* message);
    325 
    326 // Returns current Warn() handler, or nullptr if no handler was yet registered,
    327 // indicating Highway should print to stderr.
    328 // DEPRECATED because this is thread-hostile and prone to misuse (modifying the
    329 // underlying pointer through the reference).
    330 HWY_DLLEXPORT WarnFunc& GetWarnFunc();
    331 
    332 // Returns current Abort() handler, or nullptr if no handler was yet registered,
    333 // indicating Highway should print to stderr and abort.
    334 // DEPRECATED because this is thread-hostile and prone to misuse (modifying the
    335 // underlying pointer through the reference).
    336 HWY_DLLEXPORT AbortFunc& GetAbortFunc();
    337 
    338 // Sets a new Warn() handler and returns the previous handler, which is nullptr
    339 // if no previous handler was registered, and should otherwise be called from
    340 // the new handler. Thread-safe.
    341 HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func);
    342 
    343 // Sets a new Abort() handler and returns the previous handler, which is nullptr
    344 // if no previous handler was registered, and should otherwise be called from
    345 // the new handler. If all handlers return, then Highway will terminate the app.
    346 // Thread-safe.
    347 HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func);
    348 
    349 HWY_DLLEXPORT void HWY_FORMAT(3, 4)
    350    Warn(const char* file, int line, const char* format, ...);
    351 
    352 HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
    353    Abort(const char* file, int line, const char* format, ...);
    354 
    355 #endif  // HWY_HEADER_ONLY
    356 
    357 #define HWY_WARN(format, ...) \
    358  ::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__)
    359 
    360 #define HWY_ABORT(format, ...) \
    361  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
    362 
    363 // Always enabled.
    364 #define HWY_ASSERT_M(condition, msg)               \
    365  do {                                             \
    366    if (!(condition)) {                            \
    367      HWY_ABORT("Assert %s: %s", #condition, msg); \
    368    }                                              \
    369  } while (0)
    370 #define HWY_ASSERT(condition) HWY_ASSERT_M(condition, "")
    371 
    372 #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
    373    defined(__SANITIZE_MEMORY__)
    374 #define HWY_IS_MSAN 1
    375 #else
    376 #define HWY_IS_MSAN 0
    377 #endif
    378 
    379 #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
    380    defined(__SANITIZE_ADDRESS__)
    381 #define HWY_IS_ASAN 1
    382 #else
    383 #define HWY_IS_ASAN 0
    384 #endif
    385 
    386 #if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
    387    defined(__SANITIZE_HWADDRESS__)
    388 #define HWY_IS_HWASAN 1
    389 #else
    390 #define HWY_IS_HWASAN 0
    391 #endif
    392 
    393 #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
    394    defined(__SANITIZE_THREAD__)
    395 #define HWY_IS_TSAN 1
    396 #else
    397 #define HWY_IS_TSAN 0
    398 #endif
    399 
    400 #if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
    401    defined(UNDEFINED_BEHAVIOR_SANITIZER)
    402 #define HWY_IS_UBSAN 1
    403 #else
    404 #define HWY_IS_UBSAN 0
    405 #endif
    406 
    407 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
    408 // You can disable MSAN by adding this attribute to the function that fails.
    409 #if HWY_IS_MSAN
    410 #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
    411 #else
    412 #define HWY_ATTR_NO_MSAN
    413 #endif
    414 
    415 #if HWY_IS_ASAN || HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN
    416 #define HWY_IS_SANITIZER 1
    417 #else
    418 #define HWY_IS_SANITIZER 0
    419 #endif
    420 
    421 // For enabling HWY_DASSERT and shortening tests in slower debug builds
    422 //
    423 // Note: `HWY_IS_UBSAN` is specifically excluded from engaging debug
    424 // builds. This is in service of Chromium's `-fsanitize=array-bounds` by
    425 // default, where we don't want Highway to unconditionally build in
    426 // debug mode.
    427 //
    428 // See also:
    429 // https://docs.google.com/document/d/1eCtY4AZF-SiFHxhIYWzEytdIx3C24de7ccD6Y5Gn2H8/edit?tab=t.9zkn85hr82ms#heading=h.efcshvfql42c
    430 #if !defined(HWY_IS_DEBUG_BUILD)
    431 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
    432 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
    433 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) ||         \
    434    (HWY_IS_ASAN || (HWY_IS_SANITIZER && !HWY_IS_UBSAN)) || \
    435    defined(__clang_analyzer__)
    436 #define HWY_IS_DEBUG_BUILD 1
    437 #else
    438 #define HWY_IS_DEBUG_BUILD 0
    439 #endif
    440 #endif  // HWY_IS_DEBUG_BUILD
    441 
    442 #if HWY_IS_DEBUG_BUILD
    443 #define HWY_DASSERT_M(condition, msg) HWY_ASSERT_M(condition, msg)
    444 #define HWY_DASSERT(condition) HWY_ASSERT_M(condition, "")
    445 #else
    446 #define HWY_DASSERT_M(condition, msg) \
    447  do {                                \
    448  } while (0)
    449 #define HWY_DASSERT(condition) \
    450  do {                         \
    451  } while (0)
    452 #endif
    453 
    454 //------------------------------------------------------------------------------
    455 // CopyBytes / ZeroBytes
    456 
    457 #if HWY_COMPILER_MSVC
    458 #pragma intrinsic(memcpy)
    459 #pragma intrinsic(memset)
    460 #endif
    461 
    462 template <size_t kBytes, typename From, typename To>
    463 HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
    464 #if HWY_COMPILER_MSVC
    465  memcpy(to, from, kBytes);
    466 #else
    467  __builtin_memcpy(to, from, kBytes);
    468 #endif
    469 }
    470 
    471 HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
    472                       size_t num_of_bytes_to_copy) {
    473 #if HWY_COMPILER_MSVC
    474  memcpy(to, from, num_of_bytes_to_copy);
    475 #else
    476  __builtin_memcpy(to, from, num_of_bytes_to_copy);
    477 #endif
    478 }
    479 
    480 // Same as CopyBytes, but for same-sized objects; avoids a size argument.
    481 template <typename From, typename To>
    482 HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
    483  static_assert(sizeof(From) == sizeof(To), "");
    484  CopyBytes<sizeof(From)>(from, to);
    485 }
    486 
    487 template <size_t kBytes, typename To>
    488 HWY_API void ZeroBytes(To* to) {
    489 #if HWY_COMPILER_MSVC
    490  memset(to, 0, kBytes);
    491 #else
    492  __builtin_memset(to, 0, kBytes);
    493 #endif
    494 }
    495 
    496 HWY_API void ZeroBytes(void* to, size_t num_bytes) {
    497 #if HWY_COMPILER_MSVC
    498  memset(to, 0, num_bytes);
    499 #else
    500  __builtin_memset(to, 0, num_bytes);
    501 #endif
    502 }
    503 
    504 //------------------------------------------------------------------------------
    505 // kMaxVectorSize (undocumented, pending removal)
    506 
    507 #if HWY_ARCH_X86
    508 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
    509 #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
    510    __riscv_v_intrinsic >= 11000
    511 // Not actually an upper bound on the size.
    512 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
    513 #else
    514 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
    515 #endif
    516 
    517 //------------------------------------------------------------------------------
    518 // Alignment
    519 
    520 // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
    521 // should be allocated dynamically via aligned_allocator.h because Lanes() may
    522 // exceed the stack size.
    523 #if HWY_ARCH_X86
    524 #define HWY_ALIGN_MAX alignas(64)
    525 #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
    526    __riscv_v_intrinsic >= 11000
    527 #define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
    528 #else
    529 #define HWY_ALIGN_MAX alignas(16)
    530 #endif
    531 
    532 //------------------------------------------------------------------------------
    533 // Lane types
    534 
    535 // hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
    536 // BitCastScalar to be implemented before the implementations of the
    537 // hwy::float16_t and hwy::bfloat16_t types
    538 struct float16_t;
    539 struct bfloat16_t;
    540 
    541 using float32_t = float;
    542 using float64_t = double;
    543 
    544 #pragma pack(push, 1)
    545 
    546 // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
    547 // https://reviews.llvm.org/D86310
    548 struct alignas(16) uint128_t {
    549  uint64_t lo;  // little-endian layout
    550  uint64_t hi;
    551 };
    552 
    553 // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
    554 // field is to be compared (Lt128Upper instead of Lt128).
    555 struct alignas(16) K64V64 {
    556  uint64_t value;  // little-endian layout
    557  uint64_t key;
    558 };
    559 
    560 // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
    561 // than when considering both to be a 64-bit key.
    562 struct alignas(8) K32V32 {
    563  uint32_t value;  // little-endian layout
    564  uint32_t key;
    565 };
    566 
    567 #pragma pack(pop)
    568 
    569 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
    570                                              const uint128_t& b) {
    571  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
    572 }
    573 // Required for std::greater.
    574 static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
    575                                              const uint128_t& b) {
    576  return b < a;
    577 }
    578 static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
    579                                               const uint128_t& b) {
    580  return a.lo == b.lo && a.hi == b.hi;
    581 }
    582 
    583 #if !defined(HWY_NO_LIBCXX)
    584 static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
    585                                                        const uint128_t& n) {
    586  return os << "[hi=" << n.hi << ",lo=" << n.lo << "]";
    587 }
    588 #endif
    589 
    590 static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
    591                                              const K64V64& b) {
    592  return a.key < b.key;
    593 }
    594 // Required for std::greater.
    595 static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
    596                                              const K64V64& b) {
    597  return b < a;
    598 }
    599 static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
    600                                               const K64V64& b) {
    601  return a.key == b.key;
    602 }
    603 
    604 #if !defined(HWY_NO_LIBCXX)
    605 static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
    606                                                        const K64V64& n) {
    607  return os << "[k=" << n.key << ",v=" << n.value << "]";
    608 }
    609 #endif
    610 
    611 static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
    612                                              const K32V32& b) {
    613  return a.key < b.key;
    614 }
    615 // Required for std::greater.
    616 static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
    617                                              const K32V32& b) {
    618  return b < a;
    619 }
    620 static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
    621                                               const K32V32& b) {
    622  return a.key == b.key;
    623 }
    624 
    625 #if !defined(HWY_NO_LIBCXX)
    626 static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
    627                                                        const K32V32& n) {
    628  return os << "[k=" << n.key << ",v=" << n.value << "]";
    629 }
    630 #endif
    631 
    632 //------------------------------------------------------------------------------
    633 // Controlling overload resolution (SFINAE)
    634 
    635 template <bool Condition>
    636 struct EnableIfT {};
    637 template <>
    638 struct EnableIfT<true> {
    639  using type = void;
    640 };
    641 
    642 template <bool Condition>
    643 using EnableIf = typename EnableIfT<Condition>::type;
    644 
    645 template <typename T, typename U>
    646 struct IsSameT {
    647  enum { value = 0 };
    648 };
    649 
    650 template <typename T>
    651 struct IsSameT<T, T> {
    652  enum { value = 1 };
    653 };
    654 
    655 template <typename T, typename U>
    656 HWY_API constexpr bool IsSame() {
    657  return IsSameT<T, U>::value;
    658 }
    659 
    660 // Returns whether T matches either of U1 or U2
    661 template <typename T, typename U1, typename U2>
    662 HWY_API constexpr bool IsSameEither() {
    663  return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
    664 }
    665 
    666 template <bool Condition, typename Then, typename Else>
    667 struct IfT {
    668  using type = Then;
    669 };
    670 
    671 template <class Then, class Else>
    672 struct IfT<false, Then, Else> {
    673  using type = Else;
    674 };
    675 
    676 template <bool Condition, typename Then, typename Else>
    677 using If = typename IfT<Condition, Then, Else>::type;
    678 
    679 template <typename T>
    680 struct IsConstT {
    681  enum { value = 0 };
    682 };
    683 
    684 template <typename T>
    685 struct IsConstT<const T> {
    686  enum { value = 1 };
    687 };
    688 
    689 template <typename T>
    690 HWY_API constexpr bool IsConst() {
    691  return IsConstT<T>::value;
    692 }
    693 
    694 template <class T>
    695 struct RemoveConstT {
    696  using type = T;
    697 };
    698 template <class T>
    699 struct RemoveConstT<const T> {
    700  using type = T;
    701 };
    702 
    703 template <class T>
    704 using RemoveConst = typename RemoveConstT<T>::type;
    705 
    706 template <class T>
    707 struct RemoveVolatileT {
    708  using type = T;
    709 };
    710 template <class T>
    711 struct RemoveVolatileT<volatile T> {
    712  using type = T;
    713 };
    714 
    715 template <class T>
    716 using RemoveVolatile = typename RemoveVolatileT<T>::type;
    717 
    718 template <class T>
    719 struct RemoveRefT {
    720  using type = T;
    721 };
    722 template <class T>
    723 struct RemoveRefT<T&> {
    724  using type = T;
    725 };
    726 template <class T>
    727 struct RemoveRefT<T&&> {
    728  using type = T;
    729 };
    730 
    731 template <class T>
    732 using RemoveRef = typename RemoveRefT<T>::type;
    733 
    734 template <class T>
    735 using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
    736 
    737 template <class T>
    738 struct RemovePtrT {
    739  using type = T;
    740 };
    741 template <class T>
    742 struct RemovePtrT<T*> {
    743  using type = T;
    744 };
    745 template <class T>
    746 struct RemovePtrT<const T*> {
    747  using type = T;
    748 };
    749 template <class T>
    750 struct RemovePtrT<volatile T*> {
    751  using type = T;
    752 };
    753 template <class T>
    754 struct RemovePtrT<const volatile T*> {
    755  using type = T;
    756 };
    757 
    758 template <class T>
    759 using RemovePtr = typename RemovePtrT<T>::type;
    760 
    761 // Insert into template/function arguments to enable this overload only for
    762 // vectors of exactly, at most (LE), or more than (GT) this many bytes.
    763 //
    764 // As an example, checking for a total size of 16 bytes will match both
    765 // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
    766 #define HWY_IF_V_SIZE(T, kN, bytes) \
    767  hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
    768 #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
    769  hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
    770 #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
    771  hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
    772 
    773 #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
    774 #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
    775 #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
    776 
    777 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
    778 #define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
    779 #define HWY_IF_SIGNED(T)                                    \
    780  hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
    781                !hwy::IsSpecialFloat<T>()>* = nullptr
    782 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
    783 #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
    784 #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
    785 #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
    786 #define HWY_IF_SPECIAL_FLOAT(T) \
    787  hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
    788 #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
    789  hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
    790 #define HWY_IF_FLOAT_OR_SPECIAL(T) \
    791  hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
    792 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
    793  hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
    794 #define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
    795 
    796 #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
    797 #define HWY_IF_NOT_T_SIZE(T, bytes) \
    798  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
    799 // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
    800 // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
    801 // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
    802 #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
    803  hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
    804 #define HWY_IF_T_SIZE_LE(T, bytes) \
    805  hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
    806 #define HWY_IF_T_SIZE_GT(T, bytes) \
    807  hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
    808 
    809 #define HWY_IF_SAME(T, expected) \
    810  hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
    811 #define HWY_IF_NOT_SAME(T, expected) \
    812  hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
    813 
    814 // One of two expected types
    815 #define HWY_IF_SAME2(T, expected1, expected2)                            \
    816  hwy::EnableIf<                                                         \
    817      hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
    818      nullptr
    819 
    820 #define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
    821 #define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
    822 #define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
    823 #define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
    824 
    825 #define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
    826 #define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
    827 #define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
    828 #define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
    829 
    830 #define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
    831 #define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
    832 
    833 #define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
    834 #define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
    835 
    836 #define HWY_IF_F32(T) HWY_IF_SAME(T, float)
    837 #define HWY_IF_F64(T) HWY_IF_SAME(T, double)
    838 
    839 // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
    840 // overloads.
    841 #define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
    842 #define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
    843 #define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
    844 #define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
    845 
    846 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
    847  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
    848 
    849 // Empty struct used as a size tag type.
    850 template <size_t N>
    851 struct SizeTag {};
    852 
    853 template <class T>
    854 class DeclValT {
    855 private:
    856  template <class U, class URef = U&&>
    857  static URef TryAddRValRef(int);
    858  template <class U, class Arg>
    859  static U TryAddRValRef(Arg);
    860 
    861 public:
    862  using type = decltype(TryAddRValRef<T>(0));
    863  enum { kDisableDeclValEvaluation = 1 };
    864 };
    865 
    866 // hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
    867 // expression of a decltype specifier.
    868 
    869 // hwy::DeclVal<T>() does not require that T have a public default constructor
    870 template <class T>
    871 HWY_API typename DeclValT<T>::type DeclVal() noexcept {
    872  static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
    873                "DeclVal() cannot be used in an evaluated context");
    874 }
    875 
    876 template <class T>
    877 struct IsArrayT {
    878  enum { value = 0 };
    879 };
    880 
    881 template <class T>
    882 struct IsArrayT<T[]> {
    883  enum { value = 1 };
    884 };
    885 
    886 template <class T, size_t N>
    887 struct IsArrayT<T[N]> {
    888  enum { value = 1 };
    889 };
    890 
    891 template <class T>
    892 static constexpr bool IsArray() {
    893  return IsArrayT<T>::value;
    894 }
    895 
    896 #if HWY_COMPILER_MSVC
    897 HWY_DIAGNOSTICS(push)
    898 HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
    899 #endif
    900 
    901 template <class From, class To>
    902 class IsConvertibleT {
    903 private:
    904  template <class T>
    905  static hwy::SizeTag<1> TestFuncWithToArg(T);
    906 
    907  template <class T, class U>
    908  static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
    909      DeclVal<T>()))
    910  TryConvTest(int);
    911 
    912  template <class T, class U, class Arg>
    913  static hwy::SizeTag<0> TryConvTest(Arg);
    914 
    915 public:
    916  enum {
    917    value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
    918             IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
    919            (!IsArray<To>() &&
    920             (IsSame<To, decltype(DeclVal<To>())>() ||
    921              !IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
    922             IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
    923  };
    924 };
    925 
    926 #if HWY_COMPILER_MSVC
    927 HWY_DIAGNOSTICS(pop)
    928 #endif
    929 
    930 template <class From, class To>
    931 HWY_API constexpr bool IsConvertible() {
    932  return IsConvertibleT<From, To>::value;
    933 }
    934 
    935 template <class From, class To>
    936 class IsStaticCastableT {
    937 private:
    938  template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
    939  static hwy::SizeTag<1> TryStaticCastTest(int);
    940 
    941  template <class T, class U, class Arg>
    942  static hwy::SizeTag<0> TryStaticCastTest(Arg);
    943 
    944 public:
    945  enum {
    946    value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
    947  };
    948 };
    949 
    950 template <class From, class To>
    951 static constexpr bool IsStaticCastable() {
    952  return IsStaticCastableT<From, To>::value;
    953 }
    954 
    955 #define HWY_IF_CASTABLE(From, To) \
    956  hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
    957 
    958 #define HWY_IF_OP_CASTABLE(op, T, Native) \
    959  HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
    960 
    961 template <class T, class From>
    962 class IsAssignableT {
    963 private:
    964  template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
    965  static hwy::SizeTag<1> TryAssignTest(int);
    966 
    967  template <class T1, class T2, class Arg>
    968  static hwy::SizeTag<0> TryAssignTest(Arg);
    969 
    970 public:
    971  enum {
    972    value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
    973  };
    974 };
    975 
    976 template <class T, class From>
    977 static constexpr bool IsAssignable() {
    978  return IsAssignableT<T, From>::value;
    979 }
    980 
    981 #define HWY_IF_ASSIGNABLE(T, From) \
    982  hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
    983 
    984 // ----------------------------------------------------------------------------
    985 // IsSpecialFloat
    986 
    987 // These types are often special-cased and not supported in all ops.
    988 template <typename T>
    989 HWY_API constexpr bool IsSpecialFloat() {
    990  return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
    991 }
    992 
    993 // -----------------------------------------------------------------------------
    994 // IsIntegerLaneType and IsInteger
    995 
    996 template <class T>
    997 HWY_API constexpr bool IsIntegerLaneType() {
    998  return false;
    999 }
   1000 template <>
   1001 HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
   1002  return true;
   1003 }
   1004 template <>
   1005 HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
   1006  return true;
   1007 }
   1008 template <>
   1009 HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
   1010  return true;
   1011 }
   1012 template <>
   1013 HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
   1014  return true;
   1015 }
   1016 template <>
   1017 HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
   1018  return true;
   1019 }
   1020 template <>
   1021 HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
   1022  return true;
   1023 }
   1024 template <>
   1025 HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
   1026  return true;
   1027 }
   1028 template <>
   1029 HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
   1030  return true;
   1031 }
   1032 
   1033 namespace detail {
   1034 
   1035 template <class T>
   1036 static HWY_INLINE constexpr bool IsNonCvInteger() {
   1037  // NOTE: Do not add a IsNonCvInteger<wchar_t>() specialization below as it is
   1038  // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
   1039  // with the /Zc:wchar_t- option.
   1040  return IsIntegerLaneType<T>() || IsSame<T, wchar_t>() ||
   1041         IsSameEither<T, size_t, ptrdiff_t>() ||
   1042         IsSameEither<T, intptr_t, uintptr_t>();
   1043 }
   1044 template <>
   1045 HWY_INLINE constexpr bool IsNonCvInteger<bool>() {
   1046  return true;
   1047 }
   1048 template <>
   1049 HWY_INLINE constexpr bool IsNonCvInteger<char>() {
   1050  return true;
   1051 }
   1052 template <>
   1053 HWY_INLINE constexpr bool IsNonCvInteger<signed char>() {
   1054  return true;
   1055 }
   1056 template <>
   1057 HWY_INLINE constexpr bool IsNonCvInteger<unsigned char>() {
   1058  return true;
   1059 }
   1060 template <>
   1061 HWY_INLINE constexpr bool IsNonCvInteger<short>() {  // NOLINT
   1062  return true;
   1063 }
   1064 template <>
   1065 HWY_INLINE constexpr bool IsNonCvInteger<unsigned short>() {  // NOLINT
   1066  return true;
   1067 }
   1068 template <>
   1069 HWY_INLINE constexpr bool IsNonCvInteger<int>() {
   1070  return true;
   1071 }
   1072 template <>
   1073 HWY_INLINE constexpr bool IsNonCvInteger<unsigned>() {
   1074  return true;
   1075 }
   1076 template <>
   1077 HWY_INLINE constexpr bool IsNonCvInteger<long>() {  // NOLINT
   1078  return true;
   1079 }
   1080 template <>
   1081 HWY_INLINE constexpr bool IsNonCvInteger<unsigned long>() {  // NOLINT
   1082  return true;
   1083 }
   1084 template <>
   1085 HWY_INLINE constexpr bool IsNonCvInteger<long long>() {  // NOLINT
   1086  return true;
   1087 }
   1088 template <>
   1089 HWY_INLINE constexpr bool IsNonCvInteger<unsigned long long>() {  // NOLINT
   1090  return true;
   1091 }
   1092 #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
   1093 template <>
   1094 HWY_INLINE constexpr bool IsNonCvInteger<char8_t>() {
   1095  return true;
   1096 }
   1097 #endif
   1098 template <>
   1099 HWY_INLINE constexpr bool IsNonCvInteger<char16_t>() {
   1100  return true;
   1101 }
   1102 template <>
   1103 HWY_INLINE constexpr bool IsNonCvInteger<char32_t>() {
   1104  return true;
   1105 }
   1106 
   1107 }  // namespace detail
   1108 
   1109 template <class T>
   1110 HWY_API constexpr bool IsInteger() {
   1111  return detail::IsNonCvInteger<RemoveCvRef<T>>();
   1112 }
   1113 
   1114 // -----------------------------------------------------------------------------
   1115 // BitCastScalar
   1116 
   1117 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
   1118 #define HWY_BITCASTSCALAR_CONSTEXPR constexpr
   1119 #else
   1120 #define HWY_BITCASTSCALAR_CONSTEXPR
   1121 #endif
   1122 
   1123 #if __cpp_constexpr >= 201304L
   1124 #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
   1125 #else
   1126 #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
   1127 #endif
   1128 
   1129 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
   1130 namespace detail {
   1131 
   1132 template <class From>
   1133 struct BitCastScalarSrcCastHelper {
   1134  static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
   1135    return val;
   1136  }
   1137 };
   1138 
   1139 #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
   1140 // Workaround for Clang 9 constexpr __builtin_bit_cast bug
   1141 template <class To, class From,
   1142          hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
   1143                        hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
   1144 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
   1145 BuiltinBitCastScalar(const From& val) {
   1146  static_assert(sizeof(To) == sizeof(From),
   1147                "sizeof(To) == sizeof(From) must be true");
   1148  return static_cast<To>(val);
   1149 }
   1150 
   1151 template <class To, class From,
   1152          hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
   1153                          hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
   1154 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
   1155 BuiltinBitCastScalar(const From& val) {
   1156  return __builtin_bit_cast(To, val);
   1157 }
   1158 #endif  // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
   1159 
   1160 }  // namespace detail
   1161 
   1162 template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
   1163 HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
   1164  // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
   1165  // const typename From::Native& or const uint16_t& using
   1166  // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
   1167  // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
   1168  // if To is not a pointer type, union type, or a struct/class containing a
   1169  // pointer, union, or reference subobject
   1170 #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
   1171  return detail::BuiltinBitCastScalar<To>(
   1172      detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
   1173          val));
   1174 #else
   1175  return __builtin_bit_cast(
   1176      To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
   1177              val));
   1178 #endif
   1179 }
   1180 template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
   1181 HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
   1182  // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
   1183  // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
   1184  // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
   1185  // be constexpr if the __builtin_bit_cast intrinsic is available.
   1186  return To::FromBits(BitCastScalar<uint16_t>(val));
   1187 }
   1188 #else
   1189 template <class To, class From>
   1190 HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
   1191  To result;
   1192  CopySameSize(&val, &result);
   1193  return result;
   1194 }
   1195 #endif
   1196 
   1197 //------------------------------------------------------------------------------
   1198 // F16 lane type
   1199 
   1200 #pragma pack(push, 1)
   1201 
   1202 #ifndef HWY_NEON_HAVE_F16C  // allow override
   1203 // Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
   1204 // included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
   1205 // __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
   1206 #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) ||                    \
   1207    (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
   1208    (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
   1209 #define HWY_NEON_HAVE_F16C 1
   1210 #else
   1211 #define HWY_NEON_HAVE_F16C 0
   1212 #endif
   1213 #endif  // HWY_NEON_HAVE_F16C
   1214 
   1215 // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
   1216 // HWY_HAVE_FLOAT16.
   1217 #if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
   1218 #define HWY_RVV_HAVE_F16_VEC 1
   1219 #else
   1220 #define HWY_RVV_HAVE_F16_VEC 0
   1221 #endif
   1222 
   1223 // x86 compiler supports _Float16, not necessarily with operators.
   1224 // Avoid clang-cl because it lacks __extendhfsf2.
   1225 #if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
   1226    ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) ||      \
   1227     HWY_COMPILER_GCC_ACTUAL >= 1200)
   1228 #define HWY_SSE2_HAVE_F16_TYPE 1
   1229 #else
   1230 #define HWY_SSE2_HAVE_F16_TYPE 0
   1231 #endif
   1232 
   1233 #ifndef HWY_HAVE_SCALAR_F16_TYPE  // allow override
   1234 // Compiler supports _Float16, not necessarily with operators.
   1235 #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || \
   1236    __SPIRV_DEVICE__
   1237 #define HWY_HAVE_SCALAR_F16_TYPE 1
   1238 #else
   1239 #define HWY_HAVE_SCALAR_F16_TYPE 0
   1240 #endif
   1241 #endif  // HWY_HAVE_SCALAR_F16_TYPE
   1242 
   1243 #ifndef HWY_HAVE_SCALAR_F16_OPERATORS
   1244 // Recent enough compiler also has operators.
   1245 #if HWY_HAVE_SCALAR_F16_TYPE &&                                       \
   1246    (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
   1247     (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL &&          \
   1248      !defined(_WIN32)) ||                                            \
   1249     (HWY_ARCH_ARM &&                                                 \
   1250      (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
   1251 #define HWY_HAVE_SCALAR_F16_OPERATORS 1
   1252 #else
   1253 #define HWY_HAVE_SCALAR_F16_OPERATORS 0
   1254 #endif
   1255 #endif  // HWY_HAVE_SCALAR_F16_OPERATORS
   1256 
   1257 namespace detail {
   1258 
   1259 template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
   1260 struct SpecialFloatUnwrapArithOpOperandT {};
   1261 
   1262 template <class T, class TVal>
   1263 struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
   1264  using type = T;
   1265 };
   1266 
   1267 template <class T>
   1268 using SpecialFloatUnwrapArithOpOperand =
   1269    typename SpecialFloatUnwrapArithOpOperandT<T>::type;
   1270 
   1271 template <class T, class TVal = RemoveCvRef<T>>
   1272 struct NativeSpecialFloatToWrapperT {
   1273  using type = T;
   1274 };
   1275 
   1276 template <class T>
   1277 using NativeSpecialFloatToWrapper =
   1278    typename NativeSpecialFloatToWrapperT<T>::type;
   1279 
   1280 }  // namespace detail
   1281 
   1282 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
   1283 // by concatenating base type and bits. We use a wrapper class instead of a
   1284 // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
   1285 // are generated regardless of F16 support; see #1684.
   1286 struct alignas(2) float16_t {
   1287 #if HWY_HAVE_SCALAR_F16_TYPE
   1288 #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || __SPIRV_DEVICE__
   1289  using Native = _Float16;
   1290 #elif HWY_NEON_HAVE_F16C
   1291  using Native = __fp16;
   1292 #else
   1293 #error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
   1294 #endif
   1295 #elif HWY_IDE
   1296  using Native = uint16_t;
   1297 #endif  // HWY_HAVE_SCALAR_F16_TYPE
   1298 
   1299  union {
   1300 #if HWY_HAVE_SCALAR_F16_TYPE || HWY_IDE
   1301    // Accessed via NativeLaneType, and used directly if
   1302    // HWY_HAVE_SCALAR_F16_OPERATORS.
   1303    Native native;
   1304 #endif
   1305    // Only accessed via NativeLaneType or U16LaneType.
   1306    uint16_t bits;
   1307  };
   1308 
   1309  // Default init and copying.
   1310  float16_t() noexcept = default;
   1311  constexpr float16_t(const float16_t&) noexcept = default;
   1312  constexpr float16_t(float16_t&&) noexcept = default;
   1313  float16_t& operator=(const float16_t&) noexcept = default;
   1314  float16_t& operator=(float16_t&&) noexcept = default;
   1315 
   1316 #if HWY_HAVE_SCALAR_F16_TYPE
   1317  // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
   1318  // float16_t(intrinsic()), but user code expects implicit conversions.
   1319  MOZ_IMPLICIT constexpr float16_t(Native arg) noexcept : native(arg) {}
   1320  constexpr operator Native() const noexcept { return native; }
   1321 #endif
   1322 
   1323 #if HWY_HAVE_SCALAR_F16_TYPE
   1324  static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
   1325    return float16_t(BitCastScalar<Native>(bits));
   1326  }
   1327 #else
   1328 
   1329 private:
   1330  struct F16FromU16BitsTag {};
   1331  constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
   1332      : bits(u16_bits) {}
   1333 
   1334 public:
   1335  static constexpr float16_t FromBits(uint16_t bits) {
   1336    return float16_t(F16FromU16BitsTag(), bits);
   1337  }
   1338 #endif
   1339 
   1340  // When backed by a native type, ensure the wrapper behaves like the native
   1341  // type by forwarding all operators. Unfortunately it seems difficult to reuse
   1342  // this code in a base class, so we repeat it in float16_t.
   1343 #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
   1344  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
   1345                                      IsConvertible<T, Native>()>* = nullptr>
   1346  MOZ_IMPLICIT constexpr float16_t(T&& arg) noexcept
   1347      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
   1348 
   1349  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
   1350                                      !IsConvertible<T, Native>() &&
   1351                                      IsStaticCastable<T, Native>()>* = nullptr>
   1352  explicit constexpr float16_t(T&& arg) noexcept
   1353      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
   1354 
   1355  // pre-decrement operator (--x)
   1356  HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
   1357    native = static_cast<Native>(native - Native{1});
   1358    return *this;
   1359  }
   1360 
   1361  // post-decrement operator (x--)
   1362  HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
   1363    float16_t result = *this;
   1364    native = static_cast<Native>(native - Native{1});
   1365    return result;
   1366  }
   1367 
   1368  // pre-increment operator (++x)
   1369  HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
   1370    native = static_cast<Native>(native + Native{1});
   1371    return *this;
   1372  }
   1373 
   1374  // post-increment operator (x++)
   1375  HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
   1376    float16_t result = *this;
   1377    native = static_cast<Native>(native + Native{1});
   1378    return result;
   1379  }
   1380 
   1381  constexpr float16_t operator-() const noexcept {
   1382    return float16_t(static_cast<Native>(-native));
   1383  }
   1384  constexpr float16_t operator+() const noexcept { return *this; }
   1385 
   1386  // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
   1387  // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
   1388 #define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func)                      \
   1389  constexpr float16_t op_func(const float16_t& rhs) const noexcept {         \
   1390    return float16_t(static_cast<Native>(native op rhs.native));             \
   1391  }                                                                          \
   1392  template <typename T, HWY_IF_NOT_F16(T),                                   \
   1393            typename UnwrappedT =                                            \
   1394                detail::SpecialFloatUnwrapArithOpOperand<const T&>,          \
   1395            typename RawResultT =                                            \
   1396                decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()),        \
   1397            typename ResultT =                                               \
   1398                detail::NativeSpecialFloatToWrapper<RawResultT>,             \
   1399            HWY_IF_CASTABLE(RawResultT, ResultT)>                            \
   1400  constexpr ResultT op_func(const T& rhs) const noexcept(noexcept(           \
   1401      static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) {   \
   1402    return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs));     \
   1403  }                                                                          \
   1404  HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(                           \
   1405      const hwy::float16_t& rhs) noexcept {                                  \
   1406    native = static_cast<Native>(native op rhs.native);                      \
   1407    return *this;                                                            \
   1408  }                                                                          \
   1409  template <typename T, HWY_IF_NOT_F16(T),                                   \
   1410            HWY_IF_OP_CASTABLE(op, const T&, Native),                        \
   1411            HWY_IF_ASSIGNABLE(                                               \
   1412                Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
   1413  HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept(    \
   1414      noexcept(                                                              \
   1415          static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) {  \
   1416    native = static_cast<Native>(native op rhs);                             \
   1417    return *this;                                                            \
   1418  }
   1419 
   1420  HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
   1421  HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
   1422  HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
   1423  HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
   1424 #undef HWY_FLOAT16_BINARY_OP
   1425 
   1426 #endif  // HWY_HAVE_SCALAR_F16_OPERATORS
   1427 };
   1428 static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
   1429 
   1430 #if HWY_HAVE_SCALAR_F16_TYPE
   1431 namespace detail {
   1432 
   1433 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1434 template <class T>
   1435 struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
   1436  using type = hwy::float16_t::Native;
   1437 };
   1438 #endif
   1439 
   1440 template <class T>
   1441 struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
   1442  using type = hwy::float16_t;
   1443 };
   1444 
   1445 }  // namespace detail
   1446 #endif  // HWY_HAVE_SCALAR_F16_TYPE
   1447 
   1448 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
   1449 namespace detail {
   1450 
   1451 template <>
   1452 struct BitCastScalarSrcCastHelper<hwy::float16_t> {
   1453 #if HWY_HAVE_SCALAR_F16_TYPE
   1454  static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
   1455      const hwy::float16_t& val) {
   1456    return val.native;
   1457  }
   1458 #else
   1459  static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
   1460      const hwy::float16_t& val) {
   1461    return val.bits;
   1462  }
   1463 #endif
   1464 };
   1465 
   1466 }  // namespace detail
   1467 #endif  // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
   1468 
   1469 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1470 #define HWY_F16_CONSTEXPR constexpr
   1471 #else
   1472 #define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
   1473 #endif  // HWY_HAVE_SCALAR_F16_OPERATORS
   1474 
   1475 HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
   1476 #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
   1477  return static_cast<float>(f16);
   1478 #endif
   1479 #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
   1480  const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
   1481  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
   1482  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
   1483  const uint32_t mantissa = bits16 & 0x3FF;
   1484 
   1485  // Subnormal or zero
   1486  if (biased_exp == 0) {
   1487    const float subnormal =
   1488        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
   1489    return sign ? -subnormal : subnormal;
   1490  }
   1491 
   1492  // Normalized, infinity or NaN: convert the representation directly
   1493  // (faster than ldexp/tables).
   1494  const uint32_t biased_exp32 =
   1495      biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
   1496  const uint32_t mantissa32 = mantissa << (23 - 10);
   1497  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
   1498 
   1499  return BitCastScalar<float>(bits32);
   1500 #endif  // !HWY_HAVE_SCALAR_F16_OPERATORS
   1501 }
   1502 
   1503 #if HWY_IS_DEBUG_BUILD && \
   1504    (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
   1505 #if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
   1506 // If C++23 if !consteval support is available, only execute
   1507 // HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
   1508 // context to avoid compilation errors.
   1509 #define HWY_F16_FROM_F32_DASSERT(condition) \
   1510  do {                                      \
   1511    if !consteval {                         \
   1512      HWY_DASSERT(condition);               \
   1513    }                                       \
   1514  } while (0)
   1515 #elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
   1516    HWY_COMPILER_MSVC >= 1926
   1517 // If the __builtin_is_constant_evaluated() intrinsic is available,
   1518 // only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
   1519 // false to avoid compilation errors if F16FromF32 is called from a
   1520 // constant-evaluated context.
   1521 #define HWY_F16_FROM_F32_DASSERT(condition)   \
   1522  do {                                        \
   1523    if (!__builtin_is_constant_evaluated()) { \
   1524      HWY_DASSERT(condition);                 \
   1525    }                                         \
   1526  } while (0)
   1527 #else
   1528 // If C++23 if !consteval support is not available,
   1529 // the __builtin_is_constant_evaluated() intrinsic is not available,
   1530 // HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
   1531 // do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
   1532 // called from a constant-evaluated context.
   1533 #define HWY_F16_FROM_F32_DASSERT(condition) \
   1534  do {                                      \
   1535  } while (0)
   1536 #endif  // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
   1537 #else
   1538 // If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
   1539 // available, define HWY_F16_FROM_F32_DASSERT(condition) as
   1540 // HWY_DASSERT(condition)
   1541 #define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
   1542 #endif  // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
   1543        // HWY_COMPILER_MSVC >= 1926)
   1544 
   1545 HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
   1546 #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
   1547  return float16_t(static_cast<float16_t::Native>(f32));
   1548 #endif
   1549 #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
   1550  const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
   1551  const uint32_t sign = bits32 >> 31;
   1552  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
   1553  constexpr uint32_t kMantissaMask = 0x7FFFFF;
   1554  const uint32_t mantissa32 = bits32 & kMantissaMask;
   1555 
   1556  // Before shifting (truncation), round to nearest even to reduce bias. If
   1557  // the lowest remaining mantissa bit is odd, increase the offset. Example
   1558  // with the lowest remaining bit (left) and next lower two bits; the
   1559  // latter, plus two more, will be truncated.
   1560  // 0[00] +  1 =  0[01]
   1561  // 0[01] +  1 =  0[10]
   1562  // 0[10] +  1 =  0[11]  (round down toward even)
   1563  // 0[11] +  1 =  1[00]  (round up)
   1564  // 1[00] + 10 =  1[10]
   1565  // 1[01] + 10 =  1[11]
   1566  // 1[10] + 10 = C0[00]  (round up toward even with C=1 carry out)
   1567  // 1[11] + 10 = C0[01]  (round up toward even with C=1 carry out)
   1568 
   1569  // If |f32| >= 2^-24, f16_ulp_bit_idx is the index of the F32 mantissa bit
   1570  // that will be shifted down into the ULP bit of the rounded down F16 result
   1571 
   1572  // The biased F32 exponent of 2^-14 (the smallest positive normal F16 value)
   1573  // is 113, and bit 13 of the F32 mantissa will be shifted down to into the ULP
   1574  // bit of the rounded down F16 result if |f32| >= 2^14
   1575 
   1576  // If |f32| < 2^-24, f16_ulp_bit_idx is equal to 24 as there are 24 mantissa
   1577  // bits (including the implied 1 bit) in the mantissa of a normal F32 value
   1578  // and as we want to round up the mantissa if |f32| > 2^-25 && |f32| < 2^-24
   1579  const int32_t f16_ulp_bit_idx =
   1580      HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24);
   1581  const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1;
   1582  const uint32_t rounded =
   1583      mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u;
   1584  const bool carry = rounded >= (1u << 23);
   1585 
   1586  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
   1587 
   1588  // Tiny or zero => zero.
   1589  if (exp < -24) {
   1590    // restore original sign
   1591    return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
   1592  }
   1593 
   1594  // If biased_exp16 would be >= 31, first check whether the input was NaN so we
   1595  // can set the mantissa to nonzero.
   1596  const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
   1597  const bool overflowed = exp >= 16;
   1598  const uint32_t biased_exp16 =
   1599      static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
   1600  // exp = [-24, -15] => subnormal, shift the mantissa.
   1601  const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
   1602  HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
   1603  const uint32_t shifted_mantissa =
   1604      (rounded & kMantissaMask) >> (23 - 10 + sub_exp);
   1605  const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
   1606  const uint32_t mantissa16 = is_nan       ? 0x3FF
   1607                              : overflowed ? 0u
   1608                                           : (leading + shifted_mantissa);
   1609 
   1610 #if HWY_IS_DEBUG_BUILD
   1611  if (exp < -14) {
   1612    HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
   1613    HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
   1614  } else if (exp <= 15) {
   1615    HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
   1616    HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
   1617  }
   1618 #endif
   1619 
   1620  HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
   1621  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
   1622  HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
   1623  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
   1624  return float16_t::FromBits(narrowed);
   1625 #endif  // !HWY_HAVE_SCALAR_F16_OPERATORS
   1626 }
   1627 
   1628 HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
   1629 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1630  return float16_t(static_cast<float16_t::Native>(f64));
   1631 #else
   1632  // The mantissa bits of f64 are first rounded using round-to-odd rounding
   1633  // to the nearest f64 value that has the lower 29 bits zeroed out to
   1634  // ensure that the result is correctly rounded to a F16.
   1635 
   1636  // The F64 round-to-odd operation below will round a normal F64 value
   1637  // (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
   1638 
   1639  // It is okay if the magnitude of a denormal F64 value is rounded up in the
   1640  // F64 round-to-odd step below as the magnitude of a denormal F64 value is
   1641  // much smaller than 2^(-24) (the smallest positive denormal F16 value).
   1642 
   1643  // It is also okay if bit 29 of a NaN F64 value is changed by the F64
   1644  // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
   1645  // discarded or ignored by the conversion of a F32 NaN value to a F16.
   1646 
   1647  // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
   1648  // NaN value as the result of the F64 round-to-odd step will have at least one
   1649  // mantissa bit if f64 is a NaN value.
   1650 
   1651  // The F64 round-to-odd step will ensure that the F64 to F32 conversion is
   1652  // exact if the magnitude of the rounded F64 value (using round-to-odd
   1653  // rounding) is between 2^(-126) (the smallest normal F32 value) and
   1654  // HighestValue<float>() (the largest finite F32 value)
   1655 
   1656  // It is okay if the F64 to F32 conversion is inexact for F64 values that have
   1657  // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
   1658  // value is much smaller than 2^(-24) (the smallest positive denormal F16
   1659  // value).
   1660 
   1661  return F16FromF32(
   1662      static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
   1663          (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
   1664          ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
   1665           0x0000000020000000ULL)))));
   1666 #endif
   1667 }
   1668 
   1669 // More convenient to define outside float16_t because these may use
   1670 // F32FromF16, which is defined after the struct.
   1671 HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
   1672                                         float16_t rhs) noexcept {
   1673 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1674  return lhs.native == rhs.native;
   1675 #else
   1676  return F32FromF16(lhs) == F32FromF16(rhs);
   1677 #endif
   1678 }
   1679 HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
   1680                                         float16_t rhs) noexcept {
   1681 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1682  return lhs.native != rhs.native;
   1683 #else
   1684  return F32FromF16(lhs) != F32FromF16(rhs);
   1685 #endif
   1686 }
   1687 HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
   1688 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1689  return lhs.native < rhs.native;
   1690 #else
   1691  return F32FromF16(lhs) < F32FromF16(rhs);
   1692 #endif
   1693 }
   1694 HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
   1695                                         float16_t rhs) noexcept {
   1696 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1697  return lhs.native <= rhs.native;
   1698 #else
   1699  return F32FromF16(lhs) <= F32FromF16(rhs);
   1700 #endif
   1701 }
   1702 HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
   1703 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1704  return lhs.native > rhs.native;
   1705 #else
   1706  return F32FromF16(lhs) > F32FromF16(rhs);
   1707 #endif
   1708 }
   1709 HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
   1710                                         float16_t rhs) noexcept {
   1711 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1712  return lhs.native >= rhs.native;
   1713 #else
   1714  return F32FromF16(lhs) >= F32FromF16(rhs);
   1715 #endif
   1716 }
   1717 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
   1718 HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
   1719    float16_t lhs, float16_t rhs) noexcept {
   1720 #if HWY_HAVE_SCALAR_F16_OPERATORS
   1721  return lhs.native <=> rhs.native;
   1722 #else
   1723  return F32FromF16(lhs) <=> F32FromF16(rhs);
   1724 #endif
   1725 }
   1726 #endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE
   1727 
   1728 //------------------------------------------------------------------------------
   1729 // BF16 lane type
   1730 
   1731 // Compiler supports ACLE __bf16, not necessarily with operators.
   1732 
   1733 // Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
   1734 // in GCC 13 and earlier that sometimes causes BF16 constant values to be
   1735 // incorrectly loaded on AArch64, and this GCC bug on AArch64 is
   1736 // described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
   1737 
   1738 #if HWY_ARCH_ARM_A64 && \
   1739    (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
   1740 #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
   1741 #else
   1742 #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
   1743 #endif
   1744 
   1745 // x86 compiler supports __bf16, not necessarily with operators.
   1746 // Disable in debug builds due to clang miscompiles as of 2025-07-22: casting
   1747 // bf16 <-> f32 in convert_test results in 0x2525 for 1.0 instead of 0x3f80.
   1748 // Reported at https://github.com/llvm/llvm-project/issues/151692.
   1749 #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
   1750 #if HWY_ARCH_X86 && defined(__SSE2__) &&                         \
   1751    ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL &&     \
   1752      (!HWY_IS_DEBUG_BUILD || HWY_COMPILER3_CLANG >= 220101)) || \
   1753     HWY_COMPILER_GCC_ACTUAL >= 1300)
   1754 #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
   1755 #else
   1756 #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
   1757 #endif
   1758 #endif  // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
   1759 
   1760 // Compiler supports __bf16, not necessarily with operators.
   1761 #if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
   1762 #define HWY_HAVE_SCALAR_BF16_TYPE 1
   1763 #else
   1764 #define HWY_HAVE_SCALAR_BF16_TYPE 0
   1765 #endif
   1766 
   1767 #ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
   1768 // Recent enough compiler also has operators. aarch64 clang 18 hits internal
   1769 // compiler errors on bf16 ToString, hence only enable on GCC for now.
   1770 // GCC >= 13 will insert a function call to the __extendbfsf2 helper function
   1771 // for scalar conversions from __bf16 to float. This is prohibitively expensive,
   1772 // so refrain from using scalar BF16 operators on these compiler versions.
   1773 // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121853
   1774 #if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1700)
   1775 #define HWY_HAVE_SCALAR_BF16_OPERATORS 1
   1776 #else
   1777 #define HWY_HAVE_SCALAR_BF16_OPERATORS 0
   1778 #endif
   1779 #endif  // HWY_HAVE_SCALAR_BF16_OPERATORS
   1780 
   1781 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   1782 #define HWY_BF16_CONSTEXPR constexpr
   1783 #else
   1784 #define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
   1785 #endif
   1786 
   1787 struct alignas(2) bfloat16_t {
   1788 #if HWY_HAVE_SCALAR_BF16_TYPE
   1789  using Native = __bf16;
   1790 #elif HWY_IDE
   1791  using Native = uint16_t;
   1792 #endif
   1793 
   1794  union {
   1795 #if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
   1796    // Accessed via NativeLaneType, and used directly if
   1797    // HWY_HAVE_SCALAR_BF16_OPERATORS.
   1798    Native native;
   1799 #endif
   1800    // Only accessed via NativeLaneType or U16LaneType.
   1801    uint16_t bits;
   1802  };
   1803 
   1804  // Default init and copying
   1805  bfloat16_t() noexcept = default;
   1806  constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
   1807  constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
   1808  bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
   1809  bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
   1810 
   1811 // Only enable implicit conversions if we have a native type.
   1812 #if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
   1813  MOZ_IMPLICIT constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
   1814  constexpr operator Native() const noexcept { return native; }
   1815 #endif
   1816 
   1817 #if HWY_HAVE_SCALAR_BF16_TYPE
   1818  static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
   1819    return bfloat16_t(BitCastScalar<Native>(bits));
   1820  }
   1821 #else
   1822 
   1823 private:
   1824  struct BF16FromU16BitsTag {};
   1825  constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
   1826      : bits(u16_bits) {}
   1827 
   1828 public:
   1829  static constexpr bfloat16_t FromBits(uint16_t bits) {
   1830    return bfloat16_t(BF16FromU16BitsTag(), bits);
   1831  }
   1832 #endif
   1833 
   1834  // When backed by a native type, ensure the wrapper behaves like the native
   1835  // type by forwarding all operators. Unfortunately it seems difficult to reuse
   1836  // this code in a base class, so we repeat it in float16_t.
   1837 #if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
   1838  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
   1839                                      !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
   1840                                      IsConvertible<T, Native>()>* = nullptr>
   1841  constexpr bfloat16_t(T&& arg) noexcept(
   1842      noexcept(static_cast<Native>(DeclVal<T>())))
   1843      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
   1844 
   1845  template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
   1846                                      !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
   1847                                      !IsConvertible<T, Native>() &&
   1848                                      IsStaticCastable<T, Native>()>* = nullptr>
   1849  explicit constexpr bfloat16_t(T&& arg) noexcept(
   1850      noexcept(static_cast<Native>(DeclVal<T>())))
   1851      : native(static_cast<Native>(static_cast<T&&>(arg))) {}
   1852 
   1853  HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
   1854    native = arg;
   1855    return *this;
   1856  }
   1857 
   1858  // pre-decrement operator (--x)
   1859  HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
   1860    native = static_cast<Native>(native - Native{1});
   1861    return *this;
   1862  }
   1863 
   1864  // post-decrement operator (x--)
   1865  HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
   1866    bfloat16_t result = *this;
   1867    native = static_cast<Native>(native - Native{1});
   1868    return result;
   1869  }
   1870 
   1871  // pre-increment operator (++x)
   1872  HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
   1873    native = static_cast<Native>(native + Native{1});
   1874    return *this;
   1875  }
   1876 
   1877  // post-increment operator (x++)
   1878  HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
   1879    bfloat16_t result = *this;
   1880    native = static_cast<Native>(native + Native{1});
   1881    return result;
   1882  }
   1883 
   1884  constexpr bfloat16_t operator-() const noexcept {
   1885    return bfloat16_t(static_cast<Native>(-native));
   1886  }
   1887  constexpr bfloat16_t operator+() const noexcept { return *this; }
   1888 
   1889  // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
   1890  // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
   1891 #define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func)                     \
   1892  constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept {       \
   1893    return bfloat16_t(static_cast<Native>(native op rhs.native));            \
   1894  }                                                                          \
   1895  template <typename T, HWY_IF_NOT_BF16(T),                                  \
   1896            typename UnwrappedT =                                            \
   1897                detail::SpecialFloatUnwrapArithOpOperand<const T&>,          \
   1898            typename RawResultT =                                            \
   1899                decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()),        \
   1900            typename ResultT =                                               \
   1901                detail::NativeSpecialFloatToWrapper<RawResultT>,             \
   1902            HWY_IF_CASTABLE(RawResultT, ResultT)>                            \
   1903  constexpr ResultT op_func(const T& rhs) const noexcept(noexcept(           \
   1904      static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) {   \
   1905    return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs));     \
   1906  }                                                                          \
   1907  HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(                          \
   1908      const hwy::bfloat16_t& rhs) noexcept {                                 \
   1909    native = static_cast<Native>(native op rhs.native);                      \
   1910    return *this;                                                            \
   1911  }                                                                          \
   1912  template <typename T, HWY_IF_NOT_BF16(T),                                  \
   1913            HWY_IF_OP_CASTABLE(op, const T&, Native),                        \
   1914            HWY_IF_ASSIGNABLE(                                               \
   1915                Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
   1916  HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept(   \
   1917      noexcept(                                                              \
   1918          static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) {  \
   1919    native = static_cast<Native>(native op rhs);                             \
   1920    return *this;                                                            \
   1921  }
   1922  HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
   1923  HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
   1924  HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
   1925  HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
   1926 #undef HWY_BFLOAT16_BINARY_OP
   1927 
   1928 #endif  // HWY_HAVE_SCALAR_BF16_OPERATORS
   1929 };
   1930 static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
   1931 
   1932 #pragma pack(pop)
   1933 
   1934 #if HWY_HAVE_SCALAR_BF16_TYPE
   1935 namespace detail {
   1936 
   1937 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   1938 template <class T>
   1939 struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
   1940  using type = hwy::bfloat16_t::Native;
   1941 };
   1942 #endif
   1943 
   1944 template <class T>
   1945 struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
   1946  using type = hwy::bfloat16_t;
   1947 };
   1948 
   1949 }  // namespace detail
   1950 #endif  // HWY_HAVE_SCALAR_BF16_TYPE
   1951 
   1952 #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
   1953 namespace detail {
   1954 
   1955 template <>
   1956 struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
   1957 #if HWY_HAVE_SCALAR_BF16_TYPE
   1958  static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
   1959      const hwy::bfloat16_t& val) {
   1960    return val.native;
   1961  }
   1962 #else
   1963  static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
   1964      const hwy::bfloat16_t& val) {
   1965    return val.bits;
   1966  }
   1967 #endif
   1968 };
   1969 
   1970 }  // namespace detail
   1971 #endif  // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
   1972 
   1973 HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
   1974 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   1975  return static_cast<float>(bf);
   1976 #else
   1977  return BitCastScalar<float>(static_cast<uint32_t>(
   1978      static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
   1979 #endif
   1980 }
   1981 
   1982 namespace detail {
   1983 
   1984 // Returns the increment to add to the bits of a finite F32 value to round a
   1985 // finite F32 to the nearest BF16 value
   1986 static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
   1987    const uint32_t f32_bits) {
   1988  return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
   1989                                   ? (0x7FFFu + ((f32_bits >> 16) & 1u))
   1990                                   : 0u);
   1991 }
   1992 
   1993 // If f32_bits is the bit representation of a NaN F32 value, make sure that
   1994 // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
   1995 // values and to prevent NaN F32 values from being converted to an infinite
   1996 // BF16 value
   1997 static HWY_INLINE constexpr uint32_t BF16BitsIfSNAN(uint32_t f32_bits) {
   1998  return ((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) ? (uint32_t{1} << 6) : 0;
   1999 }
   2000 
   2001 // Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
   2002 // rounded to the nearest F16 value
   2003 static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
   2004    const uint32_t f32_bits) {
   2005  return static_cast<uint16_t>(
   2006      BF16BitsIfSNAN(f32_bits) |
   2007      ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16));
   2008 }
   2009 
   2010 }  // namespace detail
   2011 
   2012 HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
   2013  // The rounding mode is not specified in the C++ standard, so ignore
   2014  // `HWY_HAVE_SCALAR_BF16_OPERATORS` and only use our round to nearest.
   2015  return bfloat16_t::FromBits(
   2016      detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
   2017 }
   2018 
   2019 HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
   2020  // The mantissa bits of f64 are first rounded using round-to-odd rounding
   2021  // to the nearest f64 value that has the lower 38 bits zeroed out to
   2022  // ensure that the result is correctly rounded to a BF16.
   2023 
   2024  // The F64 round-to-odd operation below will round a normal F64 value
   2025  // (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
   2026 
   2027  // It is okay if the magnitude of a denormal F64 value is rounded up in the
   2028  // F64 round-to-odd step below as the magnitude of a denormal F64 value is
   2029  // much smaller than 2^(-133) (the smallest positive denormal BF16 value).
   2030 
   2031  // It is also okay if bit 38 of a NaN F64 value is changed by the F64
   2032  // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
   2033  // discarded or ignored by the conversion of a F32 NaN value to a BF16.
   2034 
   2035  // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
   2036  // NaN value as the result of the F64 round-to-odd step will have at least one
   2037  // mantissa bit if f64 is a NaN value.
   2038 
   2039  // The F64 round-to-odd step below will ensure that the F64 to F32 conversion
   2040  // is exact if the magnitude of the rounded F64 value (using round-to-odd
   2041  // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
   2042  // BF16 value) and HighestValue<float>() (the largest finite F32 value).
   2043 
   2044  // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
   2045  // F32 conversion is guaranteed to be less than or equal to 2^(-135), which
   2046  // ensures that the F32 to BF16 conversion is correctly rounded, even if the
   2047  // conversion of a rounded F64 value whose magnitude is less than 2^(-135)
   2048  // to a F32 is inexact.
   2049 
   2050  return BF16FromF32(
   2051      static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
   2052          (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
   2053          ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
   2054           0x0000004000000000ULL)))));
   2055 }
   2056 
   2057 // More convenient to define outside bfloat16_t because these may use
   2058 // F32FromBF16, which is defined after the struct.
   2059 
   2060 HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
   2061                                          bfloat16_t rhs) noexcept {
   2062 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2063  return lhs.native == rhs.native;
   2064 #else
   2065  return F32FromBF16(lhs) == F32FromBF16(rhs);
   2066 #endif
   2067 }
   2068 
   2069 HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
   2070                                          bfloat16_t rhs) noexcept {
   2071 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2072  return lhs.native != rhs.native;
   2073 #else
   2074  return F32FromBF16(lhs) != F32FromBF16(rhs);
   2075 #endif
   2076 }
   2077 HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
   2078                                         bfloat16_t rhs) noexcept {
   2079 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2080  return lhs.native < rhs.native;
   2081 #else
   2082  return F32FromBF16(lhs) < F32FromBF16(rhs);
   2083 #endif
   2084 }
   2085 HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
   2086                                          bfloat16_t rhs) noexcept {
   2087 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2088  return lhs.native <= rhs.native;
   2089 #else
   2090  return F32FromBF16(lhs) <= F32FromBF16(rhs);
   2091 #endif
   2092 }
   2093 HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
   2094                                         bfloat16_t rhs) noexcept {
   2095 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2096  return lhs.native > rhs.native;
   2097 #else
   2098  return F32FromBF16(lhs) > F32FromBF16(rhs);
   2099 #endif
   2100 }
   2101 HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
   2102                                          bfloat16_t rhs) noexcept {
   2103 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2104  return lhs.native >= rhs.native;
   2105 #else
   2106  return F32FromBF16(lhs) >= F32FromBF16(rhs);
   2107 #endif
   2108 }
   2109 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
   2110 HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
   2111    bfloat16_t lhs, bfloat16_t rhs) noexcept {
   2112 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2113  return lhs.native <=> rhs.native;
   2114 #else
   2115  return F32FromBF16(lhs) <=> F32FromBF16(rhs);
   2116 #endif
   2117 }
   2118 #endif  // HWY_HAVE_CXX20_THREE_WAY_COMPARE
   2119 
   2120 //------------------------------------------------------------------------------
   2121 // Type relations
   2122 
   2123 namespace detail {
   2124 
   2125 template <typename T>
   2126 struct Relations;
   2127 template <>
   2128 struct Relations<uint8_t> {
   2129  using Unsigned = uint8_t;
   2130  using Signed = int8_t;
   2131  using Wide = uint16_t;
   2132  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
   2133 };
   2134 template <>
   2135 struct Relations<int8_t> {
   2136  using Unsigned = uint8_t;
   2137  using Signed = int8_t;
   2138  using Wide = int16_t;
   2139  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
   2140 };
   2141 template <>
   2142 struct Relations<uint16_t> {
   2143  using Unsigned = uint16_t;
   2144  using Signed = int16_t;
   2145  using Float = float16_t;
   2146  using Wide = uint32_t;
   2147  using Narrow = uint8_t;
   2148  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
   2149 };
   2150 template <>
   2151 struct Relations<int16_t> {
   2152  using Unsigned = uint16_t;
   2153  using Signed = int16_t;
   2154  using Float = float16_t;
   2155  using Wide = int32_t;
   2156  using Narrow = int8_t;
   2157  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
   2158 };
   2159 template <>
   2160 struct Relations<uint32_t> {
   2161  using Unsigned = uint32_t;
   2162  using Signed = int32_t;
   2163  using Float = float;
   2164  using Wide = uint64_t;
   2165  using Narrow = uint16_t;
   2166  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
   2167 };
   2168 template <>
   2169 struct Relations<int32_t> {
   2170  using Unsigned = uint32_t;
   2171  using Signed = int32_t;
   2172  using Float = float;
   2173  using Wide = int64_t;
   2174  using Narrow = int16_t;
   2175  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
   2176 };
   2177 template <>
   2178 struct Relations<uint64_t> {
   2179  using Unsigned = uint64_t;
   2180  using Signed = int64_t;
   2181  using Float = double;
   2182  using Wide = uint128_t;
   2183  using Narrow = uint32_t;
   2184  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
   2185 };
   2186 template <>
   2187 struct Relations<int64_t> {
   2188  using Unsigned = uint64_t;
   2189  using Signed = int64_t;
   2190  using Float = double;
   2191  using Narrow = int32_t;
   2192  enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
   2193 };
   2194 template <>
   2195 struct Relations<uint128_t> {
   2196  using Unsigned = uint128_t;
   2197  using Narrow = uint64_t;
   2198  enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
   2199 };
   2200 template <>
   2201 struct Relations<float16_t> {
   2202  using Unsigned = uint16_t;
   2203  using Signed = int16_t;
   2204  using Float = float16_t;
   2205  using Wide = float;
   2206  enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
   2207 };
   2208 template <>
   2209 struct Relations<bfloat16_t> {
   2210  using Unsigned = uint16_t;
   2211  using Signed = int16_t;
   2212  using Wide = float;
   2213  enum { is_signed = 1, is_float = 1, is_bf16 = 1 };
   2214 };
   2215 template <>
   2216 struct Relations<float> {
   2217  using Unsigned = uint32_t;
   2218  using Signed = int32_t;
   2219  using Float = float;
   2220  using Wide = double;
   2221  using Narrow = float16_t;
   2222  enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
   2223 };
   2224 template <>
   2225 struct Relations<double> {
   2226  using Unsigned = uint64_t;
   2227  using Signed = int64_t;
   2228  using Float = double;
   2229  using Narrow = float;
   2230  enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
   2231 };
   2232 
   2233 template <size_t N>
   2234 struct TypeFromSize;
   2235 template <>
   2236 struct TypeFromSize<1> {
   2237  using Unsigned = uint8_t;
   2238  using Signed = int8_t;
   2239 };
   2240 template <>
   2241 struct TypeFromSize<2> {
   2242  using Unsigned = uint16_t;
   2243  using Signed = int16_t;
   2244  using Float = float16_t;
   2245 };
   2246 template <>
   2247 struct TypeFromSize<4> {
   2248  using Unsigned = uint32_t;
   2249  using Signed = int32_t;
   2250  using Float = float;
   2251 };
   2252 template <>
   2253 struct TypeFromSize<8> {
   2254  using Unsigned = uint64_t;
   2255  using Signed = int64_t;
   2256  using Float = double;
   2257 };
   2258 template <>
   2259 struct TypeFromSize<16> {
   2260  using Unsigned = uint128_t;
   2261 };
   2262 
   2263 }  // namespace detail
   2264 
   2265 // Aliases for types of a different category, but the same size.
   2266 template <typename T>
   2267 using MakeUnsigned = typename detail::Relations<T>::Unsigned;
   2268 template <typename T>
   2269 using MakeSigned = typename detail::Relations<T>::Signed;
   2270 template <typename T>
   2271 using MakeFloat = typename detail::Relations<T>::Float;
   2272 
   2273 // Aliases for types of the same category, but different size.
   2274 template <typename T>
   2275 using MakeWide = typename detail::Relations<T>::Wide;
   2276 template <typename T>
   2277 using MakeNarrow = typename detail::Relations<T>::Narrow;
   2278 
   2279 // Obtain type from its size [bytes].
   2280 template <size_t N>
   2281 using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
   2282 template <size_t N>
   2283 using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
   2284 template <size_t N>
   2285 using FloatFromSize = typename detail::TypeFromSize<N>::Float;
   2286 
   2287 // Avoid confusion with SizeTag where the parameter is a lane size.
   2288 using UnsignedTag = SizeTag<0>;
   2289 using SignedTag = SizeTag<0x100>;  // integer
   2290 using FloatTag = SizeTag<0x200>;
   2291 using SpecialTag = SizeTag<0x300>;
   2292 
   2293 template <typename T, class R = detail::Relations<T>>
   2294 constexpr auto TypeTag()
   2295    -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {
   2296  return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>();
   2297 }
   2298 
   2299 // For when we only want to distinguish FloatTag from everything else.
   2300 using NonFloatTag = SizeTag<0x400>;
   2301 
   2302 template <typename T, class R = detail::Relations<T>>
   2303 constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
   2304  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
   2305 }
   2306 
   2307 //------------------------------------------------------------------------------
   2308 // Type traits
   2309 
   2310 template <typename T>
   2311 HWY_API constexpr bool IsFloat3264() {
   2312  return IsSameEither<RemoveCvRef<T>, float, double>();
   2313 }
   2314 
   2315 template <typename T>
   2316 HWY_API constexpr bool IsFloat() {
   2317  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
   2318  // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
   2319  return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
   2320 }
   2321 
   2322 template <typename T>
   2323 HWY_API constexpr bool IsSigned() {
   2324  return static_cast<T>(0) > static_cast<T>(-1);
   2325 }
   2326 template <>
   2327 constexpr bool IsSigned<float16_t>() {
   2328  return true;
   2329 }
   2330 template <>
   2331 constexpr bool IsSigned<bfloat16_t>() {
   2332  return true;
   2333 }
   2334 template <>
   2335 constexpr bool IsSigned<hwy::uint128_t>() {
   2336  return false;
   2337 }
   2338 template <>
   2339 constexpr bool IsSigned<hwy::K64V64>() {
   2340  return false;
   2341 }
   2342 template <>
   2343 constexpr bool IsSigned<hwy::K32V32>() {
   2344  return false;
   2345 }
   2346 
   2347 template <typename T>
   2348 HWY_API constexpr bool IsUnsigned() {
   2349  return IsInteger<T>() && !IsSigned<T>();
   2350 }
   2351 
   2352 template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
   2353 struct MakeLaneTypeIfIntegerT {
   2354  using type = T;
   2355 };
   2356 
   2357 template <typename T>
   2358 struct MakeLaneTypeIfIntegerT<T, true> {
   2359  using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
   2360                       UnsignedFromSize<sizeof(T)>>;
   2361 };
   2362 
   2363 template <typename T>
   2364 using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
   2365 
   2366 // Largest/smallest representable integer values.
   2367 template <typename T>
   2368 HWY_API constexpr T LimitsMax() {
   2369  static_assert(IsInteger<T>(), "Only for integer types");
   2370  using TU = UnsignedFromSize<sizeof(T)>;
   2371  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
   2372                                      : static_cast<TU>(~TU(0)));
   2373 }
   2374 template <typename T>
   2375 HWY_API constexpr T LimitsMin() {
   2376  static_assert(IsInteger<T>(), "Only for integer types");
   2377  return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
   2378                       : static_cast<T>(0);
   2379 }
   2380 
   2381 // Largest/smallest representable value (integer or float). This naming avoids
   2382 // confusion with numeric_limits<float>::min() (the smallest positive value).
   2383 // Cannot be constexpr because we use CopySameSize for [b]float16_t.
   2384 template <typename T>
   2385 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
   2386  return LimitsMin<T>();
   2387 }
   2388 template <>
   2389 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
   2390  return bfloat16_t::FromBits(uint16_t{0xFF7Fu});  // -1.1111111 x 2^127
   2391 }
   2392 template <>
   2393 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
   2394  return float16_t::FromBits(uint16_t{0xFBFFu});  // -1.1111111111 x 2^15
   2395 }
   2396 template <>
   2397 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
   2398  return -3.402823466e+38F;
   2399 }
   2400 template <>
   2401 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
   2402  return -1.7976931348623158e+308;
   2403 }
   2404 
   2405 template <typename T>
   2406 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
   2407  return LimitsMax<T>();
   2408 }
   2409 template <>
   2410 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
   2411  return bfloat16_t::FromBits(uint16_t{0x7F7Fu});  // 1.1111111 x 2^127
   2412 }
   2413 template <>
   2414 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
   2415  return float16_t::FromBits(uint16_t{0x7BFFu});  // 1.1111111111 x 2^15
   2416 }
   2417 template <>
   2418 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
   2419  return 3.402823466e+38F;
   2420 }
   2421 template <>
   2422 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
   2423  return 1.7976931348623158e+308;
   2424 }
   2425 
   2426 // Difference between 1.0 and the next representable value. Equal to
   2427 // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
   2428 template <typename T>
   2429 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
   2430  return 1;
   2431 }
   2432 template <>
   2433 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
   2434  return bfloat16_t::FromBits(uint16_t{0x3C00u});  // 0.0078125
   2435 }
   2436 template <>
   2437 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
   2438  return float16_t::FromBits(uint16_t{0x1400u});  // 0.0009765625
   2439 }
   2440 template <>
   2441 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
   2442  return 1.192092896e-7f;
   2443 }
   2444 template <>
   2445 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
   2446  return 2.2204460492503131e-16;
   2447 }
   2448 
   2449 // Returns width in bits of the mantissa field in IEEE binary16/32/64.
   2450 template <typename T>
   2451 constexpr int MantissaBits() {
   2452  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
   2453  return 0;
   2454 }
   2455 template <>
   2456 constexpr int MantissaBits<bfloat16_t>() {
   2457  return 7;
   2458 }
   2459 template <>
   2460 constexpr int MantissaBits<float16_t>() {
   2461  return 10;
   2462 }
   2463 template <>
   2464 constexpr int MantissaBits<float>() {
   2465  return 23;
   2466 }
   2467 template <>
   2468 constexpr int MantissaBits<double>() {
   2469  return 52;
   2470 }
   2471 
   2472 // Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with
   2473 // the largest possible (biased) exponent field. Used by IsInf.
   2474 template <typename T>
   2475 constexpr MakeSigned<T> MaxExponentTimes2() {
   2476  return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
   2477 }
   2478 
   2479 // Returns bitmask of the sign bit in IEEE binary16/32/64.
   2480 template <typename T>
   2481 constexpr MakeUnsigned<T> SignMask() {
   2482  return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
   2483 }
   2484 
   2485 // Returns bitmask of the exponent field in IEEE binary16/32/64.
   2486 template <typename T>
   2487 constexpr MakeUnsigned<T> ExponentMask() {
   2488  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
   2489         static_cast<MakeUnsigned<T>>(~SignMask<T>());
   2490 }
   2491 
   2492 // Returns bitmask of the mantissa field in IEEE binary16/32/64.
   2493 template <typename T>
   2494 constexpr MakeUnsigned<T> MantissaMask() {
   2495  return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
   2496 }
   2497 
   2498 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
   2499 // absolute value are less than this can be represented exactly.
   2500 template <typename T>
   2501 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
   2502  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
   2503  return 0;
   2504 }
   2505 template <>
   2506 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
   2507  return bfloat16_t::FromBits(uint16_t{0x4300u});  // 1.0 x 2^7
   2508 }
   2509 template <>
   2510 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
   2511  return float16_t::FromBits(uint16_t{0x6400u});  // 1.0 x 2^10
   2512 }
   2513 template <>
   2514 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
   2515  return 8388608.0f;  // 1 << 23
   2516 }
   2517 template <>
   2518 HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
   2519  // floating point literal with p52 requires C++17.
   2520  return 4503599627370496.0;  // 1 << 52
   2521 }
   2522 
   2523 // Returns width in bits of the exponent field in IEEE binary16/32/64.
   2524 template <typename T>
   2525 constexpr int ExponentBits() {
   2526  // Exponent := remaining bits after deducting sign and mantissa.
   2527  return 8 * sizeof(T) - 1 - MantissaBits<T>();
   2528 }
   2529 
   2530 // Returns largest value of the biased exponent field in IEEE binary16/32/64,
   2531 // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
   2532 // This is expressed as a signed integer for more efficient comparison.
   2533 template <typename T>
   2534 constexpr MakeSigned<T> MaxExponentField() {
   2535  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
   2536 }
   2537 
   2538 namespace detail {
   2539 
   2540 template <typename T>
   2541 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
   2542 NegativeInfOrLowestValue(hwy::FloatTag /* tag */) {
   2543  return BitCastScalar<T>(
   2544      static_cast<MakeUnsigned<T>>(SignMask<T>() | ExponentMask<T>()));
   2545 }
   2546 
   2547 template <typename T>
   2548 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
   2549 NegativeInfOrLowestValue(hwy::NonFloatTag /* tag */) {
   2550  return LowestValue<T>();
   2551 }
   2552 
   2553 template <typename T>
   2554 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
   2555 PositiveInfOrHighestValue(hwy::FloatTag /* tag */) {
   2556  return BitCastScalar<T>(ExponentMask<T>());
   2557 }
   2558 
   2559 template <typename T>
   2560 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
   2561 PositiveInfOrHighestValue(hwy::NonFloatTag /* tag */) {
   2562  return HighestValue<T>();
   2563 }
   2564 
   2565 }  // namespace detail
   2566 
   2567 template <typename T>
   2568 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T NegativeInfOrLowestValue() {
   2569  return detail::NegativeInfOrLowestValue<T>(IsFloatTag<T>());
   2570 }
   2571 
   2572 template <typename T>
   2573 HWY_API HWY_BITCASTSCALAR_CONSTEXPR T PositiveInfOrHighestValue() {
   2574  return detail::PositiveInfOrHighestValue<T>(IsFloatTag<T>());
   2575 }
   2576 
   2577 //------------------------------------------------------------------------------
   2578 // Additional F16/BF16 operators
   2579 
   2580 #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
   2581 
   2582 #define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2)                       \
   2583  template <                                                                  \
   2584      typename T1,                                                            \
   2585      hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() ||                      \
   2586                    hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr,          \
   2587      typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
   2588      typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>,     \
   2589      HWY_IF_CASTABLE(RawResultT, ResultT)>                                   \
   2590  static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept {          \
   2591    return static_cast<ResultT>(a op b.native);                               \
   2592  }
   2593 
   2594 #define HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(op, assign_op, T2)                 \
   2595  template <typename T1,                                                   \
   2596            hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() ||             \
   2597                          hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
   2598            typename ResultT =                                             \
   2599                decltype(DeclVal<T1&>() assign_op DeclVal<T2::Native>())>  \
   2600  static HWY_INLINE constexpr ResultT operator assign_op(T1& a,            \
   2601                                                         T2 b) noexcept {  \
   2602    return (a assign_op b.native);                                         \
   2603  }
   2604 
   2605 #define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1)         \
   2606  HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1)                             \
   2607  template <                                                                  \
   2608      typename T2,                                                            \
   2609      hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() ||                      \
   2610                    hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr,          \
   2611      typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
   2612      typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>,     \
   2613      HWY_IF_CASTABLE(RawResultT, ResultT)>                                   \
   2614  static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept {          \
   2615    return static_cast<ResultT>(a.native op b);                               \
   2616  }
   2617 
   2618 #if HWY_HAVE_SCALAR_F16_OPERATORS
   2619 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
   2620 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
   2621 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
   2622 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
   2623 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, float16_t)
   2624 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, float16_t)
   2625 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, float16_t)
   2626 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, float16_t)
   2627 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
   2628 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
   2629 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
   2630 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
   2631 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
   2632 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
   2633 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
   2634 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
   2635 #endif
   2636 #endif  // HWY_HAVE_SCALAR_F16_OPERATORS
   2637 
   2638 #if HWY_HAVE_SCALAR_BF16_OPERATORS
   2639 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
   2640 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
   2641 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
   2642 HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
   2643 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, bfloat16_t)
   2644 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, bfloat16_t)
   2645 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, bfloat16_t)
   2646 HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, bfloat16_t)
   2647 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
   2648 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
   2649 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
   2650 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
   2651 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
   2652 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
   2653 #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
   2654 HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
   2655 #endif
   2656 #endif  // HWY_HAVE_SCALAR_BF16_OPERATORS
   2657 
   2658 #undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
   2659 #undef HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP
   2660 #undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
   2661 
   2662 #endif  // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
   2663 
   2664 //------------------------------------------------------------------------------
   2665 // Type conversions (after IsSpecialFloat)
   2666 
   2667 HWY_API float F32FromF16Mem(const void* ptr) {
   2668  float16_t f16;
   2669  CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
   2670  return F32FromF16(f16);
   2671 }
   2672 
   2673 HWY_API float F32FromBF16Mem(const void* ptr) {
   2674  bfloat16_t bf;
   2675  CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
   2676  return F32FromBF16(bf);
   2677 }
   2678 
   2679 #if HWY_HAVE_SCALAR_F16_OPERATORS
   2680 #define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
   2681 #else
   2682 #define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
   2683 #endif
   2684 
   2685 namespace detail {
   2686 
   2687 template <class TTo, class TFrom>
   2688 static HWY_INLINE HWY_MAYBE_UNUSED constexpr TTo ConvertScalarToResult(
   2689    hwy::SizeTag<0> /*conv_to_tag*/, TFrom in) {
   2690  return static_cast<TTo>(static_cast<TFrom>(in));
   2691 }
   2692 
   2693 template <class TTo>
   2694 static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
   2695 ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, float in) {
   2696  return F16FromF32(in);
   2697 }
   2698 
   2699 template <class TTo>
   2700 static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
   2701 ConvertScalarToResult(hwy::FloatTag /*conv_to_tag*/, double in) {
   2702  return F16FromF64(in);
   2703 }
   2704 
   2705 template <class TTo>
   2706 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
   2707 ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, float in) {
   2708  return BF16FromF32(in);
   2709 }
   2710 
   2711 template <class TTo>
   2712 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
   2713 ConvertScalarToResult(hwy::SpecialTag /*conv_to_tag*/, double in) {
   2714  return BF16FromF64(in);
   2715 }
   2716 
   2717 template <class TFrom, HWY_IF_BF16(TFrom)>
   2718 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR float
   2719 ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) {
   2720  return F32FromBF16(in);
   2721 }
   2722 
   2723 template <class TFrom, HWY_IF_F16(TFrom)>
   2724 static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR float
   2725 ConvertScalarSpecialFloatToF32(hwy::SpecialTag /*conv_from_tag*/, TFrom in) {
   2726  return F32FromF16(in);
   2727 }
   2728 
   2729 template <class TFrom>
   2730 static HWY_INLINE HWY_MAYBE_UNUSED constexpr auto
   2731 ConvertScalarSpecialFloatToF32(hwy::FloatTag /*conv_from_tag*/, TFrom in)
   2732    -> hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float> {
   2733  return static_cast<
   2734      hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float>>(
   2735      in);
   2736 }
   2737 
   2738 template <class TFrom>
   2739 static HWY_INLINE HWY_MAYBE_UNUSED constexpr TFrom
   2740 ConvertScalarSpecialFloatToF32(hwy::SizeTag<0> /*conv_from_tag*/, TFrom in) {
   2741  return static_cast<TFrom>(in);
   2742 }
   2743 
   2744 }  // namespace detail
   2745 
   2746 template <typename TTo, typename TFrom>
   2747 HWY_API constexpr TTo ConvertScalarTo(TFrom in) {
   2748  return detail::ConvertScalarToResult<TTo>(
   2749      hwy::SizeTag<
   2750          (!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
   2751           hwy::IsSpecialFloat<TTo>())
   2752              ? (hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>() ? 0x300
   2753                                                                  : 0x200)
   2754              : 0>(),
   2755      detail::ConvertScalarSpecialFloatToF32(
   2756          hwy::SizeTag<
   2757              (!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
   2758               (hwy::IsSpecialFloat<TFrom>() || hwy::IsSpecialFloat<TTo>()))
   2759                  ? (hwy::IsSpecialFloat<TFrom>() ? 0x300 : 0x200)
   2760                  : 0>(),
   2761          static_cast<TFrom&&>(in)));
   2762 }
   2763 
   2764 //------------------------------------------------------------------------------
   2765 // Helper functions
   2766 
   2767 template <typename T1, typename T2>
   2768 constexpr inline T1 DivCeil(T1 a, T2 b) {
   2769 #if HWY_CXX_LANG >= 201703L
   2770  HWY_DASSERT(b != T2{0});
   2771 #endif
   2772  return (a + b - 1) / b;
   2773 }
   2774 
   2775 // Works for any non-zero `align`; if a power of two, compiler emits ADD+AND.
   2776 constexpr inline size_t RoundUpTo(size_t what, size_t align) {
   2777  return DivCeil(what, align) * align;
   2778 }
   2779 
   2780 // Works for any `align`; if a power of two, compiler emits AND.
   2781 constexpr inline size_t RoundDownTo(size_t what, size_t align) {
   2782  return what - (what % align);
   2783 }
   2784 
   2785 namespace detail {
   2786 
   2787 // T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
   2788 // shift
   2789 template <class T>
   2790 static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
   2791                                        int shift_amt) {
   2792  return static_cast<T>(val >> shift_amt);
   2793 }
   2794 
   2795 // T is signed and (val >> shift_amt) is a non-arithmetic right shift
   2796 template <class T>
   2797 static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
   2798                                        int shift_amt) {
   2799  using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>;
   2800  return static_cast<T>(
   2801      (val < 0) ? static_cast<TU>(
   2802                      ~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
   2803                : static_cast<TU>(static_cast<TU>(val) >> shift_amt));
   2804 }
   2805 
   2806 }  // namespace detail
   2807 
   2808 // If T is an signed integer type, ScalarShr is guaranteed to perform an
   2809 // arithmetic right shift
   2810 
   2811 // Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
   2812 // perform a logical right shift
   2813 template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
   2814 HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
   2815  using NonCvRefT = RemoveCvRef<T>;
   2816  return detail::ScalarShr(
   2817      hwy::SizeTag<((IsSigned<NonCvRefT>() &&
   2818                     (LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
   2819                         static_cast<NonCvRefT>(-1))
   2820                        ? 0x100
   2821                        : 0)>(),
   2822      static_cast<NonCvRefT>(val), shift_amt);
   2823 }
   2824 
   2825 // Undefined results for x == 0.
   2826 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
   2827  HWY_DASSERT(x != 0);
   2828 #if HWY_COMPILER_MSVC
   2829  unsigned long index;  // NOLINT
   2830  _BitScanForward(&index, x);
   2831  return index;
   2832 #else   // HWY_COMPILER_MSVC
   2833  return static_cast<size_t>(__builtin_ctz(x));
   2834 #endif  // HWY_COMPILER_MSVC
   2835 }
   2836 
   2837 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
   2838  HWY_DASSERT(x != 0);
   2839 #if HWY_COMPILER_MSVC
   2840 #if HWY_ARCH_X86_64
   2841  unsigned long index;  // NOLINT
   2842  _BitScanForward64(&index, x);
   2843  return index;
   2844 #else   // HWY_ARCH_X86_64
   2845  // _BitScanForward64 not available
   2846  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
   2847  unsigned long index;  // NOLINT
   2848  if (lsb == 0) {
   2849    uint32_t msb = static_cast<uint32_t>(x >> 32u);
   2850    _BitScanForward(&index, msb);
   2851    return 32 + index;
   2852  } else {
   2853    _BitScanForward(&index, lsb);
   2854    return index;
   2855  }
   2856 #endif  // HWY_ARCH_X86_64
   2857 #else   // HWY_COMPILER_MSVC
   2858  return static_cast<size_t>(__builtin_ctzll(x));
   2859 #endif  // HWY_COMPILER_MSVC
   2860 }
   2861 
   2862 // Undefined results for x == 0.
   2863 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
   2864  HWY_DASSERT(x != 0);
   2865 #if HWY_COMPILER_MSVC
   2866  unsigned long index;  // NOLINT
   2867  _BitScanReverse(&index, x);
   2868  return 31 - index;
   2869 #else   // HWY_COMPILER_MSVC
   2870  return static_cast<size_t>(__builtin_clz(x));
   2871 #endif  // HWY_COMPILER_MSVC
   2872 }
   2873 
   2874 HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
   2875  HWY_DASSERT(x != 0);
   2876 #if HWY_COMPILER_MSVC
   2877 #if HWY_ARCH_X86_64
   2878  unsigned long index;  // NOLINT
   2879  _BitScanReverse64(&index, x);
   2880  return 63 - index;
   2881 #else   // HWY_ARCH_X86_64
   2882  // _BitScanReverse64 not available
   2883  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
   2884  unsigned long index;  // NOLINT
   2885  if (msb == 0) {
   2886    const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
   2887    _BitScanReverse(&index, lsb);
   2888    return 63 - index;
   2889  } else {
   2890    _BitScanReverse(&index, msb);
   2891    return 31 - index;
   2892  }
   2893 #endif  // HWY_ARCH_X86_64
   2894 #else   // HWY_COMPILER_MSVC
   2895  return static_cast<size_t>(__builtin_clzll(x));
   2896 #endif  // HWY_COMPILER_MSVC
   2897 }
   2898 
   2899 template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
   2900          HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
   2901 HWY_API size_t PopCount(T x) {
   2902  uint32_t u32_x = static_cast<uint32_t>(
   2903      static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
   2904 
   2905 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
   2906  return static_cast<size_t>(__builtin_popcountl(u32_x));
   2907 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
   2908  return static_cast<size_t>(_mm_popcnt_u32(u32_x));
   2909 #else
   2910  u32_x -= ((u32_x >> 1) & 0x55555555u);
   2911  u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
   2912  u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
   2913  u32_x += (u32_x >> 8);
   2914  u32_x += (u32_x >> 16);
   2915  return static_cast<size_t>(u32_x & 0x3Fu);
   2916 #endif
   2917 }
   2918 
   2919 template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
   2920          HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
   2921 HWY_API size_t PopCount(T x) {
   2922  uint64_t u64_x = static_cast<uint64_t>(
   2923      static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
   2924 
   2925 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
   2926  return static_cast<size_t>(__builtin_popcountll(u64_x));
   2927 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
   2928  return _mm_popcnt_u64(u64_x);
   2929 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
   2930  return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
   2931         _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
   2932 #else
   2933  u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
   2934  u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
   2935           (u64_x & 0x3333333333333333ULL));
   2936  u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
   2937  u64_x += (u64_x >> 8);
   2938  u64_x += (u64_x >> 16);
   2939  u64_x += (u64_x >> 32);
   2940  return static_cast<size_t>(u64_x & 0x7Fu);
   2941 #endif
   2942 }
   2943 
   2944 // Skip HWY_API due to GCC "function not considered for inlining". Previously
   2945 // such errors were caused by underlying type mismatches, but it's not clear
   2946 // what is still mismatched despite all the casts.
   2947 template <typename TI>
   2948 /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
   2949  return x == TI{1}
   2950             ? 0
   2951             : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
   2952 }
   2953 
   2954 template <typename TI>
   2955 /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
   2956  return x == TI{1}
   2957             ? 0
   2958             : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
   2959 }
   2960 
   2961 template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
   2962 HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
   2963  return t + static_cast<T>(increment);
   2964 }
   2965 
   2966 template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
   2967 HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
   2968  return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
   2969                            ConvertScalarTo<float>(increment));
   2970 }
   2971 
   2972 template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
   2973 HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
   2974  using TU = MakeUnsigned<T>;
   2975  // Sub-int types would promote to int, not unsigned, which would trigger
   2976  // warnings, so first promote to the largest unsigned type. Due to
   2977  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
   2978  // until fixed in 9.3, we use built-in types rather than uint64_t.
   2979  return static_cast<T>(static_cast<TU>(
   2980      static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
   2981                                      static_cast<unsigned long long>(n)) &
   2982      uint64_t{hwy::LimitsMax<TU>()}));
   2983 }
   2984 
   2985 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
   2986 #pragma intrinsic(_mul128)
   2987 #pragma intrinsic(_umul128)
   2988 #endif
   2989 
   2990 // 64 x 64 = 128 bit multiplication
   2991 HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
   2992 #if defined(__SIZEOF_INT128__)
   2993  __uint128_t product =
   2994      static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
   2995  *upper = static_cast<uint64_t>(product >> 64);
   2996  return static_cast<uint64_t>(product & 0xFFFFFFFFFFFFFFFFULL);
   2997 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
   2998  return _umul128(a, b, upper);
   2999 #else
   3000  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
   3001  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
   3002  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
   3003  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
   3004  const uint64_t hi_hi = (a >> 32) * (b >> 32);
   3005  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
   3006  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
   3007  return (t << 32) | (lo_lo & kLo32);
   3008 #endif
   3009 }
   3010 
   3011 HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
   3012 #if defined(__SIZEOF_INT128__)
   3013  __int128_t product = static_cast<__int128_t>(a) * static_cast<__int128_t>(b);
   3014  *upper = static_cast<int64_t>(product >> 64);
   3015  return static_cast<int64_t>(product & 0xFFFFFFFFFFFFFFFFULL);
   3016 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
   3017  return _mul128(a, b, upper);
   3018 #else
   3019  uint64_t unsigned_upper;
   3020  const int64_t lower = static_cast<int64_t>(Mul128(
   3021      static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
   3022  *upper = static_cast<int64_t>(
   3023      unsigned_upper -
   3024      (static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
   3025      (static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
   3026  return lower;
   3027 #endif
   3028 }
   3029 
   3030 // Precomputation for fast n / divisor and n % divisor, where n is a variable
   3031 // and divisor is unchanging but unknown at compile-time.
   3032 class Divisor {
   3033 public:
   3034  explicit Divisor(uint32_t divisor) : divisor_(divisor) {
   3035    if (divisor <= 1) return;
   3036 
   3037    const uint32_t len =
   3038        static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
   3039    const uint64_t u_hi = (2ULL << len) - divisor;
   3040    const uint32_t q = Truncate((u_hi << 32) / divisor);
   3041 
   3042    mul_ = q + 1;
   3043    shift1_ = 1;
   3044    shift2_ = len;
   3045  }
   3046 
   3047  uint32_t GetDivisor() const { return divisor_; }
   3048 
   3049  // Returns n / divisor_.
   3050  uint32_t Divide(uint32_t n) const {
   3051    const uint64_t mul = mul_;
   3052    const uint32_t t = Truncate((mul * n) >> 32);
   3053    return (t + ((n - t) >> shift1_)) >> shift2_;
   3054  }
   3055 
   3056  // Returns n % divisor_.
   3057  uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
   3058 
   3059 private:
   3060  static uint32_t Truncate(uint64_t x) {
   3061    return static_cast<uint32_t>(x & 0xFFFFFFFFu);
   3062  }
   3063 
   3064  uint32_t divisor_;
   3065  uint32_t mul_ = 1;
   3066  uint32_t shift1_ = 0;
   3067  uint32_t shift2_ = 0;
   3068 };
   3069 
   3070 #ifndef HWY_HAVE_DIV128  // allow override
   3071 // Exclude clang-cl because it calls __divti3 from clang_rt.builtins-x86_64,
   3072 // which is not linked in.
   3073 #if (HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64) || \
   3074    (defined(__SIZEOF_INT128__) && !HWY_COMPILER_CLANGCL)
   3075 #define HWY_HAVE_DIV128 1
   3076 #else
   3077 #define HWY_HAVE_DIV128 0
   3078 #endif
   3079 #endif  // HWY_HAVE_DIV128
   3080 
   3081 // Divisor64 can precompute the multiplicative inverse.
   3082 #if HWY_HAVE_DIV128
   3083 
   3084 #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
   3085 #pragma intrinsic(_udiv128)
   3086 #pragma intrinsic(__umulh)
   3087 #endif
   3088 
   3089 // As above, but for 64-bit divisors: more expensive to compute and initialize.
   3090 class Divisor64 {
   3091 public:
   3092  explicit Divisor64(uint64_t divisor) : divisor_(divisor) {
   3093    if (divisor <= 1) return;
   3094 
   3095    const uint64_t len =
   3096        static_cast<uint64_t>(63 - Num0BitsAboveMS1Bit_Nonzero64(divisor - 1));
   3097    const uint64_t u_hi = (2ULL << len) - divisor;
   3098    const uint64_t q = Div128(u_hi, divisor);
   3099 
   3100    mul_ = q + 1;
   3101    shift1_ = 1;
   3102    shift2_ = len;
   3103  }
   3104 
   3105  uint64_t GetDivisor() const { return divisor_; }
   3106 
   3107  // Returns n / divisor_.
   3108  uint64_t Divide(uint64_t n) const {
   3109    const uint64_t t = MulHigh(mul_, n);
   3110    return (t + ((n - t) >> shift1_)) >> shift2_;
   3111  }
   3112 
   3113  // Returns n % divisor_.
   3114  uint64_t Remainder(uint64_t n) const { return n - (Divide(n) * divisor_); }
   3115 
   3116 private:
   3117  uint64_t divisor_;
   3118 
   3119  static uint64_t Div128(uint64_t hi, uint64_t div) {
   3120 #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
   3121    unsigned __int64 remainder;  // unused
   3122    return _udiv128(hi, uint64_t{0}, div, &remainder);
   3123 #else
   3124    using u128 = unsigned __int128;
   3125    const u128 hi128 = static_cast<u128>(hi) << 64;
   3126    return static_cast<uint64_t>(hi128 / static_cast<u128>(div));
   3127 #endif
   3128  }
   3129 
   3130  static uint64_t MulHigh(uint64_t a, uint64_t b) {
   3131 #if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
   3132    return __umulh(a, b);
   3133 #else
   3134    using u128 = unsigned __int128;
   3135    const u128 a128 = static_cast<u128>(a);
   3136    const u128 b128 = static_cast<u128>(b);
   3137    return static_cast<uint64_t>((a128 * b128) >> 64);
   3138 #endif
   3139  }
   3140 
   3141  uint64_t mul_ = 1;
   3142  uint64_t shift1_ = 0;
   3143  uint64_t shift2_ = 0;
   3144 };
   3145 #else
   3146 // No Div128 available, use built-in 64-bit division on each call.
   3147 class Divisor64 {
   3148 public:
   3149  explicit Divisor64(uint64_t divisor) : divisor_(divisor) {}
   3150 
   3151  uint64_t GetDivisor() const { return divisor_; }
   3152 
   3153  uint64_t Divide(uint64_t n) const { return n / divisor_; }
   3154  uint64_t Remainder(uint64_t n) const { return n % divisor_; }
   3155 
   3156 private:
   3157  uint64_t divisor_;
   3158 };
   3159 #endif  // HWY_HAVE_DIV128
   3160 
   3161 namespace detail {
   3162 
   3163 template <typename T>
   3164 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
   3165                                                          T val) {
   3166  using TU = MakeUnsigned<T>;
   3167  return BitCastScalar<T>(
   3168      static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
   3169 }
   3170 
   3171 template <typename T>
   3172 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
   3173 ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
   3174  return ScalarAbs(hwy::FloatTag(), val);
   3175 }
   3176 
   3177 template <typename T>
   3178 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
   3179 ScalarAbs(hwy::SignedTag /*tag*/, T val) {
   3180  using TU = MakeUnsigned<T>;
   3181  return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
   3182 }
   3183 
   3184 template <typename T>
   3185 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
   3186 ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
   3187  return val;
   3188 }
   3189 
   3190 }  // namespace detail
   3191 
   3192 template <typename T>
   3193 HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
   3194  using TVal = MakeLaneTypeIfInteger<
   3195      detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
   3196  return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
   3197 }
   3198 
   3199 template <typename T>
   3200 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
   3201  using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
   3202  using TU = MakeUnsigned<TF>;
   3203  return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
   3204 }
   3205 
   3206 template <typename T>
   3207 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
   3208  using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
   3209  using TU = MakeUnsigned<TF>;
   3210  return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
   3211         static_cast<TU>(MaxExponentTimes2<TF>());
   3212 }
   3213 
   3214 namespace detail {
   3215 
   3216 template <typename T>
   3217 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
   3218    hwy::FloatTag /*tag*/, T val) {
   3219  using TU = MakeUnsigned<T>;
   3220  return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
   3221 }
   3222 
   3223 template <typename T>
   3224 static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
   3225    hwy::NonFloatTag /*tag*/, T /*val*/) {
   3226  // Integer values are always finite
   3227  return true;
   3228 }
   3229 
   3230 }  // namespace detail
   3231 
   3232 template <typename T>
   3233 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
   3234  using TVal = MakeLaneTypeIfInteger<
   3235      detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
   3236  return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
   3237                                static_cast<TVal>(val));
   3238 }
   3239 
   3240 template <typename T>
   3241 HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
   3242                                                                  T sign) {
   3243  using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
   3244  using TU = MakeUnsigned<TF>;
   3245  return BitCastScalar<TF>(static_cast<TU>(
   3246      (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
   3247      (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
   3248 }
   3249 
   3250 template <typename T>
   3251 HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
   3252  using TVal = MakeLaneTypeIfInteger<
   3253      detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
   3254  using TU = MakeUnsigned<TVal>;
   3255  return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
   3256 }
   3257 
   3258 // Prevents the compiler from eliding the computations that led to "output".
   3259 #if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
   3260    !defined(_SOFT_FLOAT)
   3261 // Workaround to avoid test failures on PPC if compiled with Clang
   3262 template <class T, HWY_IF_F32(T)>
   3263 HWY_API void PreventElision(T&& output) {
   3264  asm volatile("" : "+f"(output)::"memory");
   3265 }
   3266 template <class T, HWY_IF_F64(T)>
   3267 HWY_API void PreventElision(T&& output) {
   3268  asm volatile("" : "+d"(output)::"memory");
   3269 }
   3270 template <class T, HWY_IF_NOT_FLOAT3264(T)>
   3271 HWY_API void PreventElision(T&& output) {
   3272  asm volatile("" : "+r"(output)::"memory");
   3273 }
   3274 #else
   3275 template <class T>
   3276 HWY_API void PreventElision(T&& output) {
   3277 #if HWY_COMPILER_MSVC
   3278  // MSVC does not support inline assembly anymore (and never supported GCC's
   3279  // RTL constraints). Self-assignment with #pragma optimize("off") might be
   3280  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
   3281  // with volatile pointers generates inefficient code on MSVC 2017.
   3282  static std::atomic<RemoveCvRef<T>> sink;
   3283  sink.store(output, std::memory_order_relaxed);
   3284 #else
   3285  // Works by indicating to the compiler that "output" is being read and
   3286  // modified. The +r constraint avoids unnecessary writes to memory, but only
   3287  // works for built-in types (typically FuncOutput).
   3288  asm volatile("" : "+r"(output) : : "memory");
   3289 #endif
   3290 }
   3291 #endif
   3292 
   3293 }  // namespace hwy
   3294 
   3295 #endif  // HIGHWAY_HWY_BASE_H_