tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

x86_128-inl.h (525219B)


      1 // Copyright 2019 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
     17 // operations when compiling for those targets.
     18 // External include guard in highway.h - see comment there.
     19 
     20 // Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
     21 #include "hwy/base.h"
     22 
     23 // Avoid uninitialized warnings in GCC's emmintrin.h - see
     24 // https://github.com/google/highway/issues/710 and pull/902
     25 HWY_DIAGNOSTICS(push)
     26 #if HWY_COMPILER_GCC_ACTUAL
     27 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
     28 HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
     29                    ignored "-Wmaybe-uninitialized")
     30 #endif
     31 
     32 #include <emmintrin.h>
     33 #include <stdio.h>
     34 #if HWY_TARGET == HWY_SSSE3
     35 #include <tmmintrin.h>  // SSSE3
     36 #elif HWY_TARGET <= HWY_SSE4
     37 #include <smmintrin.h>  // SSE4
     38 #ifndef HWY_DISABLE_PCLMUL_AES
     39 #include <wmmintrin.h>  // CLMUL
     40 #endif
     41 #endif
     42 
     43 #include "hwy/ops/shared-inl.h"
     44 
     45 HWY_BEFORE_NAMESPACE();
     46 namespace hwy {
     47 namespace HWY_NAMESPACE {
     48 namespace detail {
     49 
     50 // Enable generic functions for whichever of (f16, bf16) are not supported.
     51 #if !HWY_HAVE_FLOAT16
     52 #define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
     53 #else
     54 #define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
     55 #endif
     56 
     57 #undef HWY_AVX3_HAVE_F32_TO_BF16C
     58 #if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL &&           \
     59    (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
     60    HWY_AVX3_ENABLE_AVX512BF16
     61 #define HWY_AVX3_HAVE_F32_TO_BF16C 1
     62 #else
     63 #define HWY_AVX3_HAVE_F32_TO_BF16C 0
     64 #endif
     65 
     66 #undef HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT
     67 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
     68 #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "v"
     69 #else
     70 #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x"
     71 #endif
     72 
     73 #undef HWY_X86_HAVE_AVX10_2_OPS
     74 #if HWY_TARGET_IS_AVX10_2 &&            \
     75    (HWY_COMPILER_GCC_ACTUAL >= 1501 || \
     76     (HWY_COMPILER3_CLANG >= 200103 && HWY_COMPILER_CLANG != 2100))
     77 #define HWY_X86_HAVE_AVX10_2_OPS 1
     78 #else
     79 #define HWY_X86_HAVE_AVX10_2_OPS 0
     80 #endif
     81 
     82 template <typename T>
     83 struct Raw128 {
     84  using type = __m128i;
     85 };
     86 #if HWY_HAVE_FLOAT16
     87 template <>
     88 struct Raw128<float16_t> {
     89  using type = __m128h;
     90 };
     91 #endif  // HWY_HAVE_FLOAT16
     92 template <>
     93 struct Raw128<float> {
     94  using type = __m128;
     95 };
     96 template <>
     97 struct Raw128<double> {
     98  using type = __m128d;
     99 };
    100 
    101 }  // namespace detail
    102 
    103 template <typename T, size_t N = 16 / sizeof(T)>
    104 class Vec128 {
    105  using Raw = typename detail::Raw128<T>::type;
    106 
    107 public:
    108  using PrivateT = T;                     // only for DFromV
    109  static constexpr size_t kPrivateN = N;  // only for DFromV
    110 
    111  // Compound assignment. Only usable if there is a corresponding non-member
    112  // binary operator overload. For example, only f32 and f64 support division.
    113  HWY_INLINE Vec128& operator*=(const Vec128 other) {
    114    return *this = (*this * other);
    115  }
    116  HWY_INLINE Vec128& operator/=(const Vec128 other) {
    117    return *this = (*this / other);
    118  }
    119  HWY_INLINE Vec128& operator+=(const Vec128 other) {
    120    return *this = (*this + other);
    121  }
    122  HWY_INLINE Vec128& operator-=(const Vec128 other) {
    123    return *this = (*this - other);
    124  }
    125  HWY_INLINE Vec128& operator%=(const Vec128 other) {
    126    return *this = (*this % other);
    127  }
    128  HWY_INLINE Vec128& operator&=(const Vec128 other) {
    129    return *this = (*this & other);
    130  }
    131  HWY_INLINE Vec128& operator|=(const Vec128 other) {
    132    return *this = (*this | other);
    133  }
    134  HWY_INLINE Vec128& operator^=(const Vec128 other) {
    135    return *this = (*this ^ other);
    136  }
    137 
    138  Raw raw;
    139 };
    140 
    141 template <typename T>
    142 using Vec64 = Vec128<T, 8 / sizeof(T)>;
    143 
    144 template <typename T>
    145 using Vec32 = Vec128<T, 4 / sizeof(T)>;
    146 
    147 template <typename T>
    148 using Vec16 = Vec128<T, 2 / sizeof(T)>;
    149 
    150 namespace detail {
    151 
    152 #if HWY_TARGET <= HWY_AVX3
    153 
    154 // Template arg: sizeof(lane type)
    155 template <size_t size>
    156 struct RawMask128T {};
    157 template <>
    158 struct RawMask128T<1> {
    159  using type = __mmask16;
    160 };
    161 template <>
    162 struct RawMask128T<2> {
    163  using type = __mmask8;
    164 };
    165 template <>
    166 struct RawMask128T<4> {
    167  using type = __mmask8;
    168 };
    169 template <>
    170 struct RawMask128T<8> {
    171  using type = __mmask8;
    172 };
    173 
    174 template <typename T>
    175 using RawMask128 = typename RawMask128T<sizeof(T)>::type;
    176 
    177 #else  // AVX2 or earlier
    178 
    179 template <typename T>
    180 using RawMask128 = typename Raw128<T>::type;
    181 
    182 #endif  // HWY_TARGET <= HWY_AVX3
    183 
    184 }  // namespace detail
    185 
    186 template <typename T, size_t N = 16 / sizeof(T)>
    187 struct Mask128 {
    188  using Raw = typename detail::RawMask128<T>;
    189 
    190  using PrivateT = T;                     // only for DFromM
    191  static constexpr size_t kPrivateN = N;  // only for DFromM
    192 
    193 #if HWY_TARGET <= HWY_AVX3
    194  static Mask128<T, N> FromBits(uint64_t mask_bits) {
    195    return Mask128<T, N>{static_cast<Raw>(mask_bits)};
    196  }
    197 #else
    198 // Lanes are either FF..FF or 0.
    199 #endif
    200 
    201  Raw raw;
    202 };
    203 
    204 template <class V>
    205 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
    206 
    207 template <class M>
    208 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
    209 
    210 template <class V>
    211 using TFromV = typename V::PrivateT;
    212 
    213 // ------------------------------ Zero
    214 
    215 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
    216 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
    217 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    218  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
    219 }
    220 #if HWY_HAVE_FLOAT16
    221 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
    222 HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    223  return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()};
    224 }
    225 #endif  // HWY_HAVE_FLOAT16
    226 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
    227 HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    228  return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()};
    229 }
    230 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
    231 HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    232  return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()};
    233 }
    234 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
    235 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    236  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
    237 }
    238 
    239 // Using the existing Zero function instead of a dedicated function for
    240 // deduction avoids having to forward-declare Vec256 here.
    241 template <class D>
    242 using VFromD = decltype(Zero(D()));
    243 
    244 // ------------------------------ BitCast
    245 
    246 namespace detail {
    247 
    248 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
    249 #if HWY_HAVE_FLOAT16
    250 HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); }
    251 #endif  // HWY_HAVE_FLOAT16
    252 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
    253 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
    254 
    255 #if HWY_AVX3_HAVE_F32_TO_BF16C
    256 HWY_INLINE __m128i BitCastToInteger(__m128bh v) {
    257  // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to
    258  // bit cast a __m128bh to a __m128i as there is currently no intrinsic
    259  // available (as of GCC 13 and Clang 17) that can bit cast a __m128bh vector
    260  // to a __m128i vector
    261 
    262 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
    263  // On GCC or Clang, use reinterpret_cast to bit cast a __m128bh to a __m128i
    264  return reinterpret_cast<__m128i>(v);
    265 #else
    266  // On MSVC, use BitCastScalar to bit cast a __m128bh to a __m128i as MSVC does
    267  // not allow reinterpret_cast, static_cast, or a C-style cast to be used to
    268  // bit cast from one SSE/AVX vector type to a different SSE/AVX vector type
    269  return BitCastScalar<__m128i>(v);
    270 #endif  // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
    271 }
    272 #endif  // HWY_AVX3_HAVE_F32_TO_BF16C
    273 
    274 template <typename T, size_t N>
    275 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
    276  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
    277 }
    278 
    279 // Cannot rely on function overloading because return types differ.
    280 template <typename T>
    281 struct BitCastFromInteger128 {
    282  HWY_INLINE __m128i operator()(__m128i v) { return v; }
    283 };
    284 #if HWY_HAVE_FLOAT16
    285 template <>
    286 struct BitCastFromInteger128<float16_t> {
    287  HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); }
    288 };
    289 #endif  // HWY_HAVE_FLOAT16
    290 template <>
    291 struct BitCastFromInteger128<float> {
    292  HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
    293 };
    294 template <>
    295 struct BitCastFromInteger128<double> {
    296  HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
    297 };
    298 
    299 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    300 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    301                                     Vec128<uint8_t, D().MaxBytes()> v) {
    302  return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)};
    303 }
    304 
    305 }  // namespace detail
    306 
    307 template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)>
    308 HWY_API VFromD<D> BitCast(D d,
    309                          Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
    310  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
    311 }
    312 
    313 // ------------------------------ Set
    314 
    315 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
    316 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    317  return VFromD<D>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
    318 }
    319 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
    320 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    321  return VFromD<D>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
    322 }
    323 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
    324 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    325  return VFromD<D>{_mm_set1_epi32(static_cast<int>(t))};
    326 }
    327 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
    328 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    329  return VFromD<D>{_mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
    330 }
    331 #if HWY_HAVE_FLOAT16
    332 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
    333 HWY_API VFromD<D> Set(D /* tag */, float16_t t) {
    334  return VFromD<D>{_mm_set1_ph(t)};
    335 }
    336 #endif  // HWY_HAVE_FLOAT16
    337 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
    338 HWY_API VFromD<D> Set(D /* tag */, float t) {
    339  return VFromD<D>{_mm_set1_ps(t)};
    340 }
    341 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
    342 HWY_API VFromD<D> Set(D /* tag */, double t) {
    343  return VFromD<D>{_mm_set1_pd(t)};
    344 }
    345 
    346 // Generic for all vector lengths.
    347 template <class D, HWY_X86_IF_EMULATED_D(D)>
    348 HWY_API VFromD<D> Set(D df, TFromD<D> t) {
    349  const RebindToUnsigned<decltype(df)> du;
    350  static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16");
    351  uint16_t bits;
    352  CopyBytes<2>(&t, &bits);
    353  return BitCast(df, Set(du, bits));
    354 }
    355 
    356 // ------------------------------ Undefined
    357 
    358 HWY_DIAGNOSTICS(push)
    359 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
    360 
    361 // Returns a vector with uninitialized elements.
    362 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
    363 HWY_API VFromD<D> Undefined(D /* tag */) {
    364  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
    365  // generate an XOR instruction.
    366  return VFromD<D>{_mm_undefined_si128()};
    367 }
    368 #if HWY_HAVE_FLOAT16
    369 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
    370 HWY_API VFromD<D> Undefined(D /* tag */) {
    371  return VFromD<D>{_mm_undefined_ph()};
    372 }
    373 #endif  // HWY_HAVE_FLOAT16
    374 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
    375 HWY_API VFromD<D> Undefined(D /* tag */) {
    376  return VFromD<D>{_mm_undefined_ps()};
    377 }
    378 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
    379 HWY_API VFromD<D> Undefined(D /* tag */) {
    380  return VFromD<D>{_mm_undefined_pd()};
    381 }
    382 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
    383 HWY_API VFromD<D> Undefined(D /* tag */) {
    384  return VFromD<D>{_mm_undefined_si128()};
    385 }
    386 
    387 HWY_DIAGNOSTICS(pop)
    388 
    389 // ------------------------------ GetLane
    390 
    391 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
    392 HWY_API T GetLane(const Vec128<T, N> v) {
    393  return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
    394 }
    395 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
    396 HWY_API T GetLane(const Vec128<T, N> v) {
    397  const DFromV<decltype(v)> d;
    398  const RebindToUnsigned<decltype(d)> du;
    399  const uint16_t bits =
    400      static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF);
    401  return BitCastScalar<T>(bits);
    402 }
    403 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
    404 HWY_API T GetLane(const Vec128<T, N> v) {
    405  return static_cast<T>(_mm_cvtsi128_si32(v.raw));
    406 }
    407 template <size_t N>
    408 HWY_API float GetLane(const Vec128<float, N> v) {
    409  return _mm_cvtss_f32(v.raw);
    410 }
    411 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
    412 HWY_API T GetLane(const Vec128<T, N> v) {
    413 #if HWY_ARCH_X86_32
    414  const DFromV<decltype(v)> d;
    415  alignas(16) T lanes[2];
    416  Store(v, d, lanes);
    417  return lanes[0];
    418 #else
    419  return static_cast<T>(_mm_cvtsi128_si64(v.raw));
    420 #endif
    421 }
    422 template <size_t N>
    423 HWY_API double GetLane(const Vec128<double, N> v) {
    424  return _mm_cvtsd_f64(v.raw);
    425 }
    426 
    427 // ------------------------------ ResizeBitCast
    428 
    429 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
    430          HWY_IF_V_SIZE_LE_D(D, 16)>
    431 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    432  const Repartition<uint8_t, decltype(d)> du8;
    433  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
    434 }
    435 
    436 // ------------------------------ Dup128VecFromValues
    437 
    438 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    439 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    440                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    441                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
    442                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
    443                                      TFromD<D> t11, TFromD<D> t12,
    444                                      TFromD<D> t13, TFromD<D> t14,
    445                                      TFromD<D> t15) {
    446  return VFromD<D>{_mm_setr_epi8(
    447      static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
    448      static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
    449      static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
    450      static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
    451      static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
    452      static_cast<char>(t15))};
    453 }
    454 
    455 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    456 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    457                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    458                                      TFromD<D> t5, TFromD<D> t6,
    459                                      TFromD<D> t7) {
    460  return VFromD<D>{
    461      _mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
    462                     static_cast<int16_t>(t2), static_cast<int16_t>(t3),
    463                     static_cast<int16_t>(t4), static_cast<int16_t>(t5),
    464                     static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
    465 }
    466 
    467 // Generic for all vector lengths
    468 template <class D, HWY_IF_BF16_D(D)>
    469 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
    470                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    471                                      TFromD<D> t5, TFromD<D> t6,
    472                                      TFromD<D> t7) {
    473  const RebindToSigned<decltype(d)> di;
    474  return BitCast(d,
    475                 Dup128VecFromValues(
    476                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
    477                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
    478                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
    479                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
    480 }
    481 
    482 #if HWY_HAVE_FLOAT16
    483 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    484 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    485                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    486                                      TFromD<D> t5, TFromD<D> t6,
    487                                      TFromD<D> t7) {
    488  return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
    489 }
    490 #else
    491 // Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true
    492 template <class D, HWY_IF_F16_D(D)>
    493 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
    494                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    495                                      TFromD<D> t5, TFromD<D> t6,
    496                                      TFromD<D> t7) {
    497  const RebindToSigned<decltype(d)> di;
    498  return BitCast(d,
    499                 Dup128VecFromValues(
    500                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
    501                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
    502                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
    503                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
    504 }
    505 #endif  // HWY_HAVE_FLOAT16
    506 
    507 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    508 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    509                                      TFromD<D> t2, TFromD<D> t3) {
    510  return VFromD<D>{
    511      _mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
    512                     static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
    513 }
    514 
    515 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    516 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    517                                      TFromD<D> t2, TFromD<D> t3) {
    518  return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
    519 }
    520 
    521 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    522 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    523  // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic
    524  // available
    525  return VFromD<D>{
    526      _mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))};
    527 }
    528 
    529 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    530 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    531  return VFromD<D>{_mm_setr_pd(t0, t1)};
    532 }
    533 
    534 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
    535 namespace detail {
    536 
    537 template <class RawV>
    538 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
    539    hwy::SizeTag<1> /* num_of_lanes_tag*/, RawV v) {
    540  return __builtin_constant_p(v[0]);
    541 }
    542 
    543 template <class RawV>
    544 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
    545    hwy::SizeTag<2> /* num_of_lanes_tag*/, RawV v) {
    546  return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
    547 }
    548 
    549 template <class RawV>
    550 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
    551    hwy::SizeTag<4> /* num_of_lanes_tag*/, RawV v) {
    552  return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
    553         __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
    554 }
    555 
    556 template <class RawV>
    557 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
    558    hwy::SizeTag<8> /* num_of_lanes_tag*/, RawV v) {
    559  return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
    560         __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
    561         __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
    562         __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
    563 }
    564 
    565 template <class RawV>
    566 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
    567    hwy::SizeTag<16> /* num_of_lanes_tag*/, RawV v) {
    568  return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
    569         __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
    570         __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
    571         __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
    572         __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
    573         __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
    574         __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
    575         __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
    576 }
    577 
    578 #if HWY_TARGET <= HWY_AVX2
    579 template <class RawV>
    580 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec(
    581    hwy::SizeTag<32> /* num_of_lanes_tag*/, RawV v) {
    582  return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
    583         __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
    584         __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
    585         __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
    586         __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
    587         __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
    588         __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
    589         __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]) &&
    590         __builtin_constant_p(v[16]) && __builtin_constant_p(v[17]) &&
    591         __builtin_constant_p(v[18]) && __builtin_constant_p(v[19]) &&
    592         __builtin_constant_p(v[20]) && __builtin_constant_p(v[21]) &&
    593         __builtin_constant_p(v[22]) && __builtin_constant_p(v[23]) &&
    594         __builtin_constant_p(v[24]) && __builtin_constant_p(v[25]) &&
    595         __builtin_constant_p(v[26]) && __builtin_constant_p(v[27]) &&
    596         __builtin_constant_p(v[28]) && __builtin_constant_p(v[29]) &&
    597         __builtin_constant_p(v[30]) && __builtin_constant_p(v[31]);
    598 }
    599 #endif
    600 
    601 template <size_t kNumOfLanes, class V>
    602 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86Vec(
    603    hwy::SizeTag<kNumOfLanes> num_of_lanes_tag, V v) {
    604  using T = TFromV<V>;
    605 #if HWY_HAVE_FLOAT16 && HWY_HAVE_SCALAR_F16_TYPE
    606  using F16VecLaneT = hwy::float16_t::Native;
    607 #else
    608  using F16VecLaneT = uint16_t;
    609 #endif
    610  using RawVecLaneT = If<hwy::IsSame<T, hwy::float16_t>(), F16VecLaneT,
    611                         If<hwy::IsSame<T, hwy::bfloat16_t>(), uint16_t, T>>;
    612 
    613  // Suppress the -Wignored-attributes warning that is emitted by
    614  // RemoveCvRef<decltype(v.raw)> with GCC
    615  HWY_DIAGNOSTICS(push)
    616  HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
    617  typedef RawVecLaneT GccRawVec
    618      __attribute__((__vector_size__(sizeof(RemoveCvRef<decltype(v.raw)>))));
    619  HWY_DIAGNOSTICS(pop)
    620 
    621  return IsConstantRawX86Vec(num_of_lanes_tag,
    622                             reinterpret_cast<GccRawVec>(v.raw));
    623 }
    624 
    625 template <class TTo, class V>
    626 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86VecForF2IConv(V v) {
    627  constexpr size_t kNumOfLanesInRawSrcVec =
    628      HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TFromV<V>));
    629  constexpr size_t kNumOfLanesInRawResultVec =
    630      HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TTo));
    631  constexpr size_t kNumOfLanesToCheck =
    632      HWY_MIN(kNumOfLanesInRawSrcVec, kNumOfLanesInRawResultVec);
    633 
    634  return IsConstantX86Vec(hwy::SizeTag<kNumOfLanesToCheck>(), v);
    635 }
    636 
    637 }  // namespace detail
    638 #endif  // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
    639 
    640 // ================================================== LOGICAL
    641 
    642 // ------------------------------ And
    643 
    644 template <typename T, size_t N>
    645 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
    646  const DFromV<decltype(a)> d;  // for float16_t
    647  const RebindToUnsigned<decltype(d)> du;
    648  return BitCast(d, VFromD<decltype(du)>{
    649                        _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
    650 }
    651 template <size_t N>
    652 HWY_API Vec128<float, N> And(Vec128<float, N> a, Vec128<float, N> b) {
    653  return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
    654 }
    655 template <size_t N>
    656 HWY_API Vec128<double, N> And(Vec128<double, N> a, Vec128<double, N> b) {
    657  return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
    658 }
    659 
    660 // ------------------------------ AndNot
    661 
    662 // Returns ~not_mask & mask.
    663 template <typename T, size_t N>
    664 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
    665  const DFromV<decltype(mask)> d;  // for float16_t
    666  const RebindToUnsigned<decltype(d)> du;
    667  return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128(
    668                        BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
    669 }
    670 template <size_t N>
    671 HWY_API Vec128<float, N> AndNot(Vec128<float, N> not_mask,
    672                                Vec128<float, N> mask) {
    673  return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
    674 }
    675 template <size_t N>
    676 HWY_API Vec128<double, N> AndNot(Vec128<double, N> not_mask,
    677                                 Vec128<double, N> mask) {
    678  return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
    679 }
    680 
    681 // ------------------------------ Or
    682 
    683 template <typename T, size_t N>
    684 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
    685  const DFromV<decltype(a)> d;  // for float16_t
    686  const RebindToUnsigned<decltype(d)> du;
    687  return BitCast(d, VFromD<decltype(du)>{
    688                        _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
    689 }
    690 
    691 template <size_t N>
    692 HWY_API Vec128<float, N> Or(Vec128<float, N> a, Vec128<float, N> b) {
    693  return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
    694 }
    695 template <size_t N>
    696 HWY_API Vec128<double, N> Or(Vec128<double, N> a, Vec128<double, N> b) {
    697  return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
    698 }
    699 
    700 // ------------------------------ Xor
    701 
    702 template <typename T, size_t N>
    703 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
    704  const DFromV<decltype(a)> d;  // for float16_t
    705  const RebindToUnsigned<decltype(d)> du;
    706  return BitCast(d, VFromD<decltype(du)>{
    707                        _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
    708 }
    709 
    710 template <size_t N>
    711 HWY_API Vec128<float, N> Xor(Vec128<float, N> a, Vec128<float, N> b) {
    712  return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
    713 }
    714 template <size_t N>
    715 HWY_API Vec128<double, N> Xor(Vec128<double, N> a, Vec128<double, N> b) {
    716  return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
    717 }
    718 
    719 // ------------------------------ Not
    720 template <typename T, size_t N>
    721 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
    722  const DFromV<decltype(v)> d;
    723  const RebindToUnsigned<decltype(d)> du;
    724  using VU = VFromD<decltype(du)>;
    725 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
    726  const __m128i vu = BitCast(du, v).raw;
    727  return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
    728 #else
    729  return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
    730 #endif
    731 }
    732 
    733 // ------------------------------ Xor3
    734 template <typename T, size_t N>
    735 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
    736 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
    737  const DFromV<decltype(x1)> d;
    738  const RebindToUnsigned<decltype(d)> du;
    739  using VU = VFromD<decltype(du)>;
    740  const __m128i ret = _mm_ternarylogic_epi64(
    741      BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
    742  return BitCast(d, VU{ret});
    743 #else
    744  return Xor(x1, Xor(x2, x3));
    745 #endif
    746 }
    747 
    748 // ------------------------------ Or3
    749 template <typename T, size_t N>
    750 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
    751 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
    752  const DFromV<decltype(o1)> d;
    753  const RebindToUnsigned<decltype(d)> du;
    754  using VU = VFromD<decltype(du)>;
    755  const __m128i ret = _mm_ternarylogic_epi64(
    756      BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
    757  return BitCast(d, VU{ret});
    758 #else
    759  return Or(o1, Or(o2, o3));
    760 #endif
    761 }
    762 
    763 // ------------------------------ OrAnd
    764 template <typename T, size_t N>
    765 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
    766 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
    767  const DFromV<decltype(o)> d;
    768  const RebindToUnsigned<decltype(d)> du;
    769  using VU = VFromD<decltype(du)>;
    770  const __m128i ret = _mm_ternarylogic_epi64(
    771      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
    772  return BitCast(d, VU{ret});
    773 #else
    774  return Or(o, And(a1, a2));
    775 #endif
    776 }
    777 
    778 // ------------------------------ IfVecThenElse
    779 template <typename T, size_t N>
    780 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
    781                                   Vec128<T, N> no) {
    782 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
    783  const DFromV<decltype(no)> d;
    784  const RebindToUnsigned<decltype(d)> du;
    785  using VU = VFromD<decltype(du)>;
    786  return BitCast(
    787      d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
    788                                   BitCast(du, no).raw, 0xCA)});
    789 #else
    790  return IfThenElse(MaskFromVec(mask), yes, no);
    791 #endif
    792 }
    793 
    794 // ------------------------------ BitwiseIfThenElse
    795 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
    796 
    797 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
    798 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
    799 #else
    800 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
    801 #endif
    802 
    803 template <class V>
    804 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
    805  return IfVecThenElse(mask, yes, no);
    806 }
    807 
    808 #endif
    809 
    810 // ------------------------------ Operator overloads (internal-only if float)
    811 
    812 template <typename T, size_t N>
    813 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
    814  return And(a, b);
    815 }
    816 
    817 template <typename T, size_t N>
    818 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
    819  return Or(a, b);
    820 }
    821 
    822 template <typename T, size_t N>
    823 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
    824  return Xor(a, b);
    825 }
    826 
    827 // ------------------------------ PopulationCount
    828 
    829 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
    830 #if HWY_TARGET <= HWY_AVX3_DL
    831 
    832 #ifdef HWY_NATIVE_POPCNT
    833 #undef HWY_NATIVE_POPCNT
    834 #else
    835 #define HWY_NATIVE_POPCNT
    836 #endif
    837 
    838 namespace detail {
    839 
    840 template <typename T, size_t N>
    841 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
    842                                        Vec128<T, N> v) {
    843  return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
    844 }
    845 template <typename T, size_t N>
    846 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
    847                                        Vec128<T, N> v) {
    848  return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
    849 }
    850 template <typename T, size_t N>
    851 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
    852                                        Vec128<T, N> v) {
    853  return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
    854 }
    855 template <typename T, size_t N>
    856 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
    857                                        Vec128<T, N> v) {
    858  return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
    859 }
    860 
    861 }  // namespace detail
    862 
    863 template <typename T, size_t N>
    864 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
    865  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
    866 }
    867 
    868 #endif  // HWY_TARGET <= HWY_AVX3_DL
    869 
    870 // ================================================== SIGN
    871 
    872 // ------------------------------ Neg
    873 
    874 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
    875 namespace detail {
    876 
    877 template <typename T, size_t N>
    878 HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) {
    879  return Xor(v, SignBit(DFromV<decltype(v)>()));
    880 }
    881 
    882 template <typename T, size_t N>
    883 HWY_INLINE Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, const Vec128<T, N> v) {
    884  return Xor(v, SignBit(DFromV<decltype(v)>()));
    885 }
    886 
    887 template <typename T, size_t N>
    888 HWY_INLINE Vec128<T, N> Neg(hwy::SignedTag /*tag*/, const Vec128<T, N> v) {
    889  return Zero(DFromV<decltype(v)>()) - v;
    890 }
    891 
    892 }  // namespace detail
    893 
    894 template <typename T, size_t N>
    895 HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
    896  return detail::Neg(hwy::TypeTag<T>(), v);
    897 }
    898 
    899 // ------------------------------ Floating-point Abs
    900 // Generic for all vector lengths
    901 template <class V, HWY_IF_FLOAT(TFromV<V>)>
    902 HWY_API V Abs(V v) {
    903  const DFromV<decltype(v)> d;
    904  const RebindToSigned<decltype(d)> di;
    905  using TI = TFromD<decltype(di)>;
    906  return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
    907 }
    908 
    909 // ------------------------------ CopySign
    910 // Generic for all vector lengths.
    911 template <class V>
    912 HWY_API V CopySign(const V magn, const V sign) {
    913  static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
    914 
    915  const DFromV<decltype(magn)> d;
    916  const auto msb = SignBit(d);
    917 
    918  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
    919  //                  0    0     0   |  0
    920  //                  0    0     1   |  0
    921  //                  0    1     0   |  1
    922  //                  0    1     1   |  1
    923  //                  1    0     0   |  0
    924  //                  1    0     1   |  1
    925  //                  1    1     0   |  0
    926  //                  1    1     1   |  1
    927  return BitwiseIfThenElse(msb, sign, magn);
    928 }
    929 
    930 // ------------------------------ CopySignToAbs
    931 // Generic for all vector lengths.
    932 template <class V>
    933 HWY_API V CopySignToAbs(const V abs, const V sign) {
    934  const DFromV<decltype(abs)> d;
    935  return OrAnd(abs, SignBit(d), sign);
    936 }
    937 
    938 // ================================================== MASK
    939 
    940 #if HWY_TARGET <= HWY_AVX3
    941 // ------------------------------ MaskFromVec
    942 
    943 namespace detail {
    944 
    945 template <typename T, size_t N>
    946 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
    947                                     const Vec128<T, N> v) {
    948  return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
    949 }
    950 template <typename T, size_t N>
    951 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
    952                                     const Vec128<T, N> v) {
    953  return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
    954 }
    955 template <typename T, size_t N>
    956 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
    957                                     const Vec128<T, N> v) {
    958  return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
    959 }
    960 template <typename T, size_t N>
    961 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
    962                                     const Vec128<T, N> v) {
    963  return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
    964 }
    965 
    966 }  // namespace detail
    967 
    968 template <typename T, size_t N>
    969 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
    970  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
    971 }
    972 // There do not seem to be native floating-point versions of these instructions.
    973 #if HWY_HAVE_FLOAT16
    974 template <size_t N>
    975 HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) {
    976  const RebindToSigned<DFromV<decltype(v)>> di;
    977  return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw};
    978 }
    979 #endif
    980 template <size_t N>
    981 HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
    982  const RebindToSigned<DFromV<decltype(v)>> di;
    983  return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
    984 }
    985 template <size_t N>
    986 HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
    987  const RebindToSigned<DFromV<decltype(v)>> di;
    988  return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
    989 }
    990 
    991 template <class D>
    992 using MFromD = decltype(MaskFromVec(VFromD<D>()));
    993 
    994 // ------------------------------ MaskFalse (MFromD)
    995 
    996 #ifdef HWY_NATIVE_MASK_FALSE
    997 #undef HWY_NATIVE_MASK_FALSE
    998 #else
    999 #define HWY_NATIVE_MASK_FALSE
   1000 #endif
   1001 
   1002 // Generic for all vector lengths
   1003 template <class D>
   1004 HWY_API MFromD<D> MaskFalse(D /*d*/) {
   1005  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
   1006 }
   1007 
   1008 // ------------------------------ SetMask
   1009 #ifdef HWY_NATIVE_SET_MASK
   1010 #undef HWY_NATIVE_SET_MASK
   1011 #else
   1012 #define HWY_NATIVE_SET_MASK
   1013 #endif
   1014 
   1015 template <class D>
   1016 HWY_API MFromD<D> SetMask(D /*d*/, bool val) {
   1017  constexpr uint64_t kMask = (HWY_MAX_LANES_D(D) < 64)
   1018                                 ? ((1ULL << (HWY_MAX_LANES_D(D) & 63)) - 1ULL)
   1019                                 : LimitsMax<uint64_t>();
   1020 
   1021  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(
   1022      static_cast<uint64_t>(-static_cast<int64_t>(val)) & kMask)};
   1023 }
   1024 
   1025 // ------------------------------ IsNegative (MFromD)
   1026 #ifdef HWY_NATIVE_IS_NEGATIVE
   1027 #undef HWY_NATIVE_IS_NEGATIVE
   1028 #else
   1029 #define HWY_NATIVE_IS_NEGATIVE
   1030 #endif
   1031 
   1032 // Generic for all vector lengths
   1033 template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
   1034 HWY_API MFromD<DFromV<V>> IsNegative(V v) {
   1035  return MaskFromVec(v);
   1036 }
   1037 
   1038 // ------------------------------ PromoteMaskTo (MFromD)
   1039 
   1040 #ifdef HWY_NATIVE_PROMOTE_MASK_TO
   1041 #undef HWY_NATIVE_PROMOTE_MASK_TO
   1042 #else
   1043 #define HWY_NATIVE_PROMOTE_MASK_TO
   1044 #endif
   1045 
   1046 // AVX3 PromoteMaskTo is generic for all vector lengths
   1047 template <class DTo, class DFrom,
   1048          HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
   1049          class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
   1050          hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
   1051 HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
   1052                                  MFromD<DFrom> m) {
   1053  return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
   1054 }
   1055 
   1056 // ------------------------------ DemoteMaskTo (MFromD)
   1057 
   1058 #ifdef HWY_NATIVE_DEMOTE_MASK_TO
   1059 #undef HWY_NATIVE_DEMOTE_MASK_TO
   1060 #else
   1061 #define HWY_NATIVE_DEMOTE_MASK_TO
   1062 #endif
   1063 
   1064 // AVX3 DemoteMaskTo is generic for all vector lengths
   1065 template <class DTo, class DFrom,
   1066          HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
   1067          class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
   1068          hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
   1069 HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
   1070                                 MFromD<DFrom> m) {
   1071  return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
   1072 }
   1073 
   1074 // ------------------------------ CombineMasks (MFromD)
   1075 
   1076 #ifdef HWY_NATIVE_COMBINE_MASKS
   1077 #undef HWY_NATIVE_COMBINE_MASKS
   1078 #else
   1079 #define HWY_NATIVE_COMBINE_MASKS
   1080 #endif
   1081 
   1082 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
   1083 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
   1084 #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
   1085    HWY_COMPILER_CLANG >= 800
   1086 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
   1087 #else
   1088 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
   1089 #endif
   1090 #endif  // HWY_COMPILER_HAS_MASK_INTRINSICS
   1091 
   1092 template <class D, HWY_IF_LANES_D(D, 2)>
   1093 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
   1094                               MFromD<Half<D>> lo) {
   1095 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1096  const __mmask8 combined_mask = _kor_mask8(
   1097      _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1),
   1098      _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1)));
   1099 #else
   1100  const auto combined_mask =
   1101      (static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1);
   1102 #endif
   1103 
   1104  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
   1105 }
   1106 
   1107 template <class D, HWY_IF_LANES_D(D, 4)>
   1108 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
   1109                               MFromD<Half<D>> lo) {
   1110 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1111  const __mmask8 combined_mask = _kor_mask8(
   1112      _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2),
   1113      _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3)));
   1114 #else
   1115  const auto combined_mask =
   1116      (static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3);
   1117 #endif
   1118 
   1119  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
   1120 }
   1121 
   1122 template <class D, HWY_IF_LANES_D(D, 8)>
   1123 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
   1124                               MFromD<Half<D>> lo) {
   1125 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1126  const __mmask8 combined_mask = _kor_mask8(
   1127      _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4),
   1128      _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15)));
   1129 #else
   1130  const auto combined_mask =
   1131      (static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u);
   1132 #endif
   1133 
   1134  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
   1135 }
   1136 
   1137 template <class D, HWY_IF_LANES_D(D, 16)>
   1138 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
   1139                               MFromD<Half<D>> lo) {
   1140 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1141  const __mmask16 combined_mask = _mm512_kunpackb(
   1142      static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw));
   1143 #else
   1144  const auto combined_mask =
   1145      ((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
   1146 #endif
   1147 
   1148  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
   1149 }
   1150 
   1151 // ------------------------------ LowerHalfOfMask (MFromD)
   1152 
   1153 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
   1154 #undef HWY_NATIVE_LOWER_HALF_OF_MASK
   1155 #else
   1156 #define HWY_NATIVE_LOWER_HALF_OF_MASK
   1157 #endif
   1158 
   1159 // Generic for all vector lengths
   1160 template <class D>
   1161 HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
   1162  using RawM = decltype(MFromD<D>().raw);
   1163  constexpr size_t kN = MaxLanes(d);
   1164  constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8;
   1165 
   1166  MFromD<D> result_mask{static_cast<RawM>(m.raw)};
   1167 
   1168  if (kN < kNumOfBitsInRawMask) {
   1169    result_mask =
   1170        And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)});
   1171  }
   1172 
   1173  return result_mask;
   1174 }
   1175 
   1176 // ------------------------------ UpperHalfOfMask (MFromD)
   1177 
   1178 #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
   1179 #undef HWY_NATIVE_UPPER_HALF_OF_MASK
   1180 #else
   1181 #define HWY_NATIVE_UPPER_HALF_OF_MASK
   1182 #endif
   1183 
   1184 template <class D, HWY_IF_LANES_D(D, 1)>
   1185 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
   1186 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1187  const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1);
   1188 #else
   1189  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1;
   1190 #endif
   1191 
   1192  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
   1193 }
   1194 
   1195 template <class D, HWY_IF_LANES_D(D, 2)>
   1196 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
   1197 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1198  const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2);
   1199 #else
   1200  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2;
   1201 #endif
   1202 
   1203  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
   1204 }
   1205 
   1206 template <class D, HWY_IF_LANES_D(D, 4)>
   1207 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
   1208 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1209  const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4);
   1210 #else
   1211  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4;
   1212 #endif
   1213 
   1214  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
   1215 }
   1216 
   1217 template <class D, HWY_IF_LANES_D(D, 8)>
   1218 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
   1219 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1220  const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8);
   1221 #else
   1222  const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8;
   1223 #endif
   1224 
   1225  return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
   1226 }
   1227 
   1228 // ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks)
   1229 
   1230 #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
   1231 #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
   1232 #else
   1233 #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
   1234 #endif
   1235 
   1236 // Generic for all vector lengths
   1237 template <class DTo, class DFrom,
   1238          HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
   1239          class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
   1240          hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
   1241 HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
   1242                                          MFromD<DFrom> a, MFromD<DFrom> b) {
   1243  using MH = MFromD<Half<DTo>>;
   1244  using RawMH = decltype(MH().raw);
   1245 
   1246  return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)},
   1247                      MH{static_cast<RawMH>(a.raw)});
   1248 }
   1249 
   1250 // ------------------------------ Slide mask up/down
   1251 #ifdef HWY_NATIVE_SLIDE_MASK
   1252 #undef HWY_NATIVE_SLIDE_MASK
   1253 #else
   1254 #define HWY_NATIVE_SLIDE_MASK
   1255 #endif
   1256 
   1257 template <class D, HWY_IF_LANES_LE_D(D, 8)>
   1258 HWY_API MFromD<D> SlideMask1Up(D d, MFromD<D> m) {
   1259  using RawM = decltype(MFromD<D>().raw);
   1260  constexpr size_t kN = MaxLanes(d);
   1261  constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
   1262 
   1263 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1264  MFromD<D> result_mask{
   1265      static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))};
   1266 
   1267  if (kN < 8) {
   1268    result_mask =
   1269        And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
   1270  }
   1271 #else
   1272  MFromD<D> result_mask{
   1273      static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)};
   1274 #endif
   1275 
   1276  return result_mask;
   1277 }
   1278 
   1279 template <class D, HWY_IF_LANES_D(D, 16)>
   1280 HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
   1281  using RawM = decltype(MFromD<D>().raw);
   1282 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1283  return MFromD<D>{
   1284      static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))};
   1285 #else
   1286  return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)};
   1287 #endif
   1288 }
   1289 
   1290 template <class D, HWY_IF_LANES_LE_D(D, 8)>
   1291 HWY_API MFromD<D> SlideMask1Down(D d, MFromD<D> m) {
   1292  using RawM = decltype(MFromD<D>().raw);
   1293  constexpr size_t kN = MaxLanes(d);
   1294  constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
   1295 
   1296 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1297  if (kN < 8) {
   1298    m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
   1299  }
   1300 
   1301  return MFromD<D>{
   1302      static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))};
   1303 #else
   1304  return MFromD<D>{
   1305      static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)};
   1306 #endif
   1307 }
   1308 
   1309 template <class D, HWY_IF_LANES_D(D, 16)>
   1310 HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
   1311  using RawM = decltype(MFromD<D>().raw);
   1312 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1313  return MFromD<D>{
   1314      static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))};
   1315 #else
   1316  return MFromD<D>{
   1317      static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)};
   1318 #endif
   1319 }
   1320 
   1321 // Generic for all vector lengths
   1322 template <class D>
   1323 HWY_API MFromD<D> SlideMaskUpLanes(D d, MFromD<D> m, size_t amt) {
   1324  using RawM = decltype(MFromD<D>().raw);
   1325  constexpr size_t kN = MaxLanes(d);
   1326  constexpr uint64_t kValidLanesMask =
   1327      static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
   1328 
   1329  return MFromD<D>{static_cast<RawM>(
   1330      (static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)};
   1331 }
   1332 
   1333 // Generic for all vector lengths
   1334 template <class D>
   1335 HWY_API MFromD<D> SlideMaskDownLanes(D d, MFromD<D> m, size_t amt) {
   1336  using RawM = decltype(MFromD<D>().raw);
   1337  constexpr size_t kN = MaxLanes(d);
   1338  constexpr uint64_t kValidLanesMask =
   1339      static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
   1340 
   1341  return MFromD<D>{static_cast<RawM>(
   1342      (static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))};
   1343 }
   1344 
   1345 // ------------------------------ VecFromMask
   1346 
   1347 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   1348 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
   1349  return Vec128<T, N>{_mm_movm_epi8(v.raw)};
   1350 }
   1351 
   1352 template <typename T, size_t N, HWY_IF_UI16(T)>
   1353 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
   1354  return Vec128<T, N>{_mm_movm_epi16(v.raw)};
   1355 }
   1356 
   1357 template <typename T, size_t N, HWY_IF_UI32(T)>
   1358 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
   1359  return Vec128<T, N>{_mm_movm_epi32(v.raw)};
   1360 }
   1361 
   1362 template <typename T, size_t N, HWY_IF_UI64(T)>
   1363 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
   1364  return Vec128<T, N>{_mm_movm_epi64(v.raw)};
   1365 }
   1366 
   1367 #if HWY_HAVE_FLOAT16
   1368 template <size_t N>
   1369 HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
   1370  return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
   1371 }
   1372 #endif  // HWY_HAVE_FLOAT16
   1373 
   1374 template <size_t N>
   1375 HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
   1376  return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
   1377 }
   1378 
   1379 template <size_t N>
   1380 HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
   1381  return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
   1382 }
   1383 
   1384 // Generic for all vector lengths.
   1385 template <class D>
   1386 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
   1387  return VecFromMask(v);
   1388 }
   1389 
   1390 // ------------------------------ RebindMask (MaskFromVec)
   1391 
   1392 template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
   1393 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
   1394  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
   1395  return MFromD<DTo>{m.raw};
   1396 }
   1397 
   1398 // ------------------------------ IfThenElse
   1399 
   1400 namespace detail {
   1401 
   1402 template <typename T, size_t N>
   1403 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */,
   1404                                   Mask128<T, N> mask, Vec128<T, N> yes,
   1405                                   Vec128<T, N> no) {
   1406  return Vec128<T, N>{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)};
   1407 }
   1408 template <typename T, size_t N>
   1409 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */,
   1410                                   Mask128<T, N> mask, Vec128<T, N> yes,
   1411                                   Vec128<T, N> no) {
   1412  return Vec128<T, N>{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)};
   1413 }
   1414 template <typename T, size_t N>
   1415 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */,
   1416                                   Mask128<T, N> mask, Vec128<T, N> yes,
   1417                                   Vec128<T, N> no) {
   1418  return Vec128<T, N>{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)};
   1419 }
   1420 template <typename T, size_t N>
   1421 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */,
   1422                                   Mask128<T, N> mask, Vec128<T, N> yes,
   1423                                   Vec128<T, N> no) {
   1424  return Vec128<T, N>{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)};
   1425 }
   1426 
   1427 }  // namespace detail
   1428 
   1429 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   1430 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
   1431                                Vec128<T, N> no) {
   1432  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
   1433 }
   1434 
   1435 #if HWY_HAVE_FLOAT16
   1436 template <size_t N>
   1437 HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask,
   1438                                        Vec128<float16_t, N> yes,
   1439                                        Vec128<float16_t, N> no) {
   1440  return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)};
   1441 }
   1442 #endif  // HWY_HAVE_FLOAT16
   1443 
   1444 // Generic for all vector lengths.
   1445 template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
   1446 HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
   1447  const RebindToUnsigned<D> du;
   1448  return BitCast(
   1449      D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
   1450 }
   1451 
   1452 template <size_t N>
   1453 HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
   1454                                    Vec128<float, N> yes, Vec128<float, N> no) {
   1455  return Vec128<float, N>{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)};
   1456 }
   1457 
   1458 template <size_t N>
   1459 HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
   1460                                     Vec128<double, N> yes,
   1461                                     Vec128<double, N> no) {
   1462  return Vec128<double, N>{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)};
   1463 }
   1464 
   1465 namespace detail {
   1466 
   1467 template <typename T, size_t N>
   1468 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */,
   1469                                       Mask128<T, N> mask, Vec128<T, N> yes) {
   1470  return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
   1471 }
   1472 template <typename T, size_t N>
   1473 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */,
   1474                                       Mask128<T, N> mask, Vec128<T, N> yes) {
   1475  return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
   1476 }
   1477 template <typename T, size_t N>
   1478 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */,
   1479                                       Mask128<T, N> mask, Vec128<T, N> yes) {
   1480  return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
   1481 }
   1482 template <typename T, size_t N>
   1483 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */,
   1484                                       Mask128<T, N> mask, Vec128<T, N> yes) {
   1485  return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
   1486 }
   1487 
   1488 }  // namespace detail
   1489 
   1490 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   1491 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   1492  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
   1493 }
   1494 
   1495 template <size_t N>
   1496 HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask,
   1497                                        Vec128<float, N> yes) {
   1498  return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
   1499 }
   1500 
   1501 template <size_t N>
   1502 HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask,
   1503                                         Vec128<double, N> yes) {
   1504  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
   1505 }
   1506 
   1507 // Generic for all vector lengths.
   1508 template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
   1509 HWY_API V IfThenElseZero(MFromD<D> mask, V yes) {
   1510  const RebindToUnsigned<D> du;
   1511  return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
   1512 }
   1513 
   1514 namespace detail {
   1515 
   1516 template <typename T, size_t N>
   1517 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */,
   1518                                       Mask128<T, N> mask, Vec128<T, N> no) {
   1519  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
   1520  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
   1521 }
   1522 template <typename T, size_t N>
   1523 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */,
   1524                                       Mask128<T, N> mask, Vec128<T, N> no) {
   1525  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
   1526 }
   1527 template <typename T, size_t N>
   1528 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */,
   1529                                       Mask128<T, N> mask, Vec128<T, N> no) {
   1530  return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
   1531 }
   1532 template <typename T, size_t N>
   1533 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
   1534                                       Mask128<T, N> mask, Vec128<T, N> no) {
   1535  return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
   1536 }
   1537 
   1538 }  // namespace detail
   1539 
   1540 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   1541 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   1542  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
   1543 }
   1544 
   1545 template <size_t N>
   1546 HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask,
   1547                                        Vec128<float, N> no) {
   1548  return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
   1549 }
   1550 
   1551 template <size_t N>
   1552 HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
   1553                                         Vec128<double, N> no) {
   1554  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
   1555 }
   1556 
   1557 // Generic for all vector lengths.
   1558 template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
   1559 HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
   1560  const RebindToUnsigned<D> du;
   1561  return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
   1562 }
   1563 
   1564 // ------------------------------ Mask logical
   1565 
   1566 namespace detail {
   1567 
   1568 template <typename T, size_t N>
   1569 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
   1570                             const Mask128<T, N> b) {
   1571 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1572  return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
   1573 #else
   1574  return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
   1575 #endif
   1576 }
   1577 template <typename T, size_t N>
   1578 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
   1579                             const Mask128<T, N> b) {
   1580 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1581  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
   1582 #else
   1583  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
   1584 #endif
   1585 }
   1586 template <typename T, size_t N>
   1587 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
   1588                             const Mask128<T, N> b) {
   1589 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1590  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
   1591 #else
   1592  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
   1593 #endif
   1594 }
   1595 template <typename T, size_t N>
   1596 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
   1597                             const Mask128<T, N> b) {
   1598 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1599  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
   1600 #else
   1601  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
   1602 #endif
   1603 }
   1604 
   1605 template <typename T, size_t N>
   1606 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
   1607                                const Mask128<T, N> b) {
   1608 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1609  return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
   1610 #else
   1611  return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
   1612 #endif
   1613 }
   1614 template <typename T, size_t N>
   1615 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
   1616                                const Mask128<T, N> b) {
   1617 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1618  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
   1619 #else
   1620  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
   1621 #endif
   1622 }
   1623 template <typename T, size_t N>
   1624 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
   1625                                const Mask128<T, N> b) {
   1626 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1627  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
   1628 #else
   1629  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
   1630 #endif
   1631 }
   1632 template <typename T, size_t N>
   1633 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
   1634                                const Mask128<T, N> b) {
   1635 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1636  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
   1637 #else
   1638  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
   1639 #endif
   1640 }
   1641 
   1642 template <typename T, size_t N>
   1643 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
   1644                            const Mask128<T, N> b) {
   1645 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1646  return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
   1647 #else
   1648  return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
   1649 #endif
   1650 }
   1651 template <typename T, size_t N>
   1652 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
   1653                            const Mask128<T, N> b) {
   1654 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1655  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
   1656 #else
   1657  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
   1658 #endif
   1659 }
   1660 template <typename T, size_t N>
   1661 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
   1662                            const Mask128<T, N> b) {
   1663 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1664  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
   1665 #else
   1666  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
   1667 #endif
   1668 }
   1669 template <typename T, size_t N>
   1670 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
   1671                            const Mask128<T, N> b) {
   1672 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1673  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
   1674 #else
   1675  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
   1676 #endif
   1677 }
   1678 
   1679 template <typename T, size_t N>
   1680 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
   1681                             const Mask128<T, N> b) {
   1682 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1683  return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
   1684 #else
   1685  return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
   1686 #endif
   1687 }
   1688 template <typename T, size_t N>
   1689 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
   1690                             const Mask128<T, N> b) {
   1691 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1692  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
   1693 #else
   1694  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
   1695 #endif
   1696 }
   1697 template <typename T, size_t N>
   1698 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
   1699                             const Mask128<T, N> b) {
   1700 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1701  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
   1702 #else
   1703  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
   1704 #endif
   1705 }
   1706 template <typename T, size_t N>
   1707 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
   1708                             const Mask128<T, N> b) {
   1709 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1710  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
   1711 #else
   1712  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
   1713 #endif
   1714 }
   1715 
   1716 template <typename T, size_t N>
   1717 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
   1718                                          const Mask128<T, N> a,
   1719                                          const Mask128<T, N> b) {
   1720 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1721  return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
   1722 #else
   1723  return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
   1724 #endif
   1725 }
   1726 template <typename T, size_t N>
   1727 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
   1728                                          const Mask128<T, N> a,
   1729                                          const Mask128<T, N> b) {
   1730 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1731  return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
   1732 #else
   1733  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
   1734 #endif
   1735 }
   1736 template <typename T, size_t N>
   1737 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
   1738                                          const Mask128<T, N> a,
   1739                                          const Mask128<T, N> b) {
   1740 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1741  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
   1742 #else
   1743  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
   1744 #endif
   1745 }
   1746 template <typename T, size_t N>
   1747 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
   1748                                          const Mask128<T, N> a,
   1749                                          const Mask128<T, N> b) {
   1750 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1751  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
   1752 #else
   1753  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
   1754 #endif
   1755 }
   1756 
   1757 // UnmaskedNot returns ~m.raw without zeroing out any invalid bits
   1758 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   1759 HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
   1760 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1761  return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))};
   1762 #else
   1763  return Mask128<T, N>{static_cast<__mmask16>(~m.raw)};
   1764 #endif
   1765 }
   1766 
   1767 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
   1768 HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
   1769 #if HWY_COMPILER_HAS_MASK_INTRINSICS
   1770  return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))};
   1771 #else
   1772  return Mask128<T, N>{static_cast<__mmask8>(~m.raw)};
   1773 #endif
   1774 }
   1775 
   1776 template <typename T>
   1777 HWY_INLINE Mask128<T> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
   1778  // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid
   1779  return UnmaskedNot(m);
   1780 }
   1781 template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)>
   1782 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> m) {
   1783  // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there
   1784  // are fewer than 16 valid bits in m
   1785 
   1786  // Return (~m) & ((1ull << N) - 1)
   1787  return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
   1788 }
   1789 template <typename T>
   1790 HWY_INLINE Mask128<T> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
   1791  // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid
   1792  return UnmaskedNot(m);
   1793 }
   1794 template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)>
   1795 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> m) {
   1796  // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there
   1797  // are fewer than 8 valid bits in m
   1798 
   1799  // Return (~m) & ((1ull << N) - 1)
   1800  return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
   1801 }
   1802 template <typename T, size_t N>
   1803 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> m) {
   1804  // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most
   1805  // 4 valid bits in m
   1806 
   1807  // Return (~m) & ((1ull << N) - 1)
   1808  return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
   1809 }
   1810 template <typename T, size_t N>
   1811 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> m) {
   1812  // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most
   1813  // 2 valid bits in m
   1814 
   1815  // Return (~m) & ((1ull << N) - 1)
   1816  return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
   1817 }
   1818 
   1819 }  // namespace detail
   1820 
   1821 template <typename T, size_t N>
   1822 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
   1823  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
   1824 }
   1825 
   1826 template <typename T, size_t N>
   1827 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
   1828  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
   1829 }
   1830 
   1831 template <typename T, size_t N>
   1832 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
   1833  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
   1834 }
   1835 
   1836 template <typename T, size_t N>
   1837 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   1838  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
   1839 }
   1840 
   1841 template <typename T, size_t N>
   1842 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   1843  // Flip only the valid bits
   1844  return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
   1845 }
   1846 
   1847 template <typename T, size_t N>
   1848 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
   1849  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
   1850 }
   1851 
   1852 #else  // AVX2 or below
   1853 
   1854 // ------------------------------ Mask
   1855 
   1856 // Mask and Vec are the same (true = FF..FF).
   1857 template <typename T, size_t N>
   1858 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
   1859  return Mask128<T, N>{v.raw};
   1860 }
   1861 
   1862 template <class D>
   1863 using MFromD = decltype(MaskFromVec(VFromD<D>()));
   1864 
   1865 template <typename T, size_t N>
   1866 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
   1867  return Vec128<T, N>{v.raw};
   1868 }
   1869 
   1870 // Generic for all vector lengths.
   1871 template <class D>
   1872 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
   1873  return VecFromMask(v);
   1874 }
   1875 
   1876 #if HWY_TARGET >= HWY_SSSE3
   1877 
   1878 // mask ? yes : no
   1879 template <typename T, size_t N>
   1880 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
   1881                                Vec128<T, N> no) {
   1882  const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
   1883  return Or(And(vmask, yes), AndNot(vmask, no));
   1884 }
   1885 
   1886 #else  // HWY_TARGET < HWY_SSSE3
   1887 
   1888 // mask ? yes : no
   1889 template <typename T, size_t N>
   1890 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
   1891                                Vec128<T, N> no) {
   1892  return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
   1893 }
   1894 template <size_t N>
   1895 HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
   1896                                    Vec128<float, N> yes, Vec128<float, N> no) {
   1897  return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
   1898 }
   1899 template <size_t N>
   1900 HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
   1901                                     Vec128<double, N> yes,
   1902                                     Vec128<double, N> no) {
   1903  return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
   1904 }
   1905 
   1906 #endif  // HWY_TARGET >= HWY_SSSE3
   1907 
   1908 // mask ? yes : 0
   1909 template <typename T, size_t N>
   1910 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   1911  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
   1912 }
   1913 
   1914 // mask ? 0 : no
   1915 template <typename T, size_t N>
   1916 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   1917  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
   1918 }
   1919 
   1920 // ------------------------------ Mask logical
   1921 
   1922 template <typename T, size_t N>
   1923 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   1924  const Simd<T, N, 0> d;
   1925  return MaskFromVec(Not(VecFromMask(d, m)));
   1926 }
   1927 
   1928 template <typename T, size_t N>
   1929 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
   1930  const Simd<T, N, 0> d;
   1931  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
   1932 }
   1933 
   1934 template <typename T, size_t N>
   1935 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
   1936  const Simd<T, N, 0> d;
   1937  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
   1938 }
   1939 
   1940 template <typename T, size_t N>
   1941 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
   1942  const Simd<T, N, 0> d;
   1943  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
   1944 }
   1945 
   1946 template <typename T, size_t N>
   1947 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   1948  const Simd<T, N, 0> d;
   1949  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
   1950 }
   1951 
   1952 template <typename T, size_t N>
   1953 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
   1954  const Simd<T, N, 0> d;
   1955  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
   1956 }
   1957 
   1958 #endif  // HWY_TARGET <= HWY_AVX3
   1959 
   1960 // ------------------------------ ShiftLeft
   1961 
   1962 template <int kBits, size_t N>
   1963 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
   1964  return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
   1965 }
   1966 
   1967 template <int kBits, size_t N>
   1968 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
   1969  return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
   1970 }
   1971 
   1972 template <int kBits, size_t N>
   1973 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
   1974  return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
   1975 }
   1976 
   1977 template <int kBits, size_t N>
   1978 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
   1979  return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
   1980 }
   1981 template <int kBits, size_t N>
   1982 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
   1983  return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
   1984 }
   1985 template <int kBits, size_t N>
   1986 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
   1987  return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
   1988 }
   1989 
   1990 #if HWY_TARGET <= HWY_AVX3_DL
   1991 
   1992 namespace detail {
   1993 template <typename T, size_t N>
   1994 HWY_API Vec128<T, N> GaloisAffine(
   1995    Vec128<T, N> v, VFromD<Repartition<uint64_t, Simd<T, N, 0>>> matrix) {
   1996  return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)};
   1997 }
   1998 }  // namespace detail
   1999 
   2000 #else  // HWY_TARGET > HWY_AVX3_DL
   2001 
   2002 template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   2003 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
   2004  const DFromV<decltype(v)> d8;
   2005  // Use raw instead of BitCast to support N=1.
   2006  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
   2007  return kBits == 1
   2008             ? (v + v)
   2009             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
   2010 }
   2011 
   2012 #endif  // HWY_TARGET > HWY_AVX3_DL
   2013 
   2014 // ------------------------------ ShiftRight
   2015 
   2016 template <int kBits, size_t N>
   2017 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
   2018  return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
   2019 }
   2020 template <int kBits, size_t N>
   2021 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
   2022  return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
   2023 }
   2024 template <int kBits, size_t N>
   2025 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
   2026  return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
   2027 }
   2028 
   2029 template <int kBits, size_t N>
   2030 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
   2031  return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
   2032 }
   2033 template <int kBits, size_t N>
   2034 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
   2035  return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
   2036 }
   2037 
   2038 #if HWY_TARGET > HWY_AVX3_DL
   2039 
   2040 template <int kBits, size_t N>
   2041 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
   2042  const DFromV<decltype(v)> d8;
   2043  // Use raw instead of BitCast to support N=1.
   2044  const Vec128<uint8_t, N> shifted{
   2045      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
   2046  return shifted & Set(d8, 0xFF >> kBits);
   2047 }
   2048 
   2049 template <int kBits, size_t N>
   2050 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
   2051  const DFromV<decltype(v)> di;
   2052  const RebindToUnsigned<decltype(di)> du;
   2053  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
   2054  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
   2055  return (shifted ^ shifted_sign) - shifted_sign;
   2056 }
   2057 
   2058 #endif  // HWY_TARGET > HWY_AVX3_DL
   2059 
   2060 // i64 is implemented after BroadcastSignBit.
   2061 
   2062 // ================================================== MEMORY (1)
   2063 
   2064 // Clang static analysis claims the memory immediately after a partial vector
   2065 // store is uninitialized, and also flags the input to partial loads (at least
   2066 // for loadl_pd) as "garbage". Since 2025-07, MSAN began raising errors. We
   2067 // work around this by using CopyBytes instead of intrinsics, but only for MSAN
   2068 // and static analyzer builds to avoid potentially bad code generation.
   2069 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
   2070 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
   2071 #if HWY_IS_MSAN || (defined(__clang_analyzer__) || \
   2072                    (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700))
   2073 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
   2074 #else
   2075 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
   2076 #endif
   2077 #endif  // HWY_SAFE_PARTIAL_LOAD_STORE
   2078 
   2079 // ------------------------------ Load
   2080 
   2081 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
   2082 HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
   2083  return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
   2084 }
   2085 #if HWY_HAVE_FLOAT16
   2086 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   2087 HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) {
   2088  return Vec128<float16_t>{_mm_load_ph(aligned)};
   2089 }
   2090 #endif  // HWY_HAVE_FLOAT16
   2091 // Generic for all vector lengths greater than or equal to 16 bytes.
   2092 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
   2093 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
   2094  const RebindToUnsigned<decltype(d)> du;
   2095  return BitCast(d, Load(du, detail::U16LanePointer(aligned)));
   2096 }
   2097 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   2098 HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
   2099  return Vec128<float>{_mm_load_ps(aligned)};
   2100 }
   2101 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   2102 HWY_API Vec128<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) {
   2103  return Vec128<double>{_mm_load_pd(aligned)};
   2104 }
   2105 
   2106 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
   2107 HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
   2108  return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
   2109 }
   2110 #if HWY_HAVE_FLOAT16
   2111 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   2112 HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) {
   2113  return Vec128<float16_t>{_mm_loadu_ph(p)};
   2114 }
   2115 #endif  // HWY_HAVE_FLOAT16
   2116 // Generic for all vector lengths greater than or equal to 16 bytes.
   2117 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
   2118 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   2119  const RebindToUnsigned<decltype(d)> du;
   2120  return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
   2121 }
   2122 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   2123 HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
   2124  return Vec128<float>{_mm_loadu_ps(p)};
   2125 }
   2126 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   2127 HWY_API Vec128<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
   2128  return Vec128<double>{_mm_loadu_pd(p)};
   2129 }
   2130 
   2131 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
   2132 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
   2133  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   2134 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2135  __m128i v = _mm_setzero_si128();
   2136  CopyBytes<8>(p, &v);  // not same size
   2137 #else
   2138  const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p));
   2139 #endif
   2140  return BitCast(d, VFromD<decltype(du)>{v});
   2141 }
   2142 
   2143 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
   2144 HWY_API Vec64<float> Load(D /* tag */, const float* HWY_RESTRICT p) {
   2145 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2146  __m128 v = _mm_setzero_ps();
   2147  CopyBytes<8>(p, &v);  // not same size
   2148  return Vec64<float>{v};
   2149 #else
   2150  const __m128 hi = _mm_setzero_ps();
   2151  return Vec64<float>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
   2152 #endif
   2153 }
   2154 
   2155 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
   2156 HWY_API Vec64<double> Load(D /* tag */, const double* HWY_RESTRICT p) {
   2157 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2158  __m128d v = _mm_setzero_pd();
   2159  CopyBytes<8>(p, &v);  // not same size
   2160  return Vec64<double>{v};
   2161 #else
   2162  return Vec64<double>{_mm_load_sd(p)};
   2163 #endif
   2164 }
   2165 
   2166 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
   2167 HWY_API Vec32<float> Load(D /* tag */, const float* HWY_RESTRICT p) {
   2168 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2169  __m128 v = _mm_setzero_ps();
   2170  CopyBytes<4>(p, &v);  // not same size
   2171  return Vec32<float>{v};
   2172 #else
   2173  return Vec32<float>{_mm_load_ss(p)};
   2174 #endif
   2175 }
   2176 
   2177 // Any <= 32 bit except <float, 1>
   2178 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
   2179 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
   2180  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   2181  // Clang ArgumentPromotionPass seems to break this code. We can unpoison
   2182  // before SetTableIndices -> LoadU -> Load and the memory is poisoned again.
   2183  detail::MaybeUnpoison(p, Lanes(d));
   2184 
   2185 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2186  __m128i v = Zero(Full128<TFromD<decltype(du)>>()).raw;
   2187  CopyBytes<d.MaxBytes()>(p, &v);  // not same size as VFromD
   2188 #else
   2189  int32_t bits = 0;
   2190  CopyBytes<d.MaxBytes()>(p, &bits);  // not same size as VFromD
   2191  const __m128i v = _mm_cvtsi32_si128(bits);
   2192 #endif
   2193  return BitCast(d, VFromD<decltype(du)>{v});
   2194 }
   2195 
   2196 // For < 128 bit, LoadU == Load.
   2197 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   2198 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   2199  return Load(d, p);
   2200 }
   2201 
   2202 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
   2203 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   2204 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
   2205  return LoadU(d, p);
   2206 }
   2207 
   2208 // ------------------------------ Store
   2209 
   2210 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
   2211 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
   2212  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
   2213 }
   2214 #if HWY_HAVE_FLOAT16
   2215 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   2216 HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) {
   2217  _mm_store_ph(aligned, v.raw);
   2218 }
   2219 #endif  // HWY_HAVE_FLOAT16
   2220 // Generic for all vector lengths greater than or equal to 16 bytes.
   2221 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
   2222 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   2223  const RebindToUnsigned<decltype(d)> du;
   2224  Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
   2225 }
   2226 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   2227 HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
   2228  _mm_store_ps(aligned, v.raw);
   2229 }
   2230 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   2231 HWY_API void Store(Vec128<double> v, D /* tag */,
   2232                   double* HWY_RESTRICT aligned) {
   2233  _mm_store_pd(aligned, v.raw);
   2234 }
   2235 
   2236 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
   2237 HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
   2238  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
   2239 }
   2240 #if HWY_HAVE_FLOAT16
   2241 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   2242 HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) {
   2243  _mm_storeu_ph(p, v.raw);
   2244 }
   2245 #endif  // HWY_HAVE_FLOAT16
   2246 // Generic for all vector lengths greater than or equal to 16 bytes.
   2247 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
   2248 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   2249  const RebindToUnsigned<decltype(d)> du;
   2250  StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
   2251 }
   2252 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   2253 HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) {
   2254  _mm_storeu_ps(p, v.raw);
   2255 }
   2256 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   2257 HWY_API void StoreU(Vec128<double> v, D /* tag */, double* HWY_RESTRICT p) {
   2258  _mm_storeu_pd(p, v.raw);
   2259 }
   2260 
   2261 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
   2262 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   2263 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2264  (void)d;
   2265  CopyBytes<8>(&v, p);  // not same size
   2266 #else
   2267  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   2268  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw);
   2269 #endif
   2270 }
   2271 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
   2272 HWY_API void Store(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
   2273 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2274  CopyBytes<8>(&v, p);  // not same size
   2275 #else
   2276  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
   2277 #endif
   2278 }
   2279 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
   2280 HWY_API void Store(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) {
   2281 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2282  CopyBytes<8>(&v, p);  // not same size
   2283 #else
   2284  _mm_storel_pd(p, v.raw);
   2285 #endif
   2286 }
   2287 
   2288 // Any <= 32 bit except <float, 1>
   2289 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)>
   2290 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   2291  CopyBytes<d.MaxBytes()>(&v, p);  // not same size
   2292 }
   2293 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
   2294 HWY_API void Store(Vec32<float> v, D /* tag */, float* HWY_RESTRICT p) {
   2295 #if HWY_SAFE_PARTIAL_LOAD_STORE
   2296  CopyBytes<4>(&v, p);  // not same size
   2297 #else
   2298  _mm_store_ss(p, v.raw);
   2299 #endif
   2300 }
   2301 
   2302 // For < 128 bit, StoreU == Store.
   2303 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   2304 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   2305  Store(v, d, p);
   2306 }
   2307 
   2308 // ================================================== SWIZZLE (1)
   2309 
   2310 // ------------------------------ TableLookupBytes
   2311 template <typename T, size_t N, typename TI, size_t NI>
   2312 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
   2313                                        const Vec128<TI, NI> from) {
   2314  const DFromV<decltype(from)> d;
   2315  const Repartition<uint8_t, decltype(d)> du8;
   2316 
   2317  const DFromV<decltype(bytes)> d_bytes;
   2318  const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
   2319 #if HWY_TARGET == HWY_SSE2
   2320 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
   2321  typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
   2322  (void)d;
   2323  (void)du8;
   2324  (void)d_bytes;
   2325  (void)du8_bytes;
   2326  return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
   2327      __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw),
   2328                        reinterpret_cast<GccU8RawVectType>(from.raw)))};
   2329 #else
   2330  const Full128<uint8_t> du8_full;
   2331 
   2332  alignas(16) uint8_t result_bytes[16];
   2333  alignas(16) uint8_t u8_bytes[16];
   2334  alignas(16) uint8_t from_bytes[16];
   2335 
   2336  Store(Vec128<uint8_t>{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes);
   2337  Store(Vec128<uint8_t>{BitCast(du8, from).raw}, du8_full, from_bytes);
   2338 
   2339  for (int i = 0; i < 16; i++) {
   2340    result_bytes[i] = u8_bytes[from_bytes[i] & 15];
   2341  }
   2342 
   2343  return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw});
   2344 #endif
   2345 #else  // SSSE3 or newer
   2346  return BitCast(
   2347      d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw,
   2348                                                BitCast(du8, from).raw)});
   2349 #endif
   2350 }
   2351 
   2352 // ------------------------------ TableLookupBytesOr0
   2353 // For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3
   2354 template <class V, class VI>
   2355 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
   2356 #if HWY_TARGET == HWY_SSE2
   2357  const DFromV<decltype(from)> d;
   2358  const Repartition<int8_t, decltype(d)> di8;
   2359 
   2360  const auto di8_from = BitCast(di8, from);
   2361  return BitCast(d, IfThenZeroElse(di8_from < Zero(di8),
   2362                                   TableLookupBytes(bytes, di8_from)));
   2363 #else
   2364  return TableLookupBytes(bytes, from);
   2365 #endif
   2366 }
   2367 
   2368 // ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
   2369 
   2370 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
   2371 // Shuffle0321 rotates one lane to the right (the previous least-significant
   2372 // lane is now most-significant). These could also be implemented via
   2373 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
   2374 
   2375 // Swap 32-bit halves in 64-bit halves.
   2376 template <typename T, size_t N>
   2377 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
   2378  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   2379  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   2380  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
   2381 }
   2382 template <size_t N>
   2383 HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) {
   2384  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   2385  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
   2386 }
   2387 
   2388 // These are used by generic_ops-inl to implement LoadInterleaved3. As with
   2389 // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
   2390 // comes from the first argument.
   2391 namespace detail {
   2392 
   2393 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2394 HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) {
   2395  const DFromV<decltype(a)> d;
   2396  const Twice<decltype(d)> d2;
   2397  const auto ba = Combine(d2, b, a);
   2398 #if HWY_TARGET == HWY_SSE2
   2399  Vec32<uint16_t> ba_shuffled{
   2400      _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
   2401  return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled)));
   2402 #else
   2403  const RebindToUnsigned<decltype(d2)> d2_u;
   2404  const auto shuffle_idx =
   2405      BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
   2406                                      0, 0, 0, 0));
   2407  return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
   2408 #endif
   2409 }
   2410 template <typename T, HWY_IF_T_SIZE(T, 2)>
   2411 HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) {
   2412  const DFromV<decltype(a)> d;
   2413  const Twice<decltype(d)> d2;
   2414  const auto ba = Combine(d2, b, a);
   2415 #if HWY_TARGET == HWY_SSE2
   2416  Vec64<uint32_t> ba_shuffled{
   2417      _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
   2418  return Vec64<T>{
   2419      _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))};
   2420 #else
   2421  const RebindToUnsigned<decltype(d2)> d2_u;
   2422  const auto shuffle_idx = BitCast(
   2423      d2,
   2424      Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0));
   2425  return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
   2426 #endif
   2427 }
   2428 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2429 HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) {
   2430  const DFromV<decltype(a)> d;
   2431  const RebindToFloat<decltype(d)> df;
   2432  constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
   2433  return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
   2434                                                 BitCast(df, b).raw, m)});
   2435 }
   2436 
   2437 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2438 HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) {
   2439  const DFromV<decltype(a)> d;
   2440 #if HWY_TARGET == HWY_SSE2
   2441  const auto zero = Zero(d);
   2442  const Rebind<int16_t, decltype(d)> di16;
   2443  const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
   2444      _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
   2445  const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
   2446      _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
   2447  const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled);
   2448  return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
   2449 #else
   2450  const Twice<decltype(d)> d2;
   2451  const auto ba = Combine(d2, b, a);
   2452  const RebindToUnsigned<decltype(d2)> d2_u;
   2453  const auto shuffle_idx =
   2454      BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0,
   2455                                      0, 0, 0, 0));
   2456  return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
   2457 #endif
   2458 }
   2459 template <typename T, HWY_IF_T_SIZE(T, 2)>
   2460 HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) {
   2461  const DFromV<decltype(a)> d;
   2462 #if HWY_TARGET == HWY_SSE2
   2463  const Vec32<T> a_shuffled{
   2464      _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))};
   2465  const Vec32<T> b_shuffled{
   2466      _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))};
   2467  return Combine(d, b_shuffled, a_shuffled);
   2468 #else
   2469  const Twice<decltype(d)> d2;
   2470  const auto ba = Combine(d2, b, a);
   2471  const RebindToUnsigned<decltype(d2)> d2_u;
   2472  const auto shuffle_idx = BitCast(
   2473      d2,
   2474      Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0));
   2475  return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
   2476 #endif
   2477 }
   2478 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2479 HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) {
   2480  const DFromV<decltype(a)> d;
   2481  const RebindToFloat<decltype(d)> df;
   2482  constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
   2483  return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
   2484                                                 BitCast(df, b).raw, m)});
   2485 }
   2486 
   2487 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2488 HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) {
   2489  const DFromV<decltype(a)> d;
   2490 #if HWY_TARGET == HWY_SSE2
   2491  const auto zero = Zero(d);
   2492  const Rebind<int16_t, decltype(d)> di16;
   2493  const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16(
   2494      _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))};
   2495  const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16(
   2496      _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))};
   2497  const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled);
   2498  return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)};
   2499 #else
   2500  const Twice<decltype(d)> d2;
   2501  const auto ba = Combine(d2, b, a);
   2502  const RebindToUnsigned<decltype(d2)> d2_u;
   2503  const auto shuffle_idx =
   2504      BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0,
   2505                                      0, 0, 0, 0));
   2506  return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
   2507 #endif
   2508 }
   2509 template <typename T, HWY_IF_T_SIZE(T, 2)>
   2510 HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) {
   2511  const DFromV<decltype(a)> d;
   2512 #if HWY_TARGET == HWY_SSE2
   2513  const Vec32<T> a_shuffled{
   2514      _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))};
   2515  const Vec32<T> b_shuffled{
   2516      _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))};
   2517  return Combine(d, b_shuffled, a_shuffled);
   2518 #else
   2519  const Twice<decltype(d)> d2;
   2520  const auto ba = Combine(d2, b, a);
   2521  const RebindToUnsigned<decltype(d2)> d2_u;
   2522  const auto shuffle_idx = BitCast(
   2523      d2,
   2524      Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0));
   2525  return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
   2526 #endif
   2527 }
   2528 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2529 HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) {
   2530  const DFromV<decltype(a)> d;
   2531  const RebindToFloat<decltype(d)> df;
   2532  constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
   2533  return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw,
   2534                                                 BitCast(df, b).raw, m)});
   2535 }
   2536 
   2537 }  // namespace detail
   2538 
   2539 // Swap 64-bit halves
   2540 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
   2541  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
   2542 }
   2543 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
   2544  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
   2545 }
   2546 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
   2547  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
   2548 }
   2549 HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) {
   2550  return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
   2551 }
   2552 HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) {
   2553  return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
   2554 }
   2555 HWY_API Vec128<double> Shuffle01(const Vec128<double> v) {
   2556  return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
   2557 }
   2558 
   2559 // Rotate right 32 bits
   2560 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
   2561  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
   2562 }
   2563 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
   2564  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
   2565 }
   2566 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
   2567  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
   2568 }
   2569 // Rotate left 32 bits
   2570 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
   2571  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
   2572 }
   2573 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
   2574  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
   2575 }
   2576 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
   2577  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
   2578 }
   2579 
   2580 // Reverse
   2581 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
   2582  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
   2583 }
   2584 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
   2585  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
   2586 }
   2587 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
   2588  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
   2589 }
   2590 
   2591 // ================================================== COMPARE
   2592 
   2593 #if HWY_TARGET <= HWY_AVX3
   2594 
   2595 // Comparisons set a mask bit to 1 if the condition is true, else 0.
   2596 
   2597 // ------------------------------ TestBit
   2598 
   2599 namespace detail {
   2600 
   2601 template <typename T, size_t N>
   2602 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v,
   2603                                 const Vec128<T, N> bit) {
   2604  return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
   2605 }
   2606 template <typename T, size_t N>
   2607 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v,
   2608                                 const Vec128<T, N> bit) {
   2609  return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
   2610 }
   2611 template <typename T, size_t N>
   2612 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v,
   2613                                 const Vec128<T, N> bit) {
   2614  return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
   2615 }
   2616 template <typename T, size_t N>
   2617 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v,
   2618                                 const Vec128<T, N> bit) {
   2619  return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
   2620 }
   2621 
   2622 }  // namespace detail
   2623 
   2624 template <typename T, size_t N>
   2625 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
   2626  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   2627  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
   2628 }
   2629 
   2630 // ------------------------------ Equality
   2631 
   2632 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   2633 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
   2634  return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
   2635 }
   2636 
   2637 template <typename T, size_t N, HWY_IF_UI16(T)>
   2638 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
   2639  return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
   2640 }
   2641 
   2642 template <typename T, size_t N, HWY_IF_UI32(T)>
   2643 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
   2644  return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
   2645 }
   2646 
   2647 template <typename T, size_t N, HWY_IF_UI64(T)>
   2648 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
   2649  return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
   2650 }
   2651 
   2652 #if HWY_HAVE_FLOAT16
   2653 template <size_t N>
   2654 HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a,
   2655                                         Vec128<float16_t, N> b) {
   2656  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
   2657  HWY_DIAGNOSTICS(push)
   2658  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
   2659  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
   2660  HWY_DIAGNOSTICS(pop)
   2661 }
   2662 #endif  // HWY_HAVE_FLOAT16
   2663 template <size_t N>
   2664 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
   2665  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
   2666 }
   2667 
   2668 template <size_t N>
   2669 HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
   2670                                      Vec128<double, N> b) {
   2671  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
   2672 }
   2673 
   2674 // ------------------------------ Inequality
   2675 
   2676 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   2677 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
   2678  return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
   2679 }
   2680 
   2681 template <typename T, size_t N, HWY_IF_UI16(T)>
   2682 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
   2683  return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
   2684 }
   2685 
   2686 template <typename T, size_t N, HWY_IF_UI32(T)>
   2687 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
   2688  return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
   2689 }
   2690 
   2691 template <typename T, size_t N, HWY_IF_UI64(T)>
   2692 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
   2693  return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
   2694 }
   2695 
   2696 #if HWY_HAVE_FLOAT16
   2697 template <size_t N>
   2698 HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a,
   2699                                         Vec128<float16_t, N> b) {
   2700  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
   2701  HWY_DIAGNOSTICS(push)
   2702  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
   2703  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
   2704  HWY_DIAGNOSTICS(pop)
   2705 }
   2706 #endif  // HWY_HAVE_FLOAT16
   2707 template <size_t N>
   2708 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
   2709  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
   2710 }
   2711 
   2712 template <size_t N>
   2713 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
   2714                                      Vec128<double, N> b) {
   2715  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
   2716 }
   2717 
   2718 // ------------------------------ Strict inequality
   2719 
   2720 // Signed/float <
   2721 template <size_t N>
   2722 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   2723  return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
   2724 }
   2725 template <size_t N>
   2726 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
   2727                                      Vec128<int16_t, N> b) {
   2728  return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
   2729 }
   2730 template <size_t N>
   2731 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
   2732                                      Vec128<int32_t, N> b) {
   2733  return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
   2734 }
   2735 template <size_t N>
   2736 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
   2737                                      Vec128<int64_t, N> b) {
   2738  return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
   2739 }
   2740 
   2741 template <size_t N>
   2742 HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
   2743                                      Vec128<uint8_t, N> b) {
   2744  return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
   2745 }
   2746 template <size_t N>
   2747 HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
   2748                                       Vec128<uint16_t, N> b) {
   2749  return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
   2750 }
   2751 template <size_t N>
   2752 HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
   2753                                       Vec128<uint32_t, N> b) {
   2754  return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
   2755 }
   2756 template <size_t N>
   2757 HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
   2758                                       Vec128<uint64_t, N> b) {
   2759  return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
   2760 }
   2761 
   2762 #if HWY_HAVE_FLOAT16
   2763 template <size_t N>
   2764 HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a,
   2765                                        Vec128<float16_t, N> b) {
   2766  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
   2767  HWY_DIAGNOSTICS(push)
   2768  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
   2769  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
   2770  HWY_DIAGNOSTICS(pop)
   2771 }
   2772 #endif  // HWY_HAVE_FLOAT16
   2773 template <size_t N>
   2774 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
   2775  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
   2776 }
   2777 template <size_t N>
   2778 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
   2779  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
   2780 }
   2781 
   2782 // ------------------------------ Weak inequality
   2783 
   2784 #if HWY_HAVE_FLOAT16
   2785 template <size_t N>
   2786 HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a,
   2787                                         Vec128<float16_t, N> b) {
   2788  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
   2789  HWY_DIAGNOSTICS(push)
   2790  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
   2791  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
   2792  HWY_DIAGNOSTICS(pop)
   2793 }
   2794 #endif  // HWY_HAVE_FLOAT16
   2795 template <size_t N>
   2796 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
   2797  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
   2798 }
   2799 template <size_t N>
   2800 HWY_API Mask128<double, N> operator>=(Vec128<double, N> a,
   2801                                      Vec128<double, N> b) {
   2802  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
   2803 }
   2804 
   2805 template <size_t N>
   2806 HWY_API Mask128<int8_t, N> operator>=(Vec128<int8_t, N> a,
   2807                                      Vec128<int8_t, N> b) {
   2808  return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)};
   2809 }
   2810 template <size_t N>
   2811 HWY_API Mask128<int16_t, N> operator>=(Vec128<int16_t, N> a,
   2812                                       Vec128<int16_t, N> b) {
   2813  return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)};
   2814 }
   2815 template <size_t N>
   2816 HWY_API Mask128<int32_t, N> operator>=(Vec128<int32_t, N> a,
   2817                                       Vec128<int32_t, N> b) {
   2818  return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)};
   2819 }
   2820 template <size_t N>
   2821 HWY_API Mask128<int64_t, N> operator>=(Vec128<int64_t, N> a,
   2822                                       Vec128<int64_t, N> b) {
   2823  return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)};
   2824 }
   2825 
   2826 template <size_t N>
   2827 HWY_API Mask128<uint8_t, N> operator>=(Vec128<uint8_t, N> a,
   2828                                       Vec128<uint8_t, N> b) {
   2829  return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)};
   2830 }
   2831 template <size_t N>
   2832 HWY_API Mask128<uint16_t, N> operator>=(Vec128<uint16_t, N> a,
   2833                                        Vec128<uint16_t, N> b) {
   2834  return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)};
   2835 }
   2836 template <size_t N>
   2837 HWY_API Mask128<uint32_t, N> operator>=(Vec128<uint32_t, N> a,
   2838                                        Vec128<uint32_t, N> b) {
   2839  return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)};
   2840 }
   2841 template <size_t N>
   2842 HWY_API Mask128<uint64_t, N> operator>=(Vec128<uint64_t, N> a,
   2843                                        Vec128<uint64_t, N> b) {
   2844  return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)};
   2845 }
   2846 
   2847 #else  // AVX2 or below
   2848 
   2849 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
   2850 
   2851 template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)>
   2852 HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) {
   2853  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
   2854  const Simd<TFrom, NFrom, 0> d;
   2855  return MaskFromVec(BitCast(dto, VecFromMask(d, m)));
   2856 }
   2857 
   2858 template <typename T, size_t N>
   2859 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
   2860  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   2861  return (v & bit) == bit;
   2862 }
   2863 
   2864 // ------------------------------ Equality
   2865 
   2866 // Unsigned
   2867 template <size_t N>
   2868 HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a,
   2869                                       Vec128<uint8_t, N> b) {
   2870  return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
   2871 }
   2872 template <size_t N>
   2873 HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a,
   2874                                        Vec128<uint16_t, N> b) {
   2875  return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
   2876 }
   2877 template <size_t N>
   2878 HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a,
   2879                                        Vec128<uint32_t, N> b) {
   2880  return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
   2881 }
   2882 template <size_t N>
   2883 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
   2884                                        const Vec128<uint64_t, N> b) {
   2885 #if HWY_TARGET >= HWY_SSSE3
   2886  const DFromV<decltype(a)> d64;
   2887  const RepartitionToNarrow<decltype(d64)> d32;
   2888  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
   2889  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
   2890  return MaskFromVec(BitCast(d64, cmp64));
   2891 #else
   2892  return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
   2893 #endif
   2894 }
   2895 
   2896 // Signed
   2897 template <size_t N>
   2898 HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a,
   2899                                      Vec128<int8_t, N> b) {
   2900  return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
   2901 }
   2902 template <size_t N>
   2903 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
   2904                                       Vec128<int16_t, N> b) {
   2905  return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
   2906 }
   2907 template <size_t N>
   2908 HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a,
   2909                                       Vec128<int32_t, N> b) {
   2910  return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
   2911 }
   2912 template <size_t N>
   2913 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
   2914                                       const Vec128<int64_t, N> b) {
   2915  // Same as signed ==; avoid duplicating the SSSE3 version.
   2916  const DFromV<decltype(a)> d;
   2917  RebindToUnsigned<decltype(d)> du;
   2918  return RebindMask(d, BitCast(du, a) == BitCast(du, b));
   2919 }
   2920 
   2921 // Float
   2922 template <size_t N>
   2923 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
   2924  return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
   2925 }
   2926 template <size_t N>
   2927 HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
   2928                                      Vec128<double, N> b) {
   2929  return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
   2930 }
   2931 
   2932 // ------------------------------ Inequality
   2933 
   2934 // This cannot have T as a template argument, otherwise it is not more
   2935 // specialized than rewritten operator== in C++20, leading to compile
   2936 // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
   2937 template <size_t N>
   2938 HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
   2939                                       Vec128<uint8_t, N> b) {
   2940  return Not(a == b);
   2941 }
   2942 template <size_t N>
   2943 HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
   2944                                        Vec128<uint16_t, N> b) {
   2945  return Not(a == b);
   2946 }
   2947 template <size_t N>
   2948 HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
   2949                                        Vec128<uint32_t, N> b) {
   2950  return Not(a == b);
   2951 }
   2952 template <size_t N>
   2953 HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
   2954                                        Vec128<uint64_t, N> b) {
   2955  return Not(a == b);
   2956 }
   2957 template <size_t N>
   2958 HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
   2959                                      Vec128<int8_t, N> b) {
   2960  return Not(a == b);
   2961 }
   2962 template <size_t N>
   2963 HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
   2964                                       Vec128<int16_t, N> b) {
   2965  return Not(a == b);
   2966 }
   2967 template <size_t N>
   2968 HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
   2969                                       Vec128<int32_t, N> b) {
   2970  return Not(a == b);
   2971 }
   2972 template <size_t N>
   2973 HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
   2974                                       Vec128<int64_t, N> b) {
   2975  return Not(a == b);
   2976 }
   2977 
   2978 template <size_t N>
   2979 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
   2980  return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
   2981 }
   2982 template <size_t N>
   2983 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
   2984                                      Vec128<double, N> b) {
   2985  return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
   2986 }
   2987 
   2988 // ------------------------------ Strict inequality
   2989 
   2990 namespace detail {
   2991 
   2992 template <size_t N>
   2993 HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
   2994                                 Vec128<int8_t, N> b) {
   2995  return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
   2996 }
   2997 template <size_t N>
   2998 HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
   2999                                  Vec128<int16_t, N> b) {
   3000  return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
   3001 }
   3002 template <size_t N>
   3003 HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
   3004                                  Vec128<int32_t, N> b) {
   3005  return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
   3006 }
   3007 
   3008 template <size_t N>
   3009 HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
   3010                                  const Vec128<int64_t, N> a,
   3011                                  const Vec128<int64_t, N> b) {
   3012 #if HWY_TARGET >= HWY_SSSE3
   3013  // See https://stackoverflow.com/questions/65166174/:
   3014  const DFromV<decltype(a)> d;
   3015  const RepartitionToNarrow<decltype(d)> d32;
   3016  const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
   3017  const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
   3018  // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
   3019  // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
   3020  const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
   3021  // Duplicate upper to lower half.
   3022  return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
   3023 #else
   3024  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
   3025 #endif
   3026 }
   3027 
   3028 template <typename T, size_t N>
   3029 HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
   3030                            Vec128<T, N> b) {
   3031  const DFromV<decltype(a)> du;
   3032  const RebindToSigned<decltype(du)> di;
   3033  const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
   3034  const auto sa = BitCast(di, Xor(a, msb));
   3035  const auto sb = BitCast(di, Xor(b, msb));
   3036  return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
   3037 }
   3038 
   3039 template <size_t N>
   3040 HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
   3041                                Vec128<float, N> b) {
   3042  return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
   3043 }
   3044 template <size_t N>
   3045 HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
   3046                                 Vec128<double, N> b) {
   3047  return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
   3048 }
   3049 
   3050 }  // namespace detail
   3051 
   3052 template <typename T, size_t N>
   3053 HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
   3054  return detail::Gt(hwy::TypeTag<T>(), a, b);
   3055 }
   3056 
   3057 // ------------------------------ Weak inequality
   3058 
   3059 namespace detail {
   3060 template <typename T, size_t N>
   3061 HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a,
   3062                            Vec128<T, N> b) {
   3063  return Not(Gt(tag, b, a));
   3064 }
   3065 
   3066 template <typename T, size_t N>
   3067 HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a,
   3068                            Vec128<T, N> b) {
   3069  return Not(Gt(tag, b, a));
   3070 }
   3071 
   3072 template <size_t N>
   3073 HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a,
   3074                                Vec128<float, N> b) {
   3075  return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
   3076 }
   3077 template <size_t N>
   3078 HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a,
   3079                                 Vec128<double, N> b) {
   3080  return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
   3081 }
   3082 
   3083 }  // namespace detail
   3084 
   3085 template <typename T, size_t N>
   3086 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
   3087  return detail::Ge(hwy::TypeTag<T>(), a, b);
   3088 }
   3089 
   3090 #endif  // HWY_TARGET <= HWY_AVX3
   3091 
   3092 // ------------------------------ Reversed comparisons
   3093 
   3094 template <typename T, size_t N>
   3095 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
   3096  return b > a;
   3097 }
   3098 
   3099 template <typename T, size_t N>
   3100 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
   3101  return b >= a;
   3102 }
   3103 
   3104 // ------------------------------ Iota (Load)
   3105 
   3106 namespace detail {
   3107 
   3108 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   3109 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3110  return VFromD<D>{_mm_set_epi8(
   3111      static_cast<char>(15), static_cast<char>(14), static_cast<char>(13),
   3112      static_cast<char>(12), static_cast<char>(11), static_cast<char>(10),
   3113      static_cast<char>(9), static_cast<char>(8), static_cast<char>(7),
   3114      static_cast<char>(6), static_cast<char>(5), static_cast<char>(4),
   3115      static_cast<char>(3), static_cast<char>(2), static_cast<char>(1),
   3116      static_cast<char>(0))};
   3117 }
   3118 
   3119 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
   3120 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3121  return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4},
   3122                                 int16_t{3}, int16_t{2}, int16_t{1},
   3123                                 int16_t{0})};
   3124 }
   3125 
   3126 #if HWY_HAVE_FLOAT16
   3127 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
   3128 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3129  return VFromD<D>{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5},
   3130                              float16_t{4}, float16_t{3}, float16_t{2},
   3131                              float16_t{1}, float16_t{0})};
   3132 }
   3133 #endif  // HWY_HAVE_FLOAT16
   3134 
   3135 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
   3136 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3137  return VFromD<D>{
   3138      _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})};
   3139 }
   3140 
   3141 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
   3142 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3143  return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})};
   3144 }
   3145 
   3146 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   3147 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3148  return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)};
   3149 }
   3150 
   3151 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3152 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
   3153  return VFromD<D>{_mm_set_pd(1.0, 0.0)};
   3154 }
   3155 
   3156 #if HWY_COMPILER_MSVC
   3157 template <class V, HWY_IF_V_SIZE_V(V, 1)>
   3158 static HWY_INLINE V MaskOutVec128Iota(V v) {
   3159  const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)};
   3160  return v & mask_out_mask;
   3161 }
   3162 template <class V, HWY_IF_V_SIZE_V(V, 2)>
   3163 static HWY_INLINE V MaskOutVec128Iota(V v) {
   3164 #if HWY_TARGET <= HWY_SSE4
   3165  return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)};
   3166 #else
   3167  const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)};
   3168  return v & mask_out_mask;
   3169 #endif
   3170 }
   3171 template <class V, HWY_IF_V_SIZE_V(V, 4)>
   3172 static HWY_INLINE V MaskOutVec128Iota(V v) {
   3173  const DFromV<decltype(v)> d;
   3174  const Repartition<float, decltype(d)> df;
   3175  using VF = VFromD<decltype(df)>;
   3176  return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)});
   3177 }
   3178 template <class V, HWY_IF_V_SIZE_V(V, 8)>
   3179 static HWY_INLINE V MaskOutVec128Iota(V v) {
   3180  const DFromV<decltype(v)> d;
   3181  const RebindToUnsigned<decltype(d)> du;
   3182  using VU = VFromD<decltype(du)>;
   3183  return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)});
   3184 }
   3185 template <class V, HWY_IF_V_SIZE_GT_V(V, 8)>
   3186 static HWY_INLINE V MaskOutVec128Iota(V v) {
   3187  return v;
   3188 }
   3189 #endif
   3190 
   3191 }  // namespace detail
   3192 
   3193 template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
   3194 HWY_API VFromD<D> Iota(D d, const T2 first) {
   3195  const auto result_iota =
   3196      detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
   3197 #if HWY_COMPILER_MSVC
   3198  return detail::MaskOutVec128Iota(result_iota);
   3199 #else
   3200  return result_iota;
   3201 #endif
   3202 }
   3203 
   3204 // ------------------------------ FirstN (Iota, Lt)
   3205 
   3206 template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
   3207 HWY_API M FirstN(D d, size_t num) {
   3208  constexpr size_t kN = MaxLanes(d);
   3209  // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks
   3210  // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI.
   3211  num = HWY_MIN(num, kN);
   3212 #if HWY_TARGET <= HWY_AVX3
   3213 #if HWY_ARCH_X86_64
   3214  const uint64_t all = (1ull << kN) - 1;
   3215  return M::FromBits(_bzhi_u64(all, num));
   3216 #else
   3217  const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1);
   3218  return M::FromBits(_bzhi_u32(all, static_cast<uint32_t>(num)));
   3219 #endif  // HWY_ARCH_X86_64
   3220 #else   // HWY_TARGET > HWY_AVX3
   3221  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
   3222  using TI = TFromD<decltype(di)>;
   3223  return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
   3224 #endif  // HWY_TARGET <= HWY_AVX3
   3225 }
   3226 
   3227 // ------------------------------ InterleaveLower
   3228 
   3229 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
   3230 // the least-significant lane) and "b". To concatenate two half-width integers
   3231 // into one, use ZipLower/Upper instead (also works with scalar).
   3232 
   3233 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   3234 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   3235  return Vec128<T, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
   3236 }
   3237 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   3238 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   3239  const DFromV<decltype(a)> d;
   3240  const RebindToUnsigned<decltype(d)> du;
   3241  using VU = VFromD<decltype(du)>;  // for float16_t
   3242  return BitCast(
   3243      d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
   3244 }
   3245 template <typename T, size_t N, HWY_IF_UI32(T)>
   3246 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   3247  return Vec128<T, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
   3248 }
   3249 template <typename T, size_t N, HWY_IF_UI64(T)>
   3250 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   3251  return Vec128<T, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
   3252 }
   3253 
   3254 template <size_t N>
   3255 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
   3256                                         Vec128<float, N> b) {
   3257  return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
   3258 }
   3259 template <size_t N>
   3260 HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
   3261                                          Vec128<double, N> b) {
   3262  return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
   3263 }
   3264 
   3265 // Generic for all vector lengths.
   3266 template <class D>
   3267 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
   3268  return InterleaveLower(a, b);
   3269 }
   3270 
   3271 // ================================================== MEMORY (2)
   3272 
   3273 // ------------------------------ MaskedLoad
   3274 
   3275 #if HWY_TARGET <= HWY_AVX3
   3276 
   3277 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   3278 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3279                             const TFromD<D>* HWY_RESTRICT p) {
   3280  return VFromD<D>{_mm_maskz_loadu_epi8(m.raw, p)};
   3281 }
   3282 
   3283 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   3284 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   3285                             const TFromD<D>* HWY_RESTRICT p) {
   3286  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   3287  return BitCast(d, VFromD<decltype(du)>{_mm_maskz_loadu_epi16(m.raw, p)});
   3288 }
   3289 
   3290 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
   3291 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3292                             const TFromD<D>* HWY_RESTRICT p) {
   3293  return VFromD<D>{_mm_maskz_loadu_epi32(m.raw, p)};
   3294 }
   3295 
   3296 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
   3297 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3298                             const TFromD<D>* HWY_RESTRICT p) {
   3299  return VFromD<D>{_mm_maskz_loadu_epi64(m.raw, p)};
   3300 }
   3301 
   3302 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   3303 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3304                             const float* HWY_RESTRICT p) {
   3305  return VFromD<D>{_mm_maskz_loadu_ps(m.raw, p)};
   3306 }
   3307 
   3308 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3309 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3310                             const double* HWY_RESTRICT p) {
   3311  return VFromD<D>{_mm_maskz_loadu_pd(m.raw, p)};
   3312 }
   3313 
   3314 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   3315 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
   3316                               const TFromD<D>* HWY_RESTRICT p) {
   3317  return VFromD<D>{_mm_mask_loadu_epi8(v.raw, m.raw, p)};
   3318 }
   3319 
   3320 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   3321 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
   3322                               const TFromD<D>* HWY_RESTRICT p) {
   3323  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   3324  return BitCast(d, VFromD<decltype(du)>{
   3325                        _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
   3326 }
   3327 
   3328 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
   3329 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
   3330                               const TFromD<D>* HWY_RESTRICT p) {
   3331  return VFromD<D>{_mm_mask_loadu_epi32(v.raw, m.raw, p)};
   3332 }
   3333 
   3334 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
   3335 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
   3336                               const TFromD<D>* HWY_RESTRICT p) {
   3337  return VFromD<D>{_mm_mask_loadu_epi64(v.raw, m.raw, p)};
   3338 }
   3339 
   3340 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   3341 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
   3342                               const float* HWY_RESTRICT p) {
   3343  return VFromD<D>{_mm_mask_loadu_ps(v.raw, m.raw, p)};
   3344 }
   3345 
   3346 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3347 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
   3348                               const double* HWY_RESTRICT p) {
   3349  return VFromD<D>{_mm_mask_loadu_pd(v.raw, m.raw, p)};
   3350 }
   3351 
   3352 #elif HWY_TARGET == HWY_AVX2
   3353 
   3354 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
   3355 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3356                             const TFromD<D>* HWY_RESTRICT p) {
   3357  auto p_p = reinterpret_cast<const int*>(p);  // NOLINT
   3358  return VFromD<D>{_mm_maskload_epi32(p_p, m.raw)};
   3359 }
   3360 
   3361 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
   3362 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */,
   3363                             const TFromD<D>* HWY_RESTRICT p) {
   3364  auto p_p = reinterpret_cast<const long long*>(p);  // NOLINT
   3365  return VFromD<D>{_mm_maskload_epi64(p_p, m.raw)};
   3366 }
   3367 
   3368 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   3369 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const float* HWY_RESTRICT p) {
   3370  const RebindToSigned<decltype(d)> di;
   3371  return VFromD<D>{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)};
   3372 }
   3373 
   3374 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3375 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const double* HWY_RESTRICT p) {
   3376  const RebindToSigned<decltype(d)> di;
   3377  return VFromD<D>{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)};
   3378 }
   3379 
   3380 // There is no maskload_epi8/16, so blend instead.
   3381 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
   3382          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   3383 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   3384                             const TFromD<D>* HWY_RESTRICT p) {
   3385  return IfThenElseZero(m, LoadU(d, p));
   3386 }
   3387 
   3388 #else  // <= SSE4
   3389 
   3390 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
   3391 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   3392 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   3393                             const TFromD<D>* HWY_RESTRICT p) {
   3394  return IfThenElseZero(m, LoadU(d, p));
   3395 }
   3396 
   3397 #endif
   3398 
   3399 // ------------------------------ MaskedLoadOr
   3400 
   3401 #if HWY_TARGET > HWY_AVX3  // else: native
   3402 
   3403 // Generic for all vector lengths.
   3404 template <class D>
   3405 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
   3406                               const TFromD<D>* HWY_RESTRICT p) {
   3407  return IfThenElse(m, LoadU(d, p), v);
   3408 }
   3409 
   3410 #endif  // HWY_TARGET > HWY_AVX3
   3411 
   3412 // ------------------------------ LoadN (InterleaveLower)
   3413 
   3414 #if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT
   3415 
   3416 #ifdef HWY_NATIVE_LOAD_N
   3417 #undef HWY_NATIVE_LOAD_N
   3418 #else
   3419 #define HWY_NATIVE_LOAD_N
   3420 #endif
   3421 
   3422 // Generic for all vector lengths.
   3423 template <class D, HWY_IF_T_SIZE_ONE_OF_D(
   3424                       D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
   3425                              (1 << 4) | (1 << 8))>
   3426 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   3427                        size_t num_lanes) {
   3428  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
   3429      d_full;
   3430  return ResizeBitCast(d, MaskedLoad(FirstN(d_full, num_lanes), d_full, p));
   3431 }
   3432 
   3433 // Generic for all vector lengths.
   3434 template <class D, HWY_IF_T_SIZE_ONE_OF_D(
   3435                       D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
   3436                              (1 << 4) | (1 << 8))>
   3437 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   3438                          size_t num_lanes) {
   3439  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
   3440      d_full;
   3441  return ResizeBitCast(d, MaskedLoadOr(ResizeBitCast(d_full, no),
   3442                                       FirstN(d_full, num_lanes), d_full, p));
   3443 }
   3444 
   3445 #if HWY_TARGET > HWY_AVX3
   3446 namespace detail {
   3447 
   3448 // 'Leading' means the part that fits in 32-bit lanes. With 2-byte vectors,
   3449 // there are none, so return the remainder (v_trailing).
   3450 template <class D, HWY_IF_V_SIZE_LE_D(D, 2)>
   3451 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(
   3452    VFromD<D> /*load_mask*/, D /*d*/, const TFromD<D>* HWY_RESTRICT /*p*/,
   3453    VFromD<D> v_trailing) {
   3454  return v_trailing;
   3455 }
   3456 
   3457 template <class D, HWY_IF_V_SIZE_LE_D(D, 2)>
   3458 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(
   3459    VFromD<D> /*no*/, VFromD<D> /*load_mask*/, D /*d*/,
   3460    const TFromD<D>* HWY_RESTRICT /*p*/, VFromD<D> v_trailing) {
   3461  return v_trailing;
   3462 }
   3463 
   3464 template <class D, HWY_IF_V_SIZE_GT_D(D, 2)>
   3465 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(VFromD<D> load_mask, D d,
   3466                                              const TFromD<D>* HWY_RESTRICT p,
   3467                                              VFromD<D> v_trailing) {
   3468  using DI32 = Repartition<int32_t, D>;
   3469  const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full;
   3470 
   3471  // ResizeBitCast of load_mask to di32 is okay below if
   3472  // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past
   3473  // the first (lowest-index) lanes of load_mask.raw will have already been
   3474  // zeroed out by FirstN.
   3475  return ResizeBitCast(
   3476      d, IfNegativeThenElse(
   3477             ResizeBitCast(di32_full, load_mask),
   3478             MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)),
   3479                        di32_full, reinterpret_cast<const int32_t*>(p)),
   3480             ResizeBitCast(di32_full, v_trailing)));
   3481 }
   3482 
   3483 template <class D, HWY_IF_V_SIZE_GT_D(D, 2)>
   3484 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(VFromD<D> no,
   3485                                                VFromD<D> load_mask, D d,
   3486                                                const TFromD<D>* HWY_RESTRICT p,
   3487                                                VFromD<D> v_trailing) {
   3488  using DI32 = Repartition<int32_t, D>;
   3489  const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full;
   3490 
   3491  // ResizeBitCast of load_mask to di32 is okay below if
   3492  // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past
   3493  // the first (lowest-index) lanes of load_mask.raw will have already been
   3494  // zeroed out by FirstN.
   3495  return ResizeBitCast(
   3496      d, IfNegativeThenElse(
   3497             ResizeBitCast(di32_full, load_mask),
   3498             MaskedLoadOr(ResizeBitCast(di32_full, no),
   3499                          MaskFromVec(ResizeBitCast(di32_full, load_mask)),
   3500                          di32_full, reinterpret_cast<const int32_t*>(p)),
   3501             ResizeBitCast(di32_full, v_trailing)));
   3502 }
   3503 
   3504 // Single lane: load or default value.
   3505 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   3506          HWY_IF_LANES_D(D, 1)>
   3507 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d,
   3508                                               const TFromD<D>* HWY_RESTRICT p,
   3509                                               size_t num_lanes) {
   3510  return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
   3511 }
   3512 
   3513 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   3514          HWY_IF_LANES_D(D, 1)>
   3515 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
   3516    VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p,
   3517    size_t num_lanes) {
   3518  return (num_lanes > 0) ? LoadU(d, p) : no;
   3519 }
   3520 
   3521 // Two lanes: load 1, 2, or default.
   3522 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
   3523 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d,
   3524                                               const TFromD<D>* HWY_RESTRICT p,
   3525                                               size_t num_lanes) {
   3526  if (num_lanes > 1) {
   3527    return LoadU(d, p);
   3528  } else {
   3529    const FixedTag<TFromD<D>, 1> d1;
   3530    return (num_lanes == 1) ? ResizeBitCast(d, LoadU(d1, p)) : Zero(d);
   3531  }
   3532 }
   3533 
   3534 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)>
   3535 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
   3536    VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p,
   3537    size_t num_lanes) {
   3538  if (num_lanes > 1) {
   3539    return LoadU(d, p);
   3540  } else {
   3541    if (num_lanes == 0) return no;
   3542    // Load one, upper lane is default.
   3543    const FixedTag<TFromD<D>, 1> d1;
   3544    return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
   3545  }
   3546 }
   3547 
   3548 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
   3549 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d,
   3550                                               const TFromD<D>* HWY_RESTRICT p,
   3551                                               size_t num_lanes) {
   3552  const size_t trailing_n = num_lanes & 3;
   3553  if (trailing_n == 0) return Zero(d);
   3554 
   3555  VFromD<D> v_trailing = And(load_mask, Set(d, p[num_lanes - 1]));
   3556 
   3557  if ((trailing_n & 2) != 0) {
   3558    const Repartition<int16_t, decltype(d)> di16;
   3559    int16_t i16_bits;
   3560    CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits);
   3561    v_trailing = BitCast(
   3562        d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits),
   3563                              BitCast(di16, v_trailing)));
   3564  }
   3565 
   3566  return v_trailing;
   3567 }
   3568 
   3569 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)>
   3570 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
   3571    VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p,
   3572    size_t num_lanes) {
   3573  const size_t trailing_n = num_lanes & 3;
   3574  if (trailing_n == 0) return no;
   3575 
   3576  VFromD<D> v_trailing = IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no);
   3577 
   3578  if ((trailing_n & 2) != 0) {
   3579    const Repartition<int16_t, decltype(d)> di16;
   3580    int16_t i16_bits;
   3581    CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits);
   3582    v_trailing = BitCast(
   3583        d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits),
   3584                              BitCast(di16, v_trailing)));
   3585  }
   3586 
   3587  return v_trailing;
   3588 }
   3589 
   3590 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
   3591 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d,
   3592                                               const TFromD<D>* HWY_RESTRICT p,
   3593                                               size_t num_lanes) {
   3594  if ((num_lanes & 1) != 0) {
   3595    return And(load_mask, Set(d, p[num_lanes - 1]));
   3596  } else {
   3597    return Zero(d);
   3598  }
   3599 }
   3600 
   3601 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
   3602 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr(
   3603    VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p,
   3604    size_t num_lanes) {
   3605  if ((num_lanes & 1) != 0) {
   3606    return IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no);
   3607  } else {
   3608    return no;
   3609  }
   3610 }
   3611 
   3612 }  // namespace detail
   3613 
   3614 // Generic for all vector lengths.
   3615 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   3616 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, size_t N) {
   3617  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
   3618      d_full;
   3619 
   3620  const VFromD<D> load_mask =
   3621      ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
   3622  const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
   3623  const VFromD<D> v_trailing =
   3624      detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes);
   3625 
   3626 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
   3627  if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
   3628      num_lanes < (4 / sizeof(TFromD<D>))) {
   3629    return v_trailing;
   3630  }
   3631 #endif
   3632 
   3633  return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing);
   3634 }
   3635 
   3636 // Generic for all vector lengths.
   3637 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   3638 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   3639                          size_t N) {
   3640  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
   3641      d_full;
   3642 
   3643  const VFromD<D> load_mask =
   3644      ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N)));
   3645  const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D));
   3646  const VFromD<D> v_trailing =
   3647      detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes);
   3648 
   3649 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
   3650  if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) &&
   3651      num_lanes < (4 / sizeof(TFromD<D>))) {
   3652    return v_trailing;
   3653  }
   3654 #endif
   3655 
   3656  return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing);
   3657 }
   3658 
   3659 #endif  // HWY_TARGET > HWY_AVX3
   3660 #endif  // HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT
   3661 
   3662 // ------------------------------ BlendedStore
   3663 
   3664 namespace detail {
   3665 
   3666 // There is no maskload_epi8/16 with which we could safely implement
   3667 // BlendedStore. Manual blending is also unsafe because loading a full vector
   3668 // that crosses the array end causes asan faults. Resort to scalar code; the
   3669 // caller should instead use memcpy, assuming m is FirstN(d, n).
   3670 template <class D>
   3671 HWY_API void ScalarMaskedStore(VFromD<D> v, MFromD<D> m, D d,
   3672                               TFromD<D>* HWY_RESTRICT p) {
   3673  const RebindToSigned<decltype(d)> di;  // for testing mask if T=bfloat16_t.
   3674  using TI = TFromD<decltype(di)>;
   3675  alignas(16) TI buf[MaxLanes(d)];
   3676  alignas(16) TI mask[MaxLanes(d)];
   3677  Store(BitCast(di, v), di, buf);
   3678  Store(BitCast(di, VecFromMask(d, m)), di, mask);
   3679  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3680    if (mask[i]) {
   3681      CopySameSize(buf + i, p + i);
   3682    }
   3683  }
   3684 }
   3685 }  // namespace detail
   3686 
   3687 #if HWY_TARGET <= HWY_AVX3
   3688 
   3689 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   3690 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
   3691                          TFromD<D>* HWY_RESTRICT p) {
   3692  _mm_mask_storeu_epi8(p, m.raw, v.raw);
   3693 }
   3694 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   3695 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   3696                          TFromD<D>* HWY_RESTRICT p) {
   3697  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   3698  _mm_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p), RebindMask(du, m).raw,
   3699                        BitCast(du, v).raw);
   3700 }
   3701 
   3702 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
   3703 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
   3704                          TFromD<D>* HWY_RESTRICT p) {
   3705  auto pi = reinterpret_cast<int*>(p);  // NOLINT
   3706  _mm_mask_storeu_epi32(pi, m.raw, v.raw);
   3707 }
   3708 
   3709 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
   3710 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */,
   3711                          TFromD<D>* HWY_RESTRICT p) {
   3712  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
   3713  _mm_mask_storeu_epi64(pi, m.raw, v.raw);
   3714 }
   3715 
   3716 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   3717 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, float* HWY_RESTRICT p) {
   3718  _mm_mask_storeu_ps(p, m.raw, v.raw);
   3719 }
   3720 
   3721 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3722 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, double* HWY_RESTRICT p) {
   3723  _mm_mask_storeu_pd(p, m.raw, v.raw);
   3724 }
   3725 
   3726 #elif HWY_TARGET == HWY_AVX2
   3727 
   3728 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
   3729          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   3730 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   3731                          TFromD<D>* HWY_RESTRICT p) {
   3732  detail::ScalarMaskedStore(v, m, d, p);
   3733 }
   3734 
   3735 namespace detail {
   3736 
   3737 template <class D, class V, class M, HWY_IF_UI32_D(D)>
   3738 HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) {
   3739  auto pi = reinterpret_cast<int*>(p);  // NOLINT
   3740  _mm_maskstore_epi32(pi, m.raw, v.raw);
   3741 }
   3742 
   3743 template <class D, class V, class M, HWY_IF_UI64_D(D)>
   3744 HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) {
   3745  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
   3746  _mm_maskstore_epi64(pi, m.raw, v.raw);
   3747 }
   3748 
   3749 template <class D, class V, class M, HWY_IF_F32_D(D)>
   3750 HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) {
   3751  _mm_maskstore_ps(p, m.raw, v.raw);
   3752 }
   3753 
   3754 template <class D, class V, class M, HWY_IF_F64_D(D)>
   3755 HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) {
   3756  _mm_maskstore_pd(p, m.raw, v.raw);
   3757 }
   3758 
   3759 }  // namespace detail
   3760 
   3761 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
   3762          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   3763 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   3764                          TFromD<D>* HWY_RESTRICT p) {
   3765  const RebindToSigned<decltype(d)> di;
   3766  // For partial vectors, avoid writing other lanes by zeroing their mask.
   3767  if (d.MaxBytes() < 16) {
   3768    const Full128<TFromD<D>> dfull;
   3769    const Mask128<TFromD<D>> mfull{m.raw};
   3770    m = MFromD<D>{And(mfull, FirstN(dfull, MaxLanes(d))).raw};
   3771  }
   3772 
   3773  // Float/double require, and unsigned ints tolerate, signed int masks.
   3774  detail::NativeBlendedStore<D>(v, RebindMask(di, m), p);
   3775 }
   3776 
   3777 #else  // <= SSE4
   3778 
   3779 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   3780 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   3781                          TFromD<D>* HWY_RESTRICT p) {
   3782  // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
   3783  detail::ScalarMaskedStore(v, m, d, p);
   3784 }
   3785 
   3786 #endif  // SSE4
   3787 
   3788 // ================================================== ARITHMETIC
   3789 
   3790 // ------------------------------ Addition
   3791 
   3792 // Unsigned
   3793 template <size_t N>
   3794 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
   3795                                     const Vec128<uint8_t, N> b) {
   3796  return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
   3797 }
   3798 template <size_t N>
   3799 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
   3800                                      const Vec128<uint16_t, N> b) {
   3801  return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
   3802 }
   3803 template <size_t N>
   3804 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
   3805                                      const Vec128<uint32_t, N> b) {
   3806  return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
   3807 }
   3808 template <size_t N>
   3809 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
   3810                                      const Vec128<uint64_t, N> b) {
   3811  return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
   3812 }
   3813 
   3814 // Signed
   3815 template <size_t N>
   3816 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
   3817                                    const Vec128<int8_t, N> b) {
   3818  return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
   3819 }
   3820 template <size_t N>
   3821 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
   3822                                     const Vec128<int16_t, N> b) {
   3823  return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
   3824 }
   3825 template <size_t N>
   3826 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
   3827                                     const Vec128<int32_t, N> b) {
   3828  return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
   3829 }
   3830 template <size_t N>
   3831 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
   3832                                     const Vec128<int64_t, N> b) {
   3833  return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
   3834 }
   3835 
   3836 // Float
   3837 #if HWY_HAVE_FLOAT16
   3838 template <size_t N>
   3839 HWY_API Vec128<float16_t, N> operator+(const Vec128<float16_t, N> a,
   3840                                       const Vec128<float16_t, N> b) {
   3841  return Vec128<float16_t, N>{_mm_add_ph(a.raw, b.raw)};
   3842 }
   3843 #endif  // HWY_HAVE_FLOAT16
   3844 template <size_t N>
   3845 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
   3846                                   const Vec128<float, N> b) {
   3847  return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
   3848 }
   3849 template <size_t N>
   3850 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
   3851                                    const Vec128<double, N> b) {
   3852  return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
   3853 }
   3854 
   3855 // ------------------------------ Subtraction
   3856 
   3857 // Unsigned
   3858 template <size_t N>
   3859 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
   3860                                     const Vec128<uint8_t, N> b) {
   3861  return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
   3862 }
   3863 template <size_t N>
   3864 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
   3865                                      Vec128<uint16_t, N> b) {
   3866  return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
   3867 }
   3868 template <size_t N>
   3869 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
   3870                                      const Vec128<uint32_t, N> b) {
   3871  return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
   3872 }
   3873 template <size_t N>
   3874 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
   3875                                      const Vec128<uint64_t, N> b) {
   3876  return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
   3877 }
   3878 
   3879 // Signed
   3880 template <size_t N>
   3881 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
   3882                                    const Vec128<int8_t, N> b) {
   3883  return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
   3884 }
   3885 template <size_t N>
   3886 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
   3887                                     const Vec128<int16_t, N> b) {
   3888  return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
   3889 }
   3890 template <size_t N>
   3891 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
   3892                                     const Vec128<int32_t, N> b) {
   3893  return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
   3894 }
   3895 template <size_t N>
   3896 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
   3897                                     const Vec128<int64_t, N> b) {
   3898  return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
   3899 }
   3900 
   3901 // Float
   3902 #if HWY_HAVE_FLOAT16
   3903 template <size_t N>
   3904 HWY_API Vec128<float16_t, N> operator-(const Vec128<float16_t, N> a,
   3905                                       const Vec128<float16_t, N> b) {
   3906  return Vec128<float16_t, N>{_mm_sub_ph(a.raw, b.raw)};
   3907 }
   3908 #endif  // HWY_HAVE_FLOAT16
   3909 template <size_t N>
   3910 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
   3911                                   const Vec128<float, N> b) {
   3912  return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
   3913 }
   3914 template <size_t N>
   3915 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
   3916                                    const Vec128<double, N> b) {
   3917  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
   3918 }
   3919 
   3920 // ------------------------------ AddSub
   3921 
   3922 #if HWY_TARGET <= HWY_SSSE3
   3923 
   3924 #undef HWY_IF_ADDSUB_V
   3925 #define HWY_IF_ADDSUB_V(V) \
   3926  HWY_IF_V_SIZE_GT_V(      \
   3927      V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>)))
   3928 
   3929 template <size_t N, HWY_IF_LANES_GT(N, 1)>
   3930 HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) {
   3931  return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)};
   3932 }
   3933 HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) {
   3934  return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)};
   3935 }
   3936 #endif  // HWY_TARGET <= HWY_SSSE3
   3937 
   3938 // ------------------------------ PairwiseAdd128/PairwiseSub128
   3939 
   3940 // Need to use the default implementation of PairwiseAdd128/PairwiseSub128 in
   3941 // generic_ops-inl.h for U8/I8/F16/I64/U64 vectors and 64-byte vectors
   3942 
   3943 #if HWY_TARGET <= HWY_SSSE3
   3944 
   3945 #undef HWY_IF_PAIRWISE_ADD_128_D
   3946 #undef HWY_IF_PAIRWISE_SUB_128_D
   3947 #define HWY_IF_PAIRWISE_ADD_128_D(D)                                       \
   3948  hwy::EnableIf<(                                                          \
   3949      HWY_MAX_LANES_D(D) > (32 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) || \
   3950      (HWY_MAX_LANES_D(D) > (8 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) && \
   3951       !(hwy::IsSameEither<hwy::HWY_NAMESPACE::TFromD<D>, int16_t,         \
   3952                           uint16_t>() ||                                  \
   3953         sizeof(hwy::HWY_NAMESPACE::TFromD<D>) == 4 ||                     \
   3954         hwy::IsSame<hwy::HWY_NAMESPACE::TFromD<D>, double>())))>* = nullptr
   3955 #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_PAIRWISE_ADD_128_D(D)
   3956 
   3957 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)>
   3958 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3959  return VFromD<D>{_mm_hadd_epi16(a.raw, b.raw)};
   3960 }
   3961 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)>
   3962 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3963  const DFromV<decltype(a)> d;
   3964  const RebindToSigned<decltype(d)> di;
   3965  return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi16(a.raw, b.raw)})));
   3966 }
   3967 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
   3968 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3969  return VFromD<D>{_mm_hadd_epi32(a.raw, b.raw)};
   3970 }
   3971 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
   3972 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3973  const DFromV<decltype(a)> d;
   3974  const RebindToSigned<decltype(d)> di;
   3975  return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi32(a.raw, b.raw)})));
   3976 }
   3977 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   3978 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3979  return VFromD<D>{_mm_hadd_ps(a.raw, b.raw)};
   3980 }
   3981 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   3982 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3983  return Neg(VFromD<D>{_mm_hsub_ps(a.raw, b.raw)});
   3984 }
   3985 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   3986 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3987  return VFromD<D>{_mm_hadd_pd(a.raw, b.raw)};
   3988 }
   3989 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   3990 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3991  return Neg(VFromD<D>{_mm_hsub_pd(a.raw, b.raw)});
   3992 }
   3993 
   3994 #endif  // HWY_TARGET <= HWY_SSSE3
   3995 
   3996 // ------------------------------ SumsOf8
   3997 template <size_t N>
   3998 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
   3999  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
   4000 }
   4001 
   4002 // Generic for all vector lengths
   4003 template <class V, HWY_IF_I8_D(DFromV<V>)>
   4004 HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
   4005  const DFromV<decltype(v)> d;
   4006  const RebindToUnsigned<decltype(d)> du;
   4007  const Repartition<int64_t, decltype(d)> di64;
   4008 
   4009  // Adjust the values of v to be in the 0..255 range by adding 128 to each lane
   4010  // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
   4011  // bitcasting the Xor result to an u8 vector.
   4012  const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
   4013 
   4014  // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj)
   4015  // operation to account for the adjustment made above.
   4016  return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024});
   4017 }
   4018 
   4019 #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   4020 #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   4021 #else
   4022 #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   4023 #endif
   4024 
   4025 template <size_t N>
   4026 HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a,
   4027                                               const Vec128<uint8_t, N> b) {
   4028  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)};
   4029 }
   4030 
   4031 // Generic for all vector lengths
   4032 template <class V, HWY_IF_I8_D(DFromV<V>)>
   4033 HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
   4034  const DFromV<V> d;
   4035  const RebindToUnsigned<decltype(d)> du;
   4036  const RepartitionToWideX3<decltype(d)> di64;
   4037 
   4038  // Adjust the values of a and b to be in the 0..255 range by adding 128 to
   4039  // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
   4040  // by 128) and then bitcasting the results of the Xor operations to u8
   4041  // vectors.
   4042  const auto i8_msb = SignBit(d);
   4043  const auto a_adj = BitCast(du, Xor(a, i8_msb));
   4044  const auto b_adj = BitCast(du, Xor(b, i8_msb));
   4045 
   4046  // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an
   4047  // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true
   4048  return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj));
   4049 }
   4050 
   4051 // ------------------------------ SumsOf4
   4052 #if HWY_TARGET <= HWY_AVX3
   4053 namespace detail {
   4054 
   4055 template <size_t N>
   4056 HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4(
   4057    hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/,
   4058    Vec128<uint8_t, N> v) {
   4059  const DFromV<decltype(v)> d;
   4060 
   4061  // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
   4062  // zeroed out and the sums of the 4 consecutive lanes are already in the
   4063  // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result.
   4064  return Vec128<uint32_t, (N + 3) / 4>{
   4065      _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)};
   4066 }
   4067 
   4068 // detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h
   4069 
   4070 }  // namespace detail
   4071 #endif  // HWY_TARGET <= HWY_AVX3
   4072 
   4073 // ------------------------------ SumsOfAdjQuadAbsDiff
   4074 
   4075 #if HWY_TARGET <= HWY_SSE4
   4076 #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   4077 #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   4078 #else
   4079 #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   4080 #endif
   4081 
   4082 template <int kAOffset, int kBOffset, size_t N>
   4083 HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff(
   4084    Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   4085  static_assert(0 <= kAOffset && kAOffset <= 1,
   4086                "kAOffset must be between 0 and 1");
   4087  static_assert(0 <= kBOffset && kBOffset <= 3,
   4088                "kBOffset must be between 0 and 3");
   4089  return Vec128<uint16_t, (N + 1) / 2>{
   4090      _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)};
   4091 }
   4092 
   4093 // Generic for all vector lengths
   4094 template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)>
   4095 HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfAdjQuadAbsDiff(V a, V b) {
   4096  const DFromV<decltype(a)> d;
   4097  const RebindToUnsigned<decltype(d)> du;
   4098  const RepartitionToWide<decltype(d)> dw;
   4099 
   4100  // Adjust the values of a and b to be in the 0..255 range by adding 128 to
   4101  // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
   4102  // by 128) and then bitcasting the results of the Xor operations to u8
   4103  // vectors.
   4104  const auto i8_msb = SignBit(d);
   4105  const auto a_adj = BitCast(du, Xor(a, i8_msb));
   4106  const auto b_adj = BitCast(du, Xor(b, i8_msb));
   4107 
   4108  // The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can
   4109  // simply be bitcasted to an i16 vector as
   4110  // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
   4111  return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj));
   4112 }
   4113 #endif
   4114 
   4115 // ------------------------------ SumsOfShuffledQuadAbsDiff
   4116 
   4117 #if HWY_TARGET <= HWY_AVX3
   4118 #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   4119 #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   4120 #else
   4121 #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   4122 #endif
   4123 
   4124 template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N>
   4125 HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfShuffledQuadAbsDiff(
   4126    Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   4127  static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
   4128  static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
   4129  static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
   4130  static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
   4131  return Vec128<uint16_t, (N + 1) / 2>{
   4132      _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
   4133 }
   4134 
   4135 // Generic for all vector lengths
   4136 template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V,
   4137          HWY_IF_I8_D(DFromV<V>)>
   4138 HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfShuffledQuadAbsDiff(V a,
   4139                                                                       V b) {
   4140  const DFromV<decltype(a)> d;
   4141  const RebindToUnsigned<decltype(d)> du;
   4142  const RepartitionToWide<decltype(d)> dw;
   4143 
   4144  // Adjust the values of a and b to be in the 0..255 range by adding 128 to
   4145  // each lane of a and b (which is the same as an bitwise XOR of each i8 lane
   4146  // by 128) and then bitcasting the results of the Xor operations to u8
   4147  // vectors.
   4148  const auto i8_msb = SignBit(d);
   4149  const auto a_adj = BitCast(du, Xor(a, i8_msb));
   4150  const auto b_adj = BitCast(du, Xor(b, i8_msb));
   4151 
   4152  // The result of
   4153  // SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can
   4154  // simply be bitcasted to an i16 vector as
   4155  // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
   4156  return BitCast(
   4157      dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj));
   4158 }
   4159 #endif
   4160 
   4161 // ------------------------------ SaturatedAdd
   4162 
   4163 // Returns a + b clamped to the destination range.
   4164 
   4165 // Unsigned
   4166 template <size_t N>
   4167 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
   4168                                        const Vec128<uint8_t, N> b) {
   4169  return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
   4170 }
   4171 template <size_t N>
   4172 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
   4173                                         const Vec128<uint16_t, N> b) {
   4174  return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
   4175 }
   4176 
   4177 // Signed
   4178 template <size_t N>
   4179 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
   4180                                       const Vec128<int8_t, N> b) {
   4181  return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
   4182 }
   4183 template <size_t N>
   4184 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
   4185                                        const Vec128<int16_t, N> b) {
   4186  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
   4187 }
   4188 
   4189 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
   4190 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
   4191 #undef HWY_NATIVE_I32_SATURATED_ADDSUB
   4192 #else
   4193 #define HWY_NATIVE_I32_SATURATED_ADDSUB
   4194 #endif
   4195 
   4196 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
   4197 #undef HWY_NATIVE_I64_SATURATED_ADDSUB
   4198 #else
   4199 #define HWY_NATIVE_I64_SATURATED_ADDSUB
   4200 #endif
   4201 
   4202 template <size_t N>
   4203 HWY_API Vec128<int32_t, N> SaturatedAdd(Vec128<int32_t, N> a,
   4204                                        Vec128<int32_t, N> b) {
   4205  const DFromV<decltype(a)> d;
   4206  const auto sum = a + b;
   4207  const auto overflow_mask = MaskFromVec(
   4208      Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)});
   4209  const auto i32_max = Set(d, LimitsMax<int32_t>());
   4210  const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32(
   4211      i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
   4212  return IfThenElse(overflow_mask, overflow_result, sum);
   4213 }
   4214 
   4215 template <size_t N>
   4216 HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a,
   4217                                        Vec128<int64_t, N> b) {
   4218  const DFromV<decltype(a)> d;
   4219  const auto sum = a + b;
   4220  const auto overflow_mask = MaskFromVec(
   4221      Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)});
   4222  const auto i64_max = Set(d, LimitsMax<int64_t>());
   4223  const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64(
   4224      i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
   4225  return IfThenElse(overflow_mask, overflow_result, sum);
   4226 }
   4227 #endif  // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
   4228 
   4229 // ------------------------------ SaturatedSub
   4230 
   4231 // Returns a - b clamped to the destination range.
   4232 
   4233 // Unsigned
   4234 template <size_t N>
   4235 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
   4236                                        const Vec128<uint8_t, N> b) {
   4237  return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
   4238 }
   4239 template <size_t N>
   4240 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
   4241                                         const Vec128<uint16_t, N> b) {
   4242  return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
   4243 }
   4244 
   4245 // Signed
   4246 template <size_t N>
   4247 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
   4248                                       const Vec128<int8_t, N> b) {
   4249  return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
   4250 }
   4251 template <size_t N>
   4252 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
   4253                                        const Vec128<int16_t, N> b) {
   4254  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
   4255 }
   4256 
   4257 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
   4258 template <size_t N>
   4259 HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a,
   4260                                        Vec128<int32_t, N> b) {
   4261  const DFromV<decltype(a)> d;
   4262  const auto diff = a - b;
   4263  const auto overflow_mask = MaskFromVec(
   4264      Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)});
   4265  const auto i32_max = Set(d, LimitsMax<int32_t>());
   4266  const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32(
   4267      i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)};
   4268  return IfThenElse(overflow_mask, overflow_result, diff);
   4269 }
   4270 
   4271 template <size_t N>
   4272 HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a,
   4273                                        Vec128<int64_t, N> b) {
   4274  const DFromV<decltype(a)> d;
   4275  const auto diff = a - b;
   4276  const auto overflow_mask = MaskFromVec(
   4277      Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)});
   4278  const auto i64_max = Set(d, LimitsMax<int64_t>());
   4279  const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64(
   4280      i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)};
   4281  return IfThenElse(overflow_mask, overflow_result, diff);
   4282 }
   4283 #endif  // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN
   4284 
   4285 // ------------------------------ AverageRound
   4286 
   4287 // Returns (a + b + 1) / 2
   4288 
   4289 // Unsigned
   4290 template <size_t N>
   4291 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
   4292                                        const Vec128<uint8_t, N> b) {
   4293  return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
   4294 }
   4295 template <size_t N>
   4296 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
   4297                                         const Vec128<uint16_t, N> b) {
   4298  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
   4299 }
   4300 
   4301 // I8/I16 AverageRound is generic for all vector lengths
   4302 template <class V, HWY_IF_SIGNED_V(V),
   4303          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
   4304 HWY_API V AverageRound(V a, V b) {
   4305  const DFromV<decltype(a)> d;
   4306  const RebindToUnsigned<decltype(d)> du;
   4307  const V sign_bit = SignBit(d);
   4308  return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)),
   4309                                     BitCast(du, Xor(b, sign_bit)))),
   4310             sign_bit);
   4311 }
   4312 
   4313 // ------------------------------ Integer multiplication
   4314 
   4315 template <size_t N>
   4316 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
   4317                                      const Vec128<uint16_t, N> b) {
   4318  return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
   4319 }
   4320 template <size_t N>
   4321 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
   4322                                     const Vec128<int16_t, N> b) {
   4323  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
   4324 }
   4325 
   4326 // Returns the upper sizeof(T)*8 bits of a * b in each lane.
   4327 template <size_t N>
   4328 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
   4329                                    const Vec128<uint16_t, N> b) {
   4330  return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
   4331 }
   4332 template <size_t N>
   4333 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
   4334                                   const Vec128<int16_t, N> b) {
   4335  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
   4336 }
   4337 
   4338 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
   4339          HWY_IF_LANES_D(DFromV<V>, 1)>
   4340 HWY_API V MulHigh(V a, V b) {
   4341  const DFromV<decltype(a)> d;
   4342  const Full128<TFromD<decltype(d)>> d_full;
   4343  return ResizeBitCast(
   4344      d, Slide1Down(d_full, ResizeBitCast(d_full, MulEven(a, b))));
   4345 }
   4346 
   4347 // I8/U8/I32/U32 MulHigh is generic for all vector lengths >= 2 lanes
   4348 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)),
   4349          HWY_IF_LANES_GT_D(DFromV<V>, 1)>
   4350 HWY_API V MulHigh(V a, V b) {
   4351  const DFromV<decltype(a)> d;
   4352 
   4353  const auto p_even = BitCast(d, MulEven(a, b));
   4354  const auto p_odd = BitCast(d, MulOdd(a, b));
   4355  return InterleaveOdd(d, p_even, p_odd);
   4356 }
   4357 
   4358 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
   4359 // even and the upper half into its odd neighbor lane.
   4360 template <class V, HWY_IF_U8_D(DFromV<V>)>
   4361 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
   4362  const DFromV<decltype(a)> d;
   4363  const RepartitionToWide<decltype(d)> dw;
   4364  const auto lo8_mask = Set(dw, uint16_t{0x00FF});
   4365  return And(ResizeBitCast(dw, a), lo8_mask) *
   4366         And(ResizeBitCast(dw, b), lo8_mask);
   4367 }
   4368 
   4369 template <class V, HWY_IF_I8_D(DFromV<V>)>
   4370 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
   4371  const DFromV<decltype(a)> d;
   4372  const RepartitionToWide<decltype(d)> dw;
   4373  return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) *
   4374         ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b)));
   4375 }
   4376 
   4377 template <class V, HWY_IF_UI16_D(DFromV<V>)>
   4378 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) {
   4379  const DFromV<decltype(a)> d;
   4380  const RepartitionToWide<decltype(d)> dw;
   4381  const RepartitionToNarrow<decltype(dw)> dw_as_d16;
   4382 
   4383  const auto lo = ResizeBitCast(dw, a * b);
   4384  const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b)));
   4385  return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
   4386 }
   4387 
   4388 template <size_t N>
   4389 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
   4390                                              const Vec128<uint32_t, N> b) {
   4391  return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
   4392 }
   4393 
   4394 template <size_t N>
   4395 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
   4396                                             const Vec128<int32_t, N> b) {
   4397 #if HWY_TARGET >= HWY_SSSE3
   4398  const DFromV<decltype(a)> d;
   4399  const RepartitionToWide<decltype(d)> dw;
   4400  const RebindToUnsigned<decltype(d)> du;
   4401 
   4402  // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) +
   4403  //        (((a[i] >> 31) * b[i]) << 32) +
   4404  //        (((b[i] >> 31) * a[i]) << 32) +
   4405  //        ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF}))
   4406 
   4407  // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the
   4408  // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero.
   4409 
   4410  // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) ==
   4411  // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32)
   4412 
   4413  // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be
   4414  // computed using MulEven(BitCast(du, a), BitCast(du, b))
   4415 
   4416  const auto neg_p_hi = ShiftLeft<32>(
   4417      ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a)));
   4418  const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b)));
   4419  return p_lo - neg_p_hi;
   4420 #else
   4421  return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
   4422 #endif
   4423 }
   4424 
   4425 template <class V, HWY_IF_T_SIZE_V(V, 1)>
   4426 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) {
   4427  const DFromV<decltype(a)> d;
   4428  const RepartitionToWide<decltype(d)> dw;
   4429  return ShiftRight<8>(ResizeBitCast(dw, a)) *
   4430         ShiftRight<8>(ResizeBitCast(dw, b));
   4431 }
   4432 
   4433 template <class V, HWY_IF_UI16_D(DFromV<V>)>
   4434 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) {
   4435  const DFromV<decltype(a)> d;
   4436  const RepartitionToWide<decltype(d)> dw;
   4437  const RebindToUnsigned<decltype(dw)> dw_u;
   4438  const RepartitionToNarrow<decltype(dw)> dw_as_d16;
   4439 
   4440  const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b)));
   4441  const auto hi = ResizeBitCast(dw, MulHigh(a, b));
   4442  return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo)));
   4443 }
   4444 
   4445 template <class V, HWY_IF_UI32_D(DFromV<V>)>
   4446 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) {
   4447  return MulEven(DupOdd(a), DupOdd(b));
   4448 }
   4449 
   4450 template <size_t N>
   4451 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
   4452                                      const Vec128<uint32_t, N> b) {
   4453 #if HWY_TARGET >= HWY_SSSE3
   4454  // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
   4455  // 64-bit right shift would also work but also needs port 5, so no benefit.
   4456  // Notation: x=don't care, z=0.
   4457  const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
   4458  const auto mullo_x2x0 = MulEven(a, b);
   4459  const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
   4460  const auto mullo_x3x1 =
   4461      MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
   4462  // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
   4463  // the latter requires one more instruction or a constant.
   4464  const __m128i mul_20 =
   4465      _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
   4466  const __m128i mul_31 =
   4467      _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
   4468  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
   4469 #else
   4470  return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
   4471 #endif
   4472 }
   4473 
   4474 template <size_t N>
   4475 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
   4476                                     const Vec128<int32_t, N> b) {
   4477  // Same as unsigned; avoid duplicating the SSSE3 code.
   4478  const DFromV<decltype(a)> d;
   4479  const RebindToUnsigned<decltype(d)> du;
   4480  return BitCast(d, BitCast(du, a) * BitCast(du, b));
   4481 }
   4482 
   4483 #if HWY_TARGET <= HWY_AVX3
   4484 // Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*.
   4485 #ifdef HWY_NATIVE_MUL_64
   4486 #undef HWY_NATIVE_MUL_64
   4487 #else
   4488 #define HWY_NATIVE_MUL_64
   4489 #endif
   4490 
   4491 template <size_t N>
   4492 HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a,
   4493                                      Vec128<uint64_t, N> b) {
   4494  return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
   4495 }
   4496 template <size_t N>
   4497 HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a,
   4498                                     Vec128<int64_t, N> b) {
   4499  return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)};
   4500 }
   4501 #endif
   4502 
   4503 // ------------------------------ RotateRight (ShiftRight, Or)
   4504 
   4505 // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
   4506 // RotateRight uses detail::GaloisAffine on AVX3_DL
   4507 
   4508 #if HWY_TARGET > HWY_AVX3_DL
   4509 template <int kBits, size_t N>
   4510 HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) {
   4511  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
   4512  if (kBits == 0) return v;
   4513  // AVX3 does not support 8-bit.
   4514  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
   4515 }
   4516 #endif
   4517 
   4518 template <int kBits, size_t N>
   4519 HWY_API Vec128<uint16_t, N> RotateRight(const Vec128<uint16_t, N> v) {
   4520  static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
   4521  if (kBits == 0) return v;
   4522 #if HWY_TARGET <= HWY_AVX3_DL
   4523  return Vec128<uint16_t, N>{_mm_shrdi_epi16(v.raw, v.raw, kBits)};
   4524 #else
   4525  // AVX3 does not support 16-bit.
   4526  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
   4527 #endif
   4528 }
   4529 
   4530 template <int kBits, size_t N>
   4531 HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
   4532  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
   4533 #if HWY_TARGET <= HWY_AVX3
   4534  return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
   4535 #else
   4536  if (kBits == 0) return v;
   4537  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
   4538 #endif
   4539 }
   4540 
   4541 template <int kBits, size_t N>
   4542 HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
   4543  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
   4544 #if HWY_TARGET <= HWY_AVX3
   4545  return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
   4546 #else
   4547  if (kBits == 0) return v;
   4548  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
   4549 #endif
   4550 }
   4551 
   4552 // I8/I16/I32/I64 RotateRight is generic for all vector lengths
   4553 template <int kBits, class V, HWY_IF_SIGNED_V(V)>
   4554 HWY_API V RotateRight(V v) {
   4555  const DFromV<decltype(v)> d;
   4556  const RebindToUnsigned<decltype(d)> du;
   4557  return BitCast(d, RotateRight<kBits>(BitCast(du, v)));
   4558 }
   4559 
   4560 // ------------------------------ Rol/Ror
   4561 #if HWY_TARGET <= HWY_AVX3_DL
   4562 #ifdef HWY_NATIVE_ROL_ROR_16
   4563 #undef HWY_NATIVE_ROL_ROR_16
   4564 #else
   4565 #define HWY_NATIVE_ROL_ROR_16
   4566 #endif
   4567 
   4568 template <class T, size_t N, HWY_IF_UI16(T)>
   4569 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   4570  return Vec128<T, N>{_mm_shrdv_epi16(a.raw, a.raw, b.raw)};
   4571 }
   4572 
   4573 // U16/I16 Rol is generic for all vector lengths on AVX3_DL
   4574 template <class V, HWY_IF_UI16(TFromV<V>)>
   4575 HWY_API V Rol(V a, V b) {
   4576  const DFromV<decltype(a)> d;
   4577  const RebindToSigned<decltype(d)> di;
   4578  return Ror(a, BitCast(d, Neg(BitCast(di, b))));
   4579 }
   4580 
   4581 #endif  // HWY_TARGET <= HWY_AVX3_DL
   4582 
   4583 #if HWY_TARGET <= HWY_AVX3
   4584 
   4585 #ifdef HWY_NATIVE_ROL_ROR_32_64
   4586 #undef HWY_NATIVE_ROL_ROR_32_64
   4587 #else
   4588 #define HWY_NATIVE_ROL_ROR_32_64
   4589 #endif
   4590 
   4591 template <class T, size_t N, HWY_IF_UI32(T)>
   4592 HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
   4593  return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)};
   4594 }
   4595 
   4596 template <class T, size_t N, HWY_IF_UI32(T)>
   4597 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   4598  return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)};
   4599 }
   4600 
   4601 template <class T, size_t N, HWY_IF_UI64(T)>
   4602 HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
   4603  return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)};
   4604 }
   4605 
   4606 template <class T, size_t N, HWY_IF_UI64(T)>
   4607 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   4608  return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)};
   4609 }
   4610 
   4611 #endif
   4612 
   4613 // ------------------------------ RotateLeftSame/RotateRightSame
   4614 
   4615 #if HWY_TARGET <= HWY_AVX3_DL
   4616 
   4617 #ifdef HWY_NATIVE_ROL_ROR_SAME_16
   4618 #undef HWY_NATIVE_ROL_ROR_SAME_16
   4619 #else
   4620 #define HWY_NATIVE_ROL_ROR_SAME_16
   4621 #endif
   4622 
   4623 // Generic for all vector lengths
   4624 template <class V, HWY_IF_UI16(TFromV<V>)>
   4625 HWY_API V RotateLeftSame(V v, int bits) {
   4626  const DFromV<decltype(v)> d;
   4627  return Ror(v,
   4628             Set(d, static_cast<TFromV<V>>(0u - static_cast<unsigned>(bits))));
   4629 }
   4630 
   4631 template <class V, HWY_IF_UI16(TFromV<V>)>
   4632 HWY_API V RotateRightSame(V v, int bits) {
   4633  const DFromV<decltype(v)> d;
   4634  return Ror(v, Set(d, static_cast<TFromV<V>>(bits)));
   4635 }
   4636 #endif  // HWY_TARGET <= HWY_AVX3_DL
   4637 
   4638 #if HWY_TARGET <= HWY_AVX3
   4639 
   4640 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
   4641 #undef HWY_NATIVE_ROL_ROR_SAME_32_64
   4642 #else
   4643 #define HWY_NATIVE_ROL_ROR_SAME_32_64
   4644 #endif
   4645 
   4646 // Generic for all vector lengths
   4647 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4648          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
   4649 HWY_API V RotateLeftSame(V v, int bits) {
   4650  const DFromV<decltype(v)> d;
   4651  return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
   4652 }
   4653 
   4654 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4655          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
   4656 HWY_API V RotateRightSame(V v, int bits) {
   4657  const DFromV<decltype(v)> d;
   4658  return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
   4659 }
   4660 #endif  // HWY_TARGET <= HWY_AVX3
   4661 
   4662 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
   4663 
   4664 template <size_t N>
   4665 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
   4666  const DFromV<decltype(v)> d;
   4667  return VecFromMask(v < Zero(d));
   4668 }
   4669 
   4670 template <size_t N>
   4671 HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) {
   4672  return ShiftRight<15>(v);
   4673 }
   4674 
   4675 template <size_t N>
   4676 HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) {
   4677  return ShiftRight<31>(v);
   4678 }
   4679 
   4680 template <size_t N>
   4681 HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) {
   4682  const DFromV<decltype(v)> d;
   4683 #if HWY_TARGET <= HWY_AVX3
   4684  (void)d;
   4685  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
   4686 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
   4687  return VecFromMask(v < Zero(d));
   4688 #else
   4689  // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
   4690  // avoids generating a zero.
   4691  const RepartitionToNarrow<decltype(d)> d32;
   4692  const auto sign = ShiftRight<31>(BitCast(d32, v));
   4693  return Vec128<int64_t, N>{
   4694      _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
   4695 #endif
   4696 }
   4697 
   4698 // ------------------------------ Integer Abs
   4699 
   4700 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
   4701 template <size_t N>
   4702 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
   4703 #if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2
   4704  const DFromV<decltype(v)> d;
   4705  const RebindToUnsigned<decltype(d)> du;
   4706  const auto zero = Zero(du);
   4707  const auto v_as_u8 = BitCast(du, v);
   4708  return BitCast(d, Min(v_as_u8, zero - v_as_u8));
   4709 #else
   4710  return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
   4711 #endif
   4712 }
   4713 
   4714 template <size_t N>
   4715 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
   4716 #if HWY_TARGET == HWY_SSE2
   4717  const auto zero = Zero(DFromV<decltype(v)>());
   4718  return Max(v, zero - v);
   4719 #else
   4720  return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
   4721 #endif
   4722 }
   4723 
   4724 template <size_t N>
   4725 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
   4726 #if HWY_TARGET <= HWY_SSSE3
   4727  return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
   4728 #else
   4729  const auto zero = Zero(DFromV<decltype(v)>());
   4730  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
   4731 #endif
   4732 }
   4733 
   4734 #if HWY_TARGET <= HWY_AVX3
   4735 template <size_t N>
   4736 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
   4737  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
   4738 }
   4739 #else
   4740 // I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
   4741 template <class V, HWY_IF_I64(TFromV<V>)>
   4742 HWY_API V Abs(V v) {
   4743  const auto zero = Zero(DFromV<decltype(v)>());
   4744  return IfNegativeThenElse(v, zero - v, v);
   4745 }
   4746 #endif
   4747 
   4748 #ifdef HWY_NATIVE_SATURATED_ABS
   4749 #undef HWY_NATIVE_SATURATED_ABS
   4750 #else
   4751 #define HWY_NATIVE_SATURATED_ABS
   4752 #endif
   4753 
   4754 // Generic for all vector lengths
   4755 template <class V, HWY_IF_I8(TFromV<V>)>
   4756 HWY_API V SaturatedAbs(V v) {
   4757  const DFromV<decltype(v)> d;
   4758  const RebindToUnsigned<decltype(d)> du;
   4759  return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
   4760 }
   4761 
   4762 // Generic for all vector lengths
   4763 template <class V, HWY_IF_I16(TFromV<V>)>
   4764 HWY_API V SaturatedAbs(V v) {
   4765  return Max(v, SaturatedSub(Zero(DFromV<V>()), v));
   4766 }
   4767 
   4768 // Generic for all vector lengths
   4769 template <class V, HWY_IF_I32(TFromV<V>)>
   4770 HWY_API V SaturatedAbs(V v) {
   4771  const auto abs_v = Abs(v);
   4772 
   4773 #if HWY_TARGET <= HWY_SSE4
   4774  const DFromV<decltype(v)> d;
   4775  const RebindToUnsigned<decltype(d)> du;
   4776  return BitCast(d, Min(BitCast(du, abs_v),
   4777                        Set(du, static_cast<uint32_t>(LimitsMax<int32_t>()))));
   4778 #else
   4779  return Add(abs_v, BroadcastSignBit(abs_v));
   4780 #endif
   4781 }
   4782 
   4783 // Generic for all vector lengths
   4784 template <class V, HWY_IF_I64(TFromV<V>)>
   4785 HWY_API V SaturatedAbs(V v) {
   4786  const auto abs_v = Abs(v);
   4787  return Add(abs_v, BroadcastSignBit(abs_v));
   4788 }
   4789 
   4790 // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL
   4791 // srli_epi64: the count should be unsigned int. Note that this is not the same
   4792 // as the Shift3264Count in x86_512-inl.h (GCC also requires int).
   4793 #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
   4794    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400)
   4795 using Shift64Count = int;
   4796 #else
   4797 // Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this.
   4798 using Shift64Count = unsigned int;
   4799 #endif
   4800 
   4801 template <int kBits, size_t N>
   4802 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
   4803 #if HWY_TARGET <= HWY_AVX3
   4804  return Vec128<int64_t, N>{
   4805      _mm_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))};
   4806 #else
   4807  const DFromV<decltype(v)> di;
   4808  const RebindToUnsigned<decltype(di)> du;
   4809  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
   4810  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
   4811  return right | sign;
   4812 #endif
   4813 }
   4814 
   4815 // ------------------------------ IfNegativeThenElse
   4816 template <size_t N>
   4817 HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v,
   4818                                             const Vec128<int8_t, N> yes,
   4819                                             const Vec128<int8_t, N> no) {
   4820 // int8: IfThenElse only looks at the MSB on SSE4 or newer
   4821 #if HWY_TARGET <= HWY_SSE4
   4822  const auto mask = MaskFromVec(v);
   4823 #else
   4824  const DFromV<decltype(v)> d;
   4825  const RebindToSigned<decltype(d)> di;
   4826  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
   4827 #endif
   4828 
   4829  return IfThenElse(mask, yes, no);
   4830 }
   4831 
   4832 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   4833 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
   4834                                        Vec128<T, N> no) {
   4835  static_assert(IsSigned<T>(), "Only works for signed/float");
   4836 
   4837 // 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's
   4838 // MSB.
   4839 #if HWY_TARGET <= HWY_AVX3
   4840  const auto mask = MaskFromVec(v);
   4841 #else
   4842  const DFromV<decltype(v)> d;
   4843  const RebindToSigned<decltype(d)> di;
   4844  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
   4845 #endif
   4846 
   4847  return IfThenElse(mask, yes, no);
   4848 }
   4849 
   4850 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
   4851 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
   4852                                        Vec128<T, N> no) {
   4853  static_assert(IsSigned<T>(), "Only works for signed/float");
   4854  const DFromV<decltype(v)> d;
   4855 
   4856 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
   4857  // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB
   4858  // on SSE4 or later.
   4859  const RebindToFloat<decltype(d)> df;
   4860  const auto mask = MaskFromVec(BitCast(df, v));
   4861  return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no)));
   4862 #else  // SSE2, SSSE3, or AVX3
   4863 
   4864 #if HWY_TARGET <= HWY_AVX3
   4865  // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only
   4866  // looks at the MSB on AVX3
   4867  (void)d;
   4868  const auto mask = MaskFromVec(v);
   4869 #else
   4870  const RebindToSigned<decltype(d)> di;
   4871  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
   4872 #endif
   4873 
   4874  return IfThenElse(mask, yes, no);
   4875 #endif
   4876 }
   4877 
   4878 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
   4879 
   4880 #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
   4881 #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
   4882 #else
   4883 #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
   4884 #endif
   4885 
   4886 #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
   4887 #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
   4888 #else
   4889 #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
   4890 #endif
   4891 
   4892 // SSE4/AVX2 IfNegativeThenElseZero/IfNegativeThenZeroElse is generic for all
   4893 // vector lengths
   4894 template <class V, HWY_IF_NOT_UNSIGNED_V(V),
   4895          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
   4896 HWY_API V IfNegativeThenElseZero(V v, V yes) {
   4897  const DFromV<decltype(v)> d;
   4898  return IfNegativeThenElse(v, yes, Zero(d));
   4899 }
   4900 
   4901 template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
   4902 HWY_API V IfNegativeThenElseZero(V v, V yes) {
   4903  return IfThenElseZero(IsNegative(v), yes);
   4904 }
   4905 
   4906 template <class V, HWY_IF_NOT_UNSIGNED_V(V),
   4907          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))>
   4908 HWY_API V IfNegativeThenZeroElse(V v, V no) {
   4909  const DFromV<decltype(v)> d;
   4910  return IfNegativeThenElse(v, Zero(d), no);
   4911 }
   4912 
   4913 template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)>
   4914 HWY_API V IfNegativeThenZeroElse(V v, V no) {
   4915  return IfThenZeroElse(IsNegative(v), no);
   4916 }
   4917 
   4918 #endif  // HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
   4919 
   4920 // ------------------------------ IfNegativeThenNegOrUndefIfZero
   4921 
   4922 #if HWY_TARGET <= HWY_SSSE3
   4923 
   4924 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
   4925 #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
   4926 #else
   4927 #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
   4928 #endif
   4929 
   4930 template <size_t N>
   4931 HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask,
   4932                                                         Vec128<int8_t, N> v) {
   4933  return Vec128<int8_t, N>{_mm_sign_epi8(v.raw, mask.raw)};
   4934 }
   4935 
   4936 template <size_t N>
   4937 HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero(
   4938    Vec128<int16_t, N> mask, Vec128<int16_t, N> v) {
   4939  return Vec128<int16_t, N>{_mm_sign_epi16(v.raw, mask.raw)};
   4940 }
   4941 
   4942 template <size_t N>
   4943 HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero(
   4944    Vec128<int32_t, N> mask, Vec128<int32_t, N> v) {
   4945  return Vec128<int32_t, N>{_mm_sign_epi32(v.raw, mask.raw)};
   4946 }
   4947 
   4948 // Generic for all vector lengths
   4949 template <class V, HWY_IF_I64_D(DFromV<V>)>
   4950 HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
   4951 #if HWY_TARGET <= HWY_AVX3
   4952  // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3
   4953  const DFromV<decltype(v)> d;
   4954  return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
   4955 #else
   4956  // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2
   4957  return IfNegativeThenElse(mask, Neg(v), v);
   4958 #endif
   4959 }
   4960 
   4961 #endif  // HWY_TARGET <= HWY_SSSE3
   4962 
   4963 // ------------------------------ ShiftLeftSame
   4964 
   4965 template <size_t N>
   4966 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
   4967                                          const int bits) {
   4968 #if HWY_COMPILER_GCC
   4969  if (__builtin_constant_p(bits)) {
   4970    return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)};
   4971  }
   4972 #endif
   4973  return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
   4974 }
   4975 template <size_t N>
   4976 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
   4977                                          const int bits) {
   4978 #if HWY_COMPILER_GCC
   4979  if (__builtin_constant_p(bits)) {
   4980    return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)};
   4981  }
   4982 #endif
   4983  return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
   4984 }
   4985 template <size_t N>
   4986 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
   4987                                          const int bits) {
   4988 #if HWY_COMPILER_GCC
   4989  if (__builtin_constant_p(bits)) {
   4990    return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)};
   4991  }
   4992 #endif
   4993  return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
   4994 }
   4995 
   4996 template <size_t N>
   4997 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
   4998                                         const int bits) {
   4999 #if HWY_COMPILER_GCC
   5000  if (__builtin_constant_p(bits)) {
   5001    return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)};
   5002  }
   5003 #endif
   5004  return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
   5005 }
   5006 
   5007 template <size_t N>
   5008 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
   5009                                         const int bits) {
   5010 #if HWY_COMPILER_GCC
   5011  if (__builtin_constant_p(bits)) {
   5012    return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)};
   5013  }
   5014 #endif
   5015  return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
   5016 }
   5017 
   5018 template <size_t N>
   5019 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
   5020                                         const int bits) {
   5021 #if HWY_COMPILER_GCC
   5022  if (__builtin_constant_p(bits)) {
   5023    return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)};
   5024  }
   5025 #endif
   5026  return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
   5027 }
   5028 
   5029 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   5030 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
   5031  const DFromV<decltype(v)> d8;
   5032  // Use raw instead of BitCast to support N=1.
   5033  const Vec128<T, N> shifted{
   5034      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
   5035  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
   5036 }
   5037 
   5038 // ------------------------------ ShiftRightSame (BroadcastSignBit)
   5039 
   5040 template <size_t N>
   5041 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
   5042                                           const int bits) {
   5043 #if HWY_COMPILER_GCC
   5044  if (__builtin_constant_p(bits)) {
   5045    return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)};
   5046  }
   5047 #endif
   5048  return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
   5049 }
   5050 template <size_t N>
   5051 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
   5052                                           const int bits) {
   5053 #if HWY_COMPILER_GCC
   5054  if (__builtin_constant_p(bits)) {
   5055    return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)};
   5056  }
   5057 #endif
   5058  return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
   5059 }
   5060 template <size_t N>
   5061 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
   5062                                           const int bits) {
   5063 #if HWY_COMPILER_GCC
   5064  if (__builtin_constant_p(bits)) {
   5065    return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)};
   5066  }
   5067 #endif
   5068  return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
   5069 }
   5070 
   5071 template <size_t N>
   5072 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
   5073                                          const int bits) {
   5074  const DFromV<decltype(v)> d8;
   5075  // Use raw instead of BitCast to support N=1.
   5076  const Vec128<uint8_t, N> shifted{
   5077      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
   5078  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
   5079 }
   5080 
   5081 template <size_t N>
   5082 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
   5083                                          const int bits) {
   5084 #if HWY_COMPILER_GCC
   5085  if (__builtin_constant_p(bits)) {
   5086    return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)};
   5087  }
   5088 #endif
   5089  return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
   5090 }
   5091 
   5092 template <size_t N>
   5093 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
   5094                                          const int bits) {
   5095 #if HWY_COMPILER_GCC
   5096  if (__builtin_constant_p(bits)) {
   5097    return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)};
   5098  }
   5099 #endif
   5100  return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
   5101 }
   5102 template <size_t N>
   5103 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
   5104                                          const int bits) {
   5105 #if HWY_TARGET <= HWY_AVX3
   5106 #if HWY_COMPILER_GCC
   5107  if (__builtin_constant_p(bits)) {
   5108    return Vec128<int64_t, N>{
   5109        _mm_srai_epi64(v.raw, static_cast<Shift64Count>(bits))};
   5110  }
   5111 #endif
   5112  return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
   5113 #else
   5114  const DFromV<decltype(v)> di;
   5115  const RebindToUnsigned<decltype(di)> du;
   5116  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
   5117  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
   5118  return right | sign;
   5119 #endif
   5120 }
   5121 
   5122 template <size_t N>
   5123 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
   5124  const DFromV<decltype(v)> di;
   5125  const RebindToUnsigned<decltype(di)> du;
   5126  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
   5127  const auto shifted_sign =
   5128      BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
   5129  return (shifted ^ shifted_sign) - shifted_sign;
   5130 }
   5131 
   5132 // ------------------------------ Floating-point mul / div
   5133 
   5134 #if HWY_HAVE_FLOAT16
   5135 template <size_t N>
   5136 HWY_API Vec128<float16_t, N> operator*(Vec128<float16_t, N> a,
   5137                                       Vec128<float16_t, N> b) {
   5138  return Vec128<float16_t, N>{_mm_mul_ph(a.raw, b.raw)};
   5139 }
   5140 #endif  // HWY_HAVE_FLOAT16
   5141 template <size_t N>
   5142 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
   5143  return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
   5144 }
   5145 HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a,
   5146                                   const Vec128<float, 1> b) {
   5147  return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
   5148 }
   5149 template <size_t N>
   5150 HWY_API Vec128<double, N> operator*(const Vec128<double, N> a,
   5151                                    const Vec128<double, N> b) {
   5152  return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
   5153 }
   5154 HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) {
   5155  return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
   5156 }
   5157 
   5158 #if HWY_TARGET <= HWY_AVX3
   5159 
   5160 #ifdef HWY_NATIVE_MUL_BY_POW2
   5161 #undef HWY_NATIVE_MUL_BY_POW2
   5162 #else
   5163 #define HWY_NATIVE_MUL_BY_POW2
   5164 #endif
   5165 
   5166 #if HWY_HAVE_FLOAT16
   5167 template <size_t N>
   5168 HWY_API Vec128<float16_t, N> MulByFloorPow2(Vec128<float16_t, N> a,
   5169                                            Vec128<float16_t, N> b) {
   5170  return Vec128<float16_t, N>{_mm_scalef_ph(a.raw, b.raw)};
   5171 }
   5172 #endif
   5173 
   5174 template <size_t N>
   5175 HWY_API Vec128<float, N> MulByFloorPow2(Vec128<float, N> a,
   5176                                        Vec128<float, N> b) {
   5177  return Vec128<float, N>{_mm_scalef_ps(a.raw, b.raw)};
   5178 }
   5179 
   5180 template <size_t N>
   5181 HWY_API Vec128<double, N> MulByFloorPow2(Vec128<double, N> a,
   5182                                         Vec128<double, N> b) {
   5183  return Vec128<double, N>{_mm_scalef_pd(a.raw, b.raw)};
   5184 }
   5185 
   5186 // MulByPow2 is generic for all vector lengths on AVX3
   5187 template <class V, HWY_IF_FLOAT_V(V)>
   5188 HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) {
   5189  const DFromV<decltype(v)> d;
   5190  return MulByFloorPow2(v, ConvertTo(d, exp));
   5191 }
   5192 
   5193 #endif  // HWY_TARGET <= HWY_AVX3
   5194 
   5195 #if HWY_HAVE_FLOAT16
   5196 template <size_t N>
   5197 HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a,
   5198                                       const Vec128<float16_t, N> b) {
   5199  return Vec128<float16_t, N>{_mm_div_ph(a.raw, b.raw)};
   5200 }
   5201 #endif  // HWY_HAVE_FLOAT16
   5202 template <size_t N>
   5203 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
   5204                                   const Vec128<float, N> b) {
   5205  return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
   5206 }
   5207 HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a,
   5208                                   const Vec128<float, 1> b) {
   5209  return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
   5210 }
   5211 template <size_t N>
   5212 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
   5213                                    const Vec128<double, N> b) {
   5214  return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
   5215 }
   5216 HWY_API Vec64<double> operator/(const Vec64<double> a, const Vec64<double> b) {
   5217  return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
   5218 }
   5219 
   5220 // Approximate reciprocal
   5221 #if HWY_HAVE_FLOAT16
   5222 template <size_t N>
   5223 HWY_API Vec128<float16_t, N> ApproximateReciprocal(
   5224    const Vec128<float16_t, N> v) {
   5225  return Vec128<float16_t, N>{_mm_rcp_ph(v.raw)};
   5226 }
   5227 #endif  // HWY_HAVE_FLOAT16
   5228 template <size_t N>
   5229 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
   5230  return Vec128<float, N>{_mm_rcp_ps(v.raw)};
   5231 }
   5232 HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) {
   5233  return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
   5234 }
   5235 
   5236 #if HWY_TARGET <= HWY_AVX3
   5237 #ifdef HWY_NATIVE_F64_APPROX_RECIP
   5238 #undef HWY_NATIVE_F64_APPROX_RECIP
   5239 #else
   5240 #define HWY_NATIVE_F64_APPROX_RECIP
   5241 #endif
   5242 
   5243 HWY_API Vec128<double> ApproximateReciprocal(Vec128<double> v) {
   5244  return Vec128<double>{_mm_rcp14_pd(v.raw)};
   5245 }
   5246 HWY_API Vec64<double> ApproximateReciprocal(Vec64<double> v) {
   5247  return Vec64<double>{_mm_rcp14_sd(v.raw, v.raw)};
   5248 }
   5249 #endif
   5250 
   5251 // Generic for all vector lengths.
   5252 template <class V, HWY_IF_FLOAT_V(V)>
   5253 HWY_API V AbsDiff(V a, V b) {
   5254  return Abs(a - b);
   5255 }
   5256 
   5257 // ------------------------------ GetExponent
   5258 
   5259 #if HWY_TARGET <= HWY_AVX3
   5260 
   5261 #ifdef HWY_NATIVE_GET_EXPONENT
   5262 #undef HWY_NATIVE_GET_EXPONENT
   5263 #else
   5264 #define HWY_NATIVE_GET_EXPONENT
   5265 #endif
   5266 
   5267 #if HWY_HAVE_FLOAT16
   5268 template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
   5269 HWY_API V GetExponent(V v) {
   5270  return V{_mm_getexp_ph(v.raw)};
   5271 }
   5272 #endif
   5273 template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
   5274 HWY_API V GetExponent(V v) {
   5275  return V{_mm_getexp_ps(v.raw)};
   5276 }
   5277 template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)>
   5278 HWY_API V GetExponent(V v) {
   5279  return V{_mm_getexp_pd(v.raw)};
   5280 }
   5281 
   5282 #endif
   5283 
   5284 // ------------------------------ MaskedMinOr
   5285 
   5286 #if HWY_TARGET <= HWY_AVX3
   5287 
   5288 #ifdef HWY_NATIVE_MASKED_ARITH
   5289 #undef HWY_NATIVE_MASKED_ARITH
   5290 #else
   5291 #define HWY_NATIVE_MASKED_ARITH
   5292 #endif
   5293 
   5294 template <typename T, size_t N, HWY_IF_U8(T)>
   5295 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5296                                 Vec128<T, N> a, Vec128<T, N> b) {
   5297  return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
   5298 }
   5299 template <typename T, size_t N, HWY_IF_I8(T)>
   5300 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5301                                 Vec128<T, N> a, Vec128<T, N> b) {
   5302  return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
   5303 }
   5304 
   5305 template <typename T, size_t N, HWY_IF_U16(T)>
   5306 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5307                                 Vec128<T, N> a, Vec128<T, N> b) {
   5308  return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
   5309 }
   5310 template <typename T, size_t N, HWY_IF_I16(T)>
   5311 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5312                                 Vec128<T, N> a, Vec128<T, N> b) {
   5313  return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
   5314 }
   5315 
   5316 template <typename T, size_t N, HWY_IF_U32(T)>
   5317 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5318                                 Vec128<T, N> a, Vec128<T, N> b) {
   5319  return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
   5320 }
   5321 template <typename T, size_t N, HWY_IF_I32(T)>
   5322 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5323                                 Vec128<T, N> a, Vec128<T, N> b) {
   5324  return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
   5325 }
   5326 
   5327 template <typename T, size_t N, HWY_IF_U64(T)>
   5328 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5329                                 Vec128<T, N> a, Vec128<T, N> b) {
   5330  return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
   5331 }
   5332 template <typename T, size_t N, HWY_IF_I64(T)>
   5333 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5334                                 Vec128<T, N> a, Vec128<T, N> b) {
   5335  return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
   5336 }
   5337 
   5338 template <typename T, size_t N, HWY_IF_F32(T)>
   5339 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5340                                 Vec128<T, N> a, Vec128<T, N> b) {
   5341  return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
   5342 }
   5343 
   5344 template <typename T, size_t N, HWY_IF_F64(T)>
   5345 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5346                                 Vec128<T, N> a, Vec128<T, N> b) {
   5347  return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
   5348 }
   5349 
   5350 #if HWY_HAVE_FLOAT16
   5351 template <typename T, size_t N, HWY_IF_F16(T)>
   5352 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
   5353                                 Vec128<T, N> a, Vec128<T, N> b) {
   5354  return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
   5355 }
   5356 #endif  // HWY_HAVE_FLOAT16
   5357 
   5358 // ------------------------------ MaskedMaxOr
   5359 
   5360 template <typename T, size_t N, HWY_IF_U8(T)>
   5361 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5362                                 Vec128<T, N> a, Vec128<T, N> b) {
   5363  return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
   5364 }
   5365 template <typename T, size_t N, HWY_IF_I8(T)>
   5366 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5367                                 Vec128<T, N> a, Vec128<T, N> b) {
   5368  return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
   5369 }
   5370 
   5371 template <typename T, size_t N, HWY_IF_U16(T)>
   5372 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5373                                 Vec128<T, N> a, Vec128<T, N> b) {
   5374  return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
   5375 }
   5376 template <typename T, size_t N, HWY_IF_I16(T)>
   5377 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5378                                 Vec128<T, N> a, Vec128<T, N> b) {
   5379  return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
   5380 }
   5381 
   5382 template <typename T, size_t N, HWY_IF_U32(T)>
   5383 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5384                                 Vec128<T, N> a, Vec128<T, N> b) {
   5385  return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
   5386 }
   5387 template <typename T, size_t N, HWY_IF_I32(T)>
   5388 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5389                                 Vec128<T, N> a, Vec128<T, N> b) {
   5390  return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
   5391 }
   5392 
   5393 template <typename T, size_t N, HWY_IF_U64(T)>
   5394 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5395                                 Vec128<T, N> a, Vec128<T, N> b) {
   5396  return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
   5397 }
   5398 template <typename T, size_t N, HWY_IF_I64(T)>
   5399 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5400                                 Vec128<T, N> a, Vec128<T, N> b) {
   5401  return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
   5402 }
   5403 
   5404 template <typename T, size_t N, HWY_IF_F32(T)>
   5405 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5406                                 Vec128<T, N> a, Vec128<T, N> b) {
   5407  return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
   5408 }
   5409 
   5410 template <typename T, size_t N, HWY_IF_F64(T)>
   5411 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5412                                 Vec128<T, N> a, Vec128<T, N> b) {
   5413  return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
   5414 }
   5415 
   5416 #if HWY_HAVE_FLOAT16
   5417 template <typename T, size_t N, HWY_IF_F16(T)>
   5418 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
   5419                                 Vec128<T, N> a, Vec128<T, N> b) {
   5420  return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
   5421 }
   5422 #endif  // HWY_HAVE_FLOAT16
   5423 
   5424 // ------------------------------ MaskedAddOr
   5425 
   5426 template <typename T, size_t N, HWY_IF_UI8(T)>
   5427 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5428                                 Vec128<T, N> a, Vec128<T, N> b) {
   5429  return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
   5430 }
   5431 
   5432 template <typename T, size_t N, HWY_IF_UI16(T)>
   5433 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5434                                 Vec128<T, N> a, Vec128<T, N> b) {
   5435  return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
   5436 }
   5437 
   5438 template <typename T, size_t N, HWY_IF_UI32(T)>
   5439 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5440                                 Vec128<T, N> a, Vec128<T, N> b) {
   5441  return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
   5442 }
   5443 
   5444 template <typename T, size_t N, HWY_IF_UI64(T)>
   5445 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5446                                 Vec128<T, N> a, Vec128<T, N> b) {
   5447  return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
   5448 }
   5449 
   5450 template <typename T, size_t N, HWY_IF_F32(T)>
   5451 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5452                                 Vec128<T, N> a, Vec128<T, N> b) {
   5453  return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
   5454 }
   5455 
   5456 template <typename T, size_t N, HWY_IF_F64(T)>
   5457 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5458                                 Vec128<T, N> a, Vec128<T, N> b) {
   5459  return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
   5460 }
   5461 
   5462 #if HWY_HAVE_FLOAT16
   5463 template <typename T, size_t N, HWY_IF_F16(T)>
   5464 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5465                                 Vec128<T, N> a, Vec128<T, N> b) {
   5466  return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
   5467 }
   5468 #endif  // HWY_HAVE_FLOAT16
   5469 
   5470 // ------------------------------ MaskedSubOr
   5471 
   5472 template <typename T, size_t N, HWY_IF_UI8(T)>
   5473 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5474                                 Vec128<T, N> a, Vec128<T, N> b) {
   5475  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
   5476 }
   5477 
   5478 template <typename T, size_t N, HWY_IF_UI16(T)>
   5479 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5480                                 Vec128<T, N> a, Vec128<T, N> b) {
   5481  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
   5482 }
   5483 
   5484 template <typename T, size_t N, HWY_IF_UI32(T)>
   5485 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5486                                 Vec128<T, N> a, Vec128<T, N> b) {
   5487  return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
   5488 }
   5489 
   5490 template <typename T, size_t N, HWY_IF_UI64(T)>
   5491 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5492                                 Vec128<T, N> a, Vec128<T, N> b) {
   5493  return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
   5494 }
   5495 
   5496 template <typename T, size_t N, HWY_IF_F32(T)>
   5497 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5498                                 Vec128<T, N> a, Vec128<T, N> b) {
   5499  return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
   5500 }
   5501 
   5502 template <typename T, size_t N, HWY_IF_F64(T)>
   5503 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5504                                 Vec128<T, N> a, Vec128<T, N> b) {
   5505  return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
   5506 }
   5507 
   5508 #if HWY_HAVE_FLOAT16
   5509 template <typename T, size_t N, HWY_IF_F16(T)>
   5510 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5511                                 Vec128<T, N> a, Vec128<T, N> b) {
   5512  return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
   5513 }
   5514 #endif  // HWY_HAVE_FLOAT16
   5515 
   5516 // ------------------------------ MaskedMulOr
   5517 
   5518 // There are no elementwise integer mask_mul. Generic for all vector lengths.
   5519 template <class V, class M>
   5520 HWY_API V MaskedMulOr(V no, M m, V a, V b) {
   5521  return IfThenElse(m, a * b, no);
   5522 }
   5523 
   5524 template <size_t N>
   5525 HWY_API Vec128<float, N> MaskedMulOr(Vec128<float, N> no, Mask128<float, N> m,
   5526                                     Vec128<float, N> a, Vec128<float, N> b) {
   5527  return Vec128<float, N>{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
   5528 }
   5529 
   5530 template <size_t N>
   5531 HWY_API Vec128<double, N> MaskedMulOr(Vec128<double, N> no,
   5532                                      Mask128<double, N> m, Vec128<double, N> a,
   5533                                      Vec128<double, N> b) {
   5534  return Vec128<double, N>{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
   5535 }
   5536 
   5537 #if HWY_HAVE_FLOAT16
   5538 template <size_t N>
   5539 HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no,
   5540                                         Mask128<float16_t, N> m,
   5541                                         Vec128<float16_t, N> a,
   5542                                         Vec128<float16_t, N> b) {
   5543  return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
   5544 }
   5545 #endif  // HWY_HAVE_FLOAT16
   5546 
   5547 // ------------------------------ MaskedDivOr
   5548 
   5549 template <size_t N>
   5550 HWY_API Vec128<float, N> MaskedDivOr(Vec128<float, N> no, Mask128<float, N> m,
   5551                                     Vec128<float, N> a, Vec128<float, N> b) {
   5552  return Vec128<float, N>{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
   5553 }
   5554 
   5555 template <size_t N>
   5556 HWY_API Vec128<double, N> MaskedDivOr(Vec128<double, N> no,
   5557                                      Mask128<double, N> m, Vec128<double, N> a,
   5558                                      Vec128<double, N> b) {
   5559  return Vec128<double, N>{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
   5560 }
   5561 
   5562 #if HWY_HAVE_FLOAT16
   5563 template <size_t N>
   5564 HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no,
   5565                                         Mask128<float16_t, N> m,
   5566                                         Vec128<float16_t, N> a,
   5567                                         Vec128<float16_t, N> b) {
   5568  return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
   5569 }
   5570 #endif  // HWY_HAVE_FLOAT16
   5571 
   5572 // Generic for all vector lengths
   5573 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5574 HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) {
   5575  return IfThenElse(m, Div(a, b), no);
   5576 }
   5577 
   5578 // ------------------------------ MaskedModOr
   5579 // Generic for all vector lengths
   5580 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5581 HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) {
   5582  return IfThenElse(m, Mod(a, b), no);
   5583 }
   5584 
   5585 // ------------------------------ MaskedSatAddOr
   5586 
   5587 template <typename T, size_t N, HWY_IF_I8(T)>
   5588 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5589                                    Vec128<T, N> a, Vec128<T, N> b) {
   5590  return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
   5591 }
   5592 
   5593 template <typename T, size_t N, HWY_IF_U8(T)>
   5594 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5595                                    Vec128<T, N> a, Vec128<T, N> b) {
   5596  return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
   5597 }
   5598 
   5599 template <typename T, size_t N, HWY_IF_I16(T)>
   5600 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5601                                    Vec128<T, N> a, Vec128<T, N> b) {
   5602  return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
   5603 }
   5604 
   5605 template <typename T, size_t N, HWY_IF_U16(T)>
   5606 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
   5607                                    Vec128<T, N> a, Vec128<T, N> b) {
   5608  return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
   5609 }
   5610 
   5611 // ------------------------------ MaskedSatSubOr
   5612 
   5613 template <typename T, size_t N, HWY_IF_I8(T)>
   5614 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5615                                    Vec128<T, N> a, Vec128<T, N> b) {
   5616  return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
   5617 }
   5618 
   5619 template <typename T, size_t N, HWY_IF_U8(T)>
   5620 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5621                                    Vec128<T, N> a, Vec128<T, N> b) {
   5622  return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
   5623 }
   5624 
   5625 template <typename T, size_t N, HWY_IF_I16(T)>
   5626 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5627                                    Vec128<T, N> a, Vec128<T, N> b) {
   5628  return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
   5629 }
   5630 
   5631 template <typename T, size_t N, HWY_IF_U16(T)>
   5632 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
   5633                                    Vec128<T, N> a, Vec128<T, N> b) {
   5634  return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
   5635 }
   5636 
   5637 #endif  // HWY_TARGET <= HWY_AVX3
   5638 
   5639 // ------------------------------ Floating-point multiply-add variants
   5640 
   5641 #if HWY_HAVE_FLOAT16
   5642 template <size_t N>
   5643 HWY_API Vec128<float16_t, N> MulAdd(Vec128<float16_t, N> mul,
   5644                                    Vec128<float16_t, N> x,
   5645                                    Vec128<float16_t, N> add) {
   5646  return Vec128<float16_t, N>{_mm_fmadd_ph(mul.raw, x.raw, add.raw)};
   5647 }
   5648 
   5649 template <size_t N>
   5650 HWY_API Vec128<float16_t, N> NegMulAdd(Vec128<float16_t, N> mul,
   5651                                       Vec128<float16_t, N> x,
   5652                                       Vec128<float16_t, N> add) {
   5653  return Vec128<float16_t, N>{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)};
   5654 }
   5655 
   5656 template <size_t N>
   5657 HWY_API Vec128<float16_t, N> MulSub(Vec128<float16_t, N> mul,
   5658                                    Vec128<float16_t, N> x,
   5659                                    Vec128<float16_t, N> sub) {
   5660  return Vec128<float16_t, N>{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)};
   5661 }
   5662 
   5663 template <size_t N>
   5664 HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul,
   5665                                       Vec128<float16_t, N> x,
   5666                                       Vec128<float16_t, N> sub) {
   5667  return Vec128<float16_t, N>{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)};
   5668 }
   5669 
   5670 #endif  // HWY_HAVE_FLOAT16
   5671 template <size_t N>
   5672 HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
   5673                                Vec128<float, N> add) {
   5674 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5675  return mul * x + add;
   5676 #else
   5677  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
   5678 #endif
   5679 }
   5680 template <size_t N>
   5681 HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
   5682                                 Vec128<double, N> add) {
   5683 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5684  return mul * x + add;
   5685 #else
   5686  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
   5687 #endif
   5688 }
   5689 
   5690 // Returns add - mul * x
   5691 template <size_t N>
   5692 HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
   5693                                   Vec128<float, N> add) {
   5694 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5695  return add - mul * x;
   5696 #else
   5697  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
   5698 #endif
   5699 }
   5700 template <size_t N>
   5701 HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x,
   5702                                    Vec128<double, N> add) {
   5703 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5704  return add - mul * x;
   5705 #else
   5706  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
   5707 #endif
   5708 }
   5709 
   5710 // Returns mul * x - sub
   5711 template <size_t N>
   5712 HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
   5713                                Vec128<float, N> sub) {
   5714 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5715  return mul * x - sub;
   5716 #else
   5717  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
   5718 #endif
   5719 }
   5720 template <size_t N>
   5721 HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x,
   5722                                 Vec128<double, N> sub) {
   5723 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5724  return mul * x - sub;
   5725 #else
   5726  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
   5727 #endif
   5728 }
   5729 
   5730 // Returns -mul * x - sub
   5731 template <size_t N>
   5732 HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x,
   5733                                   Vec128<float, N> sub) {
   5734 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5735  return Neg(mul) * x - sub;
   5736 #else
   5737  return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
   5738 #endif
   5739 }
   5740 template <size_t N>
   5741 HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
   5742                                    Vec128<double, N> sub) {
   5743 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5744  return Neg(mul) * x - sub;
   5745 #else
   5746  return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
   5747 #endif
   5748 }
   5749 
   5750 #if HWY_TARGET <= HWY_SSSE3
   5751 
   5752 #undef HWY_IF_MULADDSUB_V
   5753 #define HWY_IF_MULADDSUB_V(V)                        \
   5754  HWY_IF_LANES_GT_D(DFromV<V>, 1),                   \
   5755      HWY_IF_T_SIZE_ONE_OF_V(                        \
   5756          V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \
   5757                             ? 0                     \
   5758                             : ((1 << 2) | (1 << 4) | (1 << 8))))
   5759 
   5760 #if HWY_HAVE_FLOAT16
   5761 template <size_t N, HWY_IF_LANES_GT(N, 1)>
   5762 HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
   5763                                       Vec128<float16_t, N> x,
   5764                                       Vec128<float16_t, N> sub_or_add) {
   5765  return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
   5766 }
   5767 #endif  // HWY_HAVE_FLOAT16
   5768 
   5769 template <size_t N, HWY_IF_LANES_GT(N, 1)>
   5770 HWY_API Vec128<float, N> MulAddSub(Vec128<float, N> mul, Vec128<float, N> x,
   5771                                   Vec128<float, N> sub_or_add) {
   5772 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5773  return AddSub(mul * x, sub_or_add);
   5774 #else
   5775  return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
   5776 #endif
   5777 }
   5778 
   5779 HWY_API Vec128<double> MulAddSub(Vec128<double> mul, Vec128<double> x,
   5780                                 Vec128<double> sub_or_add) {
   5781 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
   5782  return AddSub(mul * x, sub_or_add);
   5783 #else
   5784  return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
   5785 #endif
   5786 }
   5787 
   5788 #endif  // HWY_TARGET <= HWY_SSSE3
   5789 
   5790 // ------------------------------ Floating-point square root
   5791 
   5792 // Full precision square root
   5793 #if HWY_HAVE_FLOAT16
   5794 template <size_t N>
   5795 HWY_API Vec128<float16_t, N> Sqrt(Vec128<float16_t, N> v) {
   5796  return Vec128<float16_t, N>{_mm_sqrt_ph(v.raw)};
   5797 }
   5798 #endif  // HWY_HAVE_FLOAT16
   5799 template <size_t N>
   5800 HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) {
   5801  return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
   5802 }
   5803 HWY_API Vec128<float, 1> Sqrt(Vec128<float, 1> v) {
   5804  return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
   5805 }
   5806 template <size_t N>
   5807 HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) {
   5808  return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
   5809 }
   5810 HWY_API Vec64<double> Sqrt(Vec64<double> v) {
   5811  return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
   5812 }
   5813 
   5814 // Approximate reciprocal square root
   5815 #if HWY_HAVE_FLOAT16
   5816 template <size_t N>
   5817 HWY_API Vec128<float16_t, N> ApproximateReciprocalSqrt(Vec128<float16_t, N> v) {
   5818  return Vec128<float16_t, N>{_mm_rsqrt_ph(v.raw)};
   5819 }
   5820 #endif  // HWY_HAVE_FLOAT16
   5821 template <size_t N>
   5822 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
   5823  return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
   5824 }
   5825 HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(Vec128<float, 1> v) {
   5826  return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
   5827 }
   5828 
   5829 #if HWY_TARGET <= HWY_AVX3
   5830 #ifdef HWY_NATIVE_F64_APPROX_RSQRT
   5831 #undef HWY_NATIVE_F64_APPROX_RSQRT
   5832 #else
   5833 #define HWY_NATIVE_F64_APPROX_RSQRT
   5834 #endif
   5835 
   5836 HWY_API Vec64<double> ApproximateReciprocalSqrt(Vec64<double> v) {
   5837  return Vec64<double>{_mm_rsqrt14_sd(v.raw, v.raw)};
   5838 }
   5839 HWY_API Vec128<double> ApproximateReciprocalSqrt(Vec128<double> v) {
   5840 #if HWY_COMPILER_MSVC
   5841  const DFromV<decltype(v)> d;
   5842  return Vec128<double>{_mm_mask_rsqrt14_pd(
   5843      Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)};
   5844 #else
   5845  return Vec128<double>{_mm_rsqrt14_pd(v.raw)};
   5846 #endif
   5847 }
   5848 #endif
   5849 
   5850 // ------------------------------ Min (Gt, IfThenElse)
   5851 
   5852 namespace detail {
   5853 
   5854 template <typename T, size_t N>
   5855 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a,
   5856                                              const Vec128<T, N> b) {
   5857  const DFromV<decltype(a)> d;
   5858  const RebindToUnsigned<decltype(d)> du;
   5859  const RebindToSigned<decltype(d)> di;
   5860  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
   5861  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
   5862  return IfThenElse(gt, b, a);
   5863 }
   5864 
   5865 }  // namespace detail
   5866 
   5867 // Unsigned
   5868 template <size_t N>
   5869 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   5870  return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
   5871 }
   5872 template <size_t N>
   5873 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   5874 #if HWY_TARGET >= HWY_SSSE3
   5875  return Vec128<uint16_t, N>{
   5876      _mm_sub_epi16(a.raw, _mm_subs_epu16(a.raw, b.raw))};
   5877 #else
   5878  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
   5879 #endif
   5880 }
   5881 template <size_t N>
   5882 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   5883 #if HWY_TARGET >= HWY_SSSE3
   5884  return detail::MinU(a, b);
   5885 #else
   5886  return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
   5887 #endif
   5888 }
   5889 template <size_t N>
   5890 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   5891 #if HWY_TARGET <= HWY_AVX3
   5892  return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
   5893 #else
   5894  return detail::MinU(a, b);
   5895 #endif
   5896 }
   5897 
   5898 // Signed
   5899 template <size_t N>
   5900 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   5901 #if HWY_TARGET >= HWY_SSSE3
   5902  return IfThenElse(a < b, a, b);
   5903 #else
   5904  return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
   5905 #endif
   5906 }
   5907 template <size_t N>
   5908 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   5909  return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
   5910 }
   5911 template <size_t N>
   5912 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   5913 #if HWY_TARGET >= HWY_SSSE3
   5914  return IfThenElse(a < b, a, b);
   5915 #else
   5916  return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
   5917 #endif
   5918 }
   5919 template <size_t N>
   5920 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   5921 #if HWY_TARGET <= HWY_AVX3
   5922  return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
   5923 #else
   5924  return IfThenElse(a < b, a, b);
   5925 #endif
   5926 }
   5927 
   5928 // Float
   5929 #if HWY_HAVE_FLOAT16
   5930 template <size_t N>
   5931 HWY_API Vec128<float16_t, N> Min(Vec128<float16_t, N> a,
   5932                                 Vec128<float16_t, N> b) {
   5933  return Vec128<float16_t, N>{_mm_min_ph(a.raw, b.raw)};
   5934 }
   5935 #endif  // HWY_HAVE_FLOAT16
   5936 template <size_t N>
   5937 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
   5938  return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
   5939 }
   5940 template <size_t N>
   5941 HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) {
   5942  return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
   5943 }
   5944 
   5945 // ------------------------------ Max (Gt, IfThenElse)
   5946 
   5947 namespace detail {
   5948 template <typename T, size_t N>
   5949 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a,
   5950                                              const Vec128<T, N> b) {
   5951  const DFromV<decltype(a)> d;
   5952  const RebindToUnsigned<decltype(d)> du;
   5953  const RebindToSigned<decltype(d)> di;
   5954  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
   5955  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
   5956  return IfThenElse(gt, a, b);
   5957 }
   5958 
   5959 }  // namespace detail
   5960 
   5961 // Unsigned
   5962 template <size_t N>
   5963 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   5964  return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
   5965 }
   5966 template <size_t N>
   5967 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   5968 #if HWY_TARGET >= HWY_SSSE3
   5969  return Vec128<uint16_t, N>{
   5970      _mm_add_epi16(a.raw, _mm_subs_epu16(b.raw, a.raw))};
   5971 #else
   5972  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
   5973 #endif
   5974 }
   5975 template <size_t N>
   5976 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   5977 #if HWY_TARGET >= HWY_SSSE3
   5978  return detail::MaxU(a, b);
   5979 #else
   5980  return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
   5981 #endif
   5982 }
   5983 template <size_t N>
   5984 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   5985 #if HWY_TARGET <= HWY_AVX3
   5986  return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
   5987 #else
   5988  return detail::MaxU(a, b);
   5989 #endif
   5990 }
   5991 
   5992 // Signed
   5993 template <size_t N>
   5994 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   5995 #if HWY_TARGET >= HWY_SSSE3
   5996  return IfThenElse(a < b, b, a);
   5997 #else
   5998  return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
   5999 #endif
   6000 }
   6001 template <size_t N>
   6002 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   6003  return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
   6004 }
   6005 template <size_t N>
   6006 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   6007 #if HWY_TARGET >= HWY_SSSE3
   6008  return IfThenElse(a < b, b, a);
   6009 #else
   6010  return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
   6011 #endif
   6012 }
   6013 template <size_t N>
   6014 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   6015 #if HWY_TARGET <= HWY_AVX3
   6016  return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
   6017 #else
   6018  return IfThenElse(a < b, b, a);
   6019 #endif
   6020 }
   6021 
   6022 // Float
   6023 #if HWY_HAVE_FLOAT16
   6024 template <size_t N>
   6025 HWY_API Vec128<float16_t, N> Max(Vec128<float16_t, N> a,
   6026                                 Vec128<float16_t, N> b) {
   6027  return Vec128<float16_t, N>{_mm_max_ph(a.raw, b.raw)};
   6028 }
   6029 #endif  // HWY_HAVE_FLOAT16
   6030 template <size_t N>
   6031 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
   6032  return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
   6033 }
   6034 template <size_t N>
   6035 HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
   6036  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
   6037 }
   6038 
   6039 // ------------------------------ MinNumber and MaxNumber
   6040 
   6041 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
   6042 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
   6043 #else
   6044 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
   6045 #endif
   6046 
   6047 #if HWY_X86_HAVE_AVX10_2_OPS
   6048 
   6049 #if HWY_HAVE_FLOAT16
   6050 template <size_t N>
   6051 HWY_API Vec128<float16_t, N> MinNumber(Vec128<float16_t, N> a,
   6052                                       Vec128<float16_t, N> b) {
   6053  return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x14)};
   6054 }
   6055 #endif
   6056 template <size_t N>
   6057 HWY_API Vec128<float, N> MinNumber(Vec128<float, N> a, Vec128<float, N> b) {
   6058  return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x14)};
   6059 }
   6060 template <size_t N>
   6061 HWY_API Vec128<double, N> MinNumber(Vec128<double, N> a, Vec128<double, N> b) {
   6062  return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x14)};
   6063 }
   6064 
   6065 #if HWY_HAVE_FLOAT16
   6066 template <size_t N>
   6067 HWY_API Vec128<float16_t, N> MaxNumber(Vec128<float16_t, N> a,
   6068                                       Vec128<float16_t, N> b) {
   6069  return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x15)};
   6070 }
   6071 #endif
   6072 template <size_t N>
   6073 HWY_API Vec128<float, N> MaxNumber(Vec128<float, N> a, Vec128<float, N> b) {
   6074  return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x15)};
   6075 }
   6076 template <size_t N>
   6077 HWY_API Vec128<double, N> MaxNumber(Vec128<double, N> a, Vec128<double, N> b) {
   6078  return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x15)};
   6079 }
   6080 
   6081 #else
   6082 
   6083 // MinNumber/MaxNumber are generic for all vector lengths on targets other
   6084 // than AVX10.2
   6085 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
   6086 HWY_API V MinNumber(V a, V b) {
   6087  return Min(a, IfThenElse(IsNaN(b), a, b));
   6088 }
   6089 
   6090 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
   6091 HWY_API V MaxNumber(V a, V b) {
   6092  return Max(a, IfThenElse(IsNaN(b), a, b));
   6093 }
   6094 
   6095 #endif
   6096 
   6097 // ------------------------------ MinMagnitude and MaxMagnitude
   6098 
   6099 #if HWY_X86_HAVE_AVX10_2_OPS
   6100 
   6101 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
   6102 #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
   6103 #else
   6104 #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
   6105 #endif
   6106 
   6107 #if HWY_HAVE_FLOAT16
   6108 template <size_t N>
   6109 HWY_API Vec128<float16_t, N> MinMagnitude(Vec128<float16_t, N> a,
   6110                                          Vec128<float16_t, N> b) {
   6111  return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x16)};
   6112 }
   6113 #endif
   6114 template <size_t N>
   6115 HWY_API Vec128<float, N> MinMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
   6116  return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x16)};
   6117 }
   6118 template <size_t N>
   6119 HWY_API Vec128<double, N> MinMagnitude(Vec128<double, N> a,
   6120                                       Vec128<double, N> b) {
   6121  return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x16)};
   6122 }
   6123 
   6124 #if HWY_HAVE_FLOAT16
   6125 template <size_t N>
   6126 HWY_API Vec128<float16_t, N> MaxMagnitude(Vec128<float16_t, N> a,
   6127                                          Vec128<float16_t, N> b) {
   6128  return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x17)};
   6129 }
   6130 #endif
   6131 template <size_t N>
   6132 HWY_API Vec128<float, N> MaxMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
   6133  return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x17)};
   6134 }
   6135 template <size_t N>
   6136 HWY_API Vec128<double, N> MaxMagnitude(Vec128<double, N> a,
   6137                                       Vec128<double, N> b) {
   6138  return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x17)};
   6139 }
   6140 
   6141 #endif
   6142 
   6143 // ================================================== MEMORY (3)
   6144 
   6145 // ------------------------------ Non-temporal stores
   6146 
   6147 // On clang6, we see incorrect code generated for _mm_stream_pi, so
   6148 // round even partial vectors up to 16 bytes.
   6149 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
   6150 HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   6151  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   6152  _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw);
   6153 }
   6154 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   6155 HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) {
   6156  _mm_stream_ps(aligned, v.raw);
   6157 }
   6158 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   6159 HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) {
   6160  _mm_stream_pd(aligned, v.raw);
   6161 }
   6162 
   6163 // ------------------------------ Scatter
   6164 
   6165 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
   6166 HWY_DIAGNOSTICS(push)
   6167 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
   6168 
   6169 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
   6170 using GatherIndex64 = long long int;  // NOLINT(runtime/int)
   6171 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
   6172 
   6173 #if HWY_TARGET <= HWY_AVX3
   6174 
   6175 #ifdef HWY_NATIVE_SCATTER
   6176 #undef HWY_NATIVE_SCATTER
   6177 #else
   6178 #define HWY_NATIVE_SCATTER
   6179 #endif
   6180 
   6181 namespace detail {
   6182 
   6183 template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
   6184 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
   6185                                 VI index) {
   6186  if (d.MaxBytes() == 16) {
   6187    _mm_i32scatter_epi32(base, index.raw, v.raw, kScale);
   6188  } else {
   6189    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
   6190    _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale);
   6191  }
   6192 }
   6193 
   6194 template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
   6195 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
   6196                                 VI index) {
   6197  if (d.MaxBytes() == 16) {
   6198    _mm_i64scatter_epi64(base, index.raw, v.raw, kScale);
   6199  } else {
   6200    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
   6201    _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale);
   6202  }
   6203 }
   6204 
   6205 template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
   6206 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, float* HWY_RESTRICT base,
   6207                                 VI index) {
   6208  if (d.MaxBytes() == 16) {
   6209    _mm_i32scatter_ps(base, index.raw, v.raw, kScale);
   6210  } else {
   6211    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
   6212    _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale);
   6213  }
   6214 }
   6215 
   6216 template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
   6217 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, double* HWY_RESTRICT base,
   6218                                 VI index) {
   6219  if (d.MaxBytes() == 16) {
   6220    _mm_i64scatter_pd(base, index.raw, v.raw, kScale);
   6221  } else {
   6222    const __mmask8 mask = (1u << MaxLanes(d)) - 1;
   6223    _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale);
   6224  }
   6225 }
   6226 
   6227 template <int kScale, class D, class VI, HWY_IF_UI32_D(D)>
   6228 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
   6229                                       TFromD<D>* HWY_RESTRICT base, VI index) {
   6230  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
   6231  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
   6232  _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale);
   6233 }
   6234 
   6235 template <int kScale, class D, class VI, HWY_IF_UI64_D(D)>
   6236 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
   6237                                       TFromD<D>* HWY_RESTRICT base, VI index) {
   6238  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
   6239  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
   6240  _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale);
   6241 }
   6242 
   6243 template <int kScale, class D, class VI, HWY_IF_F32_D(D)>
   6244 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
   6245                                       float* HWY_RESTRICT base, VI index) {
   6246  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
   6247  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
   6248  _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale);
   6249 }
   6250 
   6251 template <int kScale, class D, class VI, HWY_IF_F64_D(D)>
   6252 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d,
   6253                                       double* HWY_RESTRICT base, VI index) {
   6254  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
   6255  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
   6256  _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale);
   6257 }
   6258 
   6259 }  // namespace detail
   6260 
   6261 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6262 HWY_API void ScatterOffset(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
   6263                           VFromD<RebindToSigned<D>> offset) {
   6264  return detail::NativeScatter128<1>(v, d, base, offset);
   6265 }
   6266 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6267 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
   6268                          VFromD<RebindToSigned<D>> index) {
   6269  return detail::NativeScatter128<sizeof(TFromD<D>)>(v, d, base, index);
   6270 }
   6271 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6272 HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
   6273                                TFromD<D>* HWY_RESTRICT base,
   6274                                VFromD<RebindToSigned<D>> index) {
   6275  return detail::NativeMaskedScatter128<sizeof(TFromD<D>)>(v, m, d, base,
   6276                                                           index);
   6277 }
   6278 
   6279 #endif  // HWY_TARGET <= HWY_AVX3
   6280 
   6281 // ------------------------------ Gather (Load/Store)
   6282 
   6283 #if HWY_TARGET <= HWY_AVX2
   6284 
   6285 #ifdef HWY_NATIVE_GATHER
   6286 #undef HWY_NATIVE_GATHER
   6287 #else
   6288 #define HWY_NATIVE_GATHER
   6289 #endif
   6290 
   6291 namespace detail {
   6292 
   6293 template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
   6294 HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
   6295                                        Vec128<int32_t, N> indices) {
   6296  return Vec128<T, N>{_mm_i32gather_epi32(
   6297      reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
   6298 }
   6299 
   6300 template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
   6301 HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
   6302                                        Vec128<int64_t, N> indices) {
   6303  return Vec128<T, N>{_mm_i64gather_epi64(
   6304      reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
   6305 }
   6306 
   6307 template <int kScale, size_t N>
   6308 HWY_INLINE Vec128<float, N> NativeGather128(const float* HWY_RESTRICT base,
   6309                                            Vec128<int32_t, N> indices) {
   6310  return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)};
   6311 }
   6312 
   6313 template <int kScale, size_t N>
   6314 HWY_INLINE Vec128<double, N> NativeGather128(const double* HWY_RESTRICT base,
   6315                                             Vec128<int64_t, N> indices) {
   6316  return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)};
   6317 }
   6318 
   6319 template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
   6320 HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
   6321                                                Mask128<T, N> m,
   6322                                                const T* HWY_RESTRICT base,
   6323                                                Vec128<int32_t, N> indices) {
   6324 #if HWY_TARGET <= HWY_AVX3
   6325  return Vec128<T, N>{_mm_mmask_i32gather_epi32(
   6326      no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
   6327      kScale)};
   6328 #else
   6329  return Vec128<T, N>{
   6330      _mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base),
   6331                               indices.raw, m.raw, kScale)};
   6332 #endif
   6333 }
   6334 
   6335 template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
   6336 HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
   6337                                                Mask128<T, N> m,
   6338                                                const T* HWY_RESTRICT base,
   6339                                                Vec128<int64_t, N> indices) {
   6340 #if HWY_TARGET <= HWY_AVX3
   6341  return Vec128<T, N>{_mm_mmask_i64gather_epi64(
   6342      no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
   6343      kScale)};
   6344 #else
   6345  return Vec128<T, N>{_mm_mask_i64gather_epi64(
   6346      no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw,
   6347      kScale)};
   6348 #endif
   6349 }
   6350 
   6351 template <int kScale, size_t N>
   6352 HWY_INLINE Vec128<float, N> NativeMaskedGatherOr128(
   6353    Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base,
   6354    Vec128<int32_t, N> indices) {
   6355 #if HWY_TARGET <= HWY_AVX3
   6356  return Vec128<float, N>{
   6357      _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
   6358 #else
   6359  return Vec128<float, N>{
   6360      _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
   6361 #endif
   6362 }
   6363 
   6364 template <int kScale, size_t N>
   6365 HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128(
   6366    Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base,
   6367    Vec128<int64_t, N> indices) {
   6368 #if HWY_TARGET <= HWY_AVX3
   6369  return Vec128<double, N>{
   6370      _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
   6371 #else
   6372  return Vec128<double, N>{
   6373      _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
   6374 #endif
   6375 }
   6376 
   6377 }  // namespace detail
   6378 
   6379 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6380 HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
   6381                               VFromD<RebindToSigned<D>> offsets) {
   6382  return detail::NativeGather128<1>(base, offsets);
   6383 }
   6384 
   6385 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
   6386 HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base,
   6387                              VFromD<RebindToSigned<D>> indices) {
   6388  return detail::NativeGather128<sizeof(T)>(base, indices);
   6389 }
   6390 
   6391 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
   6392 HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
   6393                                      const T* HWY_RESTRICT base,
   6394                                      VFromD<RebindToSigned<D>> indices) {
   6395  // For partial vectors, ensure upper mask lanes are zero to prevent faults.
   6396  if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
   6397 
   6398  return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
   6399 }
   6400 
   6401 // Generic for all vector lengths.
   6402 template <class D>
   6403 HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
   6404                                    const TFromD<D>* HWY_RESTRICT base,
   6405                                    VFromD<RebindToSigned<D>> indices) {
   6406  return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
   6407 }
   6408 
   6409 #endif  // HWY_TARGET <= HWY_AVX2
   6410 
   6411 HWY_DIAGNOSTICS(pop)
   6412 
   6413 // ================================================== SWIZZLE (2)
   6414 
   6415 // ------------------------------ LowerHalf
   6416 
   6417 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   6418 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
   6419  return VFromD<D>{v.raw};
   6420 }
   6421 template <typename T, size_t N>
   6422 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
   6423  return Vec128<T, N / 2>{v.raw};
   6424 }
   6425 
   6426 // ------------------------------ ShiftLeftBytes
   6427 
   6428 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6429 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
   6430  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   6431  const RebindToUnsigned<decltype(d)> du;
   6432  return BitCast(
   6433      d, VFromD<decltype(du)>{_mm_slli_si128(BitCast(du, v).raw, kBytes)});
   6434 }
   6435 
   6436 // Generic for all vector lengths.
   6437 template <int kBytes, class V>
   6438 HWY_API V ShiftLeftBytes(const V v) {
   6439  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
   6440 }
   6441 
   6442 // ------------------------------ ShiftLeftLanes
   6443 
   6444 // Generic for all vector lengths.
   6445 template <int kLanes, class D>
   6446 HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) {
   6447  const Repartition<uint8_t, decltype(d)> d8;
   6448  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
   6449 }
   6450 
   6451 // Generic for all vector lengths.
   6452 template <int kLanes, class V>
   6453 HWY_API V ShiftLeftLanes(const V v) {
   6454  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
   6455 }
   6456 
   6457 // ------------------------------ ShiftRightBytes
   6458 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6459 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
   6460  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   6461  const RebindToUnsigned<decltype(d)> du;
   6462  // For partial vectors, clear upper lanes so we shift in zeros.
   6463  if (d.MaxBytes() != 16) {
   6464    const Full128<TFromD<D>> dfull;
   6465    const VFromD<decltype(dfull)> vfull{v.raw};
   6466    v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
   6467  }
   6468  return BitCast(
   6469      d, VFromD<decltype(du)>{_mm_srli_si128(BitCast(du, v).raw, kBytes)});
   6470 }
   6471 
   6472 // ------------------------------ ShiftRightLanes
   6473 // Generic for all vector lengths.
   6474 template <int kLanes, class D>
   6475 HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) {
   6476  const Repartition<uint8_t, decltype(d)> d8;
   6477  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
   6478  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
   6479 }
   6480 
   6481 // ------------------------------ UpperHalf (ShiftRightBytes)
   6482 
   6483 // Full input: copy hi into lo (smaller instruction encoding than shifts).
   6484 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)>
   6485 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   6486  const Twice<RebindToUnsigned<decltype(d)>> dut;
   6487  using VUT = VFromD<decltype(dut)>;  // for float16_t
   6488  const VUT vut = BitCast(dut, v);
   6489  return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)}));
   6490 }
   6491 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
   6492 HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
   6493  return Vec64<float>{_mm_movehl_ps(v.raw, v.raw)};
   6494 }
   6495 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
   6496 HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
   6497  return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
   6498 }
   6499 
   6500 // Partial
   6501 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   6502 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   6503  return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
   6504 }
   6505 
   6506 // ------------------------------ ExtractLane (UpperHalf)
   6507 
   6508 namespace detail {
   6509 
   6510 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   6511 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   6512  static_assert(kLane < N, "Lane index out of bounds");
   6513 #if HWY_TARGET >= HWY_SSSE3
   6514  const int pair = _mm_extract_epi16(v.raw, kLane / 2);
   6515  constexpr int kShift = kLane & 1 ? 8 : 0;
   6516  return static_cast<T>((pair >> kShift) & 0xFF);
   6517 #else
   6518  return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
   6519 #endif
   6520 }
   6521 
   6522 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   6523 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   6524  static_assert(kLane < N, "Lane index out of bounds");
   6525  const DFromV<decltype(v)> d;
   6526  const RebindToUnsigned<decltype(d)> du;
   6527  const uint16_t lane = static_cast<uint16_t>(
   6528      _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF);
   6529  return BitCastScalar<T>(lane);
   6530 }
   6531 
   6532 template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
   6533 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   6534  static_assert(kLane < N, "Lane index out of bounds");
   6535 #if HWY_TARGET >= HWY_SSSE3
   6536  return static_cast<T>(_mm_cvtsi128_si32(
   6537      (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane)));
   6538 #else
   6539  return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
   6540 #endif
   6541 }
   6542 
   6543 template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
   6544 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   6545  static_assert(kLane < N, "Lane index out of bounds");
   6546 #if HWY_ARCH_X86_32
   6547  alignas(16) T lanes[2];
   6548  Store(v, DFromV<decltype(v)>(), lanes);
   6549  return lanes[kLane];
   6550 #elif HWY_TARGET >= HWY_SSSE3
   6551  return static_cast<T>(
   6552      _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE)));
   6553 #else
   6554  return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
   6555 #endif
   6556 }
   6557 
   6558 template <size_t kLane, size_t N>
   6559 HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
   6560  static_assert(kLane < N, "Lane index out of bounds");
   6561 #if HWY_TARGET >= HWY_SSSE3
   6562  return _mm_cvtss_f32((kLane == 0) ? v.raw
   6563                                    : _mm_shuffle_ps(v.raw, v.raw, kLane));
   6564 #else
   6565  // Bug in the intrinsic, returns int but should be float.
   6566  const int32_t bits = _mm_extract_ps(v.raw, kLane);
   6567  return BitCastScalar<float>(bits);
   6568 #endif
   6569 }
   6570 
   6571 // There is no extract_pd; two overloads because there is no UpperHalf for N=1.
   6572 template <size_t kLane>
   6573 HWY_INLINE double ExtractLane(const Vec64<double> v) {
   6574  static_assert(kLane == 0, "Lane index out of bounds");
   6575  return GetLane(v);
   6576 }
   6577 
   6578 template <size_t kLane>
   6579 HWY_INLINE double ExtractLane(const Vec128<double> v) {
   6580  static_assert(kLane < 2, "Lane index out of bounds");
   6581  const Half<DFromV<decltype(v)>> dh;
   6582  return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
   6583 }
   6584 
   6585 }  // namespace detail
   6586 
   6587 // Requires one overload per vector length because ExtractLane<3> may be a
   6588 // compile error if it calls _mm_extract_epi64.
   6589 template <typename T>
   6590 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
   6591  HWY_DASSERT(i == 0);
   6592  (void)i;
   6593  return GetLane(v);
   6594 }
   6595 
   6596 template <typename T>
   6597 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
   6598 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6599  if (__builtin_constant_p(i)) {
   6600    switch (i) {
   6601      case 0:
   6602        return detail::ExtractLane<0>(v);
   6603      case 1:
   6604        return detail::ExtractLane<1>(v);
   6605    }
   6606  }
   6607 #endif
   6608  alignas(16) T lanes[2];
   6609  Store(v, DFromV<decltype(v)>(), lanes);
   6610  return lanes[i];
   6611 }
   6612 
   6613 template <typename T>
   6614 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
   6615 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6616  if (__builtin_constant_p(i)) {
   6617    switch (i) {
   6618      case 0:
   6619        return detail::ExtractLane<0>(v);
   6620      case 1:
   6621        return detail::ExtractLane<1>(v);
   6622      case 2:
   6623        return detail::ExtractLane<2>(v);
   6624      case 3:
   6625        return detail::ExtractLane<3>(v);
   6626    }
   6627  }
   6628 #endif
   6629  alignas(16) T lanes[4];
   6630  Store(v, DFromV<decltype(v)>(), lanes);
   6631  return lanes[i];
   6632 }
   6633 
   6634 template <typename T>
   6635 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
   6636 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6637  if (__builtin_constant_p(i)) {
   6638    switch (i) {
   6639      case 0:
   6640        return detail::ExtractLane<0>(v);
   6641      case 1:
   6642        return detail::ExtractLane<1>(v);
   6643      case 2:
   6644        return detail::ExtractLane<2>(v);
   6645      case 3:
   6646        return detail::ExtractLane<3>(v);
   6647      case 4:
   6648        return detail::ExtractLane<4>(v);
   6649      case 5:
   6650        return detail::ExtractLane<5>(v);
   6651      case 6:
   6652        return detail::ExtractLane<6>(v);
   6653      case 7:
   6654        return detail::ExtractLane<7>(v);
   6655    }
   6656  }
   6657 #endif
   6658  alignas(16) T lanes[8];
   6659  Store(v, DFromV<decltype(v)>(), lanes);
   6660  return lanes[i];
   6661 }
   6662 
   6663 template <typename T>
   6664 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
   6665 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6666  if (__builtin_constant_p(i)) {
   6667    switch (i) {
   6668      case 0:
   6669        return detail::ExtractLane<0>(v);
   6670      case 1:
   6671        return detail::ExtractLane<1>(v);
   6672      case 2:
   6673        return detail::ExtractLane<2>(v);
   6674      case 3:
   6675        return detail::ExtractLane<3>(v);
   6676      case 4:
   6677        return detail::ExtractLane<4>(v);
   6678      case 5:
   6679        return detail::ExtractLane<5>(v);
   6680      case 6:
   6681        return detail::ExtractLane<6>(v);
   6682      case 7:
   6683        return detail::ExtractLane<7>(v);
   6684      case 8:
   6685        return detail::ExtractLane<8>(v);
   6686      case 9:
   6687        return detail::ExtractLane<9>(v);
   6688      case 10:
   6689        return detail::ExtractLane<10>(v);
   6690      case 11:
   6691        return detail::ExtractLane<11>(v);
   6692      case 12:
   6693        return detail::ExtractLane<12>(v);
   6694      case 13:
   6695        return detail::ExtractLane<13>(v);
   6696      case 14:
   6697        return detail::ExtractLane<14>(v);
   6698      case 15:
   6699        return detail::ExtractLane<15>(v);
   6700    }
   6701  }
   6702 #endif
   6703  alignas(16) T lanes[16];
   6704  Store(v, DFromV<decltype(v)>(), lanes);
   6705  return lanes[i];
   6706 }
   6707 
   6708 // ------------------------------ InsertLane (UpperHalf)
   6709 
   6710 namespace detail {
   6711 
   6712 template <class V>
   6713 HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) {
   6714  const DFromV<decltype(v)> d;
   6715 
   6716 #if HWY_TARGET <= HWY_AVX3
   6717  using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw);
   6718  const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)};
   6719 #else
   6720  const RebindToUnsigned<decltype(d)> du;
   6721  using TU = TFromD<decltype(du)>;
   6722  const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i)));
   6723 #endif
   6724 
   6725  return IfThenElse(mask, Set(d, t), v);
   6726 }
   6727 
   6728 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   6729 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   6730  static_assert(kLane < N, "Lane index out of bounds");
   6731 #if HWY_TARGET >= HWY_SSSE3
   6732  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
   6733 #else
   6734  return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
   6735 #endif
   6736 }
   6737 
   6738 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   6739 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   6740  static_assert(kLane < N, "Lane index out of bounds");
   6741  const DFromV<decltype(v)> d;
   6742  const RebindToUnsigned<decltype(d)> du;
   6743  const uint16_t bits = BitCastScalar<uint16_t>(t);
   6744  return BitCast(d, VFromD<decltype(du)>{
   6745                        _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)});
   6746 }
   6747 
   6748 template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
   6749 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   6750  static_assert(kLane < N, "Lane index out of bounds");
   6751 #if HWY_TARGET >= HWY_SSSE3
   6752  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
   6753 #else
   6754  const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
   6755  return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
   6756 #endif
   6757 }
   6758 
   6759 template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
   6760 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   6761  static_assert(kLane < N, "Lane index out of bounds");
   6762 #if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32
   6763  const DFromV<decltype(v)> d;
   6764  const RebindToFloat<decltype(d)> df;
   6765  const auto vt = BitCast(df, Set(d, t));
   6766  if (kLane == 0) {
   6767    return BitCast(
   6768        d, Vec128<double, N>{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)});
   6769  }
   6770  return BitCast(
   6771      d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)});
   6772 #else
   6773  const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
   6774  return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
   6775 #endif
   6776 }
   6777 
   6778 template <size_t kLane, size_t N>
   6779 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
   6780  static_assert(kLane < N, "Lane index out of bounds");
   6781 #if HWY_TARGET >= HWY_SSSE3
   6782  return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
   6783 #else
   6784  return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
   6785 #endif
   6786 }
   6787 
   6788 // There is no insert_pd; two overloads because there is no UpperHalf for N=1.
   6789 template <size_t kLane>
   6790 HWY_INLINE Vec128<double, 1> InsertLane(const Vec128<double, 1> v, double t) {
   6791  static_assert(kLane == 0, "Lane index out of bounds");
   6792  return Set(DFromV<decltype(v)>(), t);
   6793 }
   6794 
   6795 template <size_t kLane>
   6796 HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) {
   6797  static_assert(kLane < 2, "Lane index out of bounds");
   6798  const DFromV<decltype(v)> d;
   6799  const Vec128<double> vt = Set(d, t);
   6800  if (kLane == 0) {
   6801    return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
   6802  }
   6803  return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
   6804 }
   6805 
   6806 }  // namespace detail
   6807 
   6808 // Requires one overload per vector length because InsertLane<3> may be a
   6809 // compile error if it calls _mm_insert_epi64.
   6810 
   6811 template <typename T>
   6812 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
   6813  HWY_DASSERT(i == 0);
   6814  (void)i;
   6815  return Set(DFromV<decltype(v)>(), t);
   6816 }
   6817 
   6818 template <typename T>
   6819 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
   6820 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6821  if (__builtin_constant_p(i)) {
   6822    switch (i) {
   6823      case 0:
   6824        return detail::InsertLane<0>(v, t);
   6825      case 1:
   6826        return detail::InsertLane<1>(v, t);
   6827    }
   6828  }
   6829 #endif
   6830  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   6831 }
   6832 
   6833 template <typename T>
   6834 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
   6835 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6836  if (__builtin_constant_p(i)) {
   6837    switch (i) {
   6838      case 0:
   6839        return detail::InsertLane<0>(v, t);
   6840      case 1:
   6841        return detail::InsertLane<1>(v, t);
   6842      case 2:
   6843        return detail::InsertLane<2>(v, t);
   6844      case 3:
   6845        return detail::InsertLane<3>(v, t);
   6846    }
   6847  }
   6848 #endif
   6849  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   6850 }
   6851 
   6852 template <typename T>
   6853 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
   6854 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6855  if (__builtin_constant_p(i)) {
   6856    switch (i) {
   6857      case 0:
   6858        return detail::InsertLane<0>(v, t);
   6859      case 1:
   6860        return detail::InsertLane<1>(v, t);
   6861      case 2:
   6862        return detail::InsertLane<2>(v, t);
   6863      case 3:
   6864        return detail::InsertLane<3>(v, t);
   6865      case 4:
   6866        return detail::InsertLane<4>(v, t);
   6867      case 5:
   6868        return detail::InsertLane<5>(v, t);
   6869      case 6:
   6870        return detail::InsertLane<6>(v, t);
   6871      case 7:
   6872        return detail::InsertLane<7>(v, t);
   6873    }
   6874  }
   6875 #endif
   6876  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   6877 }
   6878 
   6879 template <typename T>
   6880 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
   6881 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6882  if (__builtin_constant_p(i)) {
   6883    switch (i) {
   6884      case 0:
   6885        return detail::InsertLane<0>(v, t);
   6886      case 1:
   6887        return detail::InsertLane<1>(v, t);
   6888      case 2:
   6889        return detail::InsertLane<2>(v, t);
   6890      case 3:
   6891        return detail::InsertLane<3>(v, t);
   6892      case 4:
   6893        return detail::InsertLane<4>(v, t);
   6894      case 5:
   6895        return detail::InsertLane<5>(v, t);
   6896      case 6:
   6897        return detail::InsertLane<6>(v, t);
   6898      case 7:
   6899        return detail::InsertLane<7>(v, t);
   6900      case 8:
   6901        return detail::InsertLane<8>(v, t);
   6902      case 9:
   6903        return detail::InsertLane<9>(v, t);
   6904      case 10:
   6905        return detail::InsertLane<10>(v, t);
   6906      case 11:
   6907        return detail::InsertLane<11>(v, t);
   6908      case 12:
   6909        return detail::InsertLane<12>(v, t);
   6910      case 13:
   6911        return detail::InsertLane<13>(v, t);
   6912      case 14:
   6913        return detail::InsertLane<14>(v, t);
   6914      case 15:
   6915        return detail::InsertLane<15>(v, t);
   6916    }
   6917  }
   6918 #endif
   6919  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   6920 }
   6921 
   6922 // ------------------------------ CombineShiftRightBytes
   6923 
   6924 #if HWY_TARGET == HWY_SSE2
   6925 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
   6926 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   6927  static_assert(0 < kBytes && kBytes < 16, "kBytes invalid");
   6928  return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
   6929 }
   6930 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   6931 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   6932  constexpr size_t kSize = d.MaxBytes();
   6933  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
   6934 
   6935  const Twice<decltype(d)> dt;
   6936  return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw};
   6937 }
   6938 #else
   6939 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
   6940 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   6941  const Repartition<uint8_t, decltype(d)> d8;
   6942  return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
   6943                        BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
   6944 }
   6945 
   6946 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   6947 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   6948  constexpr size_t kSize = d.MaxBytes();
   6949  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
   6950  const Repartition<uint8_t, decltype(d)> d8;
   6951  using V8 = Vec128<uint8_t>;
   6952  const DFromV<V8> dfull8;
   6953  const Repartition<TFromD<D>, decltype(dfull8)> dfull;
   6954  const V8 hi8{BitCast(d8, hi).raw};
   6955  // Move into most-significant bytes
   6956  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
   6957  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
   6958  return VFromD<D>{BitCast(dfull, r).raw};
   6959 }
   6960 #endif
   6961 
   6962 // ------------------------------ Broadcast/splat any lane
   6963 
   6964 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   6965 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   6966  const DFromV<decltype(v)> d;
   6967  const RebindToUnsigned<decltype(d)> du;
   6968  using VU = VFromD<decltype(du)>;
   6969  const VU vu = BitCast(du, v);  // for float16_t
   6970  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6971  if (kLane < 4) {
   6972    const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF);
   6973    return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)});
   6974  } else {
   6975    const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF);
   6976    return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)});
   6977  }
   6978 }
   6979 
   6980 template <int kLane, typename T, size_t N, HWY_IF_UI32(T)>
   6981 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   6982  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6983  HWY_IF_CONSTEXPR(N == 1){
   6984    return Vec128<T, N>{v};  // Workaround for MSVC compiler bug on single lane integer broadcast
   6985  }else{
   6986    return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
   6987  }
   6988 }
   6989 
   6990 template <int kLane, typename T, size_t N, HWY_IF_UI64(T)>
   6991 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   6992  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6993  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
   6994 }
   6995 
   6996 template <int kLane, size_t N>
   6997 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
   6998  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6999  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
   7000 }
   7001 
   7002 template <int kLane, size_t N>
   7003 HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) {
   7004  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   7005  return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
   7006 }
   7007 
   7008 // ------------------------------ TableLookupLanes (Shuffle01)
   7009 
   7010 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
   7011 template <typename T, size_t N = 16 / sizeof(T)>
   7012 struct Indices128 {
   7013  __m128i raw;
   7014 };
   7015 
   7016 template <class D, typename T = TFromD<D>, typename TI, size_t kN,
   7017          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)>
   7018 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
   7019  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   7020 #if HWY_IS_DEBUG_BUILD
   7021  const Rebind<TI, decltype(d)> di;
   7022  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
   7023              AllTrue(di, Lt(vec, Set(di, kN * 2))));
   7024 #endif
   7025 
   7026  // No change as byte indices are always used for 8-bit lane types
   7027  (void)d;
   7028  return Indices128<T, kN>{vec.raw};
   7029 }
   7030 
   7031 template <class D, typename T = TFromD<D>, typename TI, size_t kN,
   7032          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)>
   7033 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
   7034  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   7035 #if HWY_IS_DEBUG_BUILD
   7036  const Rebind<TI, decltype(d)> di;
   7037  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
   7038              AllTrue(di, Lt(vec, Set(di, kN * 2))));
   7039 #endif
   7040 
   7041 #if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
   7042  (void)d;
   7043  return Indices128<T, kN>{vec.raw};
   7044 #else   // SSSE3, SSE4, or AVX2
   7045  const Repartition<uint8_t, decltype(d)> d8;
   7046  using V8 = VFromD<decltype(d8)>;
   7047  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   7048      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
   7049 
   7050  // Broadcast each lane index to all 4 bytes of T
   7051  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   7052      0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
   7053  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
   7054 
   7055  // Shift to bytes
   7056  const Repartition<uint16_t, decltype(d)> d16;
   7057  const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices)));
   7058 
   7059  return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
   7060 #endif  // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2
   7061 }
   7062 
   7063 template <class D, typename T = TFromD<D>, typename TI, size_t kN,
   7064          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)>
   7065 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
   7066  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   7067 #if HWY_IS_DEBUG_BUILD
   7068  const Rebind<TI, decltype(d)> di;
   7069  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
   7070              AllTrue(di, Lt(vec, Set(di, kN * 2))));
   7071 #endif
   7072 
   7073 #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
   7074  (void)d;
   7075  return Indices128<T, kN>{vec.raw};
   7076 #else
   7077  const Repartition<uint8_t, decltype(d)> d8;
   7078  using V8 = VFromD<decltype(d8)>;
   7079  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   7080      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
   7081 
   7082  // Broadcast each lane index to all 4 bytes of T
   7083  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   7084      0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
   7085  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
   7086 
   7087  // Shift to bytes
   7088  const Repartition<uint16_t, decltype(d)> d16;
   7089  const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
   7090 
   7091  return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
   7092 #endif
   7093 }
   7094 
   7095 template <class D, typename T = TFromD<D>, typename TI, size_t kN,
   7096          HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)>
   7097 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) {
   7098  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   7099 #if HWY_IS_DEBUG_BUILD
   7100  const Rebind<TI, decltype(d)> di;
   7101  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
   7102              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(kN * 2)))));
   7103 #else
   7104  (void)d;
   7105 #endif
   7106 
   7107  // No change - even without AVX3, we can shuffle+blend.
   7108  return Indices128<T, kN>{vec.raw};
   7109 }
   7110 
   7111 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI>
   7112 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
   7113    D d, const TI* idx) {
   7114  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
   7115  const Rebind<TI, decltype(d)> di;
   7116  return IndicesFromVec(d, LoadU(di, idx));
   7117 }
   7118 
   7119 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   7120 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   7121  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
   7122 }
   7123 
   7124 template <typename T, size_t N, HWY_IF_UI16(T)>
   7125 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   7126 #if HWY_TARGET <= HWY_AVX3
   7127  return {_mm_permutexvar_epi16(idx.raw, v.raw)};
   7128 #elif HWY_TARGET == HWY_SSE2
   7129 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
   7130  typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16)));
   7131  return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
   7132      __builtin_shuffle(reinterpret_cast<GccU16RawVectType>(v.raw),
   7133                        reinterpret_cast<GccU16RawVectType>(idx.raw)))};
   7134 #else
   7135  const Full128<T> d_full;
   7136  alignas(16) T src_lanes[8];
   7137  alignas(16) uint16_t indices[8];
   7138  alignas(16) T result_lanes[8];
   7139 
   7140  Store(Vec128<T>{v.raw}, d_full, src_lanes);
   7141  _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw);
   7142 
   7143  for (int i = 0; i < 8; i++) {
   7144    result_lanes[i] = src_lanes[indices[i] & 7u];
   7145  }
   7146 
   7147  return Vec128<T, N>{Load(d_full, result_lanes).raw};
   7148 #endif  // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
   7149 #else
   7150  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
   7151 #endif
   7152 }
   7153 
   7154 #if HWY_HAVE_FLOAT16
   7155 template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 2)>
   7156 HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v,
   7157                                              Indices128<float16_t, N> idx) {
   7158  return {_mm_permutexvar_ph(idx.raw, v.raw)};
   7159 }
   7160 #endif  // HWY_HAVE_FLOAT16
   7161 
   7162 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   7163 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   7164  const DFromV<decltype(v)> d;
   7165  const Full128<T> d_full;
   7166  const Vec128<T> v_full = ZeroExtendResizeBitCast(d_full, d, v);
   7167 
   7168  const RebindToSigned<decltype(d)> di;
   7169  const Full128<MakeSigned<T>> di_full;
   7170  const VFromD<decltype(di_full)> vidx =
   7171      ZeroExtendResizeBitCast(di_full, di, VFromD<decltype(di)>{idx.raw});
   7172 
   7173 #if HWY_TARGET <= HWY_AVX2
   7174  // There is no permutevar for non-float; _mm256_permutevar8x32_epi32 is for
   7175  // 256-bit vectors, hence cast to float.
   7176  const Full128<float> df_full;
   7177  // Workaround for MSAN false positive.
   7178  HWY_IF_CONSTEXPR(HWY_IS_MSAN) PreventElision(GetLane(vidx));
   7179  const Vec128<float> perm{
   7180      _mm_permutevar_ps(BitCast(df_full, v_full).raw, vidx.raw)};
   7181  return ResizeBitCast(d, perm);
   7182 #elif HWY_TARGET == HWY_SSE2
   7183 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
   7184  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
   7185  return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>(
   7186      __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v_full.raw),
   7187                        reinterpret_cast<GccU32RawVectType>(vidx.raw)))};
   7188 #else
   7189  alignas(16) T src_lanes[4];
   7190  alignas(16) int32_t indices[4];
   7191  alignas(16) T result_lanes[4];
   7192 
   7193  Store(v_full, d_full, src_lanes);
   7194  Store(vidx, di_full, indices);
   7195 
   7196  for (size_t i = 0; i < N; i++) {
   7197    result_lanes[i] = src_lanes[static_cast<size_t>(indices[i] & 3)];
   7198  }
   7199  return Load(d, result_lanes);
   7200 #endif  // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
   7201 #else   // SSSE3 or SSE4
   7202  return ResizeBitCast(d, TableLookupBytes(BitCast(di_full, v_full), vidx));
   7203 #endif
   7204 }
   7205 
   7206 // Single lane: no change
   7207 template <typename T>
   7208 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
   7209                                      Indices128<T, 1> /* idx */) {
   7210  return v;
   7211 }
   7212 
   7213 template <typename T, HWY_IF_T_SIZE(T, 8)>
   7214 HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
   7215  const DFromV<decltype(v)> d;
   7216  // No need for ZeroExtendResizeBitCast, we have full vectors.
   7217  Vec128<int64_t> vidx{idx.raw};
   7218 
   7219  // Disable in MSAN builds due to false positive. Note that this affects
   7220  // CompressNot, which assumes upper index bits will be ignored.
   7221 #if HWY_TARGET <= HWY_AVX2 && !HWY_IS_MSAN
   7222  // There is no _mm_permute[x]var_epi64.
   7223  vidx += vidx;  // bit1 is the decider (unusual)
   7224  const RebindToFloat<decltype(d)> df;
   7225  return BitCast(
   7226      d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
   7227 #else
   7228  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
   7229  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
   7230  // to obtain an all-zero or all-one mask.
   7231  const RebindToSigned<decltype(d)> di;
   7232  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
   7233  return BitCast(
   7234      d, IfVecThenElse(same, BitCast(di, v), Shuffle01(BitCast(di, v))));
   7235 #endif
   7236 }
   7237 
   7238 // ------------------------------ ReverseBlocks
   7239 
   7240 // Single block: no change
   7241 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   7242 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
   7243  return v;
   7244 }
   7245 
   7246 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
   7247 
   7248 // Single lane: no change
   7249 template <class D, HWY_IF_LANES_D(D, 1)>
   7250 HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
   7251  return v;
   7252 }
   7253 
   7254 // 32-bit x2: shuffle
   7255 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
   7256 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
   7257  return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw};
   7258 }
   7259 
   7260 // 64-bit x2: shuffle
   7261 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   7262 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
   7263  return Shuffle01(v);
   7264 }
   7265 
   7266 // 32-bit x4: shuffle
   7267 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   7268 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
   7269  return Shuffle0123(v);
   7270 }
   7271 
   7272 // 16-bit
   7273 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2),
   7274          HWY_IF_LANES_GT_D(D, 1)>
   7275 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   7276  const RebindToUnsigned<decltype(d)> du;
   7277  using VU = VFromD<decltype(du)>;
   7278  const VU vu = BitCast(du, v);  // for float16_t
   7279  constexpr size_t kN = MaxLanes(d);
   7280  if (kN == 1) return v;
   7281  if (kN == 2) {
   7282    return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))});
   7283  }
   7284  if (kN == 4) {
   7285    return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
   7286  }
   7287 
   7288 #if HWY_TARGET == HWY_SSE2
   7289  const VU rev4{
   7290      _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
   7291                          _MM_SHUFFLE(0, 1, 2, 3))};
   7292  return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))});
   7293 #else
   7294  const RebindToSigned<decltype(d)> di;
   7295  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
   7296      di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
   7297  return BitCast(d, TableLookupBytes(v, shuffle));
   7298 #endif
   7299 }
   7300 
   7301 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1),
   7302          HWY_IF_LANES_GT_D(D, 1)>
   7303 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   7304  constexpr int kN = static_cast<int>(MaxLanes(d));
   7305  if (kN == 1) return v;
   7306 #if HWY_TARGET <= HWY_SSSE3
   7307  // NOTE: Lanes with negative shuffle control mask values are set to zero.
   7308  alignas(16) static constexpr int8_t kReverse[16] = {
   7309      kN - 1, kN - 2,  kN - 3,  kN - 4,  kN - 5,  kN - 6,  kN - 7,  kN - 8,
   7310      kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16};
   7311  const RebindToSigned<decltype(d)> di;
   7312  const VFromD<decltype(di)> idx = Load(di, kReverse);
   7313  return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)};
   7314 #else
   7315  const RepartitionToWide<decltype(d)> d16;
   7316  return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v))));
   7317 #endif
   7318 }
   7319 
   7320 // ------------------------------ Reverse2
   7321 
   7322 // Single lane: no change
   7323 template <class D, HWY_IF_LANES_D(D, 1)>
   7324 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
   7325  return v;
   7326 }
   7327 
   7328 // Generic for all vector lengths (128-bit sufficient if SSE2).
   7329 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)>
   7330 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   7331 #if HWY_TARGET <= HWY_AVX3
   7332  const Repartition<uint32_t, decltype(d)> du32;
   7333  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
   7334 #elif HWY_TARGET == HWY_SSE2
   7335  const RebindToUnsigned<decltype(d)> du;
   7336  using VU = VFromD<decltype(du)>;
   7337  const VU vu = BitCast(du, v);  // for float16_t
   7338  constexpr size_t kN = MaxLanes(d);
   7339  __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1));
   7340  if (kN > 4) {
   7341    shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1));
   7342  }
   7343  return BitCast(d, VU{shuf_result});
   7344 #else
   7345  const RebindToSigned<decltype(d)> di;
   7346  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
   7347      di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C);
   7348  return BitCast(d, TableLookupBytes(v, shuffle));
   7349 #endif
   7350 }
   7351 
   7352 // Generic for all vector lengths.
   7353 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)>
   7354 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
   7355  return Shuffle2301(v);
   7356 }
   7357 
   7358 // Generic for all vector lengths.
   7359 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
   7360 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
   7361  return Shuffle01(v);
   7362 }
   7363 
   7364 // ------------------------------ Reverse4
   7365 
   7366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   7367 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   7368  const RebindToUnsigned<decltype(d)> du;
   7369  using VU = VFromD<decltype(du)>;
   7370  const VU vu = BitCast(du, v);  // for float16_t
   7371  // 4x 16-bit: a single shufflelo suffices.
   7372  constexpr size_t kN = MaxLanes(d);
   7373  if (kN <= 4) {
   7374    return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))});
   7375  }
   7376 
   7377 #if HWY_TARGET == HWY_SSE2
   7378  return BitCast(d, VU{_mm_shufflehi_epi16(
   7379                        _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)),
   7380                        _MM_SHUFFLE(0, 1, 2, 3))});
   7381 #else
   7382  const RebindToSigned<decltype(d)> di;
   7383  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
   7384      di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
   7385  return BitCast(d, TableLookupBytes(v, shuffle));
   7386 #endif
   7387 }
   7388 
   7389 // Generic for all vector lengths.
   7390 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   7391 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
   7392  return Shuffle0123(v);
   7393 }
   7394 
   7395 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   7396 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
   7397  HWY_ASSERT(0);  // don't have 4 u64 lanes
   7398 }
   7399 
   7400 // ------------------------------ Reverse8
   7401 
   7402 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   7403 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   7404 #if HWY_TARGET == HWY_SSE2
   7405  const RepartitionToWide<decltype(d)> dw;
   7406  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
   7407 #else
   7408  const RebindToSigned<decltype(d)> di;
   7409  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
   7410      di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
   7411  return BitCast(d, TableLookupBytes(v, shuffle));
   7412 #endif
   7413 }
   7414 
   7415 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
   7416          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   7417 HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
   7418  HWY_ASSERT(0);  // don't have 8 lanes if larger than 16-bit
   7419 }
   7420 
   7421 // ------------------------------ ReverseBits in x86_512
   7422 
   7423 // ------------------------------ InterleaveUpper (UpperHalf)
   7424 
   7425 // Full
   7426 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   7427 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   7428  return VFromD<D>{_mm_unpackhi_epi8(a.raw, b.raw)};
   7429 }
   7430 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   7431 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   7432  const DFromV<decltype(a)> d;
   7433  const RebindToUnsigned<decltype(d)> du;
   7434  using VU = VFromD<decltype(du)>;  // for float16_t
   7435  return BitCast(
   7436      d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)});
   7437 }
   7438 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
   7439 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   7440  return VFromD<D>{_mm_unpackhi_epi32(a.raw, b.raw)};
   7441 }
   7442 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
   7443 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   7444  return VFromD<D>{_mm_unpackhi_epi64(a.raw, b.raw)};
   7445 }
   7446 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   7447 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   7448  return VFromD<D>{_mm_unpackhi_ps(a.raw, b.raw)};
   7449 }
   7450 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   7451 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   7452  return VFromD<D>{_mm_unpackhi_pd(a.raw, b.raw)};
   7453 }
   7454 
   7455 // Partial
   7456 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   7457 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   7458  const Half<decltype(d)> d2;
   7459  return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
   7460                         VFromD<D>{UpperHalf(d2, b).raw});
   7461 }
   7462 
   7463 // -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper)
   7464 
   7465 template <int kLane, class T, size_t N, HWY_IF_T_SIZE(T, 1)>
   7466 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   7467  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   7468  const DFromV<decltype(v)> d;
   7469 
   7470 #if HWY_TARGET == HWY_SSE2
   7471  const Full128<T> d_full;
   7472  const Vec128<T> v_full{v.raw};
   7473  const auto v_interleaved = (kLane < 8)
   7474                                 ? InterleaveLower(d_full, v_full, v_full)
   7475                                 : InterleaveUpper(d_full, v_full, v_full);
   7476  return ResizeBitCast(
   7477      d, Broadcast<kLane & 7>(BitCast(Full128<uint16_t>(), v_interleaved)));
   7478 #else
   7479  return TableLookupBytes(v, Set(d, static_cast<T>(kLane)));
   7480 #endif
   7481 }
   7482 
   7483 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
   7484 
   7485 // Same as Interleave*, except that the return lanes are double-width integers;
   7486 // this is necessary because the single-lane scalar cannot return two values.
   7487 // Generic for all vector lengths.
   7488 template <class V, class DW = RepartitionToWide<DFromV<V>>>
   7489 HWY_API VFromD<DW> ZipLower(V a, V b) {
   7490  return BitCast(DW(), InterleaveLower(a, b));
   7491 }
   7492 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   7493 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   7494  return BitCast(dw, InterleaveLower(D(), a, b));
   7495 }
   7496 
   7497 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   7498 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   7499  return BitCast(dw, InterleaveUpper(D(), a, b));
   7500 }
   7501 
   7502 // ================================================== CONVERT (1)
   7503 
   7504 // ------------------------------ PromoteTo unsigned (TableLookupBytesOr0)
   7505 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
   7506 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   7507 #if HWY_TARGET >= HWY_SSSE3
   7508  const __m128i zero = _mm_setzero_si128();
   7509  return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)};
   7510 #else
   7511  return VFromD<D>{_mm_cvtepu8_epi16(v.raw)};
   7512 #endif
   7513 }
   7514 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
   7515 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   7516 #if HWY_TARGET >= HWY_SSSE3
   7517  return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
   7518 #else
   7519  return VFromD<D>{_mm_cvtepu16_epi32(v.raw)};
   7520 #endif
   7521 }
   7522 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   7523 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   7524 #if HWY_TARGET >= HWY_SSSE3
   7525  return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
   7526 #else
   7527  return VFromD<D>{_mm_cvtepu32_epi64(v.raw)};
   7528 #endif
   7529 }
   7530 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
   7531 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   7532 #if HWY_TARGET >= HWY_SSSE3
   7533  const __m128i zero = _mm_setzero_si128();
   7534  const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
   7535  return VFromD<D>{_mm_unpacklo_epi16(u16, zero)};
   7536 #else
   7537  return VFromD<D>{_mm_cvtepu8_epi32(v.raw)};
   7538 #endif
   7539 }
   7540 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   7541 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
   7542 #if HWY_TARGET > HWY_SSSE3
   7543  const Rebind<uint32_t, decltype(d)> du32;
   7544  return PromoteTo(d, PromoteTo(du32, v));
   7545 #elif HWY_TARGET == HWY_SSSE3
   7546  alignas(16) static constexpr int8_t kShuffle[16] = {
   7547      0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1};
   7548  const Repartition<int8_t, decltype(d)> di8;
   7549  return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
   7550 #else
   7551  (void)d;
   7552  return VFromD<D>{_mm_cvtepu8_epi64(v.raw)};
   7553 #endif
   7554 }
   7555 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   7556 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
   7557 #if HWY_TARGET > HWY_SSSE3
   7558  const Rebind<uint32_t, decltype(d)> du32;
   7559  return PromoteTo(d, PromoteTo(du32, v));
   7560 #elif HWY_TARGET == HWY_SSSE3
   7561  alignas(16) static constexpr int8_t kShuffle[16] = {
   7562      0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1};
   7563  const Repartition<int8_t, decltype(d)> di8;
   7564  return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle)));
   7565 #else
   7566  (void)d;
   7567  return VFromD<D>{_mm_cvtepu16_epi64(v.raw)};
   7568 #endif
   7569 }
   7570 
   7571 // Unsigned to signed: same plus cast.
   7572 template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
   7573          HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
   7574          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
   7575 HWY_API VFromD<D> PromoteTo(D di, V v) {
   7576  const RebindToUnsigned<decltype(di)> du;
   7577  return BitCast(di, PromoteTo(du, v));
   7578 }
   7579 
   7580 // ------------------------------ PromoteTo signed (ShiftRight, ZipLower)
   7581 
   7582 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
   7583 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   7584 #if HWY_TARGET >= HWY_SSSE3
   7585  return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)});
   7586 #else
   7587  return VFromD<D>{_mm_cvtepi8_epi16(v.raw)};
   7588 #endif
   7589 }
   7590 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
   7591 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   7592 #if HWY_TARGET >= HWY_SSSE3
   7593  return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)});
   7594 #else
   7595  return VFromD<D>{_mm_cvtepi16_epi32(v.raw)};
   7596 #endif
   7597 }
   7598 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   7599 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   7600 #if HWY_TARGET >= HWY_SSSE3
   7601  return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)});
   7602 #else
   7603  return VFromD<D>{_mm_cvtepi32_epi64(v.raw)};
   7604 #endif
   7605 }
   7606 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
   7607 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   7608 #if HWY_TARGET >= HWY_SSSE3
   7609  const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
   7610  const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
   7611  return ShiftRight<24>(VFromD<D>{x4});
   7612 #else
   7613  return VFromD<D>{_mm_cvtepi8_epi32(v.raw)};
   7614 #endif
   7615 }
   7616 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   7617 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
   7618 #if HWY_TARGET >= HWY_SSSE3
   7619  const Repartition<int32_t, decltype(d)> di32;
   7620  const Half<decltype(di32)> dh_i32;
   7621  const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw};
   7622  const VFromD<decltype(di32)> s4{
   7623      _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))};
   7624  return ZipLower(d, x4, s4);
   7625 #else
   7626  (void)d;
   7627  return VFromD<D>{_mm_cvtepi8_epi64(v.raw)};
   7628 #endif
   7629 }
   7630 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   7631 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
   7632 #if HWY_TARGET >= HWY_SSSE3
   7633  const Repartition<int32_t, decltype(d)> di32;
   7634  const Half<decltype(di32)> dh_i32;
   7635  const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw};
   7636  const VFromD<decltype(di32)> s2{
   7637      _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))};
   7638  return ZipLower(d, x2, s2);
   7639 #else
   7640  (void)d;
   7641  return VFromD<D>{_mm_cvtepi16_epi64(v.raw)};
   7642 #endif
   7643 }
   7644 
   7645 // -------------------- PromoteTo float (ShiftLeft, IfNegativeThenElse)
   7646 #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
   7647 
   7648 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
   7649 #ifdef HWY_NATIVE_F16C
   7650 #undef HWY_NATIVE_F16C
   7651 #else
   7652 #define HWY_NATIVE_F16C
   7653 #endif
   7654 
   7655 // Workaround for origin tracking bug in Clang msan prior to 11.0
   7656 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
   7657 #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
   7658 #define HWY_INLINE_F16 HWY_NOINLINE
   7659 #else
   7660 #define HWY_INLINE_F16 HWY_INLINE
   7661 #endif
   7662 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   7663 HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
   7664 #if HWY_HAVE_FLOAT16
   7665  const RebindToUnsigned<DFromV<decltype(v)>> du16;
   7666  return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
   7667 #else
   7668  return VFromD<D>{_mm_cvtph_ps(v.raw)};
   7669 #endif
   7670 }
   7671 
   7672 #endif  // HWY_NATIVE_F16C
   7673 
   7674 #if HWY_HAVE_FLOAT16
   7675 
   7676 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
   7677 #undef HWY_NATIVE_PROMOTE_F16_TO_F64
   7678 #else
   7679 #define HWY_NATIVE_PROMOTE_F16_TO_F64
   7680 #endif
   7681 
   7682 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   7683 HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
   7684  return VFromD<D>{_mm_cvtph_pd(v.raw)};
   7685 }
   7686 
   7687 #endif  // HWY_HAVE_FLOAT16
   7688 
   7689 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   7690 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
   7691  const Rebind<uint16_t, decltype(df32)> du16;
   7692  const RebindToSigned<decltype(df32)> di32;
   7693  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
   7694 }
   7695 
   7696 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   7697 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   7698  return VFromD<D>{_mm_cvtps_pd(v.raw)};
   7699 }
   7700 
   7701 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   7702 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   7703  return VFromD<D>{_mm_cvtepi32_pd(v.raw)};
   7704 }
   7705 
   7706 #if HWY_TARGET <= HWY_AVX3
   7707 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   7708 HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) {
   7709  return VFromD<D>{_mm_cvtepu32_pd(v.raw)};
   7710 }
   7711 #else
   7712 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
   7713 template <class D, HWY_IF_F64_D(D)>
   7714 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
   7715  const Rebind<int32_t, decltype(df64)> di32;
   7716  const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
   7717  return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
   7718                                                Set(df64, 4294967296.0),
   7719                                                Zero(df64));
   7720 }
   7721 #endif  // HWY_TARGET <= HWY_AVX3
   7722 
   7723 // ------------------------------ Per4LaneBlockShuffle
   7724 namespace detail {
   7725 
   7726 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   7727 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   7728 #else
   7729 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   7730 #endif
   7731 
   7732 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   7733 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
   7734                                                const uint32_t x2,
   7735                                                const uint32_t x1,
   7736                                                const uint32_t x0) {
   7737  return ResizeBitCast(
   7738      d, Vec128<uint32_t>{_mm_set_epi32(
   7739             static_cast<int32_t>(x3), static_cast<int32_t>(x2),
   7740             static_cast<int32_t>(x1), static_cast<int32_t>(x0))});
   7741 }
   7742 
   7743 template <size_t kIdx3210, class V>
   7744 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   7745                                  hwy::SizeTag<2> /*lane_size_tag*/,
   7746                                  hwy::SizeTag<8> /*vect_size_tag*/, V v) {
   7747  const DFromV<decltype(v)> d;
   7748  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   7749  return BitCast(d,
   7750                 VFromD<decltype(du)>{_mm_shufflelo_epi16(
   7751                     BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
   7752 }
   7753 
   7754 #if HWY_TARGET == HWY_SSE2
   7755 template <size_t kIdx3210, class V>
   7756 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   7757                                  hwy::SizeTag<2> /*lane_size_tag*/,
   7758                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
   7759  const DFromV<decltype(v)> d;
   7760  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   7761  constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
   7762  return BitCast(
   7763      d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
   7764             _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)});
   7765 }
   7766 
   7767 template <size_t kIdx3210, size_t kVectSize, class V,
   7768          hwy::EnableIf<(kVectSize == 4 || kVectSize == 8)>* = nullptr>
   7769 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag,
   7770                                  hwy::SizeTag<1> /*lane_size_tag*/,
   7771                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
   7772                                  V v) {
   7773  const DFromV<decltype(v)> d;
   7774  const RebindToUnsigned<decltype(d)> du;
   7775  const Rebind<uint16_t, decltype(d)> du16;
   7776  const RebindToSigned<decltype(du16)> di16;
   7777 
   7778  const auto vu16 = PromoteTo(du16, BitCast(du, v));
   7779  const auto shuf16_result = Per4LaneBlockShuffle(
   7780      idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<kVectSize * 2>(), vu16);
   7781  return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result)));
   7782 }
   7783 
   7784 template <size_t kIdx3210, size_t kVectSize, class V>
   7785 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag,
   7786                                  hwy::SizeTag<1> /*lane_size_tag*/,
   7787                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
   7788  const DFromV<decltype(v)> d;
   7789  const RebindToUnsigned<decltype(d)> du;
   7790  const Repartition<uint16_t, decltype(d)> du16;
   7791  const RebindToSigned<decltype(du16)> di16;
   7792 
   7793  const auto zero = Zero(d);
   7794  const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero));
   7795  const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero));
   7796 
   7797  const auto lo_shuf_result = Per4LaneBlockShuffle(
   7798      idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16);
   7799  const auto hi_shuf_result = Per4LaneBlockShuffle(
   7800      idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16);
   7801 
   7802  return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result),
   7803                                     BitCast(di16, hi_shuf_result)));
   7804 }
   7805 #endif
   7806 
   7807 template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
   7808 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   7809                                  hwy::SizeTag<4> /*lane_size_tag*/,
   7810                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
   7811  return V{_mm_shuffle_epi32(v.raw, static_cast<int>(kIdx3210 & 0xFF))};
   7812 }
   7813 
   7814 template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
   7815 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   7816                                  hwy::SizeTag<4> /*lane_size_tag*/,
   7817                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
   7818  return V{_mm_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))};
   7819 }
   7820 
   7821 }  // namespace detail
   7822 
   7823 // ------------------------------ SlideUpLanes
   7824 
   7825 namespace detail {
   7826 
   7827 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   7828 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   7829  const DFromV<decltype(v)> d;
   7830  const Full64<uint64_t> du64;
   7831  const auto vu64 = ResizeBitCast(du64, v);
   7832  return ResizeBitCast(
   7833      d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
   7834 }
   7835 
   7836 #if HWY_TARGET <= HWY_SSSE3
   7837 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   7838 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   7839  const DFromV<decltype(v)> d;
   7840  const Repartition<uint8_t, decltype(d)> du8;
   7841  const auto idx =
   7842      Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
   7843  return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
   7844 }
   7845 #else
   7846 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   7847 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   7848  const DFromV<decltype(v)> d;
   7849  const Repartition<int32_t, decltype(d)> di32;
   7850  const Repartition<uint64_t, decltype(d)> du64;
   7851  constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>);
   7852 
   7853  const auto vu64 = BitCast(du64, v);
   7854  const auto v_hi = IfVecThenElse(
   7855      BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))),
   7856      BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64);
   7857  const auto v_lo = ShiftLeftBytes<8>(du64, v_hi);
   7858 
   7859  const int shl_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63);
   7860  return BitCast(
   7861      d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt)));
   7862 }
   7863 #endif
   7864 
   7865 }  // namespace detail
   7866 
   7867 template <class D, HWY_IF_LANES_D(D, 1)>
   7868 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   7869  return v;
   7870 }
   7871 
   7872 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
   7873 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   7874 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   7875  if (__builtin_constant_p(amt)) {
   7876    switch (amt) {
   7877      case 0:
   7878        return v;
   7879      case 1:
   7880        return ShiftLeftLanes<1>(d, v);
   7881    }
   7882  }
   7883 #else
   7884  (void)d;
   7885 #endif
   7886 
   7887  return detail::SlideUpLanes(v, amt);
   7888 }
   7889 
   7890 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
   7891 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   7892 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   7893  if (__builtin_constant_p(amt)) {
   7894    switch (amt) {
   7895      case 0:
   7896        return v;
   7897      case 1:
   7898        return ShiftLeftLanes<1>(d, v);
   7899      case 2:
   7900        return ShiftLeftLanes<2>(d, v);
   7901      case 3:
   7902        return ShiftLeftLanes<3>(d, v);
   7903    }
   7904  }
   7905 #else
   7906  (void)d;
   7907 #endif
   7908 
   7909  return detail::SlideUpLanes(v, amt);
   7910 }
   7911 
   7912 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
   7913 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   7914 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   7915  if (__builtin_constant_p(amt)) {
   7916    switch (amt) {
   7917      case 0:
   7918        return v;
   7919      case 1:
   7920        return ShiftLeftLanes<1>(d, v);
   7921      case 2:
   7922        return ShiftLeftLanes<2>(d, v);
   7923      case 3:
   7924        return ShiftLeftLanes<3>(d, v);
   7925      case 4:
   7926        return ShiftLeftLanes<4>(d, v);
   7927      case 5:
   7928        return ShiftLeftLanes<5>(d, v);
   7929      case 6:
   7930        return ShiftLeftLanes<6>(d, v);
   7931      case 7:
   7932        return ShiftLeftLanes<7>(d, v);
   7933    }
   7934  }
   7935 #else
   7936  (void)d;
   7937 #endif
   7938 
   7939  return detail::SlideUpLanes(v, amt);
   7940 }
   7941 
   7942 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
   7943 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   7944 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   7945  if (__builtin_constant_p(amt)) {
   7946    switch (amt) {
   7947      case 0:
   7948        return v;
   7949      case 1:
   7950        return ShiftLeftLanes<1>(d, v);
   7951      case 2:
   7952        return ShiftLeftLanes<2>(d, v);
   7953      case 3:
   7954        return ShiftLeftLanes<3>(d, v);
   7955      case 4:
   7956        return ShiftLeftLanes<4>(d, v);
   7957      case 5:
   7958        return ShiftLeftLanes<5>(d, v);
   7959      case 6:
   7960        return ShiftLeftLanes<6>(d, v);
   7961      case 7:
   7962        return ShiftLeftLanes<7>(d, v);
   7963      case 8:
   7964        return ShiftLeftLanes<8>(d, v);
   7965      case 9:
   7966        return ShiftLeftLanes<9>(d, v);
   7967      case 10:
   7968        return ShiftLeftLanes<10>(d, v);
   7969      case 11:
   7970        return ShiftLeftLanes<11>(d, v);
   7971      case 12:
   7972        return ShiftLeftLanes<12>(d, v);
   7973      case 13:
   7974        return ShiftLeftLanes<13>(d, v);
   7975      case 14:
   7976        return ShiftLeftLanes<14>(d, v);
   7977      case 15:
   7978        return ShiftLeftLanes<15>(d, v);
   7979    }
   7980  }
   7981 #else
   7982  (void)d;
   7983 #endif
   7984 
   7985  return detail::SlideUpLanes(v, amt);
   7986 }
   7987 
   7988 // ------------------------------ SlideDownLanes
   7989 
   7990 namespace detail {
   7991 
   7992 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   7993 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   7994  const DFromV<decltype(v)> d;
   7995  const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv;
   7996  return BitCast(d,
   7997                 ShiftRightSame(BitCast(dv, v),
   7998                                static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
   7999 }
   8000 
   8001 #if HWY_TARGET <= HWY_SSSE3
   8002 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   8003 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   8004  const DFromV<decltype(v)> d;
   8005  const Repartition<int8_t, decltype(d)> di8;
   8006  auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
   8007  idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
   8008  return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
   8009 }
   8010 #else
   8011 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   8012 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   8013  const DFromV<decltype(v)> d;
   8014  const Repartition<int32_t, decltype(d)> di32;
   8015  const Repartition<uint64_t, decltype(d)> du64;
   8016  constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>);
   8017 
   8018  const auto vu64 = BitCast(du64, v);
   8019  const auto v_lo = IfVecThenElse(
   8020      BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))),
   8021      BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64);
   8022  const auto v_hi = ShiftRightBytes<8>(du64, v_lo);
   8023 
   8024  const int shr_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63);
   8025  return BitCast(
   8026      d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt)));
   8027 }
   8028 #endif
   8029 
   8030 }  // namespace detail
   8031 
   8032 template <class D, HWY_IF_LANES_D(D, 1)>
   8033 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   8034  return v;
   8035 }
   8036 
   8037 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
   8038 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   8039 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   8040  if (__builtin_constant_p(amt)) {
   8041    switch (amt) {
   8042      case 0:
   8043        return v;
   8044      case 1:
   8045        return ShiftRightLanes<1>(d, v);
   8046    }
   8047  }
   8048 #else
   8049  (void)d;
   8050 #endif
   8051 
   8052  return detail::SlideDownLanes(v, amt);
   8053 }
   8054 
   8055 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
   8056 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   8057 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   8058  if (__builtin_constant_p(amt)) {
   8059    switch (amt) {
   8060      case 0:
   8061        return v;
   8062      case 1:
   8063        return ShiftRightLanes<1>(d, v);
   8064      case 2:
   8065        return ShiftRightLanes<2>(d, v);
   8066      case 3:
   8067        return ShiftRightLanes<3>(d, v);
   8068    }
   8069  }
   8070 #else
   8071  (void)d;
   8072 #endif
   8073 
   8074  return detail::SlideDownLanes(v, amt);
   8075 }
   8076 
   8077 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
   8078 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   8079 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   8080  if (__builtin_constant_p(amt)) {
   8081    switch (amt) {
   8082      case 0:
   8083        return v;
   8084      case 1:
   8085        return ShiftRightLanes<1>(d, v);
   8086      case 2:
   8087        return ShiftRightLanes<2>(d, v);
   8088      case 3:
   8089        return ShiftRightLanes<3>(d, v);
   8090      case 4:
   8091        return ShiftRightLanes<4>(d, v);
   8092      case 5:
   8093        return ShiftRightLanes<5>(d, v);
   8094      case 6:
   8095        return ShiftRightLanes<6>(d, v);
   8096      case 7:
   8097        return ShiftRightLanes<7>(d, v);
   8098    }
   8099  }
   8100 #else
   8101  (void)d;
   8102 #endif
   8103 
   8104  return detail::SlideDownLanes(v, amt);
   8105 }
   8106 
   8107 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
   8108 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   8109 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   8110  if (__builtin_constant_p(amt)) {
   8111    switch (amt) {
   8112      case 0:
   8113        return v;
   8114      case 1:
   8115        return ShiftRightLanes<1>(d, v);
   8116      case 2:
   8117        return ShiftRightLanes<2>(d, v);
   8118      case 3:
   8119        return ShiftRightLanes<3>(d, v);
   8120      case 4:
   8121        return ShiftRightLanes<4>(d, v);
   8122      case 5:
   8123        return ShiftRightLanes<5>(d, v);
   8124      case 6:
   8125        return ShiftRightLanes<6>(d, v);
   8126      case 7:
   8127        return ShiftRightLanes<7>(d, v);
   8128      case 8:
   8129        return ShiftRightLanes<8>(d, v);
   8130      case 9:
   8131        return ShiftRightLanes<9>(d, v);
   8132      case 10:
   8133        return ShiftRightLanes<10>(d, v);
   8134      case 11:
   8135        return ShiftRightLanes<11>(d, v);
   8136      case 12:
   8137        return ShiftRightLanes<12>(d, v);
   8138      case 13:
   8139        return ShiftRightLanes<13>(d, v);
   8140      case 14:
   8141        return ShiftRightLanes<14>(d, v);
   8142      case 15:
   8143        return ShiftRightLanes<15>(d, v);
   8144    }
   8145  }
   8146 #else
   8147  (void)d;
   8148 #endif
   8149 
   8150  return detail::SlideDownLanes(v, amt);
   8151 }
   8152 
   8153 // ================================================== MEMORY (4)
   8154 
   8155 // ------------------------------ StoreN (ExtractLane)
   8156 
   8157 #if HWY_TARGET <= HWY_AVX2
   8158 
   8159 #ifdef HWY_NATIVE_STORE_N
   8160 #undef HWY_NATIVE_STORE_N
   8161 #else
   8162 #define HWY_NATIVE_STORE_N
   8163 #endif
   8164 
   8165 template <class D, HWY_IF_T_SIZE_ONE_OF_D(
   8166                       D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) |
   8167                              (1 << 4) | (1 << 8))>
   8168 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
   8169                    size_t max_lanes_to_store) {
   8170  const size_t num_lanes_to_store =
   8171      HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
   8172 
   8173 #if HWY_COMPILER_MSVC
   8174  // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore
   8175  HWY_FENCE;
   8176 #endif
   8177 
   8178  BlendedStore(v, FirstN(d, num_lanes_to_store), d, p);
   8179 
   8180 #if HWY_COMPILER_MSVC
   8181  // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
   8182  HWY_FENCE;
   8183 #endif
   8184 
   8185  detail::MaybeUnpoison(p, num_lanes_to_store);
   8186 }
   8187 
   8188 #if HWY_TARGET > HWY_AVX3
   8189 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   8190          HWY_IF_LANES_D(D, 1)>
   8191 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
   8192                    size_t max_lanes_to_store) {
   8193  if (max_lanes_to_store > 0) {
   8194    StoreU(v, d, p);
   8195  }
   8196 }
   8197 
   8198 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   8199          HWY_IF_LANES_D(D, 2)>
   8200 HWY_API void StoreN(VFromD<D> v, D /*d*/, TFromD<D>* HWY_RESTRICT p,
   8201                    size_t max_lanes_to_store) {
   8202  if (max_lanes_to_store >= 1) {
   8203    p[static_cast<size_t>(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v);
   8204    p[0] = GetLane(v);
   8205  }
   8206 }
   8207 
   8208 namespace detail {
   8209 
   8210 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   8211 HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
   8212                                        TFromD<D>* HWY_RESTRICT p,
   8213                                        size_t num_lanes_to_store) {
   8214  // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if
   8215  // (num_lanes_to_store & 3) != 0 is true
   8216  const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing);
   8217  if ((num_lanes_to_store & 2) != 0) {
   8218    const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128));
   8219    p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
   8220    CopyBytes<sizeof(uint16_t)>(&u16_bits,
   8221                                p + (num_lanes_to_store & ~size_t{3}));
   8222  } else {
   8223    p[num_lanes_to_store - 1] = GetLane(v_full128);
   8224  }
   8225 }
   8226 
   8227 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   8228 HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
   8229                                        TFromD<D>* p,
   8230                                        size_t num_lanes_to_store) {
   8231  // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16
   8232  // vector if (num_lanes_to_store & 1) == 1 is true
   8233  p[num_lanes_to_store - 1] = GetLane(v_trailing);
   8234 }
   8235 
   8236 }  // namespace detail
   8237 
   8238 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   8239          HWY_IF_LANES_GT_D(D, 2)>
   8240 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) {
   8241  const size_t num_lanes_to_store =
   8242      HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
   8243 
   8244  const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
   8245      d_full;
   8246  const RebindToUnsigned<decltype(d_full)> du_full;
   8247  const Repartition<int32_t, decltype(d_full)> di32_full;
   8248 
   8249  const auto i32_store_mask = BitCast(
   8250      di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store)));
   8251  const auto vi32 = ResizeBitCast(di32_full, v);
   8252 
   8253 #if HWY_COMPILER_MSVC
   8254  // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore
   8255  HWY_FENCE;
   8256 #endif
   8257 
   8258  BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full,
   8259               reinterpret_cast<int32_t*>(p));
   8260 
   8261  constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>);
   8262  constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1;
   8263  const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask);
   8264 
   8265  if (trailing_n != 0) {
   8266    const VFromD<D> v_trailing = ResizeBitCast(
   8267        d, SlideDownLanes(di32_full, vi32,
   8268                          num_lanes_to_store / kNumOfLanesPerI32));
   8269    detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store);
   8270  }
   8271 
   8272 #if HWY_COMPILER_MSVC
   8273  // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
   8274  HWY_FENCE;
   8275 #endif
   8276 
   8277  detail::MaybeUnpoison(p, num_lanes_to_store);
   8278 }
   8279 #endif  // HWY_TARGET > HWY_AVX3
   8280 #endif  // HWY_TARGET <= HWY_AVX2
   8281 
   8282 // ================================================== COMBINE
   8283 
   8284 // ------------------------------ Combine (InterleaveLower)
   8285 
   8286 // N = N/2 + N/2 (upper half undefined)
   8287 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
   8288 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
   8289  const Half<decltype(d)> dh;
   8290  const RebindToUnsigned<decltype(dh)> duh;
   8291  // Treat half-width input as one lane, and expand to two lanes.
   8292  using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
   8293  const VU lo{BitCast(duh, lo_half).raw};
   8294  const VU hi{BitCast(duh, hi_half).raw};
   8295  return BitCast(d, InterleaveLower(lo, hi));
   8296 }
   8297 
   8298 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
   8299 
   8300 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
   8301 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   8302  const RebindToUnsigned<decltype(d)> du;
   8303  const Half<decltype(du)> duh;
   8304  return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)});
   8305 }
   8306 
   8307 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
   8308 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   8309  const Half<D> dh;
   8310  return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
   8311 }
   8312 
   8313 #if HWY_HAVE_FLOAT16
   8314 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
   8315 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   8316  const RebindToUnsigned<decltype(d)> du;
   8317  const Half<decltype(du)> duh;
   8318  return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
   8319 }
   8320 #endif
   8321 
   8322 // Generic for all vector lengths.
   8323 template <class D, HWY_X86_IF_EMULATED_D(D)>
   8324 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   8325  const RebindToUnsigned<decltype(d)> du;
   8326  const Half<decltype(du)> duh;
   8327  return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
   8328 }
   8329 
   8330 // ------------------------------ Concat full (InterleaveLower)
   8331 
   8332 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
   8333 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   8334 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   8335  const Repartition<uint64_t, decltype(d)> d64;
   8336  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
   8337 }
   8338 
   8339 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
   8340 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   8341 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   8342  const Repartition<uint64_t, decltype(d)> d64;
   8343  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
   8344 }
   8345 
   8346 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
   8347 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   8348 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   8349  return CombineShiftRightBytes<8>(d, hi, lo);
   8350 }
   8351 
   8352 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
   8353 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
   8354 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   8355  const Repartition<double, decltype(d)> dd;
   8356 #if HWY_TARGET >= HWY_SSSE3
   8357  return BitCast(
   8358      d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
   8359                                       _MM_SHUFFLE2(1, 0))});
   8360 #else
   8361  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
   8362  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
   8363                                                BitCast(dd, lo).raw, 1)});
   8364 #endif
   8365 }
   8366 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   8367 HWY_API Vec128<float> ConcatUpperLower(D d, Vec128<float> hi,
   8368                                       Vec128<float> lo) {
   8369 #if HWY_TARGET >= HWY_SSSE3
   8370  (void)d;
   8371  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
   8372 #else
   8373  // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
   8374  const RepartitionToWide<decltype(d)> dd;
   8375  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
   8376                                                BitCast(dd, lo).raw, 1)});
   8377 #endif
   8378 }
   8379 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   8380 HWY_API Vec128<double> ConcatUpperLower(D /* tag */, Vec128<double> hi,
   8381                                        Vec128<double> lo) {
   8382 #if HWY_TARGET >= HWY_SSSE3
   8383  return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
   8384 #else
   8385  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
   8386  return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
   8387 #endif
   8388 }
   8389 
   8390 // ------------------------------ Concat partial (Combine, LowerHalf)
   8391 
   8392 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   8393 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   8394  const Half<decltype(d)> d2;
   8395  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
   8396 }
   8397 
   8398 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   8399 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   8400  const Half<decltype(d)> d2;
   8401  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
   8402 }
   8403 
   8404 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   8405 HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi,
   8406                                   const VFromD<D> lo) {
   8407  const Half<decltype(d)> d2;
   8408  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
   8409 }
   8410 
   8411 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   8412 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   8413  const Half<decltype(d)> d2;
   8414  return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
   8415 }
   8416 
   8417 // ------------------------------ ConcatOdd
   8418 
   8419 // 8-bit full
   8420 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   8421 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8422  const Repartition<uint16_t, decltype(d)> dw;
   8423  // Right-shift 8 bits per u16 so we can pack.
   8424  const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
   8425  const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
   8426  return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
   8427 }
   8428 
   8429 // 8-bit x8
   8430 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   8431 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8432 #if HWY_TARGET == HWY_SSE2
   8433  const Repartition<uint16_t, decltype(d)> dw;
   8434  // Right-shift 8 bits per u16 so we can pack.
   8435  const Vec64<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
   8436  const Vec64<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
   8437  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
   8438                                     _MM_SHUFFLE(2, 0, 2, 0))};
   8439 #else
   8440  const Repartition<uint32_t, decltype(d)> du32;
   8441  // Don't care about upper half, no need to zero.
   8442  alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
   8443  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
   8444  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8445  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8446  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
   8447 #endif
   8448 }
   8449 
   8450 // 8-bit x4
   8451 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
   8452 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8453 #if HWY_TARGET == HWY_SSE2
   8454  const Repartition<uint16_t, decltype(d)> dw;
   8455  const Twice<decltype(dw)> dw_2;
   8456  // Right-shift 8 bits per u16 so we can pack.
   8457  const Vec32<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
   8458  const Vec32<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
   8459  const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL);
   8460  return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
   8461 #else
   8462  const Repartition<uint16_t, decltype(d)> du16;
   8463  // Don't care about upper half, no need to zero.
   8464  alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
   8465  const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
   8466  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8467  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8468  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
   8469 #endif
   8470 }
   8471 
   8472 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   8473 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8474  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
   8475  // 0xFFFF8000, which correctly saturates to 0x8000.
   8476  const RebindToUnsigned<decltype(d)> du;
   8477  const Repartition<int32_t, decltype(d)> dw;
   8478  const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
   8479  const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
   8480  return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)});
   8481 }
   8482 
   8483 // 16-bit x4
   8484 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   8485 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8486 #if HWY_TARGET == HWY_SSE2
   8487  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
   8488  // 0xFFFF8000, which correctly saturates to 0x8000.
   8489  const Repartition<int32_t, decltype(d)> dw;
   8490  const Vec64<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
   8491  const Vec64<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
   8492  return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw),
   8493                                     _MM_SHUFFLE(2, 0, 2, 0))};
   8494 #else
   8495  const Repartition<uint32_t, decltype(d)> du32;
   8496  // Don't care about upper half, no need to zero.
   8497  alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
   8498  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
   8499  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8500  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8501  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
   8502 #endif
   8503 }
   8504 
   8505 // 32-bit full
   8506 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   8507 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8508  const RebindToFloat<decltype(d)> df;
   8509  return BitCast(
   8510      d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
   8511                                      _MM_SHUFFLE(3, 1, 3, 1))});
   8512 }
   8513 
   8514 // Any type x2
   8515 template <class D, HWY_IF_LANES_D(D, 2)>
   8516 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   8517  return InterleaveUpper(d, lo, hi);
   8518 }
   8519 
   8520 // ------------------------------ ConcatEven (InterleaveLower)
   8521 
   8522 // 8-bit full
   8523 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   8524 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8525  const Repartition<uint16_t, decltype(d)> dw;
   8526  // Isolate lower 8 bits per u16 so we can pack.
   8527  const Vec128<uint16_t> mask = Set(dw, 0x00FF);
   8528  const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
   8529  const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
   8530  return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)};
   8531 }
   8532 
   8533 // 8-bit x8
   8534 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   8535 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8536 #if HWY_TARGET == HWY_SSE2
   8537  const Repartition<uint16_t, decltype(d)> dw;
   8538  // Isolate lower 8 bits per u16 so we can pack.
   8539  const Vec64<uint16_t> mask = Set(dw, 0x00FF);
   8540  const Vec64<uint16_t> uH = And(BitCast(dw, hi), mask);
   8541  const Vec64<uint16_t> uL = And(BitCast(dw, lo), mask);
   8542  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw),
   8543                                     _MM_SHUFFLE(2, 0, 2, 0))};
   8544 #else
   8545  const Repartition<uint32_t, decltype(d)> du32;
   8546  // Don't care about upper half, no need to zero.
   8547  alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
   8548  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
   8549  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8550  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8551  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
   8552 #endif
   8553 }
   8554 
   8555 // 8-bit x4
   8556 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
   8557 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8558 #if HWY_TARGET == HWY_SSE2
   8559  const Repartition<uint16_t, decltype(d)> dw;
   8560  const Twice<decltype(dw)> dw_2;
   8561  // Isolate lower 8 bits per u16 so we can pack.
   8562  const Vec32<uint16_t> mask = Set(dw, 0x00FF);
   8563  const Vec32<uint16_t> uH = And(BitCast(dw, hi), mask);
   8564  const Vec32<uint16_t> uL = And(BitCast(dw, lo), mask);
   8565  const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL);
   8566  return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)};
   8567 #else
   8568  const Repartition<uint16_t, decltype(d)> du16;
   8569  // Don't care about upper half, no need to zero.
   8570  alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
   8571  const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
   8572  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8573  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8574  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
   8575 #endif
   8576 }
   8577 
   8578 // 16-bit full
   8579 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   8580 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8581 #if HWY_TARGET <= HWY_SSE4
   8582  // Isolate lower 16 bits per u32 so we can pack.
   8583  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   8584  const Repartition<uint32_t, decltype(d)> dw;
   8585  const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
   8586  const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
   8587  const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
   8588  return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)});
   8589 #elif HWY_TARGET == HWY_SSE2
   8590  const Repartition<uint32_t, decltype(d)> dw;
   8591  return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
   8592                   BitCast(d, ShiftLeft<16>(BitCast(dw, lo))));
   8593 #else
   8594  const RebindToUnsigned<decltype(d)> du;
   8595  // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
   8596  // inputs, then concatenate them.
   8597  alignas(16)
   8598      const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
   8599  const VFromD<D> shuf = BitCast(d, Load(du, kCompactEvenU16));
   8600  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8601  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8602  return ConcatLowerLower(d, H, L);
   8603 #endif
   8604 }
   8605 
   8606 // 16-bit x4
   8607 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   8608 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8609 #if HWY_TARGET == HWY_SSE2
   8610  const Repartition<uint32_t, decltype(d)> dw;
   8611  return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
   8612                   BitCast(d, ShiftLeft<16>(BitCast(dw, lo))));
   8613 #else
   8614  const Repartition<uint32_t, decltype(d)> du32;
   8615  // Don't care about upper half, no need to zero.
   8616  alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
   8617  const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
   8618  const VFromD<D> L = TableLookupBytes(lo, shuf);
   8619  const VFromD<D> H = TableLookupBytes(hi, shuf);
   8620  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
   8621 #endif
   8622 }
   8623 
   8624 // 32-bit full
   8625 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
   8626 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8627  const RebindToFloat<decltype(d)> df;
   8628  return BitCast(
   8629      d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
   8630                                      _MM_SHUFFLE(2, 0, 2, 0))});
   8631 }
   8632 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   8633 HWY_API VFromD<D> ConcatEven(D /* d */, VFromD<D> hi, VFromD<D> lo) {
   8634  return VFromD<D>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
   8635 }
   8636 
   8637 // Any T x2
   8638 template <class D, HWY_IF_LANES_D(D, 2)>
   8639 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   8640  return InterleaveLower(d, lo, hi);
   8641 }
   8642 
   8643 // ------------------------------ DupEven (InterleaveLower)
   8644 
   8645 template <typename T>
   8646 HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) {
   8647  return v;
   8648 }
   8649 
   8650 template <typename T>
   8651 HWY_API Vec128<T, 2> DupEven(const Vec128<T, 2> v) {
   8652  return InterleaveLower(DFromV<decltype(v)>(), v, v);
   8653 }
   8654 
   8655 template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 2)>
   8656 HWY_API V DupEven(V v) {
   8657  const DFromV<decltype(v)> d;
   8658 
   8659 #if HWY_TARGET <= HWY_SSSE3
   8660  const RebindToUnsigned<decltype(d)> du;
   8661  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
   8662      du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
   8663  return TableLookupBytes(v, BitCast(d, shuffle));
   8664 #else
   8665  const Repartition<uint16_t, decltype(d)> du16;
   8666  return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})),
   8667                       BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v);
   8668 #endif
   8669 }
   8670 
   8671 template <typename T, HWY_IF_T_SIZE(T, 2)>
   8672 HWY_API Vec64<T> DupEven(const Vec64<T> v) {
   8673  const DFromV<decltype(v)> d;
   8674  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   8675  return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
   8676                        BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))});
   8677 }
   8678 
   8679 // Generic for all vector lengths.
   8680 template <class V, HWY_IF_T_SIZE_V(V, 2)>
   8681 HWY_API V DupEven(const V v) {
   8682  const DFromV<decltype(v)> d;
   8683  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   8684 #if HWY_TARGET <= HWY_SSSE3
   8685  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
   8686      du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c);
   8687  return TableLookupBytes(v, BitCast(d, shuffle));
   8688 #else
   8689  return BitCast(
   8690      d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
   8691             _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)),
   8692             _MM_SHUFFLE(2, 2, 0, 0))});
   8693 #endif
   8694 }
   8695 
   8696 template <typename T, HWY_IF_UI32(T)>
   8697 HWY_API Vec128<T> DupEven(Vec128<T> v) {
   8698  return Vec128<T>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
   8699 }
   8700 
   8701 HWY_API Vec128<float> DupEven(Vec128<float> v) {
   8702  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
   8703 }
   8704 
   8705 // ------------------------------ DupOdd (InterleaveUpper)
   8706 
   8707 template <typename T, HWY_IF_T_SIZE(T, 1)>
   8708 HWY_API Vec128<T, 1> DupOdd(Vec128<T, 1> v) {
   8709  return v;
   8710 }
   8711 
   8712 template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 1)>
   8713 HWY_API V DupOdd(V v) {
   8714  const DFromV<decltype(v)> d;
   8715 
   8716 #if HWY_TARGET <= HWY_SSSE3
   8717  const RebindToUnsigned<decltype(d)> du;
   8718  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
   8719      du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
   8720  return TableLookupBytes(v, BitCast(d, shuffle));
   8721 #else
   8722  const Repartition<uint16_t, decltype(d)> du16;
   8723  return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})),
   8724                       BitCast(d, ShiftRight<8>(BitCast(du16, v))), v);
   8725 #endif
   8726 }
   8727 
   8728 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
   8729 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   8730  const DFromV<decltype(v)> d;
   8731  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   8732  return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
   8733                        BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))});
   8734 }
   8735 
   8736 // Generic for all vector lengths.
   8737 template <typename V, HWY_IF_T_SIZE_V(V, 2), HWY_IF_V_SIZE_GT_V(V, 8)>
   8738 HWY_API V DupOdd(V v) {
   8739  const DFromV<decltype(v)> d;
   8740  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   8741 #if HWY_TARGET <= HWY_SSSE3
   8742  const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
   8743      du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e);
   8744  return TableLookupBytes(v, BitCast(d, shuffle));
   8745 #else
   8746  return BitCast(
   8747      d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
   8748             _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)),
   8749             _MM_SHUFFLE(3, 3, 1, 1))});
   8750 #endif
   8751 }
   8752 
   8753 template <typename T, size_t N, HWY_IF_UI32(T)>
   8754 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   8755  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
   8756 }
   8757 template <size_t N>
   8758 HWY_API Vec128<float, N> DupOdd(Vec128<float, N> v) {
   8759  return Vec128<float, N>{
   8760      _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
   8761 }
   8762 
   8763 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   8764 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
   8765  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
   8766 }
   8767 
   8768 // ------------------------------ TwoTablesLookupLanes (DupEven)
   8769 
   8770 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   8771 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
   8772                                          Indices128<T, N> idx) {
   8773  const DFromV<decltype(a)> d;
   8774  const Twice<decltype(d)> dt;
   8775 // TableLookupLanes currently requires table and index vectors to be the same
   8776 // size, though a half-length index vector would be sufficient here.
   8777 #if HWY_IS_MSAN
   8778  const Vec128<T, N> idx_vec{idx.raw};
   8779  const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
   8780 #else
   8781  // We only keep LowerHalf of the result, which is valid in idx.
   8782  const Indices128<T, N * 2> idx2{idx.raw};
   8783 #endif
   8784  return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
   8785 }
   8786 
   8787 template <typename T, HWY_IF_T_SIZE(T, 1)>
   8788 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   8789                                       Indices128<T> idx) {
   8790 #if HWY_TARGET <= HWY_AVX3_DL
   8791  return Vec128<T>{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)};
   8792 #else  // AVX3 or below
   8793  const DFromV<decltype(a)> d;
   8794  const Vec128<T> idx_vec{idx.raw};
   8795 
   8796 #if HWY_TARGET <= HWY_SSE4
   8797  const Repartition<uint16_t, decltype(d)> du16;
   8798  const auto sel_hi_mask =
   8799      MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec))));
   8800 #else
   8801  const RebindToSigned<decltype(d)> di;
   8802  const auto sel_hi_mask =
   8803      RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15}));
   8804 #endif
   8805 
   8806  const auto lo_lookup_result = TableLookupBytes(a, idx_vec);
   8807 #if HWY_TARGET <= HWY_AVX3
   8808  const Vec128<T> lookup_result{_mm_mask_shuffle_epi8(
   8809      lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)};
   8810  return lookup_result;
   8811 #else
   8812  const auto hi_lookup_result = TableLookupBytes(b, idx_vec);
   8813  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
   8814 #endif  // HWY_TARGET <= HWY_AVX3
   8815 #endif  // HWY_TARGET <= HWY_AVX3_DL
   8816 }
   8817 
   8818 template <typename T, HWY_IF_T_SIZE(T, 2)>
   8819 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   8820                                       Indices128<T> idx) {
   8821 #if HWY_TARGET <= HWY_AVX3
   8822  return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)};
   8823 #elif HWY_TARGET == HWY_SSE2
   8824  const DFromV<decltype(a)> d;
   8825  const RebindToSigned<decltype(d)> di;
   8826  const Vec128<T> idx_vec{idx.raw};
   8827  const auto sel_hi_mask =
   8828      RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7}));
   8829  const auto lo_lookup_result = TableLookupLanes(a, idx);
   8830  const auto hi_lookup_result = TableLookupLanes(b, idx);
   8831  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
   8832 #else
   8833  const DFromV<decltype(a)> d;
   8834  const Repartition<uint8_t, decltype(d)> du8;
   8835  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
   8836                                         Indices128<uint8_t>{idx.raw}));
   8837 #endif
   8838 }
   8839 
   8840 template <typename T, HWY_IF_UI32(T)>
   8841 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   8842                                       Indices128<T> idx) {
   8843 #if HWY_TARGET <= HWY_AVX3
   8844  return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)};
   8845 #else  // AVX2 or below
   8846  const DFromV<decltype(a)> d;
   8847 
   8848 #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
   8849  const Vec128<T> idx_vec{idx.raw};
   8850 
   8851 #if HWY_TARGET <= HWY_AVX2
   8852  const RebindToFloat<decltype(d)> d_sel;
   8853  const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec)));
   8854 #else
   8855  const RebindToSigned<decltype(d)> d_sel;
   8856  const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3});
   8857 #endif
   8858 
   8859  const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx));
   8860  const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx));
   8861  return BitCast(d,
   8862                 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
   8863 #else   // SSSE3 or SSE4
   8864  const Repartition<uint8_t, decltype(d)> du8;
   8865  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
   8866                                         Indices128<uint8_t>{idx.raw}));
   8867 #endif  // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
   8868 #endif  // HWY_TARGET <= HWY_AVX3
   8869 }
   8870 
   8871 #if HWY_HAVE_FLOAT16
   8872 HWY_API Vec128<float16_t> TwoTablesLookupLanes(Vec128<float16_t> a,
   8873                                               Vec128<float16_t> b,
   8874                                               Indices128<float16_t> idx) {
   8875  return Vec128<float16_t>{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)};
   8876 }
   8877 #endif  // HWY_HAVE_FLOAT16
   8878 HWY_API Vec128<float> TwoTablesLookupLanes(Vec128<float> a, Vec128<float> b,
   8879                                           Indices128<float> idx) {
   8880 #if HWY_TARGET <= HWY_AVX3
   8881  return Vec128<float>{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)};
   8882 #elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2
   8883  const DFromV<decltype(a)> d;
   8884 
   8885 #if HWY_TARGET <= HWY_AVX2
   8886  const auto sel_hi_mask =
   8887      MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128<int32_t>{idx.raw})));
   8888 #else
   8889  const RebindToSigned<decltype(d)> di;
   8890  const auto sel_hi_mask =
   8891      RebindMask(d, Vec128<int32_t>{idx.raw} > Set(di, int32_t{3}));
   8892 #endif
   8893 
   8894  const auto lo_lookup_result = TableLookupLanes(a, idx);
   8895  const auto hi_lookup_result = TableLookupLanes(b, idx);
   8896  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
   8897 #else  // SSSE3 or SSE4
   8898  const DFromV<decltype(a)> d;
   8899  const Repartition<uint8_t, decltype(d)> du8;
   8900  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
   8901                                         Indices128<uint8_t>{idx.raw}));
   8902 #endif
   8903 }
   8904 
   8905 template <typename T, HWY_IF_UI64(T)>
   8906 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   8907                                       Indices128<T> idx) {
   8908 #if HWY_TARGET <= HWY_AVX3
   8909  return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)};
   8910 #else
   8911  const DFromV<decltype(a)> d;
   8912  const Vec128<T> idx_vec{idx.raw};
   8913  const Indices128<T> idx_mod{And(idx_vec, Set(d, T{1})).raw};
   8914 
   8915 #if HWY_TARGET <= HWY_SSE4
   8916  const RebindToFloat<decltype(d)> d_sel;
   8917  const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec)));
   8918 #else   // SSE2 or SSSE3
   8919  const Repartition<int32_t, decltype(d)> di32;
   8920  const RebindToSigned<decltype(d)> d_sel;
   8921  const auto sel_hi_mask = MaskFromVec(
   8922      BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) >
   8923                                           Set(di32, int32_t{1}))));
   8924 #endif  // HWY_TARGET <= HWY_SSE4
   8925 
   8926  const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod));
   8927  const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod));
   8928  return BitCast(d,
   8929                 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
   8930 #endif  // HWY_TARGET <= HWY_AVX3
   8931 }
   8932 
   8933 HWY_API Vec128<double> TwoTablesLookupLanes(Vec128<double> a, Vec128<double> b,
   8934                                            Indices128<double> idx) {
   8935 #if HWY_TARGET <= HWY_AVX3
   8936  return Vec128<double>{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)};
   8937 #else
   8938  const DFromV<decltype(a)> d;
   8939  const RebindToSigned<decltype(d)> di;
   8940  const Vec128<int64_t> idx_vec{idx.raw};
   8941  const Indices128<double> idx_mod{And(idx_vec, Set(di, int64_t{1})).raw};
   8942 
   8943 #if HWY_TARGET <= HWY_SSE4
   8944  const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec)));
   8945 #else   // SSE2 or SSSE3
   8946  const Repartition<int32_t, decltype(d)> di32;
   8947  const auto sel_hi_mask =
   8948      MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) >
   8949                                                   Set(di32, int32_t{1}))));
   8950 #endif  // HWY_TARGET <= HWY_SSE4
   8951 
   8952  const auto lo_lookup_result = TableLookupLanes(a, idx_mod);
   8953  const auto hi_lookup_result = TableLookupLanes(b, idx_mod);
   8954  return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result);
   8955 #endif  // HWY_TARGET <= HWY_AVX3
   8956 }
   8957 
   8958 // ------------------------------ OddEven (IfThenElse)
   8959 
   8960 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   8961 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   8962  const DFromV<decltype(a)> d;
   8963  const Repartition<uint8_t, decltype(d)> d8;
   8964  alignas(16) static constexpr uint8_t mask[16] = {
   8965      0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
   8966  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
   8967 }
   8968 
   8969 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   8970 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   8971  const DFromV<decltype(a)> d;
   8972 #if HWY_TARGET >= HWY_SSSE3
   8973  const Repartition<uint8_t, decltype(d)> d8;
   8974  alignas(16) static constexpr uint8_t mask[16] = {
   8975      0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
   8976  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
   8977 #else
   8978  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   8979  return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16(
   8980                        BitCast(du, a).raw, BitCast(du, b).raw, 0x55)});
   8981 #endif
   8982 }
   8983 
   8984 template <typename T, size_t N, HWY_IF_UI32(T)>
   8985 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   8986 #if HWY_TARGET >= HWY_SSSE3
   8987  const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
   8988  const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
   8989  return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
   8990 #else
   8991  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
   8992  const DFromV<decltype(a)> d;
   8993  const RebindToFloat<decltype(d)> df;
   8994  return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
   8995                                                  BitCast(df, b).raw, 5)});
   8996 #endif
   8997 }
   8998 
   8999 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   9000 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   9001  // Same as ConcatUpperLower for full vectors; do not call that because this
   9002  // is more efficient for 64x1 vectors.
   9003  const DFromV<decltype(a)> d;
   9004  const RebindToFloat<decltype(d)> dd;
   9005 #if HWY_TARGET >= HWY_SSSE3
   9006  return BitCast(
   9007      d, Vec128<double, N>{_mm_shuffle_pd(
   9008             BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
   9009 #else
   9010  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
   9011  return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
   9012                                                   BitCast(dd, b).raw, 1)});
   9013 #endif
   9014 }
   9015 
   9016 template <size_t N>
   9017 HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
   9018 #if HWY_TARGET >= HWY_SSSE3
   9019  // SHUFPS must fill the lower half of the output from one input, so we
   9020  // need another shuffle. Unpack avoids another immediate byte.
   9021  const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
   9022  const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
   9023  return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
   9024 #else
   9025  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
   9026 #endif
   9027 }
   9028 
   9029 // -------------------------- InterleaveEven
   9030 
   9031 template <class D, HWY_IF_LANES_LE_D(D, 2)>
   9032 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   9033  return ConcatEven(d, b, a);
   9034 }
   9035 
   9036 // I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes
   9037 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
   9038 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   9039  const Repartition<uint16_t, decltype(d)> du16;
   9040  return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a);
   9041 }
   9042 
   9043 // I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes
   9044 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
   9045 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   9046  const Repartition<uint32_t, decltype(d)> du32;
   9047  return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a);
   9048 }
   9049 
   9050 #if HWY_TARGET <= HWY_AVX3
   9051 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
   9052 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   9053  return VFromD<D>{_mm_mask_shuffle_epi32(
   9054      a.raw, static_cast<__mmask8>(0x0A), b.raw,
   9055      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
   9056 }
   9057 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
   9058 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   9059  return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A),
   9060                                       b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))};
   9061 }
   9062 #else
   9063 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
   9064 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   9065  const RebindToFloat<decltype(d)> df;
   9066  const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a));
   9067  return BitCast(
   9068      d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw,
   9069                                             _MM_SHUFFLE(3, 1, 2, 0))});
   9070 }
   9071 #endif
   9072 
   9073 // -------------------------- InterleaveOdd
   9074 
   9075 template <class D, HWY_IF_LANES_LE_D(D, 2)>
   9076 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   9077  return ConcatOdd(d, b, a);
   9078 }
   9079 
   9080 // I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes
   9081 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
   9082 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   9083  const Repartition<uint16_t, decltype(d)> du16;
   9084  return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a))));
   9085 }
   9086 
   9087 // I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes
   9088 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
   9089 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   9090  const Repartition<uint32_t, decltype(d)> du32;
   9091  return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a))));
   9092 }
   9093 
   9094 #if HWY_TARGET <= HWY_AVX3
   9095 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
   9096 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   9097  return VFromD<D>{_mm_mask_shuffle_epi32(
   9098      b.raw, static_cast<__mmask8>(0x05), a.raw,
   9099      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
   9100 }
   9101 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
   9102 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   9103  return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05),
   9104                                       a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))};
   9105 }
   9106 #else
   9107 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
   9108 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   9109  const RebindToFloat<decltype(d)> df;
   9110  const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a));
   9111  return BitCast(
   9112      d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw,
   9113                                             _MM_SHUFFLE(3, 1, 2, 0))});
   9114 }
   9115 #endif
   9116 
   9117 // ------------------------------ OddEvenBlocks
   9118 template <typename T, size_t N>
   9119 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   9120  return even;
   9121 }
   9122 
   9123 // ------------------------------ SwapAdjacentBlocks
   9124 template <typename T, size_t N>
   9125 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
   9126  return v;
   9127 }
   9128 
   9129 // ------------------------------ InterleaveEvenBlocks
   9130 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
   9131 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
   9132  return a;
   9133 }
   9134 // ------------------------------ InterleaveOddBlocks
   9135 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
   9136 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
   9137  return a;
   9138 }
   9139 
   9140 // ------------------------------ Shl (ZipLower, Mul)
   9141 
   9142 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
   9143 // two from loading float exponents, which is considerably faster (according
   9144 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
   9145 
   9146 namespace detail {
   9147 
   9148 #if HWY_TARGET == HWY_AVX2  // Unused for AVX3 - we use sllv directly
   9149 template <class V>
   9150 HWY_API V AVX2ShlU16Vec128(V v, V bits) {
   9151  const DFromV<decltype(v)> d;
   9152  const Rebind<uint32_t, decltype(d)> du32;
   9153  return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
   9154 }
   9155 #elif HWY_TARGET > HWY_AVX2
   9156 
   9157 template <class D32>
   9158 static HWY_INLINE VFromD<D32> Pow2ConvF32ToI32(
   9159    D32 d32, VFromD<RebindToFloat<D32>> vf32) {
   9160  const RebindToSigned<decltype(d32)> di32;
   9161 #if HWY_COMPILER_GCC_ACTUAL
   9162  // ConvertInRangeTo is safe with GCC due the inline assembly workaround used
   9163  // for F32->I32 ConvertInRangeTo with GCC
   9164  return BitCast(d32, ConvertInRangeTo(di32, vf32));
   9165 #else
   9166  // Otherwise, use NearestIntInRange because we rely on the native 0x80..00
   9167  // overflow behavior
   9168  return BitCast(d32, NearestIntInRange(di32, vf32));
   9169 #endif
   9170 }
   9171 
   9172 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
   9173 template <typename T, HWY_IF_T_SIZE(T, 2)>
   9174 HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) {
   9175  const DFromV<decltype(v)> d;
   9176  const RebindToUnsigned<decltype(d)> du;
   9177  const RepartitionToWide<decltype(d)> dw;
   9178  const Rebind<float, decltype(dw)> df;
   9179  const auto zero = Zero(d);
   9180  // Move into exponent (this u16 will become the upper half of an f32)
   9181  const auto exp = ShiftLeft<23 - 16>(v);
   9182  const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
   9183  // Insert 0 into lower halves for reinterpreting as binary32.
   9184  const auto f0 = ZipLower(dw, zero, upper);
   9185  const auto f1 = ZipUpper(dw, zero, upper);
   9186  // See cvtps comment below.
   9187  const VFromD<decltype(dw)> bits0 = Pow2ConvF32ToI32(dw, BitCast(df, f0));
   9188  const VFromD<decltype(dw)> bits1 = Pow2ConvF32ToI32(dw, BitCast(df, f1));
   9189 #if HWY_TARGET <= HWY_SSE4
   9190  return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)};
   9191 #else
   9192  return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0));
   9193 #endif
   9194 }
   9195 
   9196 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)>
   9197 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
   9198  const DFromV<decltype(v)> d;
   9199  const RebindToUnsigned<decltype(d)> du;
   9200  const Twice<decltype(du)> dt_u;
   9201  const RepartitionToWide<decltype(dt_u)> dt_w;
   9202  const RebindToFloat<decltype(dt_w)> dt_f;
   9203  // Move into exponent (this u16 will become the upper half of an f32)
   9204  const auto exp = ShiftLeft<23 - 16>(v);
   9205  const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
   9206  // Insert 0 into lower halves for reinterpreting as binary32.
   9207  const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper));
   9208  // See cvtps comment below.
   9209  const VFromD<decltype(dt_w)> bits0 =
   9210      Pow2ConvF32ToI32(dt_w, BitCast(dt_f, f0));
   9211 #if HWY_TARGET <= HWY_SSE4
   9212  return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)};
   9213 #elif HWY_TARGET == HWY_SSSE3
   9214  alignas(16)
   9215      const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
   9216  return TableLookupBytes(bits0, Load(du, kCompactEvenU16));
   9217 #else
   9218  const RebindToSigned<decltype(dt_w)> dt_i32;
   9219  const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0)));
   9220  return VFromD<decltype(du)>{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)};
   9221 #endif
   9222 }
   9223 
   9224 // Same, for 32-bit shifts.
   9225 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   9226 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
   9227  const DFromV<decltype(v)> d;
   9228  const RebindToFloat<decltype(d)> df;
   9229  const auto exp = ShiftLeft<23>(v);
   9230  const auto f = exp + Set(d, 0x3F800000);  // 1.0f
   9231  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
   9232  // behavior.
   9233  return Pow2ConvF32ToI32(d, BitCast(df, f));
   9234 }
   9235 
   9236 #endif  // HWY_TARGET > HWY_AVX2
   9237 
   9238 template <size_t N>
   9239 HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v,
   9240                                Vec128<uint16_t, N> bits) {
   9241 #if HWY_TARGET <= HWY_AVX3
   9242  return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
   9243 #elif HWY_TARGET == HWY_AVX2
   9244  return AVX2ShlU16Vec128(v, bits);
   9245 #else
   9246  return v * Pow2(bits);
   9247 #endif
   9248 }
   9249 
   9250 #if HWY_TARGET > HWY_AVX3
   9251 HWY_API Vec16<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec16<uint16_t> v,
   9252                            Vec16<uint16_t> bits) {
   9253 #if HWY_TARGET <= HWY_SSE4
   9254  const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
   9255 #else
   9256  const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
   9257 #endif
   9258  return Vec16<uint16_t>{_mm_sll_epi16(v.raw, bits16.raw)};
   9259 }
   9260 #endif
   9261 
   9262 #if HWY_TARGET <= HWY_AVX3
   9263 template <class V>
   9264 HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
   9265  const DFromV<decltype(v)> d;
   9266  const Rebind<uint16_t, decltype(d)> du16;
   9267  return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits));
   9268 }
   9269 #elif HWY_TARGET <= HWY_AVX2
   9270 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   9271 HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
   9272  const DFromV<decltype(v)> d;
   9273  const Rebind<uint32_t, decltype(d)> du32;
   9274  return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits));
   9275 }
   9276 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   9277 HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) {
   9278  const DFromV<decltype(v)> d;
   9279  const Half<decltype(d)> dh;
   9280  const Rebind<uint16_t, decltype(d)> du16;
   9281  const Rebind<uint32_t, decltype(dh)> dh_u32;
   9282 
   9283  const VFromD<decltype(dh_u32)> lo_shl_result =
   9284      PromoteTo(dh_u32, LowerHalf(dh, v))
   9285      << PromoteTo(dh_u32, LowerHalf(dh, bits));
   9286  const VFromD<decltype(dh_u32)> hi_shl_result =
   9287      PromoteTo(dh_u32, UpperHalf(dh, v))
   9288      << PromoteTo(dh_u32, UpperHalf(dh, bits));
   9289  const VFromD<decltype(du16)> u16_shl_result = ConcatEven(
   9290      du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result));
   9291  return TruncateTo(d, u16_shl_result);
   9292 }
   9293 #endif  // HWY_TARGET <= HWY_AVX3
   9294 
   9295 // 8-bit: may use the Shl overload for uint16_t.
   9296 template <size_t N>
   9297 HWY_API Vec128<uint8_t, N> Shl(hwy::UnsignedTag tag, Vec128<uint8_t, N> v,
   9298                               Vec128<uint8_t, N> bits) {
   9299  const DFromV<decltype(v)> d;
   9300 #if HWY_TARGET <= HWY_AVX3_DL
   9301  (void)tag;
   9302  // kMask[i] = 0xFF >> i
   9303  alignas(16) static constexpr uint8_t kMasks[16] = {
   9304      0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00};
   9305  // kShl[i] = 1 << i
   9306  alignas(16) static constexpr uint8_t kShl[16] = {1,    2,    4,    8,   0x10,
   9307                                                   0x20, 0x40, 0x80, 0x00};
   9308  v = And(v, TableLookupBytes(Load(Full64<uint8_t>(), kMasks), bits));
   9309  const VFromD<decltype(d)> mul =
   9310      TableLookupBytes(Load(Full64<uint8_t>(), kShl), bits);
   9311  return VFromD<decltype(d)>{_mm_gf2p8mul_epi8(v.raw, mul.raw)};
   9312 #elif HWY_TARGET <= HWY_AVX2
   9313  (void)tag;
   9314  (void)d;
   9315  return AVX2ShlU8Vec128(v, bits);
   9316 #else
   9317  const Repartition<uint16_t, decltype(d)> dw;
   9318  using VW = VFromD<decltype(dw)>;
   9319  const VW even_mask = Set(dw, 0x00FF);
   9320  const VW odd_mask = Set(dw, 0xFF00);
   9321  const VW vw = BitCast(dw, v);
   9322  const VW bits16 = BitCast(dw, bits);
   9323  // Shift even lanes in-place
   9324  const VW evens = Shl(tag, vw, And(bits16, even_mask));
   9325  const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16));
   9326  return OddEven(BitCast(d, odds), BitCast(d, evens));
   9327 #endif
   9328 }
   9329 HWY_API Vec128<uint8_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint8_t, 1> v,
   9330                               Vec128<uint8_t, 1> bits) {
   9331 #if HWY_TARGET <= HWY_SSE4
   9332  const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
   9333 #else
   9334  const Vec16<uint16_t> bits8 =
   9335      And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)});
   9336 #endif
   9337  return Vec128<uint8_t, 1>{_mm_sll_epi16(v.raw, bits8.raw)};
   9338 }
   9339 
   9340 template <size_t N>
   9341 HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v,
   9342                                Vec128<uint32_t, N> bits) {
   9343 #if HWY_TARGET >= HWY_SSE4
   9344  return v * Pow2(bits);
   9345 #else
   9346  return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
   9347 #endif
   9348 }
   9349 
   9350 #if HWY_TARGET >= HWY_SSE4
   9351 HWY_API Vec32<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec32<uint32_t> v,
   9352                            const Vec32<uint32_t> bits) {
   9353 #if HWY_TARGET == HWY_SSE4
   9354  const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
   9355 #else
   9356  const auto bits32 =
   9357      Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits);
   9358 #endif
   9359  return Vec32<uint32_t>{_mm_sll_epi32(v.raw, bits32.raw)};
   9360 }
   9361 #endif
   9362 
   9363 HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v,
   9364                             Vec128<uint64_t> bits) {
   9365 #if HWY_TARGET >= HWY_SSE4
   9366  const DFromV<decltype(v)> d;
   9367  // Individual shifts and combine
   9368  const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
   9369  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
   9370  const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
   9371  return ConcatUpperLower(d, out1, out0);
   9372 #else
   9373  return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
   9374 #endif
   9375 }
   9376 HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v,
   9377                            Vec64<uint64_t> bits) {
   9378  return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
   9379 }
   9380 
   9381 // Signed left shift is the same as unsigned.
   9382 template <typename T, size_t N>
   9383 HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
   9384                         Vec128<T, N> bits) {
   9385  const DFromV<decltype(v)> di;
   9386  const RebindToUnsigned<decltype(di)> du;
   9387  return BitCast(di,
   9388                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
   9389 }
   9390 
   9391 }  // namespace detail
   9392 
   9393 template <typename T, size_t N>
   9394 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
   9395  return detail::Shl(hwy::TypeTag<T>(), v, bits);
   9396 }
   9397 
   9398 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
   9399 
   9400 // Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use
   9401 // widening multiplication by powers of two obtained by loading float exponents,
   9402 // followed by a constant right-shift. This is still faster than a scalar or
   9403 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
   9404 
   9405 #if HWY_TARGET <= HWY_AVX2
   9406 namespace detail {
   9407 
   9408 #if HWY_TARGET <= HWY_AVX3
   9409 template <class V>
   9410 HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
   9411  const DFromV<decltype(v)> d;
   9412  const Rebind<uint16_t, decltype(d)> du16;
   9413  const RebindToSigned<decltype(du16)> di16;
   9414  return DemoteTo(d,
   9415                  BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits)));
   9416 }
   9417 #else   // AVX2
   9418 template <class V>
   9419 HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) {
   9420  const DFromV<decltype(v)> d;
   9421  const Rebind<uint32_t, decltype(d)> du32;
   9422  const RebindToSigned<decltype(du32)> di32;
   9423  return DemoteTo(d,
   9424                  BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
   9425 }
   9426 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   9427 HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
   9428  const DFromV<decltype(v)> d;
   9429  const Rebind<uint32_t, decltype(d)> du32;
   9430  const RebindToSigned<decltype(du32)> di32;
   9431  return DemoteTo(d,
   9432                  BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits)));
   9433 }
   9434 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   9435 HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) {
   9436  const DFromV<decltype(v)> d;
   9437  const Half<decltype(d)> dh;
   9438  const Rebind<int16_t, decltype(d)> di16;
   9439  const Rebind<uint16_t, decltype(d)> du16;
   9440  const Rebind<int32_t, decltype(dh)> dh_i32;
   9441  const Rebind<uint32_t, decltype(dh)> dh_u32;
   9442 
   9443  const auto lo_shr_result =
   9444      BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >>
   9445                          PromoteTo(dh_u32, LowerHalf(dh, bits)));
   9446  const auto hi_shr_result =
   9447      BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >>
   9448                          PromoteTo(dh_u32, UpperHalf(dh, bits)));
   9449  const auto i16_shr_result =
   9450      BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result));
   9451  return DemoteTo(d, i16_shr_result);
   9452 }
   9453 #endif  // HWY_TARGET <= HWY_AVX3
   9454 
   9455 }  // namespace detail
   9456 #endif  // HWY_TARGET <= HWY_AVX2
   9457 
   9458 template <size_t N>
   9459 HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> in,
   9460                                       const Vec128<uint16_t, N> bits) {
   9461 #if HWY_TARGET <= HWY_AVX3
   9462  return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
   9463 #elif HWY_TARGET <= HWY_AVX2
   9464  return detail::AVX2ShrU16Vec128(in, bits);
   9465 #else
   9466  const DFromV<decltype(in)> d;
   9467  // For bits=0, we cannot mul by 2^16, so fix the result later.
   9468  const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
   9469  // Replace output with input where bits == 0.
   9470  return IfThenElse(bits == Zero(d), in, out);
   9471 #endif
   9472 }
   9473 
   9474 #if HWY_TARGET > HWY_AVX3
   9475 HWY_API Vec16<uint16_t> operator>>(const Vec16<uint16_t> in,
   9476                                   const Vec16<uint16_t> bits) {
   9477 #if HWY_TARGET <= HWY_SSE4
   9478  const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
   9479 #else
   9480  const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)});
   9481 #endif
   9482  return Vec16<uint16_t>{_mm_srl_epi16(in.raw, bits16.raw)};
   9483 }
   9484 #endif
   9485 
   9486 // 8-bit uses 16-bit shifts.
   9487 template <size_t N>
   9488 HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> in,
   9489                                      const Vec128<uint8_t, N> bits) {
   9490 #if HWY_TARGET <= HWY_AVX2
   9491  return detail::AVX2ShrU8Vec128(in, bits);
   9492 #else
   9493  const DFromV<decltype(in)> d;
   9494  const Repartition<uint16_t, decltype(d)> dw;
   9495  using VW = VFromD<decltype(dw)>;
   9496  const VW mask = Set(dw, 0x00FF);
   9497  const VW vw = BitCast(dw, in);
   9498  const VW bits16 = BitCast(dw, bits);
   9499  const VW evens = And(vw, mask) >> And(bits16, mask);
   9500  // Shift odd lanes in-place
   9501  const VW odds = vw >> ShiftRight<8>(bits16);
   9502  return OddEven(BitCast(d, odds), BitCast(d, evens));
   9503 #endif
   9504 }
   9505 HWY_API Vec128<uint8_t, 1> operator>>(const Vec128<uint8_t, 1> in,
   9506                                      const Vec128<uint8_t, 1> bits) {
   9507 #if HWY_TARGET <= HWY_SSE4
   9508  const Vec16<uint16_t> in8{_mm_cvtepu8_epi16(in.raw)};
   9509  const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
   9510 #else
   9511  const Vec16<uint16_t> mask{_mm_set_epi64x(0, 0xFF)};
   9512  const Vec16<uint16_t> in8 = And(Vec16<uint16_t>{in.raw}, mask);
   9513  const Vec16<uint16_t> bits8 = And(Vec16<uint16_t>{bits.raw}, mask);
   9514 #endif
   9515  return Vec128<uint8_t, 1>{_mm_srl_epi16(in8.raw, bits8.raw)};
   9516 }
   9517 
   9518 template <size_t N>
   9519 HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in,
   9520                                       const Vec128<uint32_t, N> bits) {
   9521 #if HWY_TARGET >= HWY_SSE4
   9522  // 32x32 -> 64 bit mul, then shift right by 32.
   9523  const DFromV<decltype(in)> d32;
   9524  // Move odd lanes into position for the second mul. Shuffle more gracefully
   9525  // handles N=1 than repartitioning to u64 and shifting 32 bits right.
   9526  const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
   9527  // For bits=0, we cannot mul by 2^32, so fix the result later.
   9528  const auto mul = detail::Pow2(Set(d32, 32) - bits);
   9529  const auto out20 = ShiftRight<32>(MulEven(in, mul));  // z 2 z 0
   9530  const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
   9531  // No need to shift right, already in the correct position.
   9532  const auto out31 = BitCast(d32, MulEven(in31, mul31));  // 3 ? 1 ?
   9533  const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
   9534  // Replace output with input where bits == 0.
   9535  return IfThenElse(bits == Zero(d32), in, out);
   9536 #else
   9537  return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
   9538 #endif
   9539 }
   9540 
   9541 #if HWY_TARGET >= HWY_SSE4
   9542 HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in,
   9543                                       const Vec128<uint32_t, 1> bits) {
   9544 #if HWY_TARGET == HWY_SSE4
   9545  const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
   9546 #else
   9547  const auto bits32 =
   9548      Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits);
   9549 #endif
   9550  return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits32.raw)};
   9551 }
   9552 #endif
   9553 
   9554 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
   9555                                    const Vec128<uint64_t> bits) {
   9556 #if HWY_TARGET >= HWY_SSE4
   9557  const DFromV<decltype(v)> d;
   9558  // Individual shifts and combine
   9559  const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
   9560  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
   9561  const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
   9562  return ConcatUpperLower(d, out1, out0);
   9563 #else
   9564  return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
   9565 #endif
   9566 }
   9567 HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
   9568                                   const Vec64<uint64_t> bits) {
   9569  return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
   9570 }
   9571 
   9572 namespace detail {
   9573 
   9574 #if HWY_TARGET <= HWY_AVX3
   9575 template <class V>
   9576 HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
   9577  const DFromV<decltype(v)> d;
   9578  const Rebind<int16_t, decltype(d)> di16;
   9579  return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits));
   9580 }
   9581 #elif HWY_TARGET <= HWY_AVX2  // AVX2
   9582 template <class V>
   9583 HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) {
   9584  const DFromV<decltype(v)> d;
   9585  const Rebind<int32_t, decltype(d)> di32;
   9586  return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
   9587 }
   9588 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   9589 HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
   9590  const DFromV<decltype(v)> d;
   9591  const Rebind<int32_t, decltype(d)> di32;
   9592  return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits));
   9593 }
   9594 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   9595 HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) {
   9596  const DFromV<decltype(v)> d;
   9597  const Half<decltype(d)> dh;
   9598  const Rebind<int16_t, decltype(d)> di16;
   9599  const Rebind<int32_t, decltype(dh)> dh_i32;
   9600 
   9601  const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >>
   9602                             PromoteTo(dh_i32, LowerHalf(dh, bits));
   9603  const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >>
   9604                             PromoteTo(dh_i32, UpperHalf(dh, bits));
   9605  const auto i16_shr_result =
   9606      OrderedDemote2To(di16, lo_shr_result, hi_shr_result);
   9607  return DemoteTo(d, i16_shr_result);
   9608 }
   9609 #endif
   9610 
   9611 #if HWY_TARGET > HWY_AVX3
   9612 // Also used in x86_256-inl.h.
   9613 template <class DI, class V>
   9614 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
   9615  const RebindToUnsigned<DI> du;
   9616  const auto count = BitCast(du, count_i);  // same type as value to shift
   9617  // Clear sign and restore afterwards. This is preferable to shifting the MSB
   9618  // downwards because Shr is somewhat more expensive than Shl.
   9619  const auto sign = BroadcastSignBit(v);
   9620  const auto abs = BitCast(du, v ^ sign);  // off by one, but fixed below
   9621  return BitCast(di, abs >> count) ^ sign;
   9622 }
   9623 #endif
   9624 
   9625 }  // namespace detail
   9626 
   9627 template <size_t N>
   9628 HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v,
   9629                                      Vec128<int16_t, N> bits) {
   9630 #if HWY_TARGET <= HWY_AVX3
   9631  return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
   9632 #elif HWY_TARGET <= HWY_AVX2
   9633  return detail::AVX2ShrI16Vec128(v, bits);
   9634 #else
   9635  const DFromV<decltype(v)> d;
   9636  return detail::SignedShr(d, v, bits);
   9637 #endif
   9638 }
   9639 
   9640 #if HWY_TARGET > HWY_AVX3
   9641 HWY_API Vec16<int16_t> operator>>(Vec16<int16_t> v, Vec16<int16_t> bits) {
   9642 #if HWY_TARGET <= HWY_SSE4
   9643  const Vec16<int16_t> bits16{_mm_cvtepu16_epi64(bits.raw)};
   9644 #else
   9645  const auto bits16 = And(bits, Vec16<int16_t>{_mm_set_epi64x(0, 0xFFFF)});
   9646 #endif
   9647  return Vec16<int16_t>{_mm_sra_epi16(v.raw, bits16.raw)};
   9648 }
   9649 #endif
   9650 
   9651 template <size_t N>
   9652 HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v,
   9653                                     Vec128<int8_t, N> bits) {
   9654 #if HWY_TARGET <= HWY_AVX2
   9655  return detail::AVX2ShrI8Vec128(v, bits);
   9656 #else
   9657  const DFromV<decltype(v)> d;
   9658  return detail::SignedShr(d, v, bits);
   9659 #endif
   9660 }
   9661 HWY_API Vec128<int8_t, 1> operator>>(Vec128<int8_t, 1> v,
   9662                                     Vec128<int8_t, 1> bits) {
   9663 #if HWY_TARGET <= HWY_SSE4
   9664  const Vec16<int16_t> vi16{_mm_cvtepi8_epi16(v.raw)};
   9665  const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)};
   9666 #else
   9667  const DFromV<decltype(v)> d;
   9668  const Rebind<int16_t, decltype(d)> di16;
   9669  const Twice<decltype(d)> dt;
   9670 
   9671  const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v)));
   9672  const Vec16<uint16_t> bits8 =
   9673      And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)});
   9674 #endif
   9675  return Vec128<int8_t, 1>{_mm_sra_epi16(vi16.raw, bits8.raw)};
   9676 }
   9677 
   9678 template <size_t N>
   9679 HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v,
   9680                                      Vec128<int32_t, N> bits) {
   9681 #if HWY_TARGET <= HWY_AVX2
   9682  return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
   9683 #else
   9684  const DFromV<decltype(v)> d;
   9685  return detail::SignedShr(d, v, bits);
   9686 #endif
   9687 }
   9688 
   9689 #if HWY_TARGET > HWY_AVX2
   9690 HWY_API Vec32<int32_t> operator>>(Vec32<int32_t> v, Vec32<int32_t> bits) {
   9691 #if HWY_TARGET == HWY_SSE4
   9692  const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)};
   9693 #else
   9694  const auto bits32 = Combine(Full64<int32_t>(), Zero(Full32<int32_t>()), bits);
   9695 #endif
   9696  return Vec32<int32_t>{_mm_sra_epi32(v.raw, bits32.raw)};
   9697 }
   9698 #endif
   9699 
   9700 template <size_t N>
   9701 HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v,
   9702                                      Vec128<int64_t, N> bits) {
   9703 #if HWY_TARGET <= HWY_AVX3
   9704  return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
   9705 #else
   9706  const DFromV<decltype(v)> d;
   9707  return detail::SignedShr(d, v, bits);
   9708 #endif
   9709 }
   9710 
   9711 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
   9712 
   9713 namespace detail {
   9714 
   9715 template <class V, HWY_IF_U64(TFromV<V>)>
   9716 static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
   9717  const DFromV<decltype(a)> du64;
   9718  const RepartitionToNarrow<decltype(du64)> du32;
   9719  const auto maskL = Set(du64, 0xFFFFFFFFULL);
   9720  const auto a32 = BitCast(du32, a);
   9721  const auto b32 = BitCast(du32, b);
   9722  // Inputs for MulEven: we only need the lower 32 bits
   9723  const auto aH = Shuffle2301(a32);
   9724  const auto bH = Shuffle2301(b32);
   9725 
   9726  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
   9727  // the even (lower 64 bits of every 128-bit block) results. See
   9728  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.txt
   9729  const auto aLbL = MulEven(a32, b32);
   9730  const auto w3 = aLbL & maskL;
   9731 
   9732  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
   9733  const auto w2 = t2 & maskL;
   9734  const auto w1 = ShiftRight<32>(t2);
   9735 
   9736  const auto t = MulEven(a32, bH) + w2;
   9737  const auto k = ShiftRight<32>(t);
   9738 
   9739  mulH = MulEven(aH, bH) + w1 + k;
   9740  return ShiftLeft<32>(t) + w3;
   9741 }
   9742 
   9743 template <class V, HWY_IF_I64(TFromV<V>)>
   9744 static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) {
   9745  const DFromV<decltype(a)> di64;
   9746  const RebindToUnsigned<decltype(di64)> du64;
   9747  using VU64 = VFromD<decltype(du64)>;
   9748 
   9749  VU64 unsigned_mulH;
   9750  const auto mulL = BitCast(
   9751      di64, SSE2Mul128(BitCast(du64, a), BitCast(du64, b), unsigned_mulH));
   9752  mulH = BitCast(di64, unsigned_mulH) - And(BroadcastSignBit(a), b) -
   9753         And(a, BroadcastSignBit(b));
   9754  return mulL;
   9755 }
   9756 
   9757 }  // namespace detail
   9758 
   9759 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
   9760 
   9761 template <class V, HWY_IF_UI64(TFromV<V>),
   9762          HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
   9763 HWY_API V MulEven(V a, V b) {
   9764  V mulH;
   9765  const V mulL = detail::SSE2Mul128(a, b, mulH);
   9766  return InterleaveLower(mulL, mulH);
   9767 }
   9768 
   9769 template <class V, HWY_IF_UI64(TFromV<V>),
   9770          HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))>
   9771 HWY_API V MulOdd(V a, V b) {
   9772  const DFromV<decltype(a)> du64;
   9773  V mulH;
   9774  const V mulL = detail::SSE2Mul128(a, b, mulH);
   9775  return InterleaveUpper(du64, mulL, mulH);
   9776 }
   9777 
   9778 #endif  // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
   9779 
   9780 template <class V, HWY_IF_UI64(TFromV<V>),
   9781          HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))>
   9782 HWY_API V MulHigh(V a, V b) {
   9783  V mulH;
   9784  detail::SSE2Mul128(a, b, mulH);
   9785  return mulH;
   9786 }
   9787 
   9788 #if HWY_ARCH_X86_64
   9789 
   9790 template <class T, HWY_IF_UI64(T)>
   9791 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
   9792  const DFromV<decltype(a)> d;
   9793  alignas(16) T mul[2];
   9794  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
   9795  return Load(d, mul);
   9796 }
   9797 
   9798 template <class T, HWY_IF_UI64(T)>
   9799 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
   9800  const DFromV<decltype(a)> d;
   9801  const Half<decltype(d)> d2;
   9802  alignas(16) T mul[2];
   9803  const T a1 = GetLane(UpperHalf(d2, a));
   9804  const T b1 = GetLane(UpperHalf(d2, b));
   9805  mul[0] = Mul128(a1, b1, &mul[1]);
   9806  return Load(d, mul);
   9807 }
   9808 
   9809 template <class T, HWY_IF_UI64(T)>
   9810 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
   9811  T hi;
   9812  Mul128(GetLane(a), GetLane(b), &hi);
   9813  return Vec64<T>{_mm_cvtsi64_si128(static_cast<int64_t>(hi))};
   9814 }
   9815 
   9816 #endif  // HWY_ARCH_X86_64
   9817 
   9818 // ================================================== CONVERT (2)
   9819 
   9820 // ------------------------------ PromoteEvenTo/PromoteOddTo
   9821 
   9822 #if HWY_TARGET > HWY_AVX3
   9823 namespace detail {
   9824 
   9825 // I32->I64 PromoteEvenTo/PromoteOddTo
   9826 
   9827 template <class D, HWY_IF_LANES_D(D, 1)>
   9828 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
   9829                                   hwy::SizeTag<8> /*to_lane_size_tag*/,
   9830                                   hwy::SignedTag /*from_type_tag*/, D d_to,
   9831                                   Vec64<int32_t> v) {
   9832  return PromoteLowerTo(d_to, v);
   9833 }
   9834 
   9835 template <class D, HWY_IF_LANES_D(D, 2)>
   9836 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
   9837                                   hwy::SizeTag<8> /*to_lane_size_tag*/,
   9838                                   hwy::SignedTag /*from_type_tag*/, D d_to,
   9839                                   Vec128<int32_t> v) {
   9840  const Repartition<int32_t, D> d_from;
   9841  return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
   9842 }
   9843 
   9844 template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
   9845 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
   9846                                  hwy::SizeTag<8> /*to_lane_size_tag*/,
   9847                                  hwy::SignedTag /*from_type_tag*/, D d_to,
   9848                                  V v) {
   9849  const Repartition<int32_t, D> d_from;
   9850  return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
   9851 }
   9852 
   9853 }  // namespace detail
   9854 #endif
   9855 
   9856 // ------------------------------ PromoteEvenTo/PromoteOddTo
   9857 #include "hwy/ops/inside-inl.h"
   9858 
   9859 // ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo)
   9860 
   9861 #if HWY_NATIVE_DOT_BF16
   9862 
   9863 template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
   9864          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   9865 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
   9866  return VFromD<DF>{_mm_dpbf16_ps(Zero(df).raw,
   9867                                  reinterpret_cast<__m128bh>(a.raw),
   9868                                  reinterpret_cast<__m128bh>(b.raw))};
   9869 }
   9870 
   9871 #else
   9872 
   9873 // Generic for all vector lengths.
   9874 template <class DF, HWY_IF_F32_D(DF),
   9875          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   9876 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
   9877  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
   9878                Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
   9879 }
   9880 
   9881 #endif  // HWY_NATIVE_DOT_BF16
   9882 
   9883 // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
   9884 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
   9885          class V16 = VFromD<RepartitionToNarrow<D32>>>
   9886 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
   9887  return VFromD<D32>{_mm_madd_epi16(a.raw, b.raw)};
   9888 }
   9889 
   9890 // Generic for all vector lengths.
   9891 template <class DU32, HWY_IF_U32_D(DU32),
   9892          class VU16 = VFromD<RepartitionToNarrow<DU32>>>
   9893 HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) {
   9894  const auto p_lo = a * b;
   9895  const auto p_hi = MulHigh(a, b);
   9896 
   9897  const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo));
   9898  const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)),
   9899                            ShiftRight<16>(BitCast(du32, p_lo)));
   9900  return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1));
   9901 }
   9902 
   9903 // ------------------------------ SatWidenMulPairwiseAdd
   9904 
   9905 #if HWY_TARGET <= HWY_SSSE3
   9906 
   9907 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   9908 #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   9909 #else
   9910 #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   9911 #endif
   9912 
   9913 // Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16
   9914 // is safe.
   9915 template <class DI16, HWY_IF_I16_D(DI16), HWY_IF_V_SIZE_LE_D(DI16, 16)>
   9916 HWY_API VFromD<DI16> SatWidenMulPairwiseAdd(
   9917    DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a,
   9918    VFromD<Repartition<int8_t, DI16>> b) {
   9919  return VFromD<DI16>{_mm_maddubs_epi16(a.raw, b.raw)};
   9920 }
   9921 
   9922 #endif
   9923 
   9924 // ------------------------------ SatWidenMulPairwiseAccumulate
   9925 
   9926 #if HWY_TARGET <= HWY_AVX3_DL
   9927 
   9928 #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   9929 #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   9930 #else
   9931 #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   9932 #endif
   9933 
   9934 // Even if N=1, the I16 vectors have at least 2 lanes, hence _mm_dpwssds_epi32
   9935 // is safe.
   9936 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
   9937 HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
   9938    DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
   9939    VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
   9940  return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)};
   9941 }
   9942 
   9943 #endif  // HWY_TARGET <= HWY_AVX3_DL
   9944 
   9945 // ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo)
   9946 
   9947 #if HWY_NATIVE_DOT_BF16
   9948 
   9949 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   9950 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   9951 #else
   9952 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   9953 #endif
   9954 
   9955 template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
   9956          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   9957 HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
   9958                                             const VFromD<DF> sum0,
   9959                                             VFromD<DF>& /*sum1*/) {
   9960  return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw),
   9961                                  reinterpret_cast<__m128bh>(b.raw))};
   9962 }
   9963 
   9964 #endif  // HWY_NATIVE_DOT_BF16
   9965 
   9966 // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
   9967 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
   9968          class V16 = VFromD<RepartitionToNarrow<D32>>>
   9969 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b,
   9970                                              const VFromD<D32> sum0,
   9971                                              VFromD<D32>& /*sum1*/) {
   9972  (void)d;
   9973 #if HWY_TARGET <= HWY_AVX3_DL
   9974  return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)};
   9975 #else
   9976  return sum0 + WidenMulPairwiseAdd(d, a, b);
   9977 #endif
   9978 }
   9979 
   9980 template <class DU32, HWY_IF_U32_D(DU32),
   9981          class VU16 = VFromD<RepartitionToNarrow<DU32>>>
   9982 HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b,
   9983                                               const VFromD<DU32> sum0,
   9984                                               VFromD<DU32>& /*sum1*/) {
   9985  (void)d;
   9986  return sum0 + WidenMulPairwiseAdd(d, a, b);
   9987 }
   9988 
   9989 // ------------------------------ RearrangeToOddPlusEven
   9990 template <size_t N>
   9991 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0,
   9992                                                  Vec128<int32_t, N> /*sum1*/) {
   9993  return sum0;  // invariant already holds
   9994 }
   9995 
   9996 template <size_t N>
   9997 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
   9998    const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) {
   9999  return sum0;  // invariant already holds
  10000 }
  10001 
  10002 template <class VW>
  10003 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
  10004  return Add(sum0, sum1);
  10005 }
  10006 
  10007 // ------------------------------ SumOfMulQuadAccumulate
  10008 #if HWY_TARGET <= HWY_AVX3_DL
  10009 
  10010 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  10011 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  10012 #else
  10013 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  10014 #endif
  10015 
  10016 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
  10017 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
  10018    DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
  10019    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
  10020  return VFromD<DI32>{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)};
  10021 }
  10022 
  10023 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  10024 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  10025 #else
  10026 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  10027 #endif
  10028 
  10029 #if HWY_X86_HAVE_AVX10_2_OPS
  10030 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
  10031 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
  10032                                            VFromD<Repartition<int8_t, DI32>> a,
  10033                                            VFromD<Repartition<int8_t, DI32>> b,
  10034                                            VFromD<DI32> sum) {
  10035  return VFromD<DI32>{_mm_dpbssd_epi32(sum.raw, a.raw, b.raw)};
  10036 }
  10037 #else   // !HWY_X86_HAVE_AVX10_2_OPS
  10038 template <class DI32, HWY_IF_I32_D(DI32)>
  10039 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
  10040                                            VFromD<Repartition<int8_t, DI32>> a,
  10041                                            VFromD<Repartition<int8_t, DI32>> b,
  10042                                            VFromD<DI32> sum) {
  10043  const Repartition<uint8_t, decltype(di32)> du8;
  10044 
  10045  const auto a_u = BitCast(du8, a);
  10046  const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum);
  10047  const auto result_sum_1 = ShiftLeft<8>(
  10048      SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32)));
  10049  return result_sum_0 - result_sum_1;
  10050 }
  10051 #endif  // HWY_X86_HAVE_AVX10_2_OPS
  10052 
  10053 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  10054 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  10055 #else
  10056 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  10057 #endif
  10058 
  10059 #if HWY_X86_HAVE_AVX10_2_OPS
  10060 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16)>
  10061 HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
  10062    DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
  10063    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
  10064  return VFromD<DU32>{_mm_dpbuud_epi32(sum.raw, a.raw, b.raw)};
  10065 }
  10066 #else   // !HWY_X86_HAVE_AVX10_2_OPS
  10067 template <class DU32, HWY_IF_U32_D(DU32)>
  10068 HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
  10069    DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
  10070    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
  10071  const Repartition<uint8_t, decltype(du32)> du8;
  10072  const RebindToSigned<decltype(du8)> di8;
  10073  const RebindToSigned<decltype(du32)> di32;
  10074 
  10075  const auto b_i = BitCast(di8, b);
  10076  const auto result_sum_0 =
  10077      SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum));
  10078  const auto result_sum_1 = ShiftLeft<8>(
  10079      SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32)));
  10080 
  10081  return BitCast(du32, result_sum_0 - result_sum_1);
  10082 }
  10083 #endif  // HWY_X86_HAVE_AVX10_2_OPS
  10084 
  10085 #endif  // HWY_TARGET <= HWY_AVX3_DL
  10086 
  10087 // ------------------------------ Demotions (full -> part w/ narrow lanes)
  10088 
  10089 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
  10090 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  10091  return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)};
  10092 }
  10093 
  10094 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
  10095 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  10096 #if HWY_TARGET >= HWY_SSSE3
  10097  const Rebind<int32_t, D> di32;
  10098  const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
  10099  const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
  10100  const auto clamped = Or(zero_if_neg, too_big);
  10101 #if HWY_TARGET == HWY_SSE2
  10102  const Rebind<uint16_t, decltype(di32)> du16;
  10103  const RebindToSigned<decltype(du16)> di16;
  10104  return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
  10105 #else
  10106  const Repartition<uint16_t, decltype(di32)> du16;
  10107  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
  10108  alignas(16) static constexpr uint16_t kLower2Bytes[16] = {
  10109      0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
  10110  const auto lo2 = Load(du16, kLower2Bytes);
  10111  return VFromD<D>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
  10112 #endif
  10113 #else
  10114  return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)};
  10115 #endif
  10116 }
  10117 
  10118 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
  10119 HWY_API VFromD<D> DemoteTo(D du16, VFromD<Rebind<uint32_t, D>> v) {
  10120  const DFromV<decltype(v)> du32;
  10121  const RebindToSigned<decltype(du32)> di32;
  10122 #if HWY_TARGET >= HWY_SSSE3
  10123  const auto too_big =
  10124      VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32)));
  10125  const auto clamped = Or(BitCast(di32, v), too_big);
  10126 #if HWY_TARGET == HWY_SSE2
  10127  const RebindToSigned<decltype(du16)> di16;
  10128  return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped))));
  10129 #else
  10130  (void)du16;
  10131  const Repartition<uint16_t, decltype(di32)> du16_full;
  10132  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
  10133  alignas(16) static constexpr uint16_t kLower2Bytes[16] = {
  10134      0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
  10135  const auto lo2 = Load(du16_full, kLower2Bytes);
  10136  return VFromD<D>{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw};
  10137 #endif
  10138 #else
  10139  return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF))));
  10140 #endif
  10141 }
  10142 
  10143 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
  10144 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  10145  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
  10146  return VFromD<D>{_mm_packus_epi16(i16, i16)};
  10147 }
  10148 
  10149 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
  10150 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  10151  return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)};
  10152 }
  10153 
  10154 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
  10155 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  10156  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
  10157  return VFromD<D>{_mm_packs_epi16(i16, i16)};
  10158 }
  10159 
  10160 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
  10161 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  10162  return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)};
  10163 }
  10164 
  10165 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
  10166 HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint32_t, D>> v) {
  10167 #if HWY_TARGET <= HWY_AVX3
  10168  // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned
  10169  // integers to 8-bit unsigned integers
  10170  (void)du8;
  10171  return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)};
  10172 #else
  10173  const DFromV<decltype(v)> du32;
  10174  const RebindToSigned<decltype(du32)> di32;
  10175  const auto max_i32 = Set(du32, 0x7FFFFFFFu);
  10176 
  10177 #if HWY_TARGET >= HWY_SSSE3
  10178  // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation
  10179  // as SSE2/SSSE3 can do an u8 Min operation in a single instruction.
  10180 
  10181  // The u8 Min operation below leaves the lower 24 bits of each 32-bit
  10182  // lane unchanged.
  10183 
  10184  // The u8 Min operation below will leave any values that are less than or
  10185  // equal to 0x7FFFFFFF unchanged.
  10186 
  10187  // For values that are greater than or equal to 0x80000000, the u8 Min
  10188  // operation below will force the upper 8 bits to 0x7F and leave the lower
  10189  // 24 bits unchanged.
  10190 
  10191  // An u8 Min operation is okay here as any clamped value that is greater than
  10192  // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and
  10193  // 0x7FFFFFFF through the u8 Min operation below, which will then be converted
  10194  // to 0xFF through the i32->u8 demotion.
  10195  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  10196  const auto clamped = BitCast(
  10197      di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32)));
  10198 #else
  10199  const auto clamped = BitCast(di32, Min(v, max_i32));
  10200 #endif
  10201 
  10202  return DemoteTo(du8, clamped);
  10203 #endif
  10204 }
  10205 
  10206 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
  10207 HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) {
  10208  const DFromV<decltype(v)> du16;
  10209  const RebindToSigned<decltype(du16)> di16;
  10210  const auto max_i16 = Set(du16, 0x7FFF);
  10211 
  10212 #if HWY_TARGET >= HWY_SSSE3
  10213  // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation
  10214  // as SSE2/SSSE3 can do an u8 Min operation in a single instruction.
  10215 
  10216  // The u8 Min operation below leaves the lower 8 bits of each 16-bit
  10217  // lane unchanged.
  10218 
  10219  // The u8 Min operation below will leave any values that are less than or
  10220  // equal to 0x7FFF unchanged.
  10221 
  10222  // For values that are greater than or equal to 0x8000, the u8 Min
  10223  // operation below will force the upper 8 bits to 0x7F and leave the lower
  10224  // 8 bits unchanged.
  10225 
  10226  // An u8 Min operation is okay here as any clamped value that is greater than
  10227  // or equal to 0x8000 will be clamped to a value between 0x7F00 and
  10228  // 0x7FFF through the u8 Min operation below, which will then be converted
  10229  // to 0xFF through the i16->u8 demotion.
  10230  const Repartition<uint8_t, decltype(du16)> du16_as_du8;
  10231  const auto clamped = BitCast(
  10232      di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16)));
  10233 #else
  10234  const auto clamped = BitCast(di16, Min(v, max_i16));
  10235 #endif
  10236 
  10237  return DemoteTo(du8, clamped);
  10238 }
  10239 
  10240 #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C)
  10241 
  10242 // HWY_NATIVE_F16C was already toggled above.
  10243 
  10244 // Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
  10245 // clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
  10246 HWY_DIAGNOSTICS(push)
  10247 HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
  10248 
  10249 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
  10250 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
  10251  const RebindToUnsigned<decltype(df16)> du16;
  10252  return BitCast(
  10253      df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
  10254 }
  10255 
  10256 HWY_DIAGNOSTICS(pop)
  10257 
  10258 #endif  // F16C
  10259 
  10260 #if HWY_HAVE_FLOAT16
  10261 
  10262 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
  10263 #undef HWY_NATIVE_DEMOTE_F64_TO_F16
  10264 #else
  10265 #define HWY_NATIVE_DEMOTE_F64_TO_F16
  10266 #endif
  10267 
  10268 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
  10269 HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
  10270  return VFromD<D>{_mm_cvtpd_ph(v.raw)};
  10271 }
  10272 
  10273 #endif  // HWY_HAVE_FLOAT16
  10274 
  10275 // The _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics require GCC 9 or later
  10276 // or Clang 10 or later
  10277 
  10278 // Also need GCC or Clang to bit cast the __m128bh, __m256bh, or __m512bh vector
  10279 // returned by the _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics to a
  10280 // __m128i, __m256i, or __m512i as there are currently no intrinsics available
  10281 // (as of GCC 13 and Clang 17) to bit cast a __m128bh, __m256bh, or __m512bh
  10282 // vector to a __m128i, __m256i, or __m512i vector
  10283 
  10284 #if HWY_AVX3_HAVE_F32_TO_BF16C
  10285 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
  10286 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
  10287 #else
  10288 #define HWY_NATIVE_DEMOTE_F32_TO_BF16
  10289 #endif
  10290 
  10291 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
  10292 HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
  10293 #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
  10294  // Inline assembly workaround for LLVM codegen bug
  10295  __m128i raw_result;
  10296  __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw));
  10297  return VFromD<D>{raw_result};
  10298 #else
  10299  // The _mm_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be
  10300  // bit casted to a __m128i vector
  10301  return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))};
  10302 #endif
  10303 }
  10304 
  10305 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
  10306 HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec128<float> a,
  10307                                   Vec128<float> b) {
  10308 #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000
  10309  // Inline assembly workaround for LLVM codegen bug
  10310  __m128i raw_result;
  10311  __asm__("vcvtne2ps2bf16 %2, %1, %0"
  10312          : "=v"(raw_result)
  10313          : "v"(b.raw), "v"(a.raw));
  10314  return VFromD<D>{raw_result};
  10315 #else
  10316  // The _mm_cvtne2ps_pbh intrinsic returns a __m128bh vector that needs to be
  10317  // bit casted to a __m128i vector
  10318  return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))};
  10319 #endif
  10320 }
  10321 
  10322 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
  10323 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<float> a,
  10324                                   Vec64<float> b) {
  10325  return VFromD<D>{_mm_shuffle_epi32(
  10326      detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)),
  10327      _MM_SHUFFLE(2, 0, 2, 0))};
  10328 }
  10329 
  10330 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
  10331 HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec32<float> a, Vec32<float> b) {
  10332  const DFromV<decltype(a)> d;
  10333  const Twice<decltype(d)> dt;
  10334  return DemoteTo(dbf16, Combine(dt, b, a));
  10335 }
  10336 #endif  // HWY_AVX3_HAVE_F32_TO_BF16C
  10337 
  10338 // Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
  10339 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
  10340 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
  10341  const DFromV<decltype(a)> d;
  10342  const Twice<decltype(d)> dt;
  10343  return DemoteTo(dn, Combine(dt, b, a));
  10344 }
  10345 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
  10346 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int32_t> a,
  10347                                   Vec64<int32_t> b) {
  10348  return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw),
  10349                                     _MM_SHUFFLE(2, 0, 2, 0))};
  10350 }
  10351 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
  10352 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
  10353                                   Vec128<int32_t> b) {
  10354  return VFromD<D>{_mm_packs_epi32(a.raw, b.raw)};
  10355 }
  10356 
  10357 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
  10358 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) {
  10359  const DFromV<decltype(a)> d;
  10360  const Twice<decltype(d)> dt;
  10361  return DemoteTo(dn, Combine(dt, b, a));
  10362 }
  10363 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
  10364 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec64<int32_t> a, Vec64<int32_t> b) {
  10365 #if HWY_TARGET >= HWY_SSSE3
  10366  const DFromV<decltype(a)> d;
  10367  const Twice<decltype(d)> dt;
  10368  return DemoteTo(dn, Combine(dt, b, a));
  10369 #else
  10370  (void)dn;
  10371  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw),
  10372                                     _MM_SHUFFLE(2, 0, 2, 0))};
  10373 #endif
  10374 }
  10375 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
  10376 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<int32_t> a, Vec128<int32_t> b) {
  10377 #if HWY_TARGET >= HWY_SSSE3
  10378  const Half<decltype(dn)> dnh;
  10379  const auto u16_a = DemoteTo(dnh, a);
  10380  const auto u16_b = DemoteTo(dnh, b);
  10381  return Combine(dn, u16_b, u16_a);
  10382 #else
  10383  (void)dn;
  10384  return VFromD<D>{_mm_packus_epi32(a.raw, b.raw)};
  10385 #endif
  10386 }
  10387 
  10388 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
  10389 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a,
  10390                                   Vec128<uint32_t> b) {
  10391  const DFromV<decltype(a)> du32;
  10392  const RebindToSigned<decltype(du32)> di32;
  10393  const auto max_i32 = Set(du32, 0x7FFFFFFFu);
  10394 
  10395 #if HWY_TARGET >= HWY_SSSE3
  10396  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  10397  // On SSE2/SSSE3, clamp a and b using u8 Min operation
  10398  const auto clamped_a = BitCast(
  10399      di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32)));
  10400  const auto clamped_b = BitCast(
  10401      di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32)));
  10402 #else
  10403  const auto clamped_a = BitCast(di32, Min(a, max_i32));
  10404  const auto clamped_b = BitCast(di32, Min(b, max_i32));
  10405 #endif
  10406 
  10407  return ReorderDemote2To(dn, clamped_a, clamped_b);
  10408 }
  10409 
  10410 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
  10411 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a,
  10412                                   VFromD<Repartition<uint32_t, D>> b) {
  10413  const DFromV<decltype(a)> d;
  10414  const Twice<decltype(d)> dt;
  10415  return DemoteTo(dn, Combine(dt, b, a));
  10416 }
  10417 
  10418 // Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
  10419 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
  10420 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
  10421                                   VFromD<Repartition<int16_t, D>> b) {
  10422  const DFromV<decltype(a)> d;
  10423  const Twice<decltype(d)> dt;
  10424  return DemoteTo(dn, Combine(dt, b, a));
  10425 }
  10426 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
  10427 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a,
  10428                                   Vec64<int16_t> b) {
  10429  return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw),
  10430                                     _MM_SHUFFLE(2, 0, 2, 0))};
  10431 }
  10432 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
  10433 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
  10434                                   Vec128<int16_t> b) {
  10435  return VFromD<D>{_mm_packs_epi16(a.raw, b.raw)};
  10436 }
  10437 
  10438 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
  10439 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a,
  10440                                   VFromD<Repartition<int16_t, D>> b) {
  10441  const DFromV<decltype(a)> d;
  10442  const Twice<decltype(d)> dt;
  10443  return DemoteTo(dn, Combine(dt, b, a));
  10444 }
  10445 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
  10446 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a,
  10447                                   Vec64<int16_t> b) {
  10448  return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw),
  10449                                     _MM_SHUFFLE(2, 0, 2, 0))};
  10450 }
  10451 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
  10452 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
  10453                                   Vec128<int16_t> b) {
  10454  return VFromD<D>{_mm_packus_epi16(a.raw, b.raw)};
  10455 }
  10456 
  10457 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
  10458 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a,
  10459                                   Vec128<uint16_t> b) {
  10460  const DFromV<decltype(a)> du16;
  10461  const RebindToSigned<decltype(du16)> di16;
  10462  const auto max_i16 = Set(du16, 0x7FFFu);
  10463 
  10464 #if HWY_TARGET >= HWY_SSSE3
  10465  const Repartition<uint8_t, decltype(du16)> du16_as_du8;
  10466  // On SSE2/SSSE3, clamp a and b using u8 Min operation
  10467  const auto clamped_a = BitCast(
  10468      di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16)));
  10469  const auto clamped_b = BitCast(
  10470      di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16)));
  10471 #else
  10472  const auto clamped_a = BitCast(di16, Min(a, max_i16));
  10473  const auto clamped_b = BitCast(di16, Min(b, max_i16));
  10474 #endif
  10475 
  10476  return ReorderDemote2To(dn, clamped_a, clamped_b);
  10477 }
  10478 
  10479 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
  10480 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a,
  10481                                   VFromD<Repartition<uint16_t, D>> b) {
  10482  const DFromV<decltype(a)> d;
  10483  const Twice<decltype(d)> dt;
  10484  return DemoteTo(dn, Combine(dt, b, a));
  10485 }
  10486 
  10487 template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
  10488          HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  10489          HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
  10490          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
  10491 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
  10492  return ReorderDemote2To(d, a, b);
  10493 }
  10494 
  10495 #if HWY_AVX3_HAVE_F32_TO_BF16C
  10496 // F32 to BF16 OrderedDemote2To is generic for all vector lengths on targets
  10497 // that support AVX512BF16
  10498 template <class D, HWY_IF_BF16_D(D)>
  10499 HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
  10500                                   VFromD<Repartition<float, D>> b) {
  10501  return ReorderDemote2To(dbf16, a, b);
  10502 }
  10503 #endif  // HWY_AVX3_HAVE_F32_TO_BF16C
  10504 
  10505 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
  10506 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
  10507  return VFromD<D>{_mm_cvtpd_ps(v.raw)};
  10508 }
  10509 
  10510 namespace detail {
  10511 
  10512 // Generic for all vector lengths.
  10513 template <class D>
  10514 HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) {
  10515  // The max can be exactly represented in binary64, so clamping beforehand
  10516  // prevents x86 conversion from raising an exception and returning 80..00.
  10517  return Min(v, Set(d, 2147483647.0));
  10518 }
  10519 
  10520 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  10521 template <class TTo, class TF>
  10522 static constexpr HWY_INLINE TTo
  10523 X86ConvertScalarFromFloat(hwy::FloatTag /* to_type_tag */, TF from_val) {
  10524  return ConvertScalarTo<TTo>(from_val);
  10525 }
  10526 
  10527 template <class TTo, class TF>
  10528 static HWY_BITCASTSCALAR_CONSTEXPR HWY_INLINE TTo
  10529 X86ConvertScalarFromFloat(hwy::SpecialTag /* to_type_tag */, TF from_val) {
  10530  return ConvertScalarTo<TTo>(from_val);
  10531 }
  10532 
  10533 template <class TTo, class TF>
  10534 static HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_INLINE TTo
  10535 X86ConvertScalarFromFloat(hwy::SignedTag /* to_type_tag */, TF from_val) {
  10536 #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
  10537  using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
  10538                     RemoveCvRef<TF>>;
  10539 #else
  10540  using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
  10541 #endif
  10542 
  10543  const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val);
  10544  constexpr TTo kMinResultVal = LimitsMin<TTo>();
  10545  HWY_BITCASTSCALAR_CONSTEXPR const TFArith kMinOutOfRangePosVal =
  10546      ScalarAbs(ConvertScalarTo<TFArith>(kMinResultVal));
  10547 
  10548  return (ScalarAbs(from_val_in_arith_type) < kMinOutOfRangePosVal)
  10549             ? ConvertScalarTo<TTo>(from_val_in_arith_type)
  10550             : kMinResultVal;
  10551 }
  10552 
  10553 template <class TTo, class TF>
  10554 static HWY_CXX14_CONSTEXPR HWY_INLINE TTo
  10555 X86ConvertScalarFromFloat(hwy::UnsignedTag /* to_type_tag */, TF from_val) {
  10556 #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
  10557  using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
  10558                     RemoveCvRef<TF>>;
  10559 #else
  10560  using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
  10561 #endif
  10562 
  10563  const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val);
  10564  constexpr TTo kTToMsb = static_cast<TTo>(TTo{1} << (sizeof(TTo) * 8 - 1));
  10565  constexpr const TFArith kNegOne = ConvertScalarTo<TFArith>(-1.0);
  10566  constexpr const TFArith kMinOutOfRangePosVal =
  10567      ConvertScalarTo<TFArith>(static_cast<double>(kTToMsb) * 2.0);
  10568 
  10569  return (from_val_in_arith_type > kNegOne &&
  10570          from_val_in_arith_type < kMinOutOfRangePosVal)
  10571             ? ConvertScalarTo<TTo>(from_val_in_arith_type)
  10572             : LimitsMax<TTo>();
  10573 }
  10574 
  10575 template <class TTo, class TF>
  10576 static constexpr HWY_INLINE HWY_MAYBE_UNUSED TTo
  10577 X86ConvertScalarFromFloat(TF from_val) {
  10578  return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(),
  10579                                        from_val);
  10580 }
  10581 
  10582 #endif  // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  10583 
  10584 }  // namespace detail
  10585 
  10586 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  10587 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  10588 #else
  10589 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  10590 #endif
  10591 
  10592 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
  10593 HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
  10594 #if HWY_X86_HAVE_AVX10_2_OPS
  10595  return VFromD<D>{_mm_cvtts_pd_epi32(v.raw)};
  10596 #elif HWY_COMPILER_GCC_ACTUAL
  10597  // Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any
  10598  // values of v[i] are not within the range of an int32_t
  10599 
  10600 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  10601  if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
  10602    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  10603    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  10604    return Dup128VecFromValues(
  10605        D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
  10606        detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), int32_t{0},
  10607        int32_t{0});
  10608  }
  10609 #endif
  10610 
  10611  __m128i raw_result;
  10612  __asm__("%vcvttpd2dq {%1, %0|%0, %1}"
  10613          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  10614          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  10615          :);
  10616  return VFromD<D>{raw_result};
  10617 #else  // !HWY_COMPILER_GCC_ACTUAL
  10618  return VFromD<D>{_mm_cvttpd_epi32(v.raw)};
  10619 #endif
  10620 }
  10621 
  10622 // F64 to I32 DemoteTo is generic for all vector lengths
  10623 template <class D, HWY_IF_I32_D(D)>
  10624 HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) {
  10625  const Rebind<double, decltype(di32)> df64;
  10626  const VFromD<decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v);
  10627  return DemoteInRangeTo(di32, clamped);
  10628 }
  10629 
  10630 #if HWY_TARGET <= HWY_AVX3
  10631 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
  10632 HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) {
  10633 #if HWY_X86_HAVE_AVX10_2_OPS
  10634  return VFromD<D>{_mm_cvtts_pd_epu32(v.raw)};
  10635 #elif HWY_COMPILER_GCC_ACTUAL
  10636  // Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any
  10637  // values of v[i] are not within the range of an uint32_t
  10638 
  10639 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  10640  if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
  10641    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  10642    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  10643    return Dup128VecFromValues(
  10644        D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
  10645        detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), uint32_t{0},
  10646        uint32_t{0});
  10647  }
  10648 #endif
  10649 
  10650  __m128i raw_result;
  10651  __asm__("vcvttpd2udq {%1, %0|%0, %1}"
  10652          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  10653          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  10654          :);
  10655  return VFromD<D>{raw_result};
  10656 #else
  10657  return VFromD<D>{_mm_cvttpd_epu32(v.raw)};
  10658 #endif
  10659 }
  10660 
  10661 // F64->U32 DemoteTo is generic for all vector lengths
  10662 template <class D, HWY_IF_U32_D(D)>
  10663 HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
  10664 #if HWY_X86_HAVE_AVX10_2_OPS
  10665  return DemoteInRangeTo(du32, v);
  10666 #else
  10667  return DemoteInRangeTo(du32, ZeroIfNegative(v));
  10668 #endif
  10669 }
  10670 #else   // HWY_TARGET > HWY_AVX3
  10671 
  10672 // F64 to U32 DemoteInRangeTo is generic for all vector lengths on
  10673 // SSE2/SSSE3/SSE4/AVX2
  10674 template <class D, HWY_IF_U32_D(D)>
  10675 HWY_API VFromD<D> DemoteInRangeTo(D du32, VFromD<Rebind<double, D>> v) {
  10676  const RebindToSigned<decltype(du32)> di32;
  10677  const Rebind<double, decltype(du32)> df64;
  10678  const RebindToUnsigned<decltype(df64)> du64;
  10679 
  10680  const auto k2_31 = Set(df64, 2147483648.0);
  10681  const auto v_is_ge_k2_31 = (v >= k2_31);
  10682  const auto clamped_lo31_f64 = v - IfThenElseZero(v_is_ge_k2_31, k2_31);
  10683  const auto clamped_lo31_u32 =
  10684      BitCast(du32, DemoteInRangeTo(di32, clamped_lo31_f64));
  10685  const auto clamped_u32_msb = ShiftLeft<31>(
  10686      TruncateTo(du32, BitCast(du64, VecFromMask(df64, v_is_ge_k2_31))));
  10687  return Or(clamped_lo31_u32, clamped_u32_msb);
  10688 }
  10689 
  10690 // F64 to U32 DemoteTo is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  10691 template <class D, HWY_IF_U32_D(D)>
  10692 HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
  10693  const Rebind<double, decltype(du32)> df64;
  10694  const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0));
  10695  return DemoteInRangeTo(du32, clamped);
  10696 }
  10697 #endif  // HWY_TARGET <= HWY_AVX3
  10698 
  10699 #if HWY_TARGET <= HWY_AVX3
  10700 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
  10701 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  10702  return VFromD<D>{_mm_cvtepi64_ps(v.raw)};
  10703 }
  10704 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
  10705 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  10706  return VFromD<D>{_mm_cvtepu64_ps(v.raw)};
  10707 }
  10708 #else
  10709 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  10710 template <class D, HWY_IF_F32_D(D)>
  10711 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) {
  10712  const Rebind<double, decltype(df32)> df64;
  10713  const RebindToUnsigned<decltype(df64)> du64;
  10714  const RebindToSigned<decltype(df32)> di32;
  10715  const RebindToUnsigned<decltype(df32)> du32;
  10716 
  10717  const auto k2p64_63 = Set(df64, 27670116110564327424.0);
  10718  const auto f64_hi52 =
  10719      Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
  10720  const auto f64_lo12 =
  10721      PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
  10722                                        Set(du32, uint32_t{0x00000FFF}))));
  10723 
  10724  const auto f64_sum = f64_hi52 + f64_lo12;
  10725  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
  10726 
  10727  const auto f64_sum_is_inexact =
  10728      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
  10729  const auto f64_bits_decrement =
  10730      And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
  10731          f64_sum_is_inexact);
  10732 
  10733  const auto adj_f64_val = BitCast(
  10734      df64,
  10735      Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
  10736 
  10737  return DemoteTo(df32, adj_f64_val);
  10738 }
  10739 
  10740 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  10741 template <class D, HWY_IF_F32_D(D)>
  10742 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
  10743  const Rebind<double, decltype(df32)> df64;
  10744  const RebindToUnsigned<decltype(df64)> du64;
  10745  const RebindToSigned<decltype(df32)> di32;
  10746  const RebindToUnsigned<decltype(df32)> du32;
  10747 
  10748  const auto k2p64 = Set(df64, 18446744073709551616.0);
  10749  const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
  10750  const auto f64_lo12 =
  10751      PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
  10752                                        Set(du32, uint32_t{0x00000FFF}))));
  10753 
  10754  const auto f64_sum = f64_hi52 + f64_lo12;
  10755  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
  10756  const auto f64_sum_is_inexact =
  10757      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
  10758 
  10759  const auto adj_f64_val = BitCast(
  10760      df64,
  10761      Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
  10762         f64_sum_is_inexact));
  10763 
  10764  return DemoteTo(df32, adj_f64_val);
  10765 }
  10766 #endif
  10767 
  10768 // For already range-limited input [0, 255].
  10769 template <size_t N>
  10770 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
  10771 #if HWY_TARGET == HWY_SSE2
  10772  const RebindToSigned<DFromV<decltype(v)>> di32;
  10773  const Rebind<uint8_t, decltype(di32)> du8;
  10774  return DemoteTo(du8, BitCast(di32, v));
  10775 #else
  10776  const DFromV<decltype(v)> d32;
  10777  const Repartition<uint8_t, decltype(d32)> d8;
  10778  alignas(16) static constexpr uint32_t k8From32[4] = {
  10779      0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
  10780  // Also replicate bytes into all 32 bit lanes for safety.
  10781  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
  10782  return LowerHalf(LowerHalf(BitCast(d8, quad)));
  10783 #endif
  10784 }
  10785 
  10786 // ------------------------------ F32->UI64 PromoteTo
  10787 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  10788 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  10789 #else
  10790 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  10791 #endif
  10792 
  10793 #if HWY_TARGET <= HWY_AVX3
  10794 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
  10795 HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
  10796 #if HWY_X86_HAVE_AVX10_2_OPS
  10797  return VFromD<D>{_mm_cvtts_ps_epi64(v.raw)};
  10798 #elif HWY_COMPILER_GCC_ACTUAL
  10799  // Workaround for undefined behavior with GCC if any values of v[i] are not
  10800  // within the range of an int64_t
  10801 
  10802 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  10803  if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
  10804    typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  10805    const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
  10806    return Dup128VecFromValues(
  10807        D(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
  10808        detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]));
  10809  }
  10810 #endif
  10811 
  10812  __m128i raw_result;
  10813  __asm__("vcvttps2qq {%1, %0|%0, %1}"
  10814          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  10815          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  10816          :);
  10817  return VFromD<D>{raw_result};
  10818 #else
  10819  return VFromD<D>{_mm_cvttps_epi64(v.raw)};
  10820 #endif
  10821 }
  10822 
  10823 // Generic for all vector lengths.
  10824 template <class D, HWY_IF_I64_D(D)>
  10825 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
  10826 #if HWY_X86_HAVE_AVX10_2_OPS
  10827  return PromoteInRangeTo(di64, v);
  10828 #else
  10829  const Rebind<float, decltype(di64)> df32;
  10830  const RebindToFloat<decltype(di64)> df64;
  10831  // We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and
  10832  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115115. Previously we fixed up
  10833  // the result afterwards using three instructions. Now we instead check if
  10834  // v >= 2^63, and if so replace the output with 2^63-1, which is likely more
  10835  // efficient. Note that the previous representable f32 is less than 2^63 and
  10836  // thus fits in i64.
  10837  const MFromD<D> overflow = RebindMask(
  10838      di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f))));
  10839  return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()),
  10840                    PromoteInRangeTo(di64, v));
  10841 #endif
  10842 }
  10843 template <class D, HWY_IF_U64_D(D)>
  10844 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
  10845 #if HWY_X86_HAVE_AVX10_2_OPS
  10846  return PromoteInRangeTo(du64, v);
  10847 #else
  10848  return PromoteInRangeTo(du64, ZeroIfNegative(v));
  10849 #endif
  10850 }
  10851 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
  10852 HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) {
  10853 #if HWY_X86_HAVE_AVX10_2_OPS
  10854  return VFromD<D>{_mm_cvtts_ps_epu64(v.raw)};
  10855 #elif HWY_COMPILER_GCC_ACTUAL
  10856  // Workaround for undefined behavior with GCC if any values of v[i] are not
  10857  // within the range of an uint64_t
  10858 
  10859 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  10860  if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
  10861    typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  10862    const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
  10863    return Dup128VecFromValues(
  10864        D(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]),
  10865        detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1]));
  10866  }
  10867 #endif
  10868 
  10869  __m128i raw_result;
  10870  __asm__("vcvttps2uqq {%1, %0|%0, %1}"
  10871          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  10872          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  10873          :);
  10874  return VFromD<D>{raw_result};
  10875 #else
  10876  return VFromD<D>{_mm_cvttps_epu64(v.raw)};
  10877 #endif
  10878 }
  10879 #else   // AVX2 or below
  10880 
  10881 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  10882 template <class D, HWY_IF_I64_D(D)>
  10883 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
  10884  const Rebind<int32_t, decltype(di64)> di32;
  10885  const RebindToFloat<decltype(di32)> df32;
  10886  const RebindToUnsigned<decltype(di32)> du32;
  10887  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  10888 
  10889  const auto exponent_adj = BitCast(
  10890      du32,
  10891      Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
  10892                       BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
  10893          BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
  10894  const auto adj_v =
  10895      BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
  10896 
  10897  const auto f32_to_i32_result = ConvertTo(di32, adj_v);
  10898  const auto lo64_or_mask = PromoteTo(
  10899      di64,
  10900      BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
  10901                                         Set(di32, LimitsMax<int32_t>())))));
  10902 
  10903  return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
  10904                << PromoteTo(di64, exponent_adj),
  10905            lo64_or_mask);
  10906 }
  10907 
  10908 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  10909 template <class D, HWY_IF_UI64_D(D)>
  10910 HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
  10911  const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
  10912  const RebindToSigned<decltype(d32)> di32;
  10913  const RebindToFloat<decltype(d32)> df32;
  10914  const RebindToUnsigned<decltype(d32)> du32;
  10915  const Repartition<uint8_t, decltype(d32)> du32_as_du8;
  10916 
  10917  const auto exponent_adj = BitCast(
  10918      du32,
  10919      SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
  10920                   BitCast(du32_as_du8, Set(du32, uint32_t{0xFFFFFF9Du}))));
  10921  const auto adj_v =
  10922      BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
  10923 
  10924  const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
  10925  return PromoteTo(d64, BitCast(d32, f32_to_i32_result))
  10926         << PromoteTo(d64, exponent_adj);
  10927 }
  10928 
  10929 namespace detail {
  10930 
  10931 template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)>
  10932 HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64(
  10933    DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
  10934  const Rebind<int32_t, decltype(du64)> di32;
  10935  const Twice<decltype(di32)> dt_i32;
  10936 
  10937  const auto vt_i32_overflow_mask = ResizeBitCast(dt_i32, i32_overflow_mask);
  10938  return BitCast(du64,
  10939                 InterleaveLower(vt_i32_overflow_mask, vt_i32_overflow_mask));
  10940 }
  10941 
  10942 template <class DU64, HWY_IF_V_SIZE_GT_D(DU64, 16)>
  10943 HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64(
  10944    DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) {
  10945  const RebindToSigned<decltype(du64)> di64;
  10946  return BitCast(du64, PromoteTo(di64, i32_overflow_mask));
  10947 }
  10948 
  10949 }  // namespace detail
  10950 
  10951 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  10952 template <class D, HWY_IF_U64_D(D)>
  10953 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
  10954  const Rebind<int32_t, decltype(du64)> di32;
  10955  const RebindToFloat<decltype(di32)> df32;
  10956  const RebindToUnsigned<decltype(di32)> du32;
  10957  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  10958 
  10959  const auto non_neg_v = ZeroIfNegative(v);
  10960 
  10961  const auto exponent_adj = BitCast(
  10962      du32, Min(SaturatedSub(BitCast(du32_as_du8,
  10963                                     ShiftRight<23>(BitCast(du32, non_neg_v))),
  10964                             BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
  10965                BitCast(du32_as_du8, Set(du32, uint32_t{33}))));
  10966 
  10967  const auto adj_v =
  10968      BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj));
  10969  const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v);
  10970 
  10971  const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result);
  10972  const auto overflow_result =
  10973      detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask);
  10974 
  10975  return Or(PromoteTo(du64, BitCast(du32, f32_to_i32_result))
  10976                << PromoteTo(du64, exponent_adj),
  10977            overflow_result);
  10978 }
  10979 #endif  // HWY_TARGET <= HWY_AVX3
  10980 
  10981 // ------------------------------ MulFixedPoint15
  10982 
  10983 #if HWY_TARGET == HWY_SSE2
  10984 HWY_API Vec128<int16_t> MulFixedPoint15(const Vec128<int16_t> a,
  10985                                        const Vec128<int16_t> b) {
  10986  const DFromV<decltype(a)> d;
  10987  const Repartition<int32_t, decltype(d)> di32;
  10988 
  10989  auto lo_product = a * b;
  10990  auto hi_product = MulHigh(a, b);
  10991 
  10992  const VFromD<decltype(di32)> i32_product_lo{
  10993      _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
  10994  const VFromD<decltype(di32)> i32_product_hi{
  10995      _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)};
  10996 
  10997  const auto round_up_incr = Set(di32, 0x4000);
  10998  return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr),
  10999                          ShiftRight<15>(i32_product_hi + round_up_incr));
  11000 }
  11001 
  11002 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
  11003 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
  11004                                           const Vec128<int16_t, N> b) {
  11005  const DFromV<decltype(a)> d;
  11006  const Rebind<int32_t, decltype(d)> di32;
  11007 
  11008  const auto lo_product = a * b;
  11009  const auto hi_product = MulHigh(a, b);
  11010  const VFromD<decltype(di32)> i32_product{
  11011      _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)};
  11012 
  11013  return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000)));
  11014 }
  11015 #else
  11016 template <size_t N>
  11017 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
  11018                                           const Vec128<int16_t, N> b) {
  11019  return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
  11020 }
  11021 #endif
  11022 
  11023 // ------------------------------ Truncations
  11024 
  11025 template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)>
  11026 HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) {
  11027  // BitCast requires the same size; DTo might be u8x1 and v u16x1.
  11028  const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto;
  11029  return VFromD<DTo>{BitCast(dto, v).raw};
  11030 }
  11031 
  11032 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
  11033 HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) {
  11034 #if HWY_TARGET == HWY_SSE2
  11035  const Vec128<uint8_t, 1> lo{v.raw};
  11036  const Vec128<uint8_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
  11037  return Combine(d, hi, lo);
  11038 #else
  11039  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
  11040  (void)d;
  11041  alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8,
  11042                                                   0, 8, 0, 8, 0, 8, 0, 8};
  11043  const Vec128<uint8_t> v8 = TableLookupBytes(v, Load(d8, kIdx));
  11044  return LowerHalf(LowerHalf(LowerHalf(v8)));
  11045 #endif
  11046 }
  11047 
  11048 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
  11049 HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) {
  11050 #if HWY_TARGET == HWY_SSE2
  11051  const Vec128<uint16_t, 1> lo{v.raw};
  11052  const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)};
  11053  return Combine(d, hi, lo);
  11054 #else
  11055  (void)d;
  11056  const Repartition<uint16_t, DFromV<decltype(v)>> d16;
  11057  alignas(16) static constexpr uint16_t kIdx[8] = {
  11058      0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
  11059  const Vec128<uint16_t> v16 = TableLookupBytes(v, Load(d16, kIdx));
  11060  return LowerHalf(LowerHalf(v16));
  11061 #endif
  11062 }
  11063 
  11064 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
  11065 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
  11066  return VFromD<D>{_mm_shuffle_epi32(v.raw, 0x88)};
  11067 }
  11068 
  11069 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
  11070 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  11071  const DFromV<decltype(v)> du32;
  11072 #if HWY_TARGET == HWY_SSE2
  11073  const RebindToSigned<decltype(du32)> di32;
  11074  const Rebind<uint8_t, decltype(di32)> du8;
  11075  return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v))));
  11076 #else
  11077  const Repartition<uint8_t, decltype(du32)> d;
  11078  alignas(16) static constexpr uint8_t kIdx[16] = {
  11079      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
  11080      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
  11081  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx))));
  11082 #endif
  11083 }
  11084 
  11085 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
  11086 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
  11087  const DFromV<decltype(v)> du32;
  11088 #if HWY_TARGET == HWY_SSE2
  11089  const RebindToSigned<decltype(du32)> di32;
  11090  const Rebind<uint16_t, decltype(di32)> du16;
  11091  const RebindToSigned<decltype(du16)> di16;
  11092  return BitCast(
  11093      du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v)))));
  11094 #else
  11095  const Repartition<uint16_t, decltype(du32)> d;
  11096  return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v)));
  11097 #endif
  11098 }
  11099 
  11100 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
  11101 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  11102  const DFromV<decltype(v)> du16;
  11103 #if HWY_TARGET == HWY_SSE2
  11104  const RebindToSigned<decltype(du16)> di16;
  11105  const Rebind<uint8_t, decltype(di16)> du8;
  11106  const RebindToSigned<decltype(du8)> di8;
  11107  return BitCast(du8,
  11108                 DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v)))));
  11109 #else
  11110  const Repartition<uint8_t, decltype(du16)> d;
  11111  return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v)));
  11112 #endif
  11113 }
  11114 
  11115 // ------------------------------ Demotions to/from i64
  11116 
  11117 #if HWY_TARGET <= HWY_AVX3
  11118 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
  11119 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  11120  return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)};
  11121 }
  11122 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
  11123 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  11124  return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)};
  11125 }
  11126 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
  11127 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  11128  return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)};
  11129 }
  11130 
  11131 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
  11132 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  11133  const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
  11134  return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
  11135 }
  11136 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
  11137 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  11138  const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
  11139  return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
  11140 }
  11141 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
  11142 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
  11143  const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
  11144  return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
  11145 }
  11146 
  11147 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
  11148 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  11149  return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)};
  11150 }
  11151 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
  11152 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  11153  return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)};
  11154 }
  11155 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
  11156 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
  11157  return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)};
  11158 }
  11159 #else  // AVX2 or below
  11160 
  11161 // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
  11162 // implementations in generic_ops-inl.h for U64->I8/I16/I32 demotions on
  11163 // SSE2/SSSE3/SSE4/AVX2 as U64->I8/I16/I32 DemoteTo/ReorderDemote2To for
  11164 // SSE2/SSSE3/SSE4/AVX2 is implemented in x86_128-inl.h
  11165 
  11166 // The default unsigned to signed DemoteTo/ReorderDemote2To
  11167 // implementations in generic_ops-inl.h are still used for U32->I8/I16 and
  11168 // U16->I8 demotions on SSE2/SSSE3/SSE4/AVX2
  11169 
  11170 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
  11171 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8)
  11172 
  11173 namespace detail {
  11174 template <class D, HWY_IF_UNSIGNED_D(D)>
  11175 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
  11176    D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
  11177  return v;
  11178 }
  11179 
  11180 template <class D, HWY_IF_SIGNED_D(D)>
  11181 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult(
  11182    D /*dn*/, VFromD<Rebind<uint64_t, D>> v) {
  11183  const DFromV<decltype(v)> du64;
  11184  return And(v,
  11185             Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>())));
  11186 }
  11187 
  11188 template <class D>
  11189 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate(
  11190    D dn, VFromD<Rebind<uint64_t, D>> v) {
  11191  const Rebind<uint64_t, D> du64;
  11192  const RebindToSigned<decltype(du64)> di64;
  11193  constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) -
  11194                            static_cast<int>(hwy::IsSigned<TFromD<D>>());
  11195 
  11196  const auto too_big = BitCast(
  11197      du64, VecFromMask(
  11198                di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64))));
  11199  return DemoteFromU64MaskOutResult(dn, Or(v, too_big));
  11200 }
  11201 
  11202 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class V>
  11203 HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) {
  11204  return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
  11205 }
  11206 
  11207 }  // namespace detail
  11208 
  11209 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
  11210          HWY_IF_SIGNED_D(D)>
  11211 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
  11212  const DFromV<decltype(v)> di64;
  11213  const RebindToUnsigned<decltype(di64)> du64;
  11214  const RebindToUnsigned<decltype(dn)> dn_u;
  11215 
  11216  // Negative values are saturated by first saturating their bitwise inverse
  11217  // and then inverting the saturation result
  11218  const auto invert_mask = BitCast(du64, BroadcastSignBit(v));
  11219  const auto saturated_vals = Xor(
  11220      invert_mask,
  11221      detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v))));
  11222  return BitCast(dn, TruncateTo(dn_u, saturated_vals));
  11223 }
  11224 
  11225 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
  11226          HWY_IF_UNSIGNED_D(D)>
  11227 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) {
  11228  const DFromV<decltype(v)> di64;
  11229  const RebindToUnsigned<decltype(di64)> du64;
  11230 
  11231  const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v));
  11232  return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals));
  11233 }
  11234 
  11235 template <class D,
  11236          HWY_IF_T_SIZE_ONE_OF_D(
  11237              D, ((HWY_TARGET != HWY_SSE2) ? ((1 << 1) | (1 << 2)) : 0) |
  11238                     (1 << 4)),
  11239          HWY_IF_SIGNED_D(D)>
  11240 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
  11241  const RebindToUnsigned<decltype(dn)> dn_u;
  11242  return BitCast(dn, TruncateTo(dn_u, detail::DemoteFromU64Saturate(dn, v)));
  11243 }
  11244 
  11245 #if HWY_TARGET == HWY_SSE2
  11246 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
  11247          HWY_IF_SIGNED_D(D)>
  11248 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
  11249  const Rebind<int32_t, decltype(dn)> di32;
  11250  return DemoteTo(dn, DemoteTo(di32, v));
  11251 }
  11252 #endif  // HWY_TARGET == HWY_SSE2
  11253 
  11254 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
  11255          HWY_IF_UNSIGNED_D(D)>
  11256 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) {
  11257  return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v));
  11258 }
  11259 #endif  // HWY_TARGET <= HWY_AVX3
  11260 
  11261 template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2),
  11262          HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)>
  11263 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a,
  11264                                   VFromD<Repartition<int64_t, D>> b) {
  11265  const DFromV<decltype(a)> d;
  11266  const Twice<decltype(d)> dt;
  11267  return DemoteTo(dn, Combine(dt, b, a));
  11268 }
  11269 
  11270 template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_U32_D(D)>
  11271 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
  11272                                   VFromD<Repartition<uint64_t, D>> b) {
  11273  const DFromV<decltype(a)> d;
  11274  const Twice<decltype(d)> dt;
  11275  return DemoteTo(dn, Combine(dt, b, a));
  11276 }
  11277 
  11278 #if HWY_TARGET > HWY_AVX3
  11279 template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_I32_D(D)>
  11280 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a,
  11281                                   VFromD<Repartition<uint64_t, D>> b) {
  11282  const DFromV<decltype(a)> d;
  11283  const Twice<decltype(d)> dt;
  11284  return DemoteTo(dn, Combine(dt, b, a));
  11285 }
  11286 #endif
  11287 
  11288 #if HWY_TARGET > HWY_AVX2
  11289 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
  11290 HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
  11291                                         Vec128<int64_t> b) {
  11292  const DFromV<decltype(a)> di64;
  11293  const RebindToUnsigned<decltype(di64)> du64;
  11294  const Half<decltype(dn)> dnh;
  11295 
  11296  // Negative values are saturated by first saturating their bitwise inverse
  11297  // and then inverting the saturation result
  11298  const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a));
  11299  const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b));
  11300  const auto saturated_a = Xor(
  11301      invert_mask_a,
  11302      detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a))));
  11303  const auto saturated_b = Xor(
  11304      invert_mask_b,
  11305      detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b))));
  11306 
  11307  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
  11308 }
  11309 
  11310 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
  11311 HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a,
  11312                                          Vec128<int64_t> b) {
  11313  const DFromV<decltype(a)> di64;
  11314  const RebindToUnsigned<decltype(di64)> du64;
  11315  const Half<decltype(dn)> dnh;
  11316 
  11317  const auto saturated_a = detail::DemoteFromU64Saturate(
  11318      dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a)));
  11319  const auto saturated_b = detail::DemoteFromU64Saturate(
  11320      dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b)));
  11321 
  11322  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
  11323 }
  11324 
  11325 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
  11326 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint64_t> a,
  11327                                   Vec128<uint64_t> b) {
  11328  const Half<decltype(dn)> dnh;
  11329 
  11330  const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a);
  11331  const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b);
  11332 
  11333  return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a));
  11334 }
  11335 #endif  // HWY_TARGET > HWY_AVX2
  11336 
  11337 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
  11338 
  11339 #if HWY_HAVE_FLOAT16
  11340 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
  11341 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
  11342  return VFromD<D>{_mm_cvtepu16_ph(v.raw)};
  11343 }
  11344 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
  11345 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
  11346  return VFromD<D>{_mm_cvtepi16_ph(v.raw)};
  11347 }
  11348 #endif  // HWY_HAVE_FLOAT16
  11349 
  11350 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
  11351 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
  11352  return VFromD<D>{_mm_cvtepi32_ps(v.raw)};
  11353 }
  11354 
  11355 #if HWY_TARGET <= HWY_AVX3
  11356 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
  11357 HWY_API VFromD<D> ConvertTo(D /*df*/, VFromD<Rebind<uint32_t, D>> v) {
  11358  return VFromD<D>{_mm_cvtepu32_ps(v.raw)};
  11359 }
  11360 
  11361 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
  11362 HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<int64_t, D>> v) {
  11363  return VFromD<D>{_mm_cvtepi64_pd(v.raw)};
  11364 }
  11365 
  11366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
  11367 HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<uint64_t, D>> v) {
  11368  return VFromD<D>{_mm_cvtepu64_pd(v.raw)};
  11369 }
  11370 #else   // AVX2 or below
  11371 // Generic for all vector lengths.
  11372 template <class D, HWY_IF_F32_D(D)>
  11373 HWY_API VFromD<D> ConvertTo(D df, VFromD<Rebind<uint32_t, D>> v) {
  11374  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
  11375  const RebindToUnsigned<decltype(df)> du32;
  11376  const RebindToSigned<decltype(df)> d32;
  11377 
  11378  const auto msk_lo = Set(du32, 0xFFFF);
  11379  const auto cnst2_16_flt = Set(df, 65536.0f);  // 2^16
  11380 
  11381  // Extract the 16 lowest/highest significant bits of v and cast to signed int
  11382  const auto v_lo = BitCast(d32, And(v, msk_lo));
  11383  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
  11384  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
  11385 }
  11386 
  11387 // Generic for all vector lengths.
  11388 template <class D, HWY_IF_F64_D(D)>
  11389 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) {
  11390  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
  11391  const Repartition<uint32_t, decltype(dd)> d32;
  11392  const Repartition<uint64_t, decltype(dd)> d64;
  11393 
  11394  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
  11395  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
  11396  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
  11397 
  11398  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
  11399  const auto k52 = Set(d32, 0x43300000);
  11400  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
  11401 
  11402  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
  11403  return (v_upper - k84_63_52) + v_lower;  // order matters!
  11404 }
  11405 
  11406 namespace detail {
  11407 template <class VW>
  11408 HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) {
  11409  const DFromV<decltype(w)> d64;
  11410  const RebindToFloat<decltype(d64)> dd;
  11411  const auto cnst2_52_dbl = Set(dd, 0x0010000000000000);  // 2^52
  11412  return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl;
  11413 }
  11414 }  // namespace detail
  11415 
  11416 // Generic for all vector lengths.
  11417 template <class D, HWY_IF_F64_D(D)>
  11418 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) {
  11419  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
  11420  const RebindToUnsigned<decltype(dd)> d64;
  11421  using VU = VFromD<decltype(d64)>;
  11422 
  11423  const VU msk_lo = Set(d64, 0xFFFFFFFF);
  11424  const auto cnst2_32_dbl = Set(dd, 4294967296.0);  // 2^32
  11425 
  11426  // Extract the 32 lowest/highest significant bits of v
  11427  const VU v_lo = And(v, msk_lo);
  11428  const VU v_hi = ShiftRight<32>(v);
  11429 
  11430  const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo);
  11431  return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl);
  11432 }
  11433 #endif  // HWY_TARGET <= HWY_AVX3
  11434 
  11435 // Truncates (rounds toward zero).
  11436 
  11437 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  11438 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  11439 #else
  11440 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  11441 #endif
  11442 
  11443 #if HWY_HAVE_FLOAT16
  11444 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
  11445 HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
  11446 #if HWY_COMPILER_GCC_ACTUAL
  11447  // Workaround for undefined behavior in _mm_cvttph_epi16 if any values of v[i]
  11448  // are not within the range of an int16_t
  11449 
  11450 #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
  11451    HWY_HAVE_SCALAR_F16_TYPE
  11452  if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
  11453    typedef hwy::float16_t::Native GccF16RawVectType
  11454        __attribute__((__vector_size__(16)));
  11455    const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
  11456    return Dup128VecFromValues(
  11457        D(), detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]),
  11458        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]),
  11459        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]),
  11460        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]),
  11461        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]),
  11462        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]),
  11463        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]),
  11464        detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7]));
  11465  }
  11466 #endif
  11467 
  11468  __m128i raw_result;
  11469  __asm__("vcvttph2w {%1, %0|%0, %1}"
  11470          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11471          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11472          :);
  11473  return VFromD<D>{raw_result};
  11474 #else  // !HWY_COMPILER_GCC_ACTUAL
  11475  return VFromD<D>{_mm_cvttph_epi16(v.raw)};
  11476 #endif
  11477 }
  11478 
  11479 // F16 to I16 ConvertTo is generic for all vector lengths
  11480 template <class D, HWY_IF_I16_D(D)>
  11481 HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
  11482  const RebindToFloat<decltype(di)> df;
  11483  // See comment at the first occurrence of "IfThenElse(overflow,".
  11484  const MFromD<D> overflow =
  11485      RebindMask(di, Ge(v, Set(df, ConvertScalarTo<hwy::float16_t>(32768.0f))));
  11486  return IfThenElse(overflow, Set(di, LimitsMax<int16_t>()),
  11487                    ConvertInRangeTo(di, v));
  11488 }
  11489 
  11490 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
  11491 HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
  11492 #if HWY_COMPILER_GCC_ACTUAL
  11493  // Workaround for undefined behavior in _mm_cvttph_epu16 if any values of v[i]
  11494  // are not within the range of an uint16_t
  11495 
  11496 #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
  11497    HWY_HAVE_SCALAR_F16_TYPE
  11498  if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) {
  11499    typedef hwy::float16_t::Native GccF16RawVectType
  11500        __attribute__((__vector_size__(16)));
  11501    const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
  11502    return Dup128VecFromValues(
  11503        D(), detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0]),
  11504        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1]),
  11505        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2]),
  11506        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3]),
  11507        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4]),
  11508        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5]),
  11509        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6]),
  11510        detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7]));
  11511  }
  11512 #endif
  11513 
  11514  __m128i raw_result;
  11515  __asm__("vcvttph2uw {%1, %0|%0, %1}"
  11516          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11517          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11518          :);
  11519  return VFromD<D>{raw_result};
  11520 #else  // !HWY_COMPILER_GCC_ACTUAL
  11521  return VFromD<D>{_mm_cvttph_epu16(v.raw)};
  11522 #endif
  11523 }
  11524 
  11525 // F16->U16 ConvertTo is generic for all vector lengths
  11526 template <class D, HWY_IF_U16_D(D)>
  11527 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
  11528  return ConvertInRangeTo(D(), ZeroIfNegative(v));
  11529 }
  11530 #endif  // HWY_HAVE_FLOAT16
  11531 
  11532 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
  11533 HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) {
  11534 #if HWY_X86_HAVE_AVX10_2_OPS
  11535  return VFromD<D>{_mm_cvtts_ps_epi32(v.raw)};
  11536 #elif HWY_COMPILER_GCC_ACTUAL
  11537  // Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any
  11538  // values of v[i] are not within the range of an int32_t
  11539 
  11540 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11541  if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
  11542    typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  11543    const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
  11544    return Dup128VecFromValues(
  11545        D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]),
  11546        detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]),
  11547        detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]),
  11548        detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3]));
  11549  }
  11550 #endif
  11551 
  11552  __m128i raw_result;
  11553  __asm__("%vcvttps2dq {%1, %0|%0, %1}"
  11554          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11555          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11556          :);
  11557  return VFromD<D>{raw_result};
  11558 #else  // !HWY_COMPILER_GCC_ACTUAL
  11559  return VFromD<D>{_mm_cvttps_epi32(v.raw)};
  11560 #endif
  11561 }
  11562 
  11563 // F32 to I32 ConvertTo is generic for all vector lengths
  11564 template <class D, HWY_IF_I32_D(D)>
  11565 HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
  11566 #if HWY_X86_HAVE_AVX10_2_OPS
  11567  return ConvertInRangeTo(di, v);
  11568 #else
  11569  const RebindToFloat<decltype(di)> df;
  11570  // See comment at the first occurrence of "IfThenElse(overflow,".
  11571  const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f)));
  11572  return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()),
  11573                    ConvertInRangeTo(di, v));
  11574 #endif
  11575 }
  11576 
  11577 #if HWY_TARGET <= HWY_AVX3
  11578 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
  11579 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
  11580 #if HWY_X86_HAVE_AVX10_2_OPS
  11581  return VFromD<DI>{_mm_cvtts_pd_epi64(v.raw)};
  11582 #elif HWY_COMPILER_GCC_ACTUAL
  11583  // Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any
  11584  // values of v[i] are not within the range of an int64_t
  11585 
  11586 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11587  if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
  11588    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  11589    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  11590    return Dup128VecFromValues(
  11591        DI(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]),
  11592        detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1]));
  11593  }
  11594 #endif
  11595 
  11596  __m128i raw_result;
  11597  __asm__("vcvttpd2qq {%1, %0|%0, %1}"
  11598          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11599          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11600          :);
  11601  return VFromD<DI>{raw_result};
  11602 #else  // !HWY_COMPILER_GCC_ACTUAL
  11603  return VFromD<DI>{_mm_cvttpd_epi64(v.raw)};
  11604 #endif
  11605 }
  11606 
  11607 // F64 to I64 ConvertTo is generic for all vector lengths on AVX3
  11608 template <class DI, HWY_IF_I64_D(DI)>
  11609 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
  11610 #if HWY_X86_HAVE_AVX10_2_OPS
  11611  return ConvertInRangeTo(di, v);
  11612 #else
  11613  const RebindToFloat<decltype(di)> df;
  11614  // See comment at the first occurrence of "IfThenElse(overflow,".
  11615  const MFromD<DI> overflow =
  11616      RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
  11617  return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
  11618                    ConvertInRangeTo(di, v));
  11619 #endif
  11620 }
  11621 
  11622 template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
  11623 HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
  11624 #if HWY_X86_HAVE_AVX10_2_OPS
  11625  return VFromD<DU>{_mm_cvtts_ps_epu32(v.raw)};
  11626 #elif HWY_COMPILER_GCC_ACTUAL
  11627  // Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any
  11628  // values of v[i] are not within the range of an uint32_t
  11629 
  11630 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11631  if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) {
  11632    typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  11633    const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
  11634    return Dup128VecFromValues(
  11635        DU(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]),
  11636        detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]),
  11637        detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]),
  11638        detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3]));
  11639  }
  11640 #endif
  11641 
  11642  __m128i raw_result;
  11643  __asm__("vcvttps2udq {%1, %0|%0, %1}"
  11644          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11645          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11646          :);
  11647  return VFromD<DU>{raw_result};
  11648 #else  // !HWY_COMPILER_GCC_ACTUAL
  11649  return VFromD<DU>{_mm_cvttps_epu32(v.raw)};
  11650 #endif
  11651 }
  11652 
  11653 // F32->U32 ConvertTo is generic for all vector lengths
  11654 template <class DU, HWY_IF_U32_D(DU)>
  11655 HWY_API VFromD<DU> ConvertTo(DU du32, VFromD<RebindToFloat<DU>> v) {
  11656 #if HWY_X86_HAVE_AVX10_2_OPS
  11657  return ConvertInRangeTo(du32, v);
  11658 #else
  11659  return ConvertInRangeTo(du32, ZeroIfNegative(v));
  11660 #endif
  11661 }
  11662 
  11663 template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
  11664 HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
  11665 #if HWY_X86_HAVE_AVX10_2_OPS
  11666  return VFromD<DU>{_mm_cvtts_pd_epu64(v.raw)};
  11667 #elif HWY_COMPILER_GCC_ACTUAL
  11668  // Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any
  11669  // values of v[i] are not within the range of an uint64_t
  11670 
  11671 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11672  if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) {
  11673    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  11674    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  11675    return Dup128VecFromValues(
  11676        DU(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]),
  11677        detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1]));
  11678  }
  11679 #endif
  11680 
  11681  __m128i raw_result;
  11682  __asm__("vcvttpd2uqq {%1, %0|%0, %1}"
  11683          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11684          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11685          :);
  11686  return VFromD<DU>{raw_result};
  11687 #else  // !HWY_COMPILER_GCC_ACTUAL
  11688  return VFromD<DU>{_mm_cvttpd_epu64(v.raw)};
  11689 #endif
  11690 }
  11691 
  11692 // F64->U64 ConvertTo is generic for all vector lengths
  11693 template <class DU, HWY_IF_U64_D(DU)>
  11694 HWY_API VFromD<DU> ConvertTo(DU du64, VFromD<RebindToFloat<DU>> v) {
  11695 #if HWY_X86_HAVE_AVX10_2_OPS
  11696  return ConvertInRangeTo(du64, v);
  11697 #else
  11698  return ConvertInRangeTo(du64, ZeroIfNegative(v));
  11699 #endif
  11700 }
  11701 
  11702 #else  // AVX2 or below
  11703 
  11704 namespace detail {
  11705 
  11706 template <class DU32, HWY_IF_U32_D(DU32)>
  11707 static HWY_INLINE VFromD<DU32> ConvInRangeF32ToU32(
  11708    DU32 du32, VFromD<RebindToFloat<DU32>> v, VFromD<DU32>& exp_diff) {
  11709  const RebindToSigned<decltype(du32)> di32;
  11710  const RebindToFloat<decltype(du32)> df32;
  11711 
  11712  exp_diff = Set(du32, uint32_t{158}) - ShiftRight<23>(BitCast(du32, v));
  11713  const auto scale_down_f32_val_mask =
  11714      VecFromMask(du32, Eq(exp_diff, Zero(du32)));
  11715 
  11716  const auto v_scaled =
  11717      BitCast(df32, BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask));
  11718  const auto f32_to_u32_result =
  11719      BitCast(du32, ConvertInRangeTo(di32, v_scaled));
  11720 
  11721  return f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask);
  11722 }
  11723 
  11724 }  // namespace detail
  11725 
  11726 // F32 to U32 ConvertInRangeTo is generic for all vector lengths on
  11727 // SSE2/SSSE3/SSE4/AVX2
  11728 template <class DU32, HWY_IF_U32_D(DU32)>
  11729 HWY_API VFromD<DU32> ConvertInRangeTo(DU32 du32,
  11730                                      VFromD<RebindToFloat<DU32>> v) {
  11731  VFromD<DU32> exp_diff;
  11732  const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff);
  11733  return f32_to_u32_result;
  11734 }
  11735 
  11736 // F32 to U32 ConvertTo is generic for all vector lengths on
  11737 // SSE2/SSSE3/SSE4/AVX2
  11738 template <class DU32, HWY_IF_U32_D(DU32)>
  11739 HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) {
  11740  const RebindToSigned<decltype(du32)> di32;
  11741 
  11742  const auto non_neg_v = ZeroIfNegative(v);
  11743  VFromD<DU32> exp_diff;
  11744  const auto f32_to_u32_result =
  11745      detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff);
  11746 
  11747  return Or(f32_to_u32_result,
  11748            BitCast(du32, BroadcastSignBit(BitCast(di32, exp_diff))));
  11749 }
  11750 
  11751 namespace detail {
  11752 
  11753 template <class D64, HWY_IF_UI64_D(D64)>
  11754 HWY_API VFromD<D64> ConvAbsInRangeF64ToUI64(D64 d64,
  11755                                            VFromD<Rebind<double, D64>> v,
  11756                                            VFromD<D64>& biased_exp) {
  11757  const RebindToSigned<decltype(d64)> di64;
  11758  const RebindToUnsigned<decltype(d64)> du64;
  11759  using VU64 = VFromD<decltype(du64)>;
  11760  const Repartition<uint16_t, decltype(di64)> du16;
  11761  const VU64 k1075 = Set(du64, 1075); /* biased exponent of 2^52 */
  11762 
  11763  // Exponent indicates whether the number can be represented as int64_t.
  11764  biased_exp = BitCast(d64, ShiftRight<52>(BitCast(du64, v)));
  11765  HWY_IF_CONSTEXPR(IsSigned<TFromD<D64>>()) {
  11766    biased_exp = And(biased_exp, Set(d64, TFromD<D64>{0x7FF}));
  11767  }
  11768 
  11769  // If we were to cap the exponent at 51 and add 2^52, the number would be in
  11770  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
  11771  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
  11772  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
  11773  // manually shift the mantissa into place (we already have many of the
  11774  // inputs anyway).
  11775 
  11776  // Use 16-bit saturated unsigned subtraction to compute shift_mnt and
  11777  // shift_int since biased_exp[i] is a non-negative integer that is less than
  11778  // or equal to 2047.
  11779 
  11780  // 16-bit saturated unsigned subtraction is also more efficient than a
  11781  // 64-bit subtraction followed by a 64-bit signed Max operation on
  11782  // SSE2/SSSE3/SSE4/AVX2.
  11783 
  11784  // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be
  11785  // zero as the upper 48 bits of both k1075 and biased_exp are zero.
  11786 
  11787  const VU64 shift_mnt = BitCast(
  11788      du64, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp)));
  11789  const VU64 shift_int = BitCast(
  11790      du64, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075)));
  11791  const VU64 mantissa = BitCast(du64, v) & Set(du64, (1ULL << 52) - 1);
  11792  // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86
  11793  // returning zero in that case.
  11794  const VU64 int53 = (mantissa | Set(du64, 1ULL << 52)) >> shift_mnt;
  11795 
  11796  // For inputs larger than 2^53 - 1, insert zeros at the bottom.
  11797 
  11798  // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be
  11799  // shifted out of the left shift result below as shift_int[i] <= 11 is true
  11800  // for any inputs that are less than 2^64.
  11801 
  11802  return BitCast(d64, int53 << shift_int);
  11803 }
  11804 
  11805 }  // namespace detail
  11806 
  11807 #if HWY_ARCH_X86_64
  11808 
  11809 namespace detail {
  11810 
  11811 template <size_t N>
  11812 static HWY_INLINE int64_t SSE2ConvFirstF64LaneToI64(Vec128<double, N> v) {
  11813 #if HWY_COMPILER_GCC_ACTUAL
  11814  // Workaround for undefined behavior in _mm_cvttsd_si64 with GCC if v[0] is
  11815  // not within the range of an int64_t
  11816 
  11817 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11818  if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) {
  11819    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  11820    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  11821    return X86ConvertScalarFromFloat<int64_t>(raw_v[0]);
  11822  }
  11823 #endif
  11824 
  11825  int64_t result;
  11826  __asm__("%vcvttsd2si {%1, %0|%0, %1}"
  11827          : "=r"(result)
  11828          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11829          :);
  11830  return result;
  11831 #else
  11832  return _mm_cvttsd_si64(v.raw);
  11833 #endif
  11834 }
  11835 
  11836 }  // namespace detail
  11837 
  11838 template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
  11839 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec64<double> v) {
  11840  return VFromD<DI>{_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v))};
  11841 }
  11842 template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
  11843 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec128<double> v) {
  11844  const __m128i i0 = _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v));
  11845  const Full64<double> dd2;
  11846  const __m128i i1 =
  11847      _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(UpperHalf(dd2, v)));
  11848  return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
  11849 }
  11850 
  11851 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
  11852 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
  11853  const RebindToFloat<decltype(di)> df;
  11854  // See comment at the first occurrence of "IfThenElse(overflow,".
  11855  const MFromD<DI> overflow =
  11856      RebindMask(di, Ge(v, Set(df, 9.223372036854776e18)));
  11857  return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()),
  11858                    ConvertInRangeTo(di, v));
  11859 }
  11860 #endif  // HWY_ARCH_X86_64
  11861 
  11862 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
  11863 template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
  11864          HWY_IF_I64_D(DI)>
  11865 HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<Rebind<double, DI>> v) {
  11866  using VI = VFromD<DI>;
  11867 
  11868  VI biased_exp;
  11869  const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
  11870  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
  11871 
  11872  // If the input was negative, negate the integer (two's complement).
  11873  return (shifted ^ sign_mask) - sign_mask;
  11874 }
  11875 
  11876 template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
  11877          HWY_IF_I64_D(DI)>
  11878 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) {
  11879  using VI = VFromD<DI>;
  11880 
  11881  VI biased_exp;
  11882  const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp);
  11883 
  11884 #if HWY_TARGET <= HWY_SSE4
  11885  const auto in_range = biased_exp < Set(di, 1086);
  11886 #else
  11887  const Repartition<int32_t, decltype(di)> di32;
  11888  const auto in_range = MaskFromVec(BitCast(
  11889      di,
  11890      VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086))));
  11891 #endif
  11892 
  11893  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
  11894  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
  11895  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
  11896  const VI magnitude = IfThenElse(in_range, shifted, limit);
  11897 
  11898  // If the input was negative, negate the integer (two's complement).
  11899  return (magnitude ^ sign_mask) - sign_mask;
  11900 }
  11901 #endif  // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
  11902 
  11903 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  11904 template <class DU, HWY_IF_U64_D(DU)>
  11905 HWY_API VFromD<DU> ConvertInRangeTo(DU du, VFromD<Rebind<double, DU>> v) {
  11906  VFromD<DU> biased_exp;
  11907  const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp);
  11908  return shifted;
  11909 }
  11910 
  11911 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
  11912 template <class DU, HWY_IF_U64_D(DU)>
  11913 HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
  11914  const RebindToSigned<DU> di;
  11915  using VU = VFromD<DU>;
  11916 
  11917  VU biased_exp;
  11918  const VU shifted =
  11919      detail::ConvAbsInRangeF64ToUI64(du, ZeroIfNegative(v), biased_exp);
  11920 
  11921  // Exponent indicates whether the number can be represented as uint64_t.
  11922 #if HWY_TARGET <= HWY_SSE4
  11923  const VU out_of_range =
  11924      BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086)));
  11925 #else
  11926  const Repartition<int32_t, decltype(di)> di32;
  11927  const VU out_of_range = BitCast(
  11928      du,
  11929      VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086)));
  11930 #endif
  11931 
  11932  return (shifted | out_of_range);
  11933 }
  11934 #endif  // HWY_TARGET <= HWY_AVX3
  11935 
  11936 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11937 namespace detail {
  11938 
  11939 template <class TTo, class TF, HWY_IF_SIGNED(TTo)>
  11940 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CXX14_CONSTEXPR TTo
  11941 X86ScalarNearestInt(TF flt_val) {
  11942 #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS
  11943  using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float,
  11944                     RemoveCvRef<TF>>;
  11945 #else
  11946  using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
  11947 #endif
  11948 
  11949  const TTo trunc_int_val = X86ConvertScalarFromFloat<TTo>(flt_val);
  11950  const TFArith abs_val_diff = ScalarAbs(
  11951      ConvertScalarTo<TFArith>(ConvertScalarTo<TFArith>(flt_val) -
  11952                               ConvertScalarTo<TFArith>(trunc_int_val)));
  11953  constexpr TFArith kHalf = ConvertScalarTo<TFArith>(0.5);
  11954 
  11955  const bool round_result_up =
  11956      ((trunc_int_val ^ ScalarShr(trunc_int_val, sizeof(TTo) * 8 - 1)) !=
  11957       LimitsMax<TTo>()) &&
  11958      (abs_val_diff > kHalf ||
  11959       (abs_val_diff == kHalf && (trunc_int_val & 1) != 0));
  11960  return static_cast<TTo>(
  11961      trunc_int_val +
  11962      (round_result_up ? (ScalarSignBit(flt_val) ? (-1) : 1) : 0));
  11963 }
  11964 
  11965 }  // namespace detail
  11966 #endif  // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11967 
  11968 // If these are in namespace detail, the x86_256/512 templates are not found.
  11969 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)>
  11970 static HWY_INLINE VFromD<DI> NearestIntInRange(DI,
  11971                                               VFromD<RebindToFloat<DI>> v) {
  11972 #if HWY_COMPILER_GCC_ACTUAL
  11973  // Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values
  11974  // of v[i] are not within the range of an int32_t
  11975 
  11976 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  11977  if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
  11978    typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
  11979    const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
  11980    return Dup128VecFromValues(DI(),
  11981                               detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
  11982                               detail::X86ScalarNearestInt<int32_t>(raw_v[1]),
  11983                               detail::X86ScalarNearestInt<int32_t>(raw_v[2]),
  11984                               detail::X86ScalarNearestInt<int32_t>(raw_v[3]));
  11985  }
  11986 #endif
  11987 
  11988  __m128i raw_result;
  11989  __asm__("%vcvtps2dq {%1, %0|%0, %1}"
  11990          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  11991          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  11992          :);
  11993  return VFromD<DI>{raw_result};
  11994 #else  // !HWY_COMPILER_GCC_ACTUAL
  11995  return VFromD<DI>{_mm_cvtps_epi32(v.raw)};
  11996 #endif
  11997 }
  11998 
  11999 #if HWY_HAVE_FLOAT16
  12000 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I16_D(DI)>
  12001 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/,
  12002                                               VFromD<RebindToFloat<DI>> v) {
  12003 #if HWY_COMPILER_GCC_ACTUAL
  12004  // Workaround for undefined behavior in _mm_cvtph_epi16 if any values of v[i]
  12005  // are not within the range of an int16_t
  12006 
  12007 #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \
  12008    HWY_HAVE_SCALAR_F16_TYPE
  12009  if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) {
  12010    typedef hwy::float16_t::Native GccF16RawVectType
  12011        __attribute__((__vector_size__(16)));
  12012    const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw);
  12013    return Dup128VecFromValues(DI(),
  12014                               detail::X86ScalarNearestInt<int16_t>(raw_v[0]),
  12015                               detail::X86ScalarNearestInt<int16_t>(raw_v[1]),
  12016                               detail::X86ScalarNearestInt<int16_t>(raw_v[2]),
  12017                               detail::X86ScalarNearestInt<int16_t>(raw_v[3]),
  12018                               detail::X86ScalarNearestInt<int16_t>(raw_v[4]),
  12019                               detail::X86ScalarNearestInt<int16_t>(raw_v[5]),
  12020                               detail::X86ScalarNearestInt<int16_t>(raw_v[6]),
  12021                               detail::X86ScalarNearestInt<int16_t>(raw_v[7]));
  12022  }
  12023 #endif
  12024 
  12025  __m128i raw_result;
  12026  __asm__("vcvtph2w {%1, %0|%0, %1}"
  12027          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  12028          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  12029          :);
  12030  return VFromD<DI>{raw_result};
  12031 #else  // !HWY_COMPILER_GCC_ACTUAL
  12032  return VFromD<DI>{_mm_cvtph_epi16(v.raw)};
  12033 #endif
  12034 }
  12035 #endif  // HWY_HAVE_FLOAT16
  12036 
  12037 #if HWY_TARGET <= HWY_AVX3
  12038 
  12039 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)>
  12040 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/,
  12041                                               VFromD<RebindToFloat<DI>> v) {
  12042 #if HWY_COMPILER_GCC_ACTUAL
  12043  // Workaround for undefined behavior in _mm_cvtpd_epi64 with GCC if any
  12044  // values of v[i] are not within the range of an int64_t
  12045 
  12046 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  12047  if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) {
  12048    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  12049    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  12050    return Dup128VecFromValues(DI(),
  12051                               detail::X86ScalarNearestInt<int64_t>(raw_v[0]),
  12052                               detail::X86ScalarNearestInt<int64_t>(raw_v[1]));
  12053  }
  12054 #endif
  12055 
  12056  __m128i raw_result;
  12057  __asm__("vcvtpd2qq {%1, %0|%0, %1}"
  12058          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  12059          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  12060          :);
  12061  return VFromD<DI>{raw_result};
  12062 #else  // !HWY_COMPILER_GCC_ACTUAL
  12063  return VFromD<DI>{_mm_cvtpd_epi64(v.raw)};
  12064 #endif
  12065 }
  12066 
  12067 #else  // HWY_TARGET > HWY_AVX3
  12068 
  12069 namespace detail {
  12070 
  12071 #if HWY_ARCH_X86_64
  12072 template <size_t N>
  12073 static HWY_INLINE int64_t
  12074 SSE2ConvFirstF64LaneToNearestI64(Vec128<double, N> v) {
  12075 #if HWY_COMPILER_GCC_ACTUAL
  12076  // Workaround for undefined behavior in _mm_cvtsd_si64 with GCC if v[0] is
  12077  // not within the range of an int64_t
  12078 
  12079 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  12080  if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) {
  12081    typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
  12082    const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw);
  12083    return X86ScalarNearestInt<int64_t>(raw_v[0]);
  12084  }
  12085 #endif
  12086 
  12087  int64_t result;
  12088  __asm__("%vcvtsd2si {%1, %0|%0, %1}"
  12089          : "=r"(result)
  12090          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  12091          :);
  12092  return result;
  12093 #else
  12094  return _mm_cvtsd_si64(v.raw);
  12095 #endif
  12096 }
  12097 #endif  // HWY_ARCH_X86_64
  12098 
  12099 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
  12100 template <class DI64, HWY_IF_I64_D(DI64)>
  12101 static HWY_INLINE VFromD<DI64> SSE2NearestI64InRange(
  12102    DI64 di64, VFromD<RebindToFloat<DI64>> v) {
  12103  const RebindToFloat<DI64> df64;
  12104  const RebindToUnsigned<DI64> du64;
  12105  using VI64 = VFromD<decltype(di64)>;
  12106 
  12107  const auto mant_end = Set(df64, MantissaEnd<double>());
  12108  const auto is_small = Lt(Abs(v), mant_end);
  12109 
  12110  const auto adj_v = Max(v, Set(df64, -9223372036854775808.0)) +
  12111                     IfThenElseZero(is_small, CopySignToAbs(mant_end, v));
  12112  const auto adj_v_biased_exp =
  12113      And(BitCast(di64, ShiftRight<52>(BitCast(du64, adj_v))),
  12114          Set(di64, int64_t{0x7FF}));
  12115 
  12116  // We can simply subtract 1075 from adj_v_biased_exp[i] to get shift_int since
  12117  // adj_v_biased_exp[i] is at least 1075
  12118  const VI64 shift_int = adj_v_biased_exp + Set(di64, int64_t{-1075});
  12119 
  12120  const VI64 mantissa = BitCast(di64, adj_v) & Set(di64, (1LL << 52) - 1);
  12121  // Include implicit 1-bit if is_small[i] is 0. NOTE: the shift count may
  12122  // exceed 63; we rely on x86 returning zero in that case.
  12123  const VI64 int53 = mantissa | IfThenZeroElse(RebindMask(di64, is_small),
  12124                                               Set(di64, 1LL << 52));
  12125 
  12126  const VI64 sign_mask = BroadcastSignBit(BitCast(di64, v));
  12127  // If the input was negative, negate the integer (two's complement).
  12128  return ((int53 << shift_int) ^ sign_mask) - sign_mask;
  12129 }
  12130 #endif  // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
  12131 
  12132 }  // namespace detail
  12133 
  12134 #if HWY_ARCH_X86_64
  12135 template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)>
  12136 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec64<double> v) {
  12137  return VFromD<DI>{
  12138      _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v))};
  12139 }
  12140 template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)>
  12141 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec128<double> v) {
  12142  const __m128i i0 =
  12143      _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v));
  12144  const Full64<double> dd2;
  12145  const __m128i i1 = _mm_cvtsi64_si128(
  12146      detail::SSE2ConvFirstF64LaneToNearestI64(UpperHalf(dd2, v)));
  12147  return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)};
  12148 }
  12149 #endif  // HWY_ARCH_X86_64
  12150 
  12151 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
  12152 template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)),
  12153          HWY_IF_I64_D(DI)>
  12154 static HWY_INLINE VFromD<DI> NearestIntInRange(DI di,
  12155                                               VFromD<RebindToFloat<DI>> v) {
  12156  return detail::SSE2NearestI64InRange(di, v);
  12157 }
  12158 #endif  //  !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2
  12159 
  12160 #endif  // HWY_TARGET <= HWY_AVX3
  12161 
  12162 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 8), HWY_IF_I32_D(DI)>
  12163 static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange(
  12164    DI, VFromD<Rebind<double, DI>> v) {
  12165 #if HWY_COMPILER_GCC_ACTUAL
  12166  // Workaround for undefined behavior in _mm_cvtpd_epi32 with GCC if any values
  12167  // of v[i] are not within the range of an int32_t
  12168 
  12169 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD
  12170  if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) {
  12171    typedef double GccF32RawVectType __attribute__((__vector_size__(16)));
  12172    const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw);
  12173    return Dup128VecFromValues(
  12174        DI(), detail::X86ScalarNearestInt<int32_t>(raw_v[0]),
  12175        detail::X86ScalarNearestInt<int32_t>(raw_v[1]), int32_t{0}, int32_t{0});
  12176  }
  12177 #endif
  12178 
  12179  __m128i raw_result;
  12180  __asm__("%vcvtpd2dq {%1, %0|%0, %1}"
  12181          : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result)
  12182          : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw)
  12183          :);
  12184  return VFromD<DI>{raw_result};
  12185 #else  // !HWY_COMPILER_GCC_ACTUAL
  12186  return VFromD<DI>{_mm_cvtpd_epi32(v.raw)};
  12187 #endif
  12188 }
  12189 
  12190 // F16/F32/F64 NearestInt is generic for all vector lengths
  12191 template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>,
  12192          HWY_IF_FLOAT_D(DF),
  12193          HWY_IF_T_SIZE_ONE_OF_D(DF, (1 << 4) | (1 << 8) |
  12194                                         (HWY_HAVE_FLOAT16 ? (1 << 2) : 0))>
  12195 HWY_API VFromD<DI> NearestInt(const VF v) {
  12196  const DI di;
  12197  using TI = TFromD<DI>;
  12198  using TF = TFromD<DF>;
  12199  using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>;
  12200 
  12201  constexpr TFArith kMinOutOfRangePosVal =
  12202      static_cast<TFArith>(-static_cast<TFArith>(LimitsMin<TI>()));
  12203  static_assert(kMinOutOfRangePosVal > static_cast<TFArith>(0.0),
  12204                "kMinOutOfRangePosVal > 0.0 must be true");
  12205 
  12206  // See comment at the first occurrence of "IfThenElse(overflow,".
  12207  // Here we are rounding, whereas previous occurrences truncate, but there is
  12208  // no difference because the previous float value is well below the max i32.
  12209  const auto overflow = RebindMask(
  12210      di, Ge(v, Set(DF(), ConvertScalarTo<TF>(kMinOutOfRangePosVal))));
  12211  auto result =
  12212      IfThenElse(overflow, Set(di, LimitsMax<TI>()), NearestIntInRange(di, v));
  12213 
  12214  return result;
  12215 }
  12216 
  12217 template <class DI, HWY_IF_I32_D(DI)>
  12218 HWY_API VFromD<DI> DemoteToNearestInt(DI, VFromD<Rebind<double, DI>> v) {
  12219  const DI di;
  12220  const Rebind<double, DI> df64;
  12221  return DemoteToNearestIntInRange(di, Min(v, Set(df64, 2147483647.0)));
  12222 }
  12223 
  12224 // ------------------------------ Floating-point rounding (ConvertTo)
  12225 
  12226 #if HWY_TARGET >= HWY_SSSE3
  12227 
  12228 // Toward nearest integer, ties to even
  12229 template <typename T, size_t N>
  12230 HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
  12231  static_assert(IsFloat<T>(), "Only for float");
  12232  // Rely on rounding after addition with a large value such that no mantissa
  12233  // bits remain (assuming the current mode is nearest-even). We may need a
  12234  // compiler flag for precise floating-point to prevent "optimizing" this out.
  12235  const DFromV<decltype(v)> df;
  12236  const auto max = Set(df, MantissaEnd<T>());
  12237  const auto large = CopySignToAbs(max, v);
  12238  const auto added = large + v;
  12239  const auto rounded = added - large;
  12240  // Keep original if NaN or the magnitude is large (already an int).
  12241  return IfThenElse(Abs(v) < max, rounded, v);
  12242 }
  12243 
  12244 namespace detail {
  12245 
  12246 // Truncating to integer and converting back to float is correct except when the
  12247 // input magnitude is large, in which case the input was already an integer
  12248 // (because mantissa >> exponent is zero).
  12249 template <typename T, size_t N>
  12250 HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) {
  12251  static_assert(IsFloat<T>(), "Only for float");
  12252  const DFromV<decltype(v)> d;
  12253  return Abs(v) < Set(d, MantissaEnd<T>());
  12254 }
  12255 
  12256 }  // namespace detail
  12257 
  12258 // Toward zero, aka truncate
  12259 template <typename T, size_t N>
  12260 HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
  12261  static_assert(IsFloat<T>(), "Only for float");
  12262  const DFromV<decltype(v)> df;
  12263  const RebindToSigned<decltype(df)> di;
  12264 
  12265  const auto integer = ConvertInRangeTo(di, v);  // round toward 0
  12266  const auto int_f = ConvertTo(df, integer);
  12267 
  12268  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
  12269 }
  12270 
  12271 // Toward +infinity, aka ceiling
  12272 template <typename T, size_t N>
  12273 HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
  12274  static_assert(IsFloat<T>(), "Only for float");
  12275  const DFromV<decltype(v)> df;
  12276  const RebindToSigned<decltype(df)> di;
  12277 
  12278  const auto integer = ConvertInRangeTo(di, v);  // round toward 0
  12279  const auto int_f = ConvertTo(df, integer);
  12280 
  12281  // Truncating a positive non-integer ends up smaller; if so, add 1.
  12282  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
  12283 
  12284  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
  12285 }
  12286 
  12287 #ifdef HWY_NATIVE_CEIL_FLOOR_INT
  12288 #undef HWY_NATIVE_CEIL_FLOOR_INT
  12289 #else
  12290 #define HWY_NATIVE_CEIL_FLOOR_INT
  12291 #endif
  12292 
  12293 template <class V, HWY_IF_FLOAT_V(V)>
  12294 HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
  12295  const DFromV<decltype(v)> df;
  12296  const RebindToSigned<decltype(df)> di;
  12297 
  12298  const auto integer = ConvertTo(di, v);  // round toward 0
  12299  const auto int_f = ConvertTo(df, integer);
  12300 
  12301  // Truncating a positive non-integer ends up smaller; if so, add 1.
  12302  return integer -
  12303         VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v)));
  12304 }
  12305 
  12306 // Toward -infinity, aka floor
  12307 template <typename T, size_t N>
  12308 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
  12309  static_assert(IsFloat<T>(), "Only for float");
  12310  const DFromV<decltype(v)> df;
  12311  const RebindToSigned<decltype(df)> di;
  12312 
  12313  const auto integer = ConvertInRangeTo(di, v);  // round toward 0
  12314  const auto int_f = ConvertTo(df, integer);
  12315 
  12316  // Truncating a negative non-integer ends up larger; if so, subtract 1.
  12317  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
  12318 
  12319  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
  12320 }
  12321 
  12322 template <class V, HWY_IF_FLOAT_V(V)>
  12323 HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
  12324  const DFromV<decltype(v)> df;
  12325  const RebindToSigned<decltype(df)> di;
  12326 
  12327  const auto integer = ConvertTo(di, v);  // round toward 0
  12328  const auto int_f = ConvertTo(df, integer);
  12329 
  12330  // Truncating a negative non-integer ends up larger; if so, subtract 1.
  12331  return integer +
  12332         VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v)));
  12333 }
  12334 
  12335 #else
  12336 
  12337 // Toward nearest integer, ties to even
  12338 #if HWY_HAVE_FLOAT16
  12339 template <size_t N>
  12340 HWY_API Vec128<float16_t, N> Round(const Vec128<float16_t, N> v) {
  12341  return Vec128<float16_t, N>{
  12342      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
  12343 }
  12344 #endif  // HWY_HAVE_FLOAT16
  12345 template <size_t N>
  12346 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
  12347  return Vec128<float, N>{
  12348      _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
  12349 }
  12350 template <size_t N>
  12351 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
  12352  return Vec128<double, N>{
  12353      _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
  12354 }
  12355 
  12356 // Toward zero, aka truncate
  12357 #if HWY_HAVE_FLOAT16
  12358 template <size_t N>
  12359 HWY_API Vec128<float16_t, N> Trunc(const Vec128<float16_t, N> v) {
  12360  return Vec128<float16_t, N>{
  12361      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
  12362 }
  12363 #endif  // HWY_HAVE_FLOAT16
  12364 template <size_t N>
  12365 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
  12366  return Vec128<float, N>{
  12367      _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
  12368 }
  12369 template <size_t N>
  12370 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
  12371  return Vec128<double, N>{
  12372      _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
  12373 }
  12374 
  12375 // Toward +infinity, aka ceiling
  12376 #if HWY_HAVE_FLOAT16
  12377 template <size_t N>
  12378 HWY_API Vec128<float16_t, N> Ceil(const Vec128<float16_t, N> v) {
  12379  return Vec128<float16_t, N>{
  12380      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
  12381 }
  12382 #endif  // HWY_HAVE_FLOAT16
  12383 template <size_t N>
  12384 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
  12385  return Vec128<float, N>{
  12386      _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
  12387 }
  12388 template <size_t N>
  12389 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
  12390  return Vec128<double, N>{
  12391      _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
  12392 }
  12393 
  12394 // Toward -infinity, aka floor
  12395 #if HWY_HAVE_FLOAT16
  12396 template <size_t N>
  12397 HWY_API Vec128<float16_t, N> Floor(const Vec128<float16_t, N> v) {
  12398  return Vec128<float16_t, N>{
  12399      _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
  12400 }
  12401 #endif  // HWY_HAVE_FLOAT16
  12402 template <size_t N>
  12403 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
  12404  return Vec128<float, N>{
  12405      _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
  12406 }
  12407 template <size_t N>
  12408 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
  12409  return Vec128<double, N>{
  12410      _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
  12411 }
  12412 
  12413 #endif  // !HWY_SSSE3
  12414 
  12415 // ------------------------------ Floating-point classification
  12416 
  12417 #define HWY_X86_FPCLASS_QNAN 0x01
  12418 #define HWY_X86_FPCLASS_POS0 0x02
  12419 #define HWY_X86_FPCLASS_NEG0 0x04
  12420 #define HWY_X86_FPCLASS_POS_INF 0x08
  12421 #define HWY_X86_FPCLASS_NEG_INF 0x10
  12422 #define HWY_X86_FPCLASS_SUBNORMAL 0x20
  12423 #define HWY_X86_FPCLASS_NEG 0x40
  12424 #define HWY_X86_FPCLASS_SNAN 0x80
  12425 
  12426 #if HWY_HAVE_FLOAT16 || HWY_IDE
  12427 
  12428 template <size_t N>
  12429 HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) {
  12430  return Mask128<float16_t, N>{
  12431      _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
  12432 }
  12433 
  12434 template <size_t N>
  12435 HWY_API Mask128<float16_t, N> IsEitherNaN(Vec128<float16_t, N> a,
  12436                                          Vec128<float16_t, N> b) {
  12437  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
  12438  HWY_DIAGNOSTICS(push)
  12439  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
  12440  return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)};
  12441  HWY_DIAGNOSTICS(pop)
  12442 }
  12443 
  12444 template <size_t N>
  12445 HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) {
  12446  return Mask128<float16_t, N>{_mm_fpclass_ph_mask(
  12447      v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
  12448 }
  12449 
  12450 template <size_t N>
  12451 HWY_API Mask128<float16_t, N> IsFinite(const Vec128<float16_t, N> v) {
  12452  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
  12453  // and negate the mask.
  12454  return Not(Mask128<float16_t, N>{_mm_fpclass_ph_mask(
  12455      v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
  12456                 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
  12457 }
  12458 
  12459 #endif  // HWY_HAVE_FLOAT16
  12460 
  12461 template <size_t N>
  12462 HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) {
  12463 #if HWY_TARGET <= HWY_AVX3
  12464  return Mask128<float, N>{
  12465      _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
  12466 #else
  12467  return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
  12468 #endif
  12469 }
  12470 template <size_t N>
  12471 HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
  12472 #if HWY_TARGET <= HWY_AVX3
  12473  return Mask128<double, N>{
  12474      _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)};
  12475 #else
  12476  return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
  12477 #endif
  12478 }
  12479 
  12480 #ifdef HWY_NATIVE_IS_EITHER_NAN
  12481 #undef HWY_NATIVE_IS_EITHER_NAN
  12482 #else
  12483 #define HWY_NATIVE_IS_EITHER_NAN
  12484 #endif
  12485 
  12486 template <size_t N>
  12487 HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) {
  12488 #if HWY_TARGET <= HWY_AVX3
  12489  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)};
  12490 #else
  12491  return Mask128<float, N>{_mm_cmpunord_ps(a.raw, b.raw)};
  12492 #endif
  12493 }
  12494 
  12495 template <size_t N>
  12496 HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a,
  12497                                       Vec128<double, N> b) {
  12498 #if HWY_TARGET <= HWY_AVX3
  12499  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)};
  12500 #else
  12501  return Mask128<double, N>{_mm_cmpunord_pd(a.raw, b.raw)};
  12502 #endif
  12503 }
  12504 
  12505 #if HWY_TARGET <= HWY_AVX3
  12506 
  12507 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
  12508 #ifdef HWY_NATIVE_ISINF
  12509 #undef HWY_NATIVE_ISINF
  12510 #else
  12511 #define HWY_NATIVE_ISINF
  12512 #endif
  12513 
  12514 template <size_t N>
  12515 HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) {
  12516  return Mask128<float, N>{_mm_fpclass_ps_mask(
  12517      v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
  12518 }
  12519 template <size_t N>
  12520 HWY_API Mask128<double, N> IsInf(const Vec128<double, N> v) {
  12521  return Mask128<double, N>{_mm_fpclass_pd_mask(
  12522      v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)};
  12523 }
  12524 
  12525 // Returns whether normal/subnormal/zero.
  12526 template <size_t N>
  12527 HWY_API Mask128<float, N> IsFinite(const Vec128<float, N> v) {
  12528  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
  12529  // and negate the mask.
  12530  return Not(Mask128<float, N>{_mm_fpclass_ps_mask(
  12531      v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
  12532                 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
  12533 }
  12534 template <size_t N>
  12535 HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) {
  12536  return Not(Mask128<double, N>{_mm_fpclass_pd_mask(
  12537      v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN |
  12538                 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
  12539 }
  12540 
  12541 #endif  // HWY_TARGET <= HWY_AVX3
  12542 
  12543 // ================================================== CRYPTO
  12544 
  12545 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4
  12546 
  12547 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
  12548 #ifdef HWY_NATIVE_AES
  12549 #undef HWY_NATIVE_AES
  12550 #else
  12551 #define HWY_NATIVE_AES
  12552 #endif
  12553 
  12554 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
  12555                                 Vec128<uint8_t> round_key) {
  12556  return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
  12557 }
  12558 
  12559 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
  12560                                     Vec128<uint8_t> round_key) {
  12561  return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
  12562 }
  12563 
  12564 HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
  12565  return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)};
  12566 }
  12567 
  12568 HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
  12569                                    Vec128<uint8_t> round_key) {
  12570  return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)};
  12571 }
  12572 
  12573 HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
  12574                                        Vec128<uint8_t> round_key) {
  12575  return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)};
  12576 }
  12577 
  12578 template <uint8_t kRcon>
  12579 HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
  12580  return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)};
  12581 }
  12582 
  12583 template <size_t N>
  12584 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
  12585                                       Vec128<uint64_t, N> b) {
  12586  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
  12587 }
  12588 
  12589 template <size_t N>
  12590 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
  12591                                       Vec128<uint64_t, N> b) {
  12592  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
  12593 }
  12594 
  12595 #endif  // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4
  12596 
  12597 // ================================================== MISC
  12598 
  12599 // ------------------------------ LoadMaskBits (TestBit)
  12600 
  12601 #if HWY_TARGET > HWY_AVX3
  12602 namespace detail {
  12603 
  12604 template <class D, HWY_IF_T_SIZE_D(D, 1)>
  12605 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  12606  const RebindToUnsigned<decltype(d)> du;
  12607  // Easier than Set(), which would require an >8-bit type, which would not
  12608  // compile for T=uint8_t, kN=1.
  12609  const VFromD<D> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
  12610 
  12611 #if HWY_TARGET == HWY_SSE2
  12612  // {b0, b1, ...} ===> {b0, b0, b1, b1, ...}
  12613  __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw);
  12614  // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...}
  12615  unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits);
  12616  // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==>
  12617  // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1}
  12618  const VFromD<decltype(du)> rep8{
  12619      _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)};
  12620 #else
  12621  // Replicate bytes 8x such that each byte contains the bit that governs it.
  12622  alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
  12623                                                    1, 1, 1, 1, 1, 1, 1, 1};
  12624  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
  12625 #endif
  12626  const VFromD<decltype(du)> bit = Dup128VecFromValues(
  12627      du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
  12628  return RebindMask(d, TestBit(rep8, bit));
  12629 }
  12630 
  12631 template <class D, HWY_IF_T_SIZE_D(D, 2)>
  12632 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  12633  const RebindToUnsigned<decltype(d)> du;
  12634  alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
  12635  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
  12636  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
  12637 }
  12638 
  12639 template <class D, HWY_IF_T_SIZE_D(D, 4)>
  12640 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  12641  const RebindToUnsigned<decltype(d)> du;
  12642  alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
  12643  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
  12644  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
  12645 }
  12646 
  12647 template <class D, HWY_IF_T_SIZE_D(D, 8)>
  12648 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
  12649  const RebindToUnsigned<decltype(d)> du;
  12650  alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
  12651  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
  12652 }
  12653 
  12654 }  // namespace detail
  12655 #endif  // HWY_TARGET > HWY_AVX3
  12656 
  12657 // `p` points to at least 8 readable bytes, not all of which need be valid.
  12658 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12659 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
  12660  constexpr size_t kN = MaxLanes(d);
  12661 #if HWY_TARGET <= HWY_AVX3
  12662  (void)d;
  12663  uint64_t mask_bits = 0;
  12664  constexpr size_t kNumBytes = (kN + 7) / 8;
  12665  CopyBytes<kNumBytes>(bits, &mask_bits);
  12666  if (kN < 8) {
  12667    mask_bits &= (1ull << kN) - 1;
  12668  }
  12669 
  12670  return MFromD<D>::FromBits(mask_bits);
  12671 #else
  12672  uint64_t mask_bits = 0;
  12673  constexpr size_t kNumBytes = (kN + 7) / 8;
  12674  CopyBytes<kNumBytes>(bits, &mask_bits);
  12675  if (kN < 8) {
  12676    mask_bits &= (1ull << kN) - 1;
  12677  }
  12678 
  12679  return detail::LoadMaskBits128(d, mask_bits);
  12680 #endif
  12681 }
  12682 
  12683 // ------------------------------ Dup128MaskFromMaskBits
  12684 
  12685 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12686 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
  12687  constexpr size_t kN = MaxLanes(d);
  12688  if (kN < 8) mask_bits &= (1u << kN) - 1;
  12689 
  12690 #if HWY_TARGET <= HWY_AVX3
  12691  return MFromD<D>::FromBits(mask_bits);
  12692 #else
  12693  return detail::LoadMaskBits128(d, mask_bits);
  12694 #endif
  12695 }
  12696 
  12697 template <typename T>
  12698 struct CompressIsPartition {
  12699 #if HWY_TARGET <= HWY_AVX3
  12700  // AVX3 supports native compress, but a table-based approach allows
  12701  // 'partitioning' (also moving mask=false lanes to the top), which helps
  12702  // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
  12703  // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
  12704  // u32x8 etc.).
  12705  enum { value = (sizeof(T) == 8) };
  12706 #else
  12707  // generic_ops-inl does not guarantee IsPartition for 8-bit.
  12708  enum { value = (sizeof(T) != 1) };
  12709 #endif
  12710 };
  12711 
  12712 namespace detail {
  12713 
  12714 // Returns `mask_bits` (from movemask) with the upper bits cleared, if there
  12715 // are 8 or fewer valid bits.
  12716 template <class D>
  12717 constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
  12718  return (d.MaxBytes() >= 16) ? mask_bits
  12719                              : mask_bits & ((1ull << d.MaxLanes()) - 1);
  12720 }
  12721 
  12722 }  // namespace detail
  12723 
  12724 #if HWY_TARGET <= HWY_AVX3
  12725 
  12726 // ------------------------------ BitsFromMask (MFromD, OnlyActive)
  12727 // Generic for all vector lengths.
  12728 template <class D>
  12729 HWY_INLINE uint64_t BitsFromMask(D d, MFromD<D> mask) {
  12730  return detail::OnlyActive(d, mask.raw);
  12731 }
  12732 
  12733 // ------------------------------ StoreMaskBits
  12734 
  12735 // `p` points to at least 8 writable bytes.
  12736 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12737 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
  12738  constexpr size_t kN = MaxLanes(d);
  12739  constexpr size_t kNumBytes = (kN + 7) / 8;
  12740  CopyBytes<kNumBytes>(&mask.raw, bits);
  12741 
  12742  // Non-full byte, need to clear the undefined upper bits.
  12743  if (kN < 8) {
  12744    const int mask_bits = (1 << kN) - 1;
  12745    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
  12746  }
  12747 
  12748  return kNumBytes;
  12749 }
  12750 
  12751 // ------------------------------ Mask testing
  12752 
  12753 // Beware: the suffix indicates the number of mask bits, not lane size!
  12754 
  12755 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12756 HWY_API size_t CountTrue(D d, MFromD<D> mask) {
  12757  constexpr size_t kN = MaxLanes(d);
  12758  const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
  12759  return PopCount(mask_bits);
  12760 }
  12761 
  12762 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12763 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
  12764  constexpr size_t kN = MaxLanes(d);
  12765  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  12766  return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
  12767 }
  12768 
  12769 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12770 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
  12771  constexpr size_t kN = MaxLanes(d);
  12772  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  12773  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
  12774 }
  12775 
  12776 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12777 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
  12778  constexpr size_t kN = MaxLanes(d);
  12779  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  12780  return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
  12781 }
  12782 
  12783 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12784 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
  12785  constexpr size_t kN = MaxLanes(d);
  12786  const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1);
  12787  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
  12788                   : -1;
  12789 }
  12790 
  12791 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12792 HWY_API bool AllFalse(D d, MFromD<D> mask) {
  12793  constexpr size_t kN = MaxLanes(d);
  12794  const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
  12795  return mask_bits == 0;
  12796 }
  12797 
  12798 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12799 HWY_API bool AllTrue(D d, MFromD<D> mask) {
  12800  constexpr size_t kN = MaxLanes(d);
  12801  const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1);
  12802  // Cannot use _kortestc because we may have less than 8 mask bits.
  12803  return mask_bits == (1ull << kN) - 1;
  12804 }
  12805 
  12806 // ------------------------------ Compress
  12807 
  12808 // 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512.
  12809 
  12810 // Single lane: no-op
  12811 template <typename T>
  12812 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  12813  return v;
  12814 }
  12815 
  12816 template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)>
  12817 HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) {
  12818  return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
  12819 }
  12820 
  12821 template <typename T, HWY_IF_T_SIZE(T, 8)>
  12822 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
  12823  HWY_DASSERT(mask.raw < 4);
  12824 
  12825  // There are only 2 lanes, so we can afford to load the index vector directly.
  12826  alignas(16) static constexpr uint8_t u8_indices[64] = {
  12827      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
  12828      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
  12829      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
  12830      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
  12831 
  12832  const DFromV<decltype(v)> d;
  12833  const Repartition<uint8_t, decltype(d)> d8;
  12834  const auto index = Load(d8, u8_indices + 16 * mask.raw);
  12835  return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
  12836 }
  12837 
  12838 // ------------------------------ CompressNot (Compress)
  12839 
  12840 // Single lane: no-op
  12841 template <typename T>
  12842 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  12843  return v;
  12844 }
  12845 
  12846 template <typename T, HWY_IF_T_SIZE(T, 8)>
  12847 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
  12848  // See CompressIsPartition, PrintCompressNot64x2NibbleTables
  12849  alignas(16) static constexpr uint64_t packed_array[16] = {
  12850      0x00000010, 0x00000001, 0x00000010, 0x00000010};
  12851 
  12852  // For lane i, shift the i-th 4-bit index down to bits [0, 2).
  12853  const DFromV<decltype(v)> d;
  12854  const RebindToUnsigned<decltype(d)> du64;
  12855  const auto packed = Set(du64, packed_array[mask.raw]);
  12856  alignas(16) static constexpr uint64_t kShifts[2] = {0, 4};
  12857  Vec128<uint64_t> indices = packed >> Load(du64, kShifts);
  12858  // _mm_permutevar_pd will ignore the upper bits, but TableLookupLanes uses
  12859  // a fallback in MSAN builds, so mask there.
  12860  HWY_IF_CONSTEXPR(HWY_IS_MSAN) indices &= Set(du64, 1);
  12861  return TableLookupLanes(v, Indices128<T>{indices.raw});
  12862 }
  12863 
  12864 // ------------------------------ CompressBlocksNot
  12865 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
  12866                                           Mask128<uint64_t> /* m */) {
  12867  return v;
  12868 }
  12869 
  12870 // ------------------------------ CompressStore (defined in x86_512)
  12871 
  12872 // ------------------------------ CompressBlendedStore (defined in x86_avx3)
  12873 
  12874 // ------------------------------ CompressBitsStore (defined in x86_512)
  12875 
  12876 #else  // AVX2 or below
  12877 
  12878 // ------------------------------ BitsFromMask
  12879 
  12880 namespace detail {
  12881 
  12882 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
  12883  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
  12884 }
  12885 
  12886 }  // namespace detail
  12887 
  12888 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
  12889 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
  12890  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
  12891  return detail::OnlyActive(d,
  12892                            detail::U64FromInt(_mm_movemask_epi8(sign_bits)));
  12893 }
  12894 
  12895 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
  12896 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
  12897  // Remove useless lower half of each u16 while preserving the sign bit.
  12898  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
  12899  return detail::OnlyActive(d,
  12900                            detail::U64FromInt(_mm_movemask_epi8(sign_bits)));
  12901 }
  12902 
  12903 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
  12904 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
  12905  const RebindToFloat<decltype(d)> df;
  12906  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
  12907  return detail::OnlyActive(d,
  12908                            detail::U64FromInt(_mm_movemask_ps(sign_bits.raw)));
  12909 }
  12910 
  12911 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
  12912 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
  12913  const RebindToFloat<D> df;
  12914  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
  12915  return detail::OnlyActive(d,
  12916                            detail::U64FromInt(_mm_movemask_pd(sign_bits.raw)));
  12917 }
  12918 
  12919 // ------------------------------ StoreMaskBits
  12920 // `p` points to at least 8 writable bytes.
  12921 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12922 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
  12923  constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8;
  12924  const uint64_t mask_bits = BitsFromMask(d, mask);
  12925  CopyBytes<kNumBytes>(&mask_bits, bits);
  12926  return kNumBytes;
  12927 }
  12928 
  12929 // ------------------------------ Mask testing
  12930 
  12931 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12932 HWY_API bool AllFalse(D d, MFromD<D> mask) {
  12933  // Cheaper than PTEST, which is 2 uop / 3L.
  12934  return BitsFromMask(d, mask) == 0;
  12935 }
  12936 
  12937 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12938 HWY_API bool AllTrue(D d, MFromD<D> mask) {
  12939  constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1;
  12940  return BitsFromMask(d, mask) == kAllBits;
  12941 }
  12942 
  12943 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12944 HWY_API size_t CountTrue(D d, MFromD<D> mask) {
  12945  return PopCount(BitsFromMask(d, mask));
  12946 }
  12947 
  12948 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12949 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
  12950  return Num0BitsBelowLS1Bit_Nonzero32(
  12951      static_cast<uint32_t>(BitsFromMask(d, mask)));
  12952 }
  12953 
  12954 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12955 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
  12956  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
  12957  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
  12958 }
  12959 
  12960 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12961 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
  12962  return 31 - Num0BitsAboveMS1Bit_Nonzero32(
  12963                  static_cast<uint32_t>(BitsFromMask(d, mask)));
  12964 }
  12965 
  12966 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  12967 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
  12968  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
  12969  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
  12970                   : -1;
  12971 }
  12972 
  12973 // ------------------------------ Compress, CompressBits
  12974 
  12975 namespace detail {
  12976 
  12977 // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
  12978 template <class D, HWY_IF_T_SIZE_D(D, 2)>
  12979 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  12980  HWY_DASSERT(mask_bits < 256);
  12981  const Rebind<uint8_t, decltype(d)> d8;
  12982  const Twice<decltype(d8)> d8t;
  12983  const RebindToUnsigned<decltype(d)> du;
  12984 
  12985  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
  12986  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
  12987  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
  12988  // store lane indices and convert to byte indices (2*lane + 0..1), with the
  12989  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
  12990  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
  12991  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
  12992  // is likely more costly than the higher cache footprint from storing bytes.
  12993  alignas(16) static constexpr uint8_t table[2048] = {
  12994      // PrintCompress16x8Tables
  12995      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  12996      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  12997      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
  12998      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  12999      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
  13000      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
  13001      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
  13002      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  13003      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
  13004      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
  13005      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
  13006      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
  13007      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
  13008      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
  13009      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
  13010      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  13011      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
  13012      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
  13013      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
  13014      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
  13015      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
  13016      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
  13017      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
  13018      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
  13019      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
  13020      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
  13021      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
  13022      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
  13023      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
  13024      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
  13025      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
  13026      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  13027      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
  13028      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
  13029      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
  13030      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
  13031      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
  13032      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
  13033      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
  13034      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
  13035      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
  13036      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
  13037      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
  13038      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
  13039      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
  13040      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
  13041      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
  13042      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
  13043      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
  13044      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
  13045      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
  13046      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
  13047      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
  13048      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
  13049      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
  13050      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
  13051      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
  13052      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
  13053      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
  13054      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
  13055      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
  13056      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
  13057      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
  13058      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
  13059      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
  13060      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
  13061      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
  13062      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
  13063      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
  13064      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
  13065      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
  13066      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
  13067      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
  13068      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
  13069      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
  13070      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
  13071      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
  13072      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
  13073      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
  13074      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
  13075      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
  13076      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
  13077      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
  13078      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
  13079      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
  13080      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
  13081      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
  13082      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
  13083      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
  13084      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
  13085      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
  13086      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
  13087      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
  13088      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
  13089      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
  13090      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
  13091      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
  13092      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
  13093      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
  13094      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
  13095      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
  13096      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
  13097      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
  13098      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
  13099      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
  13100      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
  13101      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
  13102      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
  13103      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
  13104      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
  13105      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
  13106      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
  13107      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
  13108      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
  13109      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
  13110      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
  13111      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
  13112      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
  13113      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
  13114      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
  13115      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
  13116      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
  13117      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
  13118      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
  13119      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
  13120      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
  13121      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
  13122      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};
  13123 
  13124  const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
  13125  const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
  13126  return BitCast(d, pairs + Set(du, 0x0100));
  13127 }
  13128 
  13129 template <class D, HWY_IF_T_SIZE_D(D, 2)>
  13130 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  13131  HWY_DASSERT(mask_bits < 256);
  13132  const Rebind<uint8_t, decltype(d)> d8;
  13133  const Twice<decltype(d8)> d8t;
  13134  const RebindToUnsigned<decltype(d)> du;
  13135 
  13136  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
  13137  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
  13138  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
  13139  // store lane indices and convert to byte indices (2*lane + 0..1), with the
  13140  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
  13141  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
  13142  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
  13143  // is likely more costly than the higher cache footprint from storing bytes.
  13144  alignas(16) static constexpr uint8_t table[2048] = {
  13145      // PrintCompressNot16x8Tables
  13146      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
  13147      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
  13148      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
  13149      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
  13150      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
  13151      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
  13152      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
  13153      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
  13154      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
  13155      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
  13156      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
  13157      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
  13158      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
  13159      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
  13160      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
  13161      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
  13162      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
  13163      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
  13164      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
  13165      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
  13166      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
  13167      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
  13168      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
  13169      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
  13170      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
  13171      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
  13172      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
  13173      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
  13174      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
  13175      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
  13176      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
  13177      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
  13178      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
  13179      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
  13180      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
  13181      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
  13182      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
  13183      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
  13184      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
  13185      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
  13186      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
  13187      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
  13188      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
  13189      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
  13190      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
  13191      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
  13192      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
  13193      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
  13194      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
  13195      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
  13196      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
  13197      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
  13198      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
  13199      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
  13200      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
  13201      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
  13202      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
  13203      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
  13204      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
  13205      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
  13206      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
  13207      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
  13208      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
  13209      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
  13210      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
  13211      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
  13212      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
  13213      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
  13214      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
  13215      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
  13216      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
  13217      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
  13218      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
  13219      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
  13220      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
  13221      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
  13222      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
  13223      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
  13224      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
  13225      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
  13226      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
  13227      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
  13228      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
  13229      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
  13230      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
  13231      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
  13232      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
  13233      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
  13234      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
  13235      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
  13236      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
  13237      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
  13238      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
  13239      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
  13240      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
  13241      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
  13242      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
  13243      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
  13244      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
  13245      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
  13246      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
  13247      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
  13248      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
  13249      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
  13250      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
  13251      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
  13252      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
  13253      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
  13254      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
  13255      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
  13256      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
  13257      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
  13258      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
  13259      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
  13260      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
  13261      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
  13262      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
  13263      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
  13264      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
  13265      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
  13266      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
  13267      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
  13268      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
  13269      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
  13270      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
  13271      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
  13272      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
  13273      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
  13274 
  13275  const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
  13276  const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
  13277  return BitCast(d, pairs + Set(du, 0x0100));
  13278 }
  13279 
  13280 template <class D, HWY_IF_T_SIZE_D(D, 4)>
  13281 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  13282  HWY_DASSERT(mask_bits < 16);
  13283 
  13284  // There are only 4 lanes, so we can afford to load the index vector directly.
  13285  alignas(16) static constexpr uint8_t u8_indices[256] = {
  13286      // PrintCompress32x4Tables
  13287      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
  13288      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
  13289      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
  13290      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
  13291      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
  13292      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
  13293      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
  13294      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
  13295      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
  13296      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
  13297      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
  13298      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
  13299      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
  13300      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
  13301      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
  13302      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
  13303 
  13304  const Repartition<uint8_t, decltype(d)> d8;
  13305  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  13306 }
  13307 
  13308 template <class D, HWY_IF_T_SIZE_D(D, 4)>
  13309 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  13310  HWY_DASSERT(mask_bits < 16);
  13311 
  13312  // There are only 4 lanes, so we can afford to load the index vector directly.
  13313  alignas(16) static constexpr uint8_t u8_indices[256] = {
  13314      // PrintCompressNot32x4Tables
  13315      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
  13316      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
  13317      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
  13318      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
  13319      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
  13320      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
  13321      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
  13322      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
  13323      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
  13324      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
  13325      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
  13326      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
  13327      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
  13328      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
  13329      12, 13, 14, 15};
  13330 
  13331  const Repartition<uint8_t, decltype(d)> d8;
  13332  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  13333 }
  13334 
  13335 template <class D, HWY_IF_T_SIZE_D(D, 8)>
  13336 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  13337  HWY_DASSERT(mask_bits < 4);
  13338 
  13339  // There are only 2 lanes, so we can afford to load the index vector directly.
  13340  alignas(16) static constexpr uint8_t u8_indices[64] = {
  13341      // PrintCompress64x2Tables
  13342      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
  13343      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
  13344      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
  13345      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
  13346 
  13347  const Repartition<uint8_t, decltype(d)> d8;
  13348  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  13349 }
  13350 
  13351 template <class D, HWY_IF_T_SIZE_D(D, 8)>
  13352 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  13353  HWY_DASSERT(mask_bits < 4);
  13354 
  13355  // There are only 2 lanes, so we can afford to load the index vector directly.
  13356  alignas(16) static constexpr uint8_t u8_indices[64] = {
  13357      // PrintCompressNot64x2Tables
  13358      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
  13359      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
  13360      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
  13361      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
  13362 
  13363  const Repartition<uint8_t, decltype(d)> d8;
  13364  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  13365 }
  13366 
  13367 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
  13368 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
  13369  const DFromV<decltype(v)> d;
  13370  const RebindToUnsigned<decltype(d)> du;
  13371 
  13372  HWY_DASSERT(mask_bits < (1ull << N));
  13373  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  13374  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  13375 }
  13376 
  13377 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
  13378 HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
  13379  const DFromV<decltype(v)> d;
  13380  const RebindToUnsigned<decltype(d)> du;
  13381 
  13382  HWY_DASSERT(mask_bits < (1ull << N));
  13383  const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
  13384  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  13385 }
  13386 
  13387 }  // namespace detail
  13388 
  13389 // Single lane: no-op
  13390 template <typename T>
  13391 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  13392  return v;
  13393 }
  13394 
  13395 // Two lanes: conditional swap
  13396 template <typename T, HWY_IF_T_SIZE(T, 8)>
  13397 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
  13398  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
  13399  const DFromV<decltype(v)> d;
  13400  const Vec128<T> m = VecFromMask(d, mask);
  13401  const Vec128<T> maskL = DupEven(m);
  13402  const Vec128<T> maskH = DupOdd(m);
  13403  const Vec128<T> swap = AndNot(maskL, maskH);
  13404  return IfVecThenElse(swap, Shuffle01(v), v);
  13405 }
  13406 
  13407 // General case, 2 or 4 bytes
  13408 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
  13409 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
  13410  const DFromV<decltype(v)> d;
  13411  return detail::CompressBits(v, BitsFromMask(d, mask));
  13412 }
  13413 
  13414 // ------------------------------ CompressNot
  13415 
  13416 // Single lane: no-op
  13417 template <typename T>
  13418 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  13419  return v;
  13420 }
  13421 
  13422 // Two lanes: conditional swap
  13423 template <typename T, HWY_IF_T_SIZE(T, 8)>
  13424 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
  13425  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
  13426  const DFromV<decltype(v)> d;
  13427  const Vec128<T> m = VecFromMask(d, mask);
  13428  const Vec128<T> maskL = DupEven(m);
  13429  const Vec128<T> maskH = DupOdd(m);
  13430  const Vec128<T> swap = AndNot(maskH, maskL);
  13431  return IfVecThenElse(swap, Shuffle01(v), v);
  13432 }
  13433 
  13434 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
  13435 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
  13436  const DFromV<decltype(v)> d;
  13437  // For partial vectors, we cannot pull the Not() into the table because
  13438  // BitsFromMask clears the upper bits.
  13439  if (N < 16 / sizeof(T)) {
  13440    return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
  13441  }
  13442  return detail::CompressNotBits(v, BitsFromMask(d, mask));
  13443 }
  13444 
  13445 // ------------------------------ CompressBlocksNot
  13446 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
  13447                                           Mask128<uint64_t> /* m */) {
  13448  return v;
  13449 }
  13450 
  13451 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
  13452 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
  13453                                  const uint8_t* HWY_RESTRICT bits) {
  13454  uint64_t mask_bits = 0;
  13455  constexpr size_t kNumBytes = (N + 7) / 8;
  13456  CopyBytes<kNumBytes>(bits, &mask_bits);
  13457  if (N < 8) {
  13458    mask_bits &= (1ull << N) - 1;
  13459  }
  13460 
  13461  return detail::CompressBits(v, mask_bits);
  13462 }
  13463 
  13464 // ------------------------------ CompressStore, CompressBitsStore
  13465 
  13466 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
  13467 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
  13468                             TFromD<D>* HWY_RESTRICT unaligned) {
  13469  const RebindToUnsigned<decltype(d)> du;
  13470 
  13471  const uint64_t mask_bits = BitsFromMask(d, m);
  13472  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
  13473  const size_t count = PopCount(mask_bits);
  13474 
  13475  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
  13476  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  13477  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  13478  StoreU(compressed, d, unaligned);
  13479  detail::MaybeUnpoison(unaligned, count);
  13480  return count;
  13481 }
  13482 
  13483 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
  13484 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
  13485                                    TFromD<D>* HWY_RESTRICT unaligned) {
  13486  const RebindToUnsigned<decltype(d)> du;
  13487 
  13488  const uint64_t mask_bits = BitsFromMask(d, m);
  13489  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
  13490  const size_t count = PopCount(mask_bits);
  13491 
  13492  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
  13493  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  13494  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  13495  BlendedStore(compressed, FirstN(d, count), d, unaligned);
  13496  detail::MaybeUnpoison(unaligned, count);
  13497  return count;
  13498 }
  13499 
  13500 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
  13501 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
  13502                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
  13503  const RebindToUnsigned<decltype(d)> du;
  13504 
  13505  uint64_t mask_bits = 0;
  13506  constexpr size_t kN = MaxLanes(d);
  13507  constexpr size_t kNumBytes = (kN + 7) / 8;
  13508  CopyBytes<kNumBytes>(bits, &mask_bits);
  13509  if (kN < 8) {
  13510    mask_bits &= (1ull << kN) - 1;
  13511  }
  13512  const size_t count = PopCount(mask_bits);
  13513 
  13514  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
  13515  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  13516  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  13517  StoreU(compressed, d, unaligned);
  13518 
  13519  detail::MaybeUnpoison(unaligned, count);
  13520  return count;
  13521 }
  13522 
  13523 #endif  // HWY_TARGET <= HWY_AVX3
  13524 
  13525 // ------------------------------ Expand
  13526 
  13527 // Otherwise, use the generic_ops-inl.h fallback.
  13528 #if HWY_TARGET <= HWY_AVX3 || HWY_IDE
  13529 
  13530 // The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL),
  13531 // but we still want to override generic_ops-inl's table-based implementation
  13532 // whenever we have the 32-bit expand provided by AVX3.
  13533 #ifdef HWY_NATIVE_EXPAND
  13534 #undef HWY_NATIVE_EXPAND
  13535 #else
  13536 #define HWY_NATIVE_EXPAND
  13537 #endif
  13538 
  13539 namespace detail {
  13540 
  13541 #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE  // VBMI2
  13542 
  13543 template <size_t N>
  13544 HWY_INLINE Vec128<uint8_t, N> NativeExpand(Vec128<uint8_t, N> v,
  13545                                           Mask128<uint8_t, N> mask) {
  13546  return Vec128<uint8_t, N>{_mm_maskz_expand_epi8(mask.raw, v.raw)};
  13547 }
  13548 
  13549 template <size_t N>
  13550 HWY_INLINE Vec128<uint16_t, N> NativeExpand(Vec128<uint16_t, N> v,
  13551                                            Mask128<uint16_t, N> mask) {
  13552  return Vec128<uint16_t, N>{_mm_maskz_expand_epi16(mask.raw, v.raw)};
  13553 }
  13554 
  13555 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)>
  13556 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
  13557                                      const uint8_t* HWY_RESTRICT unaligned) {
  13558  return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)};
  13559 }
  13560 
  13561 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
  13562 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
  13563                                      const uint16_t* HWY_RESTRICT unaligned) {
  13564  return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)};
  13565 }
  13566 
  13567 #endif  // HWY_TARGET <= HWY_AVX3_DL
  13568 
  13569 template <size_t N>
  13570 HWY_INLINE Vec128<uint32_t, N> NativeExpand(Vec128<uint32_t, N> v,
  13571                                            Mask128<uint32_t, N> mask) {
  13572  return Vec128<uint32_t, N>{_mm_maskz_expand_epi32(mask.raw, v.raw)};
  13573 }
  13574 
  13575 template <size_t N>
  13576 HWY_INLINE Vec128<uint64_t, N> NativeExpand(Vec128<uint64_t, N> v,
  13577                                            Mask128<uint64_t, N> mask) {
  13578  return Vec128<uint64_t, N>{_mm_maskz_expand_epi64(mask.raw, v.raw)};
  13579 }
  13580 
  13581 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
  13582 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
  13583                                      const uint32_t* HWY_RESTRICT unaligned) {
  13584  return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)};
  13585 }
  13586 
  13587 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
  13588 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */,
  13589                                      const uint64_t* HWY_RESTRICT unaligned) {
  13590  return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)};
  13591 }
  13592 
  13593 }  // namespace detail
  13594 
  13595 // Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo.
  13596 #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE  // VBMI2
  13597 
  13598 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  13599 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  13600  const DFromV<decltype(v)> d;
  13601  const RebindToUnsigned<decltype(d)> du;
  13602  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  13603  return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
  13604 }
  13605 
  13606 #endif  // HWY_TARGET <= HWY_AVX3_DL
  13607 
  13608 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
  13609 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  13610  const DFromV<decltype(v)> d;
  13611  const RebindToUnsigned<decltype(d)> du;
  13612  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  13613  return BitCast(d, detail::NativeExpand(BitCast(du, v), mu));
  13614 }
  13615 
  13616 // ------------------------------ LoadExpand
  13617 
  13618 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
  13619          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
  13620 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
  13621                             const TFromD<D>* HWY_RESTRICT unaligned) {
  13622 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
  13623  const RebindToUnsigned<decltype(d)> du;
  13624  using TU = TFromD<decltype(du)>;
  13625  const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
  13626  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  13627  return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
  13628 #else
  13629  return Expand(LoadU(d, unaligned), mask);
  13630 #endif
  13631 }
  13632 
  13633 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
  13634          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
  13635 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
  13636                             const TFromD<D>* HWY_RESTRICT unaligned) {
  13637 #if HWY_TARGET <= HWY_AVX3
  13638  const RebindToUnsigned<decltype(d)> du;
  13639  using TU = TFromD<decltype(du)>;
  13640  const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned);
  13641  const MFromD<decltype(du)> mu = RebindMask(du, mask);
  13642  return BitCast(d, detail::NativeLoadExpand(mu, du, pu));
  13643 #else
  13644  return Expand(LoadU(d, unaligned), mask);
  13645 #endif
  13646 }
  13647 
  13648 #endif  // HWY_TARGET <= HWY_AVX3
  13649 
  13650 // ------------------------------ StoreInterleaved2/3/4
  13651 
  13652 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
  13653 // generic_ops-inl.h.
  13654 
  13655 // ------------------------------ Additional mask logical operations
  13656 
  13657 #if HWY_TARGET <= HWY_AVX3
  13658 namespace detail {
  13659 
  13660 template <class T, HWY_IF_LANES_LE(sizeof(T), 4)>
  13661 static HWY_INLINE uint32_t AVX3Blsi(T x) {
  13662  using TU = MakeUnsigned<T>;
  13663  const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x));
  13664 #if HWY_COMPILER_CLANGCL
  13665  return static_cast<uint32_t>(u32_val & (0u - u32_val));
  13666 #else
  13667  return static_cast<uint32_t>(_blsi_u32(u32_val));
  13668 #endif
  13669 }
  13670 template <class T, HWY_IF_T_SIZE(T, 8)>
  13671 static HWY_INLINE uint64_t AVX3Blsi(T x) {
  13672  const auto u64_val = static_cast<uint64_t>(x);
  13673 #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
  13674  return static_cast<uint64_t>(u64_val & (0ULL - u64_val));
  13675 #else
  13676  return static_cast<uint64_t>(_blsi_u64(u64_val));
  13677 #endif
  13678 }
  13679 
  13680 template <class T, HWY_IF_LANES_LE(sizeof(T), 4)>
  13681 static HWY_INLINE uint32_t AVX3Blsmsk(T x) {
  13682  using TU = MakeUnsigned<T>;
  13683  const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x));
  13684 #if HWY_COMPILER_CLANGCL
  13685  return static_cast<uint32_t>(u32_val ^ (u32_val - 1u));
  13686 #else
  13687  return static_cast<uint32_t>(_blsmsk_u32(u32_val));
  13688 #endif
  13689 }
  13690 template <class T, HWY_IF_T_SIZE(T, 8)>
  13691 static HWY_INLINE uint64_t AVX3Blsmsk(T x) {
  13692  const auto u64_val = static_cast<uint64_t>(x);
  13693 #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32
  13694  return static_cast<uint64_t>(u64_val ^ (u64_val - 1ULL));
  13695 #else
  13696  return static_cast<uint64_t>(_blsmsk_u64(u64_val));
  13697 #endif
  13698 }
  13699 
  13700 }  // namespace detail
  13701 
  13702 template <class T, size_t N>
  13703 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
  13704  constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
  13705  return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
  13706      (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)};
  13707 }
  13708 template <class T, size_t N>
  13709 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
  13710  constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
  13711  return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
  13712      (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)};
  13713 }
  13714 template <class T, size_t N>
  13715 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
  13716  constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1;
  13717  return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>(
  13718      detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)};
  13719 }
  13720 template <class T, size_t N>
  13721 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
  13722  return Mask128<T, N>{
  13723      static_cast<typename Mask128<T, N>::Raw>(detail::AVX3Blsi(mask.raw))};
  13724 }
  13725 #else   // AVX2 or below
  13726 template <class T>
  13727 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
  13728  return mask;
  13729 }
  13730 template <class T>
  13731 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
  13732  const FixedTag<T, 2> d;
  13733  const auto vmask = VecFromMask(d, mask);
  13734  return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
  13735 }
  13736 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
  13737 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
  13738  const Simd<T, N, 0> d;
  13739  const auto vmask = VecFromMask(d, mask);
  13740  const auto neg_vmask =
  13741      ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
  13742  return MaskFromVec(Or(vmask, neg_vmask));
  13743 }
  13744 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
  13745 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
  13746  const Full128<T> d;
  13747  const Repartition<int64_t, decltype(d)> di64;
  13748  const Repartition<float, decltype(d)> df32;
  13749  const Repartition<int32_t, decltype(d)> di32;
  13750  using VF = VFromD<decltype(df32)>;
  13751 
  13752  auto vmask = BitCast(di64, VecFromMask(d, mask));
  13753  vmask = Or(vmask, Neg(vmask));
  13754 
  13755  // Copy the sign bit of the first int64_t lane to the second int64_t lane
  13756  const auto vmask2 = BroadcastSignBit(
  13757      BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw,
  13758                                      _MM_SHUFFLE(1, 1, 0, 0))}));
  13759  return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2))));
  13760 }
  13761 
  13762 template <class T, size_t N>
  13763 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
  13764  return Not(SetAtOrAfterFirst(mask));
  13765 }
  13766 
  13767 template <class T>
  13768 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
  13769  return mask;
  13770 }
  13771 template <class T>
  13772 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
  13773  const FixedTag<T, 2> d;
  13774  const RebindToSigned<decltype(d)> di;
  13775 
  13776  const auto vmask = BitCast(di, VecFromMask(d, mask));
  13777  const auto zero = Zero(di);
  13778  const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
  13779  return MaskFromVec(BitCast(d, And(vmask, vmask2)));
  13780 }
  13781 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
  13782 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
  13783  const Simd<T, N, 0> d;
  13784  const RebindToSigned<decltype(d)> di;
  13785 
  13786  const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
  13787  const auto only_first_vmask =
  13788      BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
  13789  return MaskFromVec(only_first_vmask);
  13790 }
  13791 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
  13792 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
  13793  const Full128<T> d;
  13794  const RebindToSigned<decltype(d)> di;
  13795  const Repartition<int64_t, decltype(d)> di64;
  13796 
  13797  const auto zero = Zero(di64);
  13798  const auto vmask = BitCast(di64, VecFromMask(d, mask));
  13799  const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
  13800  const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
  13801  return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
  13802 }
  13803 
  13804 template <class T>
  13805 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
  13806  const FixedTag<T, 1> d;
  13807  const RebindToSigned<decltype(d)> di;
  13808  using TI = MakeSigned<T>;
  13809 
  13810  return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
  13811 }
  13812 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
  13813 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
  13814  const Simd<T, N, 0> d;
  13815  return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
  13816 }
  13817 #endif  // HWY_TARGET <= HWY_AVX3
  13818 
  13819 // ------------------------------ Reductions
  13820 
  13821 // Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum.
  13822 
  13823 // We provide specializations of u8x8 and u8x16, so exclude those.
  13824 #undef HWY_IF_SUM_OF_LANES_D
  13825 #define HWY_IF_SUM_OF_LANES_D(D)                                        \
  13826  HWY_IF_LANES_GT_D(D, 1),                                              \
  13827      hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() ||               \
  13828                    (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
  13829          nullptr
  13830 
  13831 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
  13832 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
  13833  return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
  13834 }
  13835 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
  13836 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
  13837  const Repartition<uint64_t, decltype(d)> d64;
  13838  VFromD<decltype(d64)> sums = SumsOf8(v);
  13839  sums = SumOfLanes(d64, sums);
  13840  return Broadcast<0>(BitCast(d, sums));
  13841 }
  13842 
  13843 #if HWY_TARGET <= HWY_SSE4
  13844 // We provide specializations of u8x8, u8x16, and u16x8, so exclude those.
  13845 #undef HWY_IF_MINMAX_OF_LANES_D
  13846 #define HWY_IF_MINMAX_OF_LANES_D(D)                                        \
  13847  HWY_IF_LANES_GT_D(D, 1),                                                 \
  13848      hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() ||                 \
  13849                     ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \
  13850                    (!hwy::IsSame<TFromD<D>, uint16_t>() ||                \
  13851                     (HWY_V_SIZE_D(D) != 16))>* = nullptr
  13852 
  13853 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
  13854 HWY_API Vec128<uint16_t> MinOfLanes(D /* tag */, Vec128<uint16_t> v) {
  13855  return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)});
  13856 }
  13857 
  13858 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
  13859 HWY_API Vec128<uint16_t> MaxOfLanes(D d, Vec128<uint16_t> v) {
  13860  const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>());
  13861  return max - MinOfLanes(d, max - v);
  13862 }
  13863 
  13864 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
  13865 HWY_API Vec64<uint8_t> MinOfLanes(D d, Vec64<uint8_t> v) {
  13866  const Rebind<uint16_t, decltype(d)> d16;
  13867  return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v)));
  13868 }
  13869 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
  13870 HWY_API Vec128<uint8_t> MinOfLanes(D d, Vec128<uint8_t> v) {
  13871  const Half<decltype(d)> dh;
  13872  Vec64<uint8_t> result =
  13873      Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v)));
  13874  return Combine(d, result, result);
  13875 }
  13876 
  13877 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
  13878 HWY_API Vec64<uint8_t> MaxOfLanes(D d, Vec64<uint8_t> v) {
  13879  const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
  13880  return m - MinOfLanes(d, m - v);
  13881 }
  13882 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
  13883 HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) {
  13884  const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
  13885  return m - MinOfLanes(d, m - v);
  13886 }
  13887 
  13888 #endif  // HWY_TARGET <= HWY_SSE4
  13889 
  13890 // ------------------------------ BitShuffle
  13891 #if HWY_TARGET <= HWY_AVX3_DL
  13892 
  13893 #ifdef HWY_NATIVE_BITSHUFFLE
  13894 #undef HWY_NATIVE_BITSHUFFLE
  13895 #else
  13896 #define HWY_NATIVE_BITSHUFFLE
  13897 #endif
  13898 
  13899 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
  13900          HWY_IF_V_SIZE_LE_V(V, 16),
  13901          HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
  13902 HWY_API V BitShuffle(V v, VI idx) {
  13903  const DFromV<decltype(v)> d64;
  13904  const RebindToUnsigned<decltype(d64)> du64;
  13905  const Rebind<uint8_t, decltype(d64)> du8;
  13906 
  13907  int32_t i32_bit_shuf_result = static_cast<int32_t>(
  13908      static_cast<uint16_t>(_mm_bitshuffle_epi64_mask(v.raw, idx.raw)));
  13909 
  13910  return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128(
  13911                                          i32_bit_shuf_result)}));
  13912 }
  13913 #endif  // HWY_TARGET <= HWY_AVX3_DL
  13914 
  13915 // ------------------------------ MultiRotateRight
  13916 
  13917 #if HWY_TARGET <= HWY_AVX3_DL
  13918 
  13919 #ifdef HWY_NATIVE_MULTIROTATERIGHT
  13920 #undef HWY_NATIVE_MULTIROTATERIGHT
  13921 #else
  13922 #define HWY_NATIVE_MULTIROTATERIGHT
  13923 #endif
  13924 
  13925 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
  13926          HWY_IF_V_SIZE_LE_V(V, 16),
  13927          HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
  13928 HWY_API V MultiRotateRight(V v, VI idx) {
  13929  return V{_mm_multishift_epi64_epi8(idx.raw, v.raw)};
  13930 }
  13931 
  13932 #endif
  13933 
  13934 // ------------------------------ Lt128
  13935 
  13936 namespace detail {
  13937 
  13938 // Returns vector-mask for Lt128. Generic for all vector lengths.
  13939 template <class D, HWY_IF_U64_D(D)>
  13940 HWY_INLINE VFromD<D> Lt128Vec(const D d, VFromD<D> a, VFromD<D> b) {
  13941  // Truth table of Eq and Lt for Hi and Lo u64.
  13942  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  13943  // =H =L cH cL  | out = cH | (=H & cL)
  13944  //  0  0  0  0  |  0
  13945  //  0  0  0  1  |  0
  13946  //  0  0  1  0  |  1
  13947  //  0  0  1  1  |  1
  13948  //  0  1  0  0  |  0
  13949  //  0  1  0  1  |  0
  13950  //  0  1  1  0  |  1
  13951  //  1  0  0  0  |  0
  13952  //  1  0  0  1  |  1
  13953  //  1  1  0  0  |  0
  13954  const auto eqHL = Eq(a, b);
  13955  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  13956  const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL);
  13957  const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL);
  13958  return InterleaveUpper(d, vecHx, vecHx);
  13959 }
  13960 
  13961 // Returns vector-mask for Eq128. Generic for all vector lengths.
  13962 template <class D, HWY_IF_U64_D(D)>
  13963 HWY_INLINE VFromD<D> Eq128Vec(D d, VFromD<D> a, VFromD<D> b) {
  13964  const auto eqHL = VecFromMask(d, Eq(a, b));
  13965  const auto eqLH = Reverse2(d, eqHL);
  13966  return And(eqHL, eqLH);
  13967 }
  13968 
  13969 template <class D, HWY_IF_U64_D(D)>
  13970 HWY_INLINE VFromD<D> Ne128Vec(D d, VFromD<D> a, VFromD<D> b) {
  13971  const auto neHL = VecFromMask(d, Ne(a, b));
  13972  const auto neLH = Reverse2(d, neHL);
  13973  return Or(neHL, neLH);
  13974 }
  13975 
  13976 template <class D, HWY_IF_U64_D(D)>
  13977 HWY_INLINE VFromD<D> Lt128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
  13978  // No specialization required for AVX-512: Mask <-> Vec is fast, and
  13979  // copying mask bits to their neighbor seems infeasible.
  13980  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  13981  return InterleaveUpper(d, ltHL, ltHL);
  13982 }
  13983 
  13984 template <class D, HWY_IF_U64_D(D)>
  13985 HWY_INLINE VFromD<D> Eq128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
  13986  // No specialization required for AVX-512: Mask <-> Vec is fast, and
  13987  // copying mask bits to their neighbor seems infeasible.
  13988  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  13989  return InterleaveUpper(d, eqHL, eqHL);
  13990 }
  13991 
  13992 template <class D, HWY_IF_U64_D(D)>
  13993 HWY_INLINE VFromD<D> Ne128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
  13994  // No specialization required for AVX-512: Mask <-> Vec is fast, and
  13995  // copying mask bits to their neighbor seems infeasible.
  13996  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
  13997  return InterleaveUpper(d, neHL, neHL);
  13998 }
  13999 
  14000 }  // namespace detail
  14001 
  14002 template <class D, HWY_IF_U64_D(D)>
  14003 HWY_API MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) {
  14004  return MaskFromVec(detail::Lt128Vec(d, a, b));
  14005 }
  14006 
  14007 template <class D, HWY_IF_U64_D(D)>
  14008 HWY_API MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) {
  14009  return MaskFromVec(detail::Eq128Vec(d, a, b));
  14010 }
  14011 
  14012 template <class D, HWY_IF_U64_D(D)>
  14013 HWY_API MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) {
  14014  return MaskFromVec(detail::Ne128Vec(d, a, b));
  14015 }
  14016 
  14017 template <class D, HWY_IF_U64_D(D)>
  14018 HWY_API MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
  14019  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
  14020 }
  14021 
  14022 template <class D, HWY_IF_U64_D(D)>
  14023 HWY_API MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
  14024  return MaskFromVec(detail::Eq128UpperVec(d, a, b));
  14025 }
  14026 
  14027 template <class D, HWY_IF_U64_D(D)>
  14028 HWY_API MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
  14029  return MaskFromVec(detail::Ne128UpperVec(d, a, b));
  14030 }
  14031 
  14032 // ------------------------------ Min128, Max128 (Lt128)
  14033 
  14034 // Avoids the extra MaskFromVec in Lt128.
  14035 template <class D, HWY_IF_U64_D(D)>
  14036 HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
  14037  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
  14038 }
  14039 
  14040 template <class D, HWY_IF_U64_D(D)>
  14041 HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
  14042  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
  14043 }
  14044 
  14045 template <class D, HWY_IF_U64_D(D)>
  14046 HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
  14047  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
  14048 }
  14049 
  14050 template <class D, HWY_IF_U64_D(D)>
  14051 HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
  14052  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
  14053 }
  14054 
  14055 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
  14056 
  14057 #if HWY_TARGET <= HWY_AVX3
  14058 
  14059 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
  14060 #undef HWY_NATIVE_LEADING_ZERO_COUNT
  14061 #else
  14062 #define HWY_NATIVE_LEADING_ZERO_COUNT
  14063 #endif
  14064 
  14065 template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
  14066 HWY_API V LeadingZeroCount(V v) {
  14067  return V{_mm_lzcnt_epi32(v.raw)};
  14068 }
  14069 
  14070 template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
  14071 HWY_API V LeadingZeroCount(V v) {
  14072  return V{_mm_lzcnt_epi64(v.raw)};
  14073 }
  14074 
  14075 // HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h
  14076 // for AVX3 targets
  14077 
  14078 #endif  // HWY_TARGET <= HWY_AVX3
  14079 
  14080 // NOLINTNEXTLINE(google-readability-namespace-comments)
  14081 }  // namespace HWY_NAMESPACE
  14082 }  // namespace hwy
  14083 HWY_AFTER_NAMESPACE();
  14084 
  14085 #undef HWY_X86_IF_EMULATED_D
  14086 
  14087 // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
  14088 // the warning seems to be issued at the call site of intrinsics, i.e. our code.
  14089 HWY_DIAGNOSTICS(pop)