tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

emu128-inl.h (90027B)


      1 // Copyright 2022 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // Single-element vectors and operations.
     17 // External include guard in highway.h - see comment there.
     18 
     19 #include "hwy/base.h"
     20 
     21 #ifndef HWY_NO_LIBCXX
     22 #include <math.h>  // sqrtf
     23 #endif
     24 
     25 #include "hwy/ops/shared-inl.h"
     26 
     27 HWY_BEFORE_NAMESPACE();
     28 namespace hwy {
     29 namespace HWY_NAMESPACE {
     30 
     31 template <typename T>
     32 using Full128 = Simd<T, 16 / sizeof(T), 0>;
     33 
     34 // (Wrapper class required for overloading comparison operators.)
     35 template <typename T, size_t N = 16 / sizeof(T)>
     36 struct Vec128 {
     37  using PrivateT = T;                     // only for DFromV
     38  static constexpr size_t kPrivateN = N;  // only for DFromV
     39 
     40  HWY_INLINE Vec128() = default;
     41  Vec128(const Vec128&) = default;
     42  Vec128& operator=(const Vec128&) = default;
     43 
     44  HWY_INLINE Vec128& operator*=(const Vec128 other) {
     45    return *this = (*this * other);
     46  }
     47  HWY_INLINE Vec128& operator/=(const Vec128 other) {
     48    return *this = (*this / other);
     49  }
     50  HWY_INLINE Vec128& operator+=(const Vec128 other) {
     51    return *this = (*this + other);
     52  }
     53  HWY_INLINE Vec128& operator-=(const Vec128 other) {
     54    return *this = (*this - other);
     55  }
     56  HWY_INLINE Vec128& operator%=(const Vec128 other) {
     57    return *this = (*this % other);
     58  }
     59  HWY_INLINE Vec128& operator&=(const Vec128 other) {
     60    return *this = (*this & other);
     61  }
     62  HWY_INLINE Vec128& operator|=(const Vec128 other) {
     63    return *this = (*this | other);
     64  }
     65  HWY_INLINE Vec128& operator^=(const Vec128 other) {
     66    return *this = (*this ^ other);
     67  }
     68 
     69  // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
     70  // relies on this for LoadInterleaved*. CAVEAT: this method of padding
     71  // prevents using range for, especially in SumOfLanes, where it would be
     72  // incorrect. Moving padding to another field would require handling the case
     73  // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
     74  T raw[16 / sizeof(T)] = {};
     75 };
     76 
     77 // 0 or FF..FF, same size as Vec128.
     78 template <typename T, size_t N = 16 / sizeof(T)>
     79 struct Mask128 {
     80  using Raw = hwy::MakeUnsigned<T>;
     81 
     82  using PrivateT = T;                     // only for DFromM
     83  static constexpr size_t kPrivateN = N;  // only for DFromM
     84 
     85  static HWY_INLINE Raw FromBool(bool b) {
     86    return b ? static_cast<Raw>(~Raw{0}) : 0;
     87  }
     88 
     89  // Must match the size of Vec128.
     90  Raw bits[16 / sizeof(T)] = {};
     91 };
     92 
     93 template <class V>
     94 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
     95 
     96 template <class M>
     97 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
     98 
     99 template <class V>
    100 using TFromV = typename V::PrivateT;
    101 
    102 // ------------------------------ Zero
    103 
    104 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
    105 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    106 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    107  Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> v;  // zero-initialized
    108  return v;
    109 }
    110 
    111 template <class D>
    112 using VFromD = decltype(Zero(D()));
    113 
    114 // ------------------------------ BitCast
    115 
    116 template <class D, class VFrom>
    117 HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) {
    118  VFromD<D> to;
    119  CopySameSize(&v.raw, &to.raw);
    120  return to;
    121 }
    122 
    123 // ------------------------------ ResizeBitCast
    124 
    125 template <class D, class VFrom>
    126 HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) {
    127  using DFrom = DFromV<VFrom>;
    128  using TFrom = TFromD<DFrom>;
    129  using TTo = TFromD<D>;
    130 
    131  constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom);
    132  constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D);
    133  constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen);
    134 
    135  VFromD<D> to = Zero(d);
    136  CopyBytes<kCopyByteLen>(&v.raw, &to.raw);
    137  return to;
    138 }
    139 
    140 namespace detail {
    141 
    142 // ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if
    143 // VFromD<DTo> is a larger vector than FromV
    144 template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
    145 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
    146                                               ToSizeTag /* to_size_tag */,
    147                                               DTo d_to, DFrom /* d_from */,
    148                                               VFromD<DFrom> v) {
    149  return ResizeBitCast(d_to, v);
    150 }
    151 
    152 }  // namespace detail
    153 
    154 // ------------------------------ Set
    155 template <class D, typename T2>
    156 HWY_API VFromD<D> Set(D d, const T2 t) {
    157  VFromD<D> v;
    158  for (size_t i = 0; i < MaxLanes(d); ++i) {
    159    v.raw[i] = ConvertScalarTo<TFromD<D>>(t);
    160  }
    161  return v;
    162 }
    163 
    164 // ------------------------------ Undefined
    165 template <class D>
    166 HWY_API VFromD<D> Undefined(D d) {
    167  return Zero(d);
    168 }
    169 
    170 // ------------------------------ Dup128VecFromValues
    171 
    172 template <class D, HWY_IF_T_SIZE_D(D, 1)>
    173 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    174                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    175                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
    176                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
    177                                      TFromD<D> t11, TFromD<D> t12,
    178                                      TFromD<D> t13, TFromD<D> t14,
    179                                      TFromD<D> t15) {
    180  VFromD<D> result;
    181  result.raw[0] = t0;
    182  result.raw[1] = t1;
    183  result.raw[2] = t2;
    184  result.raw[3] = t3;
    185  result.raw[4] = t4;
    186  result.raw[5] = t5;
    187  result.raw[6] = t6;
    188  result.raw[7] = t7;
    189  result.raw[8] = t8;
    190  result.raw[9] = t9;
    191  result.raw[10] = t10;
    192  result.raw[11] = t11;
    193  result.raw[12] = t12;
    194  result.raw[13] = t13;
    195  result.raw[14] = t14;
    196  result.raw[15] = t15;
    197  return result;
    198 }
    199 
    200 template <class D, HWY_IF_T_SIZE_D(D, 2)>
    201 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    202                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    203                                      TFromD<D> t5, TFromD<D> t6,
    204                                      TFromD<D> t7) {
    205  VFromD<D> result;
    206  result.raw[0] = t0;
    207  result.raw[1] = t1;
    208  result.raw[2] = t2;
    209  result.raw[3] = t3;
    210  result.raw[4] = t4;
    211  result.raw[5] = t5;
    212  result.raw[6] = t6;
    213  result.raw[7] = t7;
    214  return result;
    215 }
    216 
    217 template <class D, HWY_IF_T_SIZE_D(D, 4)>
    218 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    219                                      TFromD<D> t2, TFromD<D> t3) {
    220  VFromD<D> result;
    221  result.raw[0] = t0;
    222  result.raw[1] = t1;
    223  result.raw[2] = t2;
    224  result.raw[3] = t3;
    225  return result;
    226 }
    227 
    228 template <class D, HWY_IF_T_SIZE_D(D, 8)>
    229 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    230  VFromD<D> result;
    231  result.raw[0] = t0;
    232  result.raw[1] = t1;
    233  return result;
    234 }
    235 
    236 // ------------------------------ Iota
    237 
    238 template <class D, typename T = TFromD<D>, typename T2>
    239 HWY_API VFromD<D> Iota(D d, T2 first) {
    240  VFromD<D> v;
    241  for (size_t i = 0; i < MaxLanes(d); ++i) {
    242    v.raw[i] = AddWithWraparound(static_cast<T>(first), i);
    243  }
    244  return v;
    245 }
    246 
    247 // ================================================== LOGICAL
    248 
    249 // ------------------------------ Not
    250 template <typename T, size_t N>
    251 HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
    252  const DFromV<decltype(v)> d;
    253  const RebindToUnsigned<decltype(d)> du;
    254  using TU = TFromD<decltype(du)>;
    255  VFromD<decltype(du)> vu = BitCast(du, v);
    256  for (size_t i = 0; i < N; ++i) {
    257    vu.raw[i] = static_cast<TU>(~vu.raw[i]);
    258  }
    259  return BitCast(d, vu);
    260 }
    261 
    262 // ------------------------------ And
    263 template <typename T, size_t N>
    264 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
    265  const DFromV<decltype(a)> d;
    266  const RebindToUnsigned<decltype(d)> du;
    267  auto au = BitCast(du, a);
    268  auto bu = BitCast(du, b);
    269  for (size_t i = 0; i < N; ++i) {
    270    au.raw[i] &= bu.raw[i];
    271  }
    272  return BitCast(d, au);
    273 }
    274 template <typename T, size_t N>
    275 HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
    276  return And(a, b);
    277 }
    278 
    279 // ------------------------------ AndNot
    280 template <typename T, size_t N>
    281 HWY_API Vec128<T, N> AndNot(Vec128<T, N> a, Vec128<T, N> b) {
    282  return And(Not(a), b);
    283 }
    284 
    285 // ------------------------------ Or
    286 template <typename T, size_t N>
    287 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
    288  const DFromV<decltype(a)> d;
    289  const RebindToUnsigned<decltype(d)> du;
    290  auto au = BitCast(du, a);
    291  auto bu = BitCast(du, b);
    292  for (size_t i = 0; i < N; ++i) {
    293    au.raw[i] |= bu.raw[i];
    294  }
    295  return BitCast(d, au);
    296 }
    297 template <typename T, size_t N>
    298 HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
    299  return Or(a, b);
    300 }
    301 
    302 // ------------------------------ Xor
    303 template <typename T, size_t N>
    304 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
    305  const DFromV<decltype(a)> d;
    306  const RebindToUnsigned<decltype(d)> du;
    307  auto au = BitCast(du, a);
    308  auto bu = BitCast(du, b);
    309  for (size_t i = 0; i < N; ++i) {
    310    au.raw[i] ^= bu.raw[i];
    311  }
    312  return BitCast(d, au);
    313 }
    314 template <typename T, size_t N>
    315 HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
    316  return Xor(a, b);
    317 }
    318 
    319 // ------------------------------ Xor3
    320 template <typename T, size_t N>
    321 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
    322  return Xor(x1, Xor(x2, x3));
    323 }
    324 
    325 // ------------------------------ Or3
    326 template <typename T, size_t N>
    327 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
    328  return Or(o1, Or(o2, o3));
    329 }
    330 
    331 // ------------------------------ OrAnd
    332 template <typename T, size_t N>
    333 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
    334  return Or(o, And(a1, a2));
    335 }
    336 
    337 // ------------------------------ IfVecThenElse
    338 template <typename T, size_t N>
    339 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
    340                                   Vec128<T, N> no) {
    341  return Or(And(mask, yes), AndNot(mask, no));
    342 }
    343 
    344 // ------------------------------ CopySign
    345 template <typename T, size_t N>
    346 HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
    347  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
    348  const DFromV<decltype(magn)> d;
    349  return BitwiseIfThenElse(SignBit(d), sign, magn);
    350 }
    351 
    352 // ------------------------------ CopySignToAbs
    353 template <typename T, size_t N>
    354 HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
    355  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
    356  const DFromV<decltype(abs)> d;
    357  return OrAnd(abs, SignBit(d), sign);
    358 }
    359 
    360 // ------------------------------ BroadcastSignBit
    361 template <typename T, size_t N>
    362 HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
    363  for (size_t i = 0; i < N; ++i) {
    364    v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1);
    365  }
    366  return v;
    367 }
    368 
    369 // ------------------------------ Mask
    370 
    371 // v must be 0 or FF..FF.
    372 template <typename T, size_t N>
    373 HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
    374  Mask128<T, N> mask;
    375  CopySameSize(&v.raw, &mask.bits);
    376  return mask;
    377 }
    378 
    379 template <class D>
    380 using MFromD = decltype(MaskFromVec(VFromD<D>()));
    381 
    382 template <class DTo, class MFrom>
    383 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) {
    384  MFromD<DTo> to;
    385  CopySameSize(&mask.bits, &to.bits);
    386  return to;
    387 }
    388 
    389 template <class D>
    390 VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
    391  VFromD<D> v;
    392  CopySameSize(&mask.bits, &v.raw);
    393  return v;
    394 }
    395 
    396 template <class D>
    397 uint64_t BitsFromMask(D d, MFromD<D> mask) {
    398  uint64_t bits = 0;
    399  for (size_t i = 0; i < Lanes(d); ++i) {
    400    bits |= mask.bits[i] ? (1ull << i) : 0;
    401  }
    402  return bits;
    403 }
    404 
    405 template <class D>
    406 HWY_API MFromD<D> FirstN(D d, size_t n) {
    407  MFromD<D> m;
    408  for (size_t i = 0; i < MaxLanes(d); ++i) {
    409    m.bits[i] = MFromD<D>::FromBool(i < n);
    410  }
    411  return m;
    412 }
    413 
    414 // Returns mask ? yes : no.
    415 template <typename T, size_t N>
    416 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
    417                                Vec128<T, N> no) {
    418  const DFromV<decltype(yes)> d;
    419  return IfVecThenElse(VecFromMask(d, mask), yes, no);
    420 }
    421 
    422 template <typename T, size_t N>
    423 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
    424  const DFromV<decltype(yes)> d;
    425  return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
    426 }
    427 
    428 template <typename T, size_t N>
    429 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
    430  const DFromV<decltype(no)> d;
    431  return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
    432 }
    433 
    434 template <typename T, size_t N>
    435 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
    436                                        Vec128<T, N> no) {
    437  const DFromV<decltype(v)> d;
    438  const RebindToSigned<decltype(d)> di;
    439  const auto vi = BitCast(di, v);
    440 
    441  for (size_t i = 0; i < N; ++i) {
    442    v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i];
    443  }
    444  return v;
    445 }
    446 
    447 // ------------------------------ Mask logical
    448 
    449 template <typename T, size_t N>
    450 HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
    451  const Simd<T, N, 0> d;
    452  return MaskFromVec(Not(VecFromMask(d, m)));
    453 }
    454 
    455 template <typename T, size_t N>
    456 HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
    457  const Simd<T, N, 0> d;
    458  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
    459 }
    460 
    461 template <typename T, size_t N>
    462 HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
    463  const Simd<T, N, 0> d;
    464  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
    465 }
    466 
    467 template <typename T, size_t N>
    468 HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
    469  const Simd<T, N, 0> d;
    470  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
    471 }
    472 
    473 template <typename T, size_t N>
    474 HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
    475  const Simd<T, N, 0> d;
    476  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
    477 }
    478 
    479 template <typename T, size_t N>
    480 HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
    481  const Simd<T, N, 0> d;
    482  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
    483 }
    484 
    485 // ================================================== SHIFTS
    486 
    487 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
    488 
    489 template <int kBits, typename T, size_t N>
    490 HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
    491  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
    492  using TU = hwy::MakeUnsigned<T>;
    493  for (size_t i = 0; i < N; ++i) {
    494    const TU raw_u = static_cast<TU>(v.raw[i]);
    495    const auto shifted = raw_u << kBits;  // separate line to avoid MSVC warning
    496    v.raw[i] = static_cast<T>(shifted);
    497  }
    498  return v;
    499 }
    500 
    501 template <int kBits, typename T, size_t N>
    502 HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
    503  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
    504  // Signed right shift is now guaranteed to be arithmetic (rounding toward
    505  // negative infinity, i.e. shifting in the sign bit).
    506  for (size_t i = 0; i < N; ++i) {
    507    v.raw[i] = ScalarShr(v.raw[i], kBits);
    508  }
    509 
    510  return v;
    511 }
    512 
    513 // ------------------------------ RotateRight (ShiftRight)
    514 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    515 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
    516  const DFromV<decltype(v)> d;
    517  const RebindToUnsigned<decltype(d)> du;
    518 
    519  constexpr size_t kSizeInBits = sizeof(T) * 8;
    520  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
    521  if (kBits == 0) return v;
    522 
    523  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
    524            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
    525 }
    526 
    527 // ------------------------------ ShiftLeftSame
    528 
    529 template <typename T, size_t N>
    530 HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
    531  for (size_t i = 0; i < N; ++i) {
    532    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
    533    v.raw[i] = static_cast<T>(shifted);
    534  }
    535  return v;
    536 }
    537 
    538 template <typename T, size_t N>
    539 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
    540  for (size_t i = 0; i < N; ++i) {
    541    v.raw[i] = ScalarShr(v.raw[i], bits);
    542  }
    543 
    544  return v;
    545 }
    546 
    547 // ------------------------------ Shl
    548 
    549 template <typename T, size_t N>
    550 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
    551  for (size_t i = 0; i < N; ++i) {
    552    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
    553                         << bits.raw[i];
    554    v.raw[i] = static_cast<T>(shifted);
    555  }
    556  return v;
    557 }
    558 
    559 template <typename T, size_t N>
    560 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
    561  for (size_t i = 0; i < N; ++i) {
    562    v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i]));
    563  }
    564 
    565  return v;
    566 }
    567 
    568 // ================================================== ARITHMETIC
    569 
    570 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
    571 namespace detail {
    572 
    573 template <typename T, size_t N>
    574 HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
    575                            Vec128<T, N> b) {
    576  for (size_t i = 0; i < N; ++i) {
    577    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
    578    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
    579    a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
    580  }
    581  return a;
    582 }
    583 template <typename T, size_t N>
    584 HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
    585                            Vec128<T, N> b) {
    586  for (size_t i = 0; i < N; ++i) {
    587    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
    588    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
    589    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
    590  }
    591  return a;
    592 }
    593 
    594 template <typename T, size_t N>
    595 HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a,
    596                            Vec128<T, N> b) {
    597  for (size_t i = 0; i < N; ++i) {
    598    a.raw[i] += b.raw[i];
    599  }
    600  return a;
    601 }
    602 
    603 template <typename T, size_t N>
    604 HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a,
    605                            Vec128<T, N> b) {
    606  for (size_t i = 0; i < N; ++i) {
    607    a.raw[i] -= b.raw[i];
    608  }
    609  return a;
    610 }
    611 
    612 }  // namespace detail
    613 
    614 template <typename T, size_t N>
    615 HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
    616  return detail::Sub(hwy::IsFloatTag<T>(), a, b);
    617 }
    618 template <typename T, size_t N>
    619 HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
    620  return detail::Add(hwy::IsFloatTag<T>(), a, b);
    621 }
    622 
    623 // ------------------------------ SumsOf8
    624 
    625 template <size_t N>
    626 HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) {
    627  Vec128<uint64_t, (N + 7) / 8> sums;
    628  for (size_t i = 0; i < N; ++i) {
    629    sums.raw[i / 8] += v.raw[i];
    630  }
    631  return sums;
    632 }
    633 
    634 template <size_t N>
    635 HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) {
    636  Vec128<int64_t, (N + 7) / 8> sums;
    637  for (size_t i = 0; i < N; ++i) {
    638    sums.raw[i / 8] += v.raw[i];
    639  }
    640  return sums;
    641 }
    642 
    643 // ------------------------------ SaturatedAdd
    644 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
    645          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    646 HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
    647  using TW = MakeSigned<MakeWide<T>>;
    648  for (size_t i = 0; i < N; ++i) {
    649    a.raw[i] = static_cast<T>(HWY_MIN(
    650        HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) + b.raw[i]),
    651        hwy::HighestValue<T>()));
    652  }
    653  return a;
    654 }
    655 
    656 // ------------------------------ SaturatedSub
    657 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
    658          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    659 HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
    660  using TW = MakeSigned<MakeWide<T>>;
    661  for (size_t i = 0; i < N; ++i) {
    662    a.raw[i] = static_cast<T>(HWY_MIN(
    663        HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) - b.raw[i]),
    664        hwy::HighestValue<T>()));
    665  }
    666  return a;
    667 }
    668 
    669 // ------------------------------ AverageRound
    670 
    671 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
    672 #undef HWY_NATIVE_AVERAGE_ROUND_UI32
    673 #else
    674 #define HWY_NATIVE_AVERAGE_ROUND_UI32
    675 #endif
    676 
    677 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
    678 #undef HWY_NATIVE_AVERAGE_ROUND_UI64
    679 #else
    680 #define HWY_NATIVE_AVERAGE_ROUND_UI64
    681 #endif
    682 
    683 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    684 HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
    685  for (size_t i = 0; i < N; ++i) {
    686    const T a_val = a.raw[i];
    687    const T b_val = b.raw[i];
    688    a.raw[i] = static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1));
    689  }
    690  return a;
    691 }
    692 
    693 // ------------------------------ Abs
    694 
    695 template <typename T, size_t N>
    696 HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
    697  for (size_t i = 0; i < N; ++i) {
    698    a.raw[i] = ScalarAbs(a.raw[i]);
    699  }
    700  return a;
    701 }
    702 
    703 // ------------------------------ Min/Max
    704 
    705 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
    706 namespace detail {
    707 
    708 template <typename T, size_t N>
    709 HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
    710                            Vec128<T, N> b) {
    711  for (size_t i = 0; i < N; ++i) {
    712    a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
    713  }
    714  return a;
    715 }
    716 template <typename T, size_t N>
    717 HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
    718                            Vec128<T, N> b) {
    719  for (size_t i = 0; i < N; ++i) {
    720    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
    721  }
    722  return a;
    723 }
    724 
    725 template <typename T, size_t N>
    726 HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
    727                            Vec128<T, N> b) {
    728  for (size_t i = 0; i < N; ++i) {
    729    if (ScalarIsNaN(a.raw[i])) {
    730      a.raw[i] = b.raw[i];
    731    } else if (ScalarIsNaN(b.raw[i])) {
    732      // no change
    733    } else {
    734      a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
    735    }
    736  }
    737  return a;
    738 }
    739 template <typename T, size_t N>
    740 HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
    741                            Vec128<T, N> b) {
    742  for (size_t i = 0; i < N; ++i) {
    743    if (ScalarIsNaN(a.raw[i])) {
    744      a.raw[i] = b.raw[i];
    745    } else if (ScalarIsNaN(b.raw[i])) {
    746      // no change
    747    } else {
    748      a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
    749    }
    750  }
    751  return a;
    752 }
    753 
    754 }  // namespace detail
    755 
    756 template <typename T, size_t N>
    757 HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
    758  return detail::Min(hwy::IsFloatTag<T>(), a, b);
    759 }
    760 
    761 template <typename T, size_t N>
    762 HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
    763  return detail::Max(hwy::IsFloatTag<T>(), a, b);
    764 }
    765 
    766 // ------------------------------ Neg
    767 
    768 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
    769 namespace detail {
    770 
    771 template <typename T, size_t N>
    772 HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) {
    773  const DFromV<decltype(v)> d;
    774  return Zero(d) - v;
    775 }
    776 
    777 template <typename T, size_t N>
    778 HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
    779  const DFromV<decltype(v)> d;
    780  return Xor(v, SignBit(d));
    781 }
    782 
    783 template <typename T, size_t N>
    784 HWY_API Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, Vec128<T, N> v) {
    785  const DFromV<decltype(v)> d;
    786  return Xor(v, SignBit(d));
    787 }
    788 
    789 }  // namespace detail
    790 
    791 template <typename T, size_t N>
    792 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
    793  return detail::Neg(hwy::IsFloatTag<T>(), v);
    794 }
    795 
    796 // ------------------------------ Mul/Div
    797 
    798 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
    799 namespace detail {
    800 
    801 template <typename T, size_t N>
    802 HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a,
    803                            Vec128<T, N> b) {
    804  for (size_t i = 0; i < N; ++i) {
    805    a.raw[i] *= b.raw[i];
    806  }
    807  return a;
    808 }
    809 
    810 template <typename T, size_t N>
    811 HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a, Vec128<T, N> b) {
    812  for (size_t i = 0; i < N; ++i) {
    813    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
    814                              static_cast<uint64_t>(b.raw[i]));
    815  }
    816  return a;
    817 }
    818 
    819 template <typename T, size_t N>
    820 HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a,
    821                            Vec128<T, N> b) {
    822  for (size_t i = 0; i < N; ++i) {
    823    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) *
    824                              static_cast<uint64_t>(b.raw[i]));
    825  }
    826  return a;
    827 }
    828 
    829 }  // namespace detail
    830 
    831 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
    832 #ifdef HWY_NATIVE_MUL_8
    833 #undef HWY_NATIVE_MUL_8
    834 #else
    835 #define HWY_NATIVE_MUL_8
    836 #endif
    837 #ifdef HWY_NATIVE_MUL_64
    838 #undef HWY_NATIVE_MUL_64
    839 #else
    840 #define HWY_NATIVE_MUL_64
    841 #endif
    842 
    843 template <typename T, size_t N>
    844 HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
    845  return detail::Mul(hwy::TypeTag<T>(), a, b);
    846 }
    847 
    848 template <typename T, size_t N, HWY_IF_FLOAT(T)>
    849 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
    850  for (size_t i = 0; i < N; ++i) {
    851    a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i];
    852  }
    853  return a;
    854 }
    855 
    856 // Returns the upper sizeof(T)*8 bits of a * b in each lane.
    857 template <class T, size_t N,
    858          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    859          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    860 HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
    861  using TW = MakeWide<T>;
    862  for (size_t i = 0; i < N; ++i) {
    863    a.raw[i] = static_cast<T>(
    864        (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >>
    865        (sizeof(T) * 8));
    866  }
    867  return a;
    868 }
    869 
    870 template <class T, HWY_IF_UI64(T)>
    871 HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
    872  T hi;
    873  Mul128(GetLane(a), GetLane(b), &hi);
    874  return Set(Full64<T>(), hi);
    875 }
    876 
    877 template <class T, HWY_IF_UI64(T)>
    878 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
    879  T hi_0;
    880  T hi_1;
    881 
    882  Mul128(GetLane(a), GetLane(b), &hi_0);
    883  Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1);
    884 
    885  return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
    886 }
    887 
    888 template <size_t N>
    889 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
    890                                           Vec128<int16_t, N> b) {
    891  for (size_t i = 0; i < N; ++i) {
    892    a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i] + 16384) >> 15);
    893  }
    894  return a;
    895 }
    896 
    897 // Multiplies even lanes (0, 2, ..) and returns the double-wide result.
    898 template <class T, size_t N,
    899          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    900          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    901 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
    902                                                 Vec128<T, N> b) {
    903  using TW = MakeWide<T>;
    904  Vec128<TW, (N + 1) / 2> mul;
    905  for (size_t i = 0; i < N; i += 2) {
    906    const TW a_wide = a.raw[i];
    907    mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i]);
    908  }
    909  return mul;
    910 }
    911 
    912 // Multiplies odd lanes (1, 3, ..) and returns the double-wide result.
    913 template <class T, size_t N,
    914          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    915          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    916 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
    917                                                Vec128<T, N> b) {
    918  using TW = MakeWide<T>;
    919  Vec128<TW, (N + 1) / 2> mul;
    920  for (size_t i = 0; i < N; i += 2) {
    921    const TW a_wide = a.raw[i + 1];
    922    mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i + 1]);
    923  }
    924  return mul;
    925 }
    926 
    927 template <size_t N>
    928 HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
    929  for (size_t i = 0; i < N; ++i) {
    930    // Zero inputs are allowed, but callers are responsible for replacing the
    931    // return value with something else (typically using IfThenElse). This check
    932    // avoids a ubsan error. The result is arbitrary.
    933    v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
    934  }
    935  return v;
    936 }
    937 
    938 // generic_ops takes care of integer T.
    939 template <typename T, size_t N, HWY_IF_FLOAT(T)>
    940 HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
    941  return Abs(a - b);
    942 }
    943 
    944 // ------------------------------ Floating-point multiply-add variants
    945 
    946 template <typename T, size_t N, HWY_IF_FLOAT(T)>
    947 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
    948                            Vec128<T, N> add) {
    949  return mul * x + add;
    950 }
    951 
    952 template <typename T, size_t N, HWY_IF_FLOAT(T)>
    953 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
    954                               Vec128<T, N> add) {
    955  return add - mul * x;
    956 }
    957 
    958 template <typename T, size_t N, HWY_IF_FLOAT(T)>
    959 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
    960                            Vec128<T, N> sub) {
    961  return mul * x - sub;
    962 }
    963 
    964 template <typename T, size_t N, HWY_IF_FLOAT(T)>
    965 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
    966                               Vec128<T, N> sub) {
    967  return Neg(mul) * x - sub;
    968 }
    969 
    970 // ------------------------------ Floating-point square root
    971 
    972 template <size_t N>
    973 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
    974  for (size_t i = 0; i < N; ++i) {
    975    const float half = v.raw[i] * 0.5f;
    976    // Initial guess based on log2(f)
    977    v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>(
    978        0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1)));
    979    // One Newton-Raphson iteration
    980    v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
    981  }
    982  return v;
    983 }
    984 
    985 namespace detail {
    986 
    987 static HWY_INLINE float ScalarSqrt(float v) {
    988 #if defined(HWY_NO_LIBCXX)
    989 #if HWY_COMPILER_GCC_ACTUAL
    990  return __builtin_sqrt(v);
    991 #else
    992  uint32_t bits = BitCastScalar<uint32_t>(v);
    993  // Coarse approximation, letting the exponent LSB leak into the mantissa
    994  bits = (1 << 29) + (bits >> 1) - (1 << 22);
    995  return BitCastScalar<float>(bits);
    996 #endif  // !HWY_COMPILER_GCC_ACTUAL
    997 #else
    998  return sqrtf(v);
    999 #endif  // !HWY_NO_LIBCXX
   1000 }
   1001 static HWY_INLINE double ScalarSqrt(double v) {
   1002 #if defined(HWY_NO_LIBCXX)
   1003 #if HWY_COMPILER_GCC_ACTUAL
   1004  return __builtin_sqrt(v);
   1005 #else
   1006  uint64_t bits = BitCastScalar<uint64_t>(v);
   1007  // Coarse approximation, letting the exponent LSB leak into the mantissa
   1008  bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
   1009  return BitCastScalar<double>(bits);
   1010 #endif  // !HWY_COMPILER_GCC_ACTUAL
   1011 #else
   1012  return sqrt(v);
   1013 #endif  // HWY_NO_LIBCXX
   1014 }
   1015 
   1016 }  // namespace detail
   1017 
   1018 template <typename T, size_t N>
   1019 HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
   1020  for (size_t i = 0; i < N; ++i) {
   1021    v.raw[i] = detail::ScalarSqrt(v.raw[i]);
   1022  }
   1023  return v;
   1024 }
   1025 
   1026 // ------------------------------ Floating-point rounding
   1027 
   1028 template <typename T, size_t N>
   1029 HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
   1030  using TI = MakeSigned<T>;
   1031  const T k0 = ConvertScalarTo<T>(0);
   1032  const Vec128<T, N> a = Abs(v);
   1033  for (size_t i = 0; i < N; ++i) {
   1034    if (!(a.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
   1035      continue;
   1036    }
   1037    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
   1038    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
   1039    if (rounded == 0) {
   1040      v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0;
   1041      continue;
   1042    }
   1043    const T rounded_f = ConvertScalarTo<T>(rounded);
   1044    // Round to even
   1045    if ((rounded & 1) &&
   1046        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
   1047      v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1));
   1048      continue;
   1049    }
   1050    v.raw[i] = rounded_f;
   1051  }
   1052  return v;
   1053 }
   1054 
   1055 // Round-to-nearest even.
   1056 template <class T, size_t N, HWY_IF_FLOAT3264(T)>
   1057 HWY_API Vec128<MakeSigned<T>, N> NearestInt(Vec128<T, N> v) {
   1058  using TI = MakeSigned<T>;
   1059  const T k0 = ConvertScalarTo<T>(0);
   1060 
   1061  const Vec128<T, N> abs = Abs(v);
   1062  Vec128<TI, N> ret;
   1063  for (size_t i = 0; i < N; ++i) {
   1064    const bool signbit = ScalarSignBit(v.raw[i]);
   1065 
   1066    if (!(abs.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
   1067      // Check if too large to cast or NaN
   1068      if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
   1069        ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
   1070        continue;
   1071      }
   1072      ret.raw[i] = static_cast<TI>(v.raw[i]);
   1073      continue;
   1074    }
   1075    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
   1076    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
   1077    if (rounded == 0) {
   1078      ret.raw[i] = 0;
   1079      continue;
   1080    }
   1081    const T rounded_f = ConvertScalarTo<T>(rounded);
   1082    // Round to even
   1083    if ((rounded & 1) &&
   1084        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
   1085      ret.raw[i] = rounded - (signbit ? -1 : 1);
   1086      continue;
   1087    }
   1088    ret.raw[i] = rounded;
   1089  }
   1090  return ret;
   1091 }
   1092 
   1093 template <class DI32, HWY_IF_I32_D(DI32)>
   1094 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/,
   1095                                        VFromD<Rebind<double, DI32>> v) {
   1096  using T = double;
   1097  using TI = int32_t;
   1098  const T k0 = ConvertScalarTo<T>(0);
   1099 
   1100  constexpr size_t N = HWY_MAX_LANES_D(DI32);
   1101 
   1102  const VFromD<Rebind<double, DI32>> abs = Abs(v);
   1103  VFromD<DI32> ret;
   1104  for (size_t i = 0; i < N; ++i) {
   1105    const bool signbit = ScalarSignBit(v.raw[i]);
   1106 
   1107    // Check if too large to cast or NaN
   1108    if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
   1109      ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
   1110      continue;
   1111    }
   1112 
   1113    const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5);
   1114    const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias);
   1115    if (rounded == 0) {
   1116      ret.raw[i] = 0;
   1117      continue;
   1118    }
   1119    const T rounded_f = ConvertScalarTo<T>(rounded);
   1120    // Round to even
   1121    if ((rounded & 1) &&
   1122        ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) {
   1123      ret.raw[i] = rounded - (signbit ? -1 : 1);
   1124      continue;
   1125    }
   1126    ret.raw[i] = rounded;
   1127  }
   1128  return ret;
   1129 }
   1130 
   1131 template <typename T, size_t N>
   1132 HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
   1133  using TI = MakeSigned<T>;
   1134  const Vec128<T, N> abs = Abs(v);
   1135  for (size_t i = 0; i < N; ++i) {
   1136    if (!(abs.raw[i] <= MantissaEnd<T>())) {  // Huge or NaN
   1137      continue;
   1138    }
   1139    const TI truncated = static_cast<TI>(v.raw[i]);
   1140    if (truncated == 0) {
   1141      v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
   1142      continue;
   1143    }
   1144    v.raw[i] = static_cast<T>(truncated);
   1145  }
   1146  return v;
   1147 }
   1148 
   1149 // Toward +infinity, aka ceiling
   1150 template <typename Float, size_t N>
   1151 Vec128<Float, N> Ceil(Vec128<Float, N> v) {
   1152  constexpr int kMantissaBits = MantissaBits<Float>();
   1153  using Bits = MakeUnsigned<Float>;
   1154  const Bits kExponentMask = MaxExponentField<Float>();
   1155  const Bits kMantissaMask = MantissaMask<Float>();
   1156  const Bits kBias = kExponentMask / 2;
   1157 
   1158  for (size_t i = 0; i < N; ++i) {
   1159    const bool positive = v.raw[i] > Float(0.0);
   1160 
   1161    Bits bits = BitCastScalar<Bits>(v.raw[i]);
   1162 
   1163    const int exponent =
   1164        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
   1165    // Already an integer.
   1166    if (exponent >= kMantissaBits) continue;
   1167    // |v| <= 1 => 0 or 1.
   1168    if (exponent < 0) {
   1169      v.raw[i] = positive ? Float{1} : Float{-0.0};
   1170      continue;
   1171    }
   1172 
   1173    const Bits mantissa_mask = kMantissaMask >> exponent;
   1174    // Already an integer
   1175    if ((bits & mantissa_mask) == 0) continue;
   1176 
   1177    // Clear fractional bits and round up
   1178    if (positive) bits += (kMantissaMask + 1) >> exponent;
   1179    bits &= ~mantissa_mask;
   1180 
   1181    v.raw[i] = BitCastScalar<Float>(bits);
   1182  }
   1183  return v;
   1184 }
   1185 
   1186 // Toward -infinity, aka floor
   1187 template <typename Float, size_t N>
   1188 Vec128<Float, N> Floor(Vec128<Float, N> v) {
   1189  constexpr int kMantissaBits = MantissaBits<Float>();
   1190  using Bits = MakeUnsigned<Float>;
   1191  const Bits kExponentMask = MaxExponentField<Float>();
   1192  const Bits kMantissaMask = MantissaMask<Float>();
   1193  const Bits kBias = kExponentMask / 2;
   1194 
   1195  for (size_t i = 0; i < N; ++i) {
   1196    const bool negative = v.raw[i] < Float(0.0);
   1197 
   1198    Bits bits = BitCastScalar<Bits>(v.raw[i]);
   1199 
   1200    const int exponent =
   1201        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
   1202    // Already an integer.
   1203    if (exponent >= kMantissaBits) continue;
   1204    // |v| <= 1 => -1 or 0.
   1205    if (exponent < 0) {
   1206      v.raw[i] = negative ? Float(-1.0) : Float(0.0);
   1207      continue;
   1208    }
   1209 
   1210    const Bits mantissa_mask = kMantissaMask >> exponent;
   1211    // Already an integer
   1212    if ((bits & mantissa_mask) == 0) continue;
   1213 
   1214    // Clear fractional bits and round down
   1215    if (negative) bits += (kMantissaMask + 1) >> exponent;
   1216    bits &= ~mantissa_mask;
   1217 
   1218    v.raw[i] = BitCastScalar<Float>(bits);
   1219  }
   1220  return v;
   1221 }
   1222 
   1223 // ------------------------------ Floating-point classification
   1224 
   1225 template <typename T, size_t N>
   1226 HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
   1227  Mask128<T, N> ret;
   1228  for (size_t i = 0; i < N; ++i) {
   1229    // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
   1230    ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i]));
   1231  }
   1232  return ret;
   1233 }
   1234 
   1235 // ================================================== COMPARE
   1236 
   1237 template <typename T, size_t N>
   1238 HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
   1239  Mask128<T, N> m;
   1240  for (size_t i = 0; i < N; ++i) {
   1241    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
   1242  }
   1243  return m;
   1244 }
   1245 
   1246 template <typename T, size_t N>
   1247 HWY_API Mask128<T, N> operator!=(Vec128<T, N> a, Vec128<T, N> b) {
   1248  Mask128<T, N> m;
   1249  for (size_t i = 0; i < N; ++i) {
   1250    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
   1251  }
   1252  return m;
   1253 }
   1254 
   1255 template <typename T, size_t N>
   1256 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
   1257  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   1258  return (v & bit) == bit;
   1259 }
   1260 
   1261 template <typename T, size_t N>
   1262 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
   1263  Mask128<T, N> m;
   1264  for (size_t i = 0; i < N; ++i) {
   1265    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
   1266  }
   1267  return m;
   1268 }
   1269 template <typename T, size_t N>
   1270 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
   1271  Mask128<T, N> m;
   1272  for (size_t i = 0; i < N; ++i) {
   1273    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
   1274  }
   1275  return m;
   1276 }
   1277 
   1278 template <typename T, size_t N>
   1279 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
   1280  Mask128<T, N> m;
   1281  for (size_t i = 0; i < N; ++i) {
   1282    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
   1283  }
   1284  return m;
   1285 }
   1286 template <typename T, size_t N>
   1287 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
   1288  Mask128<T, N> m;
   1289  for (size_t i = 0; i < N; ++i) {
   1290    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
   1291  }
   1292  return m;
   1293 }
   1294 
   1295 // ------------------------------ Lt128
   1296 
   1297 // Only makes sense for full vectors of u64.
   1298 template <class D>
   1299 HWY_API MFromD<D> Lt128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) {
   1300  const bool lt =
   1301      (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
   1302  Mask128<uint64_t> ret;
   1303  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
   1304  return ret;
   1305 }
   1306 
   1307 template <class D>
   1308 HWY_API MFromD<D> Lt128Upper(D /* tag */, Vec128<uint64_t> a,
   1309                             Vec128<uint64_t> b) {
   1310  const bool lt = a.raw[1] < b.raw[1];
   1311  Mask128<uint64_t> ret;
   1312  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
   1313  return ret;
   1314 }
   1315 
   1316 // ------------------------------ Eq128
   1317 
   1318 // Only makes sense for full vectors of u64.
   1319 template <class D>
   1320 HWY_API MFromD<D> Eq128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) {
   1321  const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
   1322  Mask128<uint64_t> ret;
   1323  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
   1324  return ret;
   1325 }
   1326 
   1327 template <class D>
   1328 HWY_API Mask128<uint64_t> Ne128(D /* tag */, Vec128<uint64_t> a,
   1329                                Vec128<uint64_t> b) {
   1330  const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
   1331  Mask128<uint64_t> ret;
   1332  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
   1333  return ret;
   1334 }
   1335 
   1336 template <class D>
   1337 HWY_API MFromD<D> Eq128Upper(D /* tag */, Vec128<uint64_t> a,
   1338                             Vec128<uint64_t> b) {
   1339  const bool eq = a.raw[1] == b.raw[1];
   1340  Mask128<uint64_t> ret;
   1341  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
   1342  return ret;
   1343 }
   1344 
   1345 template <class D>
   1346 HWY_API MFromD<D> Ne128Upper(D /* tag */, Vec128<uint64_t> a,
   1347                             Vec128<uint64_t> b) {
   1348  const bool ne = a.raw[1] != b.raw[1];
   1349  Mask128<uint64_t> ret;
   1350  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
   1351  return ret;
   1352 }
   1353 
   1354 // ------------------------------ Min128, Max128 (Lt128)
   1355 
   1356 template <class D>
   1357 HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
   1358  return IfThenElse(Lt128(d, a, b), a, b);
   1359 }
   1360 
   1361 template <class D>
   1362 HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
   1363  return IfThenElse(Lt128(d, b, a), a, b);
   1364 }
   1365 
   1366 template <class D>
   1367 HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
   1368  return IfThenElse(Lt128Upper(d, a, b), a, b);
   1369 }
   1370 
   1371 template <class D>
   1372 HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
   1373  return IfThenElse(Lt128Upper(d, b, a), a, b);
   1374 }
   1375 
   1376 // ================================================== MEMORY
   1377 
   1378 // ------------------------------ Load
   1379 
   1380 template <class D>
   1381 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
   1382  VFromD<D> v;
   1383  CopyBytes<d.MaxBytes()>(aligned, v.raw);  // copy from array
   1384  return v;
   1385 }
   1386 
   1387 template <class D>
   1388 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   1389                             const TFromD<D>* HWY_RESTRICT p) {
   1390  return IfThenElseZero(m, LoadU(d, p));
   1391 }
   1392 
   1393 template <class D>
   1394 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
   1395                               const TFromD<D>* HWY_RESTRICT p) {
   1396  return IfThenElse(m, LoadU(d, p), v);
   1397 }
   1398 
   1399 template <class D>
   1400 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   1401  return Load(d, p);
   1402 }
   1403 
   1404 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
   1405 template <class D>
   1406 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT aligned) {
   1407  return Load(d, aligned);
   1408 }
   1409 
   1410 #ifdef HWY_NATIVE_LOAD_N
   1411 #undef HWY_NATIVE_LOAD_N
   1412 #else
   1413 #define HWY_NATIVE_LOAD_N
   1414 #endif
   1415 
   1416 template <class D>
   1417 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   1418                        size_t max_lanes_to_load) {
   1419  VFromD<D> v = Zero(d);
   1420  const size_t N = Lanes(d);
   1421  const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
   1422  CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
   1423  return v;
   1424 }
   1425 
   1426 template <class D>
   1427 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   1428                          size_t max_lanes_to_load) {
   1429  VFromD<D> v = no;
   1430  const size_t N = Lanes(d);
   1431  const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N);
   1432  CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>));
   1433  return v;
   1434 }
   1435 
   1436 // ------------------------------ Store
   1437 
   1438 template <class D>
   1439 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   1440  CopyBytes<d.MaxBytes()>(v.raw, aligned);  // copy to array
   1441 }
   1442 
   1443 template <class D>
   1444 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   1445  Store(v, d, p);
   1446 }
   1447 
   1448 template <class D>
   1449 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   1450                          TFromD<D>* HWY_RESTRICT p) {
   1451  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1452    if (m.bits[i]) p[i] = v.raw[i];
   1453  }
   1454 }
   1455 
   1456 #ifdef HWY_NATIVE_STORE_N
   1457 #undef HWY_NATIVE_STORE_N
   1458 #else
   1459 #define HWY_NATIVE_STORE_N
   1460 #endif
   1461 
   1462 template <class D>
   1463 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
   1464                    size_t max_lanes_to_store) {
   1465  const size_t N = Lanes(d);
   1466  const size_t num_of_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
   1467  CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>));
   1468 }
   1469 
   1470 // ================================================== COMBINE
   1471 
   1472 template <typename T, size_t N>
   1473 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
   1474  Vec128<T, N / 2> ret;
   1475  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
   1476  return ret;
   1477 }
   1478 
   1479 template <class D>
   1480 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
   1481  return LowerHalf(v);
   1482 }
   1483 
   1484 template <class D>
   1485 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   1486  VFromD<D> ret;
   1487  CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw);
   1488  return ret;
   1489 }
   1490 
   1491 template <class D>
   1492 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) {
   1493  const Half<decltype(d)> dh;
   1494  VFromD<D> ret;  // zero-initialized
   1495  CopyBytes<dh.MaxBytes()>(v.raw, ret.raw);
   1496  return ret;
   1497 }
   1498 
   1499 template <class D, class VH = VFromD<Half<D>>>
   1500 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
   1501  const Half<decltype(d)> dh;
   1502  VFromD<D> ret;
   1503  CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]);
   1504  CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]);
   1505  return ret;
   1506 }
   1507 
   1508 template <class D>
   1509 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   1510  const Half<decltype(d)> dh;
   1511  VFromD<D> ret;
   1512  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
   1513  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
   1514  return ret;
   1515 }
   1516 
   1517 template <class D>
   1518 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   1519  const Half<decltype(d)> dh;
   1520  VFromD<D> ret;
   1521  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
   1522  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
   1523  return ret;
   1524 }
   1525 
   1526 template <class D>
   1527 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   1528  const Half<decltype(d)> dh;
   1529  VFromD<D> ret;
   1530  CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]);
   1531  CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]);
   1532  return ret;
   1533 }
   1534 
   1535 template <class D>
   1536 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   1537  const Half<decltype(d)> dh;
   1538  VFromD<D> ret;
   1539  CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]);
   1540  CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]);
   1541  return ret;
   1542 }
   1543 
   1544 template <class D>
   1545 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   1546  const Half<decltype(d)> dh;
   1547  VFromD<D> ret;
   1548  for (size_t i = 0; i < MaxLanes(dh); ++i) {
   1549    ret.raw[i] = lo.raw[2 * i];
   1550  }
   1551  for (size_t i = 0; i < MaxLanes(dh); ++i) {
   1552    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i];
   1553  }
   1554  return ret;
   1555 }
   1556 
   1557 // 2023-11-23: workaround for incorrect codegen (reduction_test fails for
   1558 // SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero).
   1559 #if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG
   1560 #define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE
   1561 #else
   1562 #define HWY_EMU128_CONCAT_INLINE HWY_API
   1563 #endif
   1564 
   1565 template <class D>
   1566 HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   1567  const Half<decltype(d)> dh;
   1568  VFromD<D> ret;
   1569  for (size_t i = 0; i < MaxLanes(dh); ++i) {
   1570    ret.raw[i] = lo.raw[2 * i + 1];
   1571  }
   1572  for (size_t i = 0; i < MaxLanes(dh); ++i) {
   1573    ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1];
   1574  }
   1575  return ret;
   1576 }
   1577 
   1578 // ------------------------------ CombineShiftRightBytes
   1579 template <int kBytes, class D>
   1580 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   1581  VFromD<D> ret;
   1582  const uint8_t* HWY_RESTRICT lo8 =
   1583      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
   1584  uint8_t* HWY_RESTRICT ret8 =
   1585      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
   1586  CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8);
   1587  CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes);
   1588  return ret;
   1589 }
   1590 
   1591 // ------------------------------ ShiftLeftBytes
   1592 
   1593 template <int kBytes, class D>
   1594 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
   1595  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   1596  VFromD<D> ret;
   1597  uint8_t* HWY_RESTRICT ret8 =
   1598      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
   1599  ZeroBytes<kBytes>(ret8);
   1600  CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes);
   1601  return ret;
   1602 }
   1603 
   1604 template <int kBytes, typename T, size_t N>
   1605 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
   1606  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
   1607 }
   1608 
   1609 // ------------------------------ ShiftLeftLanes
   1610 
   1611 template <int kLanes, class D, typename T = TFromD<D>>
   1612 HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
   1613  const Repartition<uint8_t, decltype(d)> d8;
   1614  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
   1615 }
   1616 
   1617 template <int kLanes, typename T, size_t N>
   1618 HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
   1619  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
   1620 }
   1621 
   1622 // ------------------------------ ShiftRightBytes
   1623 template <int kBytes, class D>
   1624 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
   1625  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   1626  VFromD<D> ret;
   1627  const uint8_t* HWY_RESTRICT v8 =
   1628      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
   1629  uint8_t* HWY_RESTRICT ret8 =
   1630      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
   1631  CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8);
   1632  ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes);
   1633  return ret;
   1634 }
   1635 
   1636 // ------------------------------ ShiftRightLanes
   1637 template <int kLanes, class D>
   1638 HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
   1639  const Repartition<uint8_t, decltype(d)> d8;
   1640  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
   1641  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
   1642 }
   1643 
   1644 // ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo
   1645 #include "hwy/ops/inside-inl.h"
   1646 
   1647 // ------------------------------ LoadInterleaved2/3/4
   1648 
   1649 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
   1650 // We implement those here because scalar code is likely faster than emulation
   1651 // via shuffles.
   1652 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1653 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1654 #else
   1655 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1656 #endif
   1657 
   1658 // Same for Load/StoreInterleaved of special floats.
   1659 #ifdef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   1660 #undef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   1661 #else
   1662 #define HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   1663 #endif
   1664 
   1665 template <class D, typename T = TFromD<D>>
   1666 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
   1667                              VFromD<D>& v0, VFromD<D>& v1) {
   1668  alignas(16) T buf0[MaxLanes(d)];
   1669  alignas(16) T buf1[MaxLanes(d)];
   1670  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1671    buf0[i] = *unaligned++;
   1672    buf1[i] = *unaligned++;
   1673  }
   1674  v0 = Load(d, buf0);
   1675  v1 = Load(d, buf1);
   1676 }
   1677 
   1678 template <class D, typename T = TFromD<D>>
   1679 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
   1680                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1681  alignas(16) T buf0[MaxLanes(d)];
   1682  alignas(16) T buf1[MaxLanes(d)];
   1683  alignas(16) T buf2[MaxLanes(d)];
   1684  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1685    buf0[i] = *unaligned++;
   1686    buf1[i] = *unaligned++;
   1687    buf2[i] = *unaligned++;
   1688  }
   1689  v0 = Load(d, buf0);
   1690  v1 = Load(d, buf1);
   1691  v2 = Load(d, buf2);
   1692 }
   1693 
   1694 template <class D, typename T = TFromD<D>>
   1695 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
   1696                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   1697                              VFromD<D>& v3) {
   1698  alignas(16) T buf0[MaxLanes(d)];
   1699  alignas(16) T buf1[MaxLanes(d)];
   1700  alignas(16) T buf2[MaxLanes(d)];
   1701  alignas(16) T buf3[MaxLanes(d)];
   1702  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1703    buf0[i] = *unaligned++;
   1704    buf1[i] = *unaligned++;
   1705    buf2[i] = *unaligned++;
   1706    buf3[i] = *unaligned++;
   1707  }
   1708  v0 = Load(d, buf0);
   1709  v1 = Load(d, buf1);
   1710  v2 = Load(d, buf2);
   1711  v3 = Load(d, buf3);
   1712 }
   1713 
   1714 // ------------------------------ StoreInterleaved2/3/4
   1715 
   1716 template <class D>
   1717 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
   1718                               TFromD<D>* HWY_RESTRICT unaligned) {
   1719  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1720    *unaligned++ = v0.raw[i];
   1721    *unaligned++ = v1.raw[i];
   1722  }
   1723 }
   1724 
   1725 template <class D>
   1726 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   1727                               TFromD<D>* HWY_RESTRICT unaligned) {
   1728  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1729    *unaligned++ = v0.raw[i];
   1730    *unaligned++ = v1.raw[i];
   1731    *unaligned++ = v2.raw[i];
   1732  }
   1733 }
   1734 
   1735 template <class D>
   1736 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
   1737                               VFromD<D> v3, D d,
   1738                               TFromD<D>* HWY_RESTRICT unaligned) {
   1739  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1740    *unaligned++ = v0.raw[i];
   1741    *unaligned++ = v1.raw[i];
   1742    *unaligned++ = v2.raw[i];
   1743    *unaligned++ = v3.raw[i];
   1744  }
   1745 }
   1746 
   1747 // ------------------------------ Stream
   1748 template <class D>
   1749 HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   1750  Store(v, d, aligned);
   1751 }
   1752 
   1753 // ------------------------------ Scatter in generic_ops-inl.h
   1754 // ------------------------------ Gather in generic_ops-inl.h
   1755 
   1756 // ================================================== CONVERT
   1757 
   1758 // ConvertTo and DemoteTo with floating-point input and integer output truncate
   1759 // (rounding toward zero).
   1760 
   1761 namespace detail {
   1762 
   1763 template <class ToT, class FromT>
   1764 HWY_INLINE ToT CastValueForF2IConv(FromT val) {
   1765  // Prevent ubsan errors when converting float to narrower integer
   1766 
   1767  using FromTU = MakeUnsigned<FromT>;
   1768  using ToTU = MakeUnsigned<ToT>;
   1769 
   1770  constexpr unsigned kMaxExpField =
   1771      static_cast<unsigned>(MaxExponentField<FromT>());
   1772  constexpr unsigned kExpBias = kMaxExpField >> 1;
   1773  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
   1774      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
   1775      kMaxExpField));
   1776 
   1777  // If ToT is signed, compare only the exponent bits of val against
   1778  // kMinOutOfRangeExpField.
   1779  //
   1780  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
   1781  // val against kMinOutOfRangeExpField as a negative value is outside of the
   1782  // range of an unsigned integer type.
   1783  const FromT val_to_compare =
   1784      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
   1785 
   1786  // val is within the range of ToT if
   1787  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
   1788  // than kMinOutOfRangeExpField
   1789  //
   1790  // Otherwise, val is either outside of the range of ToT or equal to
   1791  // LimitsMin<ToT>() if
   1792  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
   1793  // than or equal to kMinOutOfRangeExpField.
   1794 
   1795  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
   1796                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
   1797             ? static_cast<ToT>(val)
   1798             : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
   1799                                static_cast<ToTU>(ScalarSignBit(val)));
   1800 }
   1801 
   1802 template <class ToT, class ToTypeTag, class FromT>
   1803 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
   1804  return ConvertScalarTo<ToT>(val);
   1805 }
   1806 
   1807 template <class ToT>
   1808 HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
   1809                                     float val) {
   1810  return CastValueForF2IConv<ToT>(val);
   1811 }
   1812 
   1813 template <class ToT>
   1814 HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
   1815                                     float val) {
   1816  return CastValueForF2IConv<ToT>(val);
   1817 }
   1818 // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
   1819 // returns static_cast<ToT>(val)
   1820 //
   1821 // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
   1822 // implementation-defined result if val is not within the range of ToT.
   1823 template <class ToT, class FromT>
   1824 HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
   1825  // Prevent ubsan errors when converting float to narrower integer
   1826 
   1827  using FromTU = MakeUnsigned<FromT>;
   1828 
   1829  constexpr unsigned kMaxExpField =
   1830      static_cast<unsigned>(MaxExponentField<FromT>());
   1831  constexpr unsigned kExpBias = kMaxExpField >> 1;
   1832  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
   1833      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
   1834      kMaxExpField));
   1835 
   1836  // If ToT is signed, compare only the exponent bits of val against
   1837  // kMinOutOfRangeExpField.
   1838  //
   1839  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
   1840  // val against kMinOutOfRangeExpField as a negative value is outside of the
   1841  // range of an unsigned integer type.
   1842  const FromT val_to_compare =
   1843      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
   1844 
   1845  // val is within the range of ToT if
   1846  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
   1847  // than kMinOutOfRangeExpField
   1848  //
   1849  // Otherwise, val is either outside of the range of ToT or equal to
   1850  // LimitsMin<ToT>() if
   1851  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
   1852  // than or equal to kMinOutOfRangeExpField.
   1853 
   1854  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
   1855                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
   1856             ? static_cast<ToT>(val)
   1857             : static_cast<ToT>(LimitsMin<ToT>());
   1858 }
   1859 
   1860 }  // namespace detail
   1861 
   1862 template <class DTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)>
   1863 HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   1864  static_assert(sizeof(TFromD<DTo>) > sizeof(TFrom), "Not promoting");
   1865  VFromD<DTo> ret;
   1866  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1867    // For bits Y > X, floatX->floatY and intX->intY are always representable.
   1868    ret.raw[i] = detail::CastValueForPromoteTo<TFromD<DTo>>(
   1869        hwy::TypeTag<TFromD<DTo>>(), from.raw[i]);
   1870  }
   1871  return ret;
   1872 }
   1873 
   1874 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   1875 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   1876 #else
   1877 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   1878 #endif
   1879 
   1880 template <class D64, HWY_IF_UI64_D(D64)>
   1881 HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
   1882  VFromD<D64> ret;
   1883  for (size_t i = 0; i < MaxLanes(d64); ++i) {
   1884    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]);
   1885  }
   1886  return ret;
   1887 }
   1888 
   1889 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
   1890 // so we overload for TFrom=double and ToT={float,int32_t}.
   1891 template <class D, HWY_IF_F32_D(D)>
   1892 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
   1893  VFromD<D> ret;
   1894  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1895    // Prevent ubsan errors when converting float to narrower integer/float
   1896    if (ScalarIsInf(from.raw[i]) ||
   1897        ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
   1898      ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>()
   1899                                              : HighestValue<float>();
   1900      continue;
   1901    }
   1902    ret.raw[i] = static_cast<float>(from.raw[i]);
   1903  }
   1904  return ret;
   1905 }
   1906 template <class D, HWY_IF_UI32_D(D)>
   1907 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) {
   1908  VFromD<D> ret;
   1909  for (size_t i = 0; i < MaxLanes(d); ++i) {
   1910    // Prevent ubsan errors when converting double to narrower integer/int32_t
   1911    ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]);
   1912  }
   1913  return ret;
   1914 }
   1915 
   1916 template <class DTo, typename TFrom, size_t N, HWY_IF_SIGNED(TFrom),
   1917          HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
   1918 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   1919  using TTo = TFromD<DTo>;
   1920  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
   1921 
   1922  VFromD<DTo> ret;
   1923  for (size_t i = 0; i < N; ++i) {
   1924    // Int to int: choose closest value in ToT to `from` (avoids UB)
   1925    from.raw[i] =
   1926        HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw[i]), LimitsMax<TTo>());
   1927    ret.raw[i] = static_cast<TTo>(from.raw[i]);
   1928  }
   1929  return ret;
   1930 }
   1931 
   1932 // Disable the default unsigned to signed DemoteTo/ReorderDemote2To
   1933 // implementations in generic_ops-inl.h on EMU128 as the EMU128 target has
   1934 // target-specific implementations of the unsigned to signed DemoteTo and
   1935 // ReorderDemote2To ops
   1936 
   1937 // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
   1938 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
   1939 // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
   1940 // SFINAE to occur instead of a hard error due to a dependency on the V template
   1941 // argument
   1942 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
   1943 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
   1944  hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
   1945 
   1946 template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom),
   1947          HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
   1948 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   1949  using TTo = TFromD<DTo>;
   1950  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
   1951 
   1952  const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
   1953 
   1954  VFromD<DTo> ret;
   1955  for (size_t i = 0; i < N; ++i) {
   1956    // Int to int: choose closest value in ToT to `from` (avoids UB)
   1957    ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max));
   1958  }
   1959  return ret;
   1960 }
   1961 
   1962 template <class DTo, typename TFrom, size_t N, HWY_IF_UI64(TFrom),
   1963          HWY_IF_F32_D(DTo)>
   1964 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) {
   1965  using TTo = TFromD<DTo>;
   1966  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
   1967 
   1968  VFromD<DTo> ret;
   1969  for (size_t i = 0; i < N; ++i) {
   1970    // int64_t/uint64_t to float: okay to cast to float as an int64_t/uint64_t
   1971    // value is always within the range of a float
   1972    ret.raw[i] = static_cast<TTo>(from.raw[i]);
   1973  }
   1974  return ret;
   1975 }
   1976 
   1977 template <class DBF16, HWY_IF_BF16_D(DBF16), class VF32>
   1978 HWY_API VFromD<DBF16> ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) {
   1979  const Repartition<uint32_t, decltype(dbf16)> du32;
   1980  const VFromD<decltype(du32)> b_in_lower = ShiftRight<16>(BitCast(du32, b));
   1981  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
   1982  const VFromD<decltype(du32)> a_mask = Set(du32, 0xFFFF0000);
   1983  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
   1984 }
   1985 
   1986 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
   1987          HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   1988          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   1989 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   1990  const RepartitionToWide<decltype(dn)> dw;
   1991  const size_t NW = Lanes(dw);
   1992  using TN = TFromD<DN>;
   1993  const TN min = LimitsMin<TN>();
   1994  const TN max = LimitsMax<TN>();
   1995  VFromD<DN> ret;
   1996  for (size_t i = 0; i < NW; ++i) {
   1997    ret.raw[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
   1998  }
   1999  for (size_t i = 0; i < NW; ++i) {
   2000    ret.raw[NW + i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
   2001  }
   2002  return ret;
   2003 }
   2004 
   2005 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V,
   2006          HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   2007          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   2008 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   2009  const RepartitionToWide<decltype(dn)> dw;
   2010  const size_t NW = Lanes(dw);
   2011  using TN = TFromD<DN>;
   2012  using TN_U = MakeUnsigned<TN>;
   2013  const TN_U max = static_cast<TN_U>(LimitsMax<TN>());
   2014  VFromD<DN> ret;
   2015  for (size_t i = 0; i < NW; ++i) {
   2016    ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max));
   2017  }
   2018  for (size_t i = 0; i < NW; ++i) {
   2019    ret.raw[NW + i] = static_cast<TN>(HWY_MIN(b.raw[i], max));
   2020  }
   2021  return ret;
   2022 }
   2023 
   2024 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
   2025          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   2026          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   2027          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   2028 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
   2029  return ReorderDemote2To(dn, a, b);
   2030 }
   2031 
   2032 template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V,
   2033          HWY_IF_F32_D(DFromV<V>),
   2034          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   2035 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
   2036  const size_t NW = Lanes(dn) / 2;
   2037  using TN = TFromD<DN>;
   2038  VFromD<DN> ret;
   2039  for (size_t i = 0; i < NW; ++i) {
   2040    ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]);
   2041  }
   2042  for (size_t i = 0; i < NW; ++i) {
   2043    ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]);
   2044  }
   2045  return ret;
   2046 }
   2047 
   2048 namespace detail {
   2049 
   2050 HWY_INLINE void StoreU16ToF16(const uint16_t val,
   2051                              hwy::float16_t* HWY_RESTRICT to) {
   2052  CopySameSize(&val, to);
   2053 }
   2054 
   2055 HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
   2056  uint16_t bits16;
   2057  CopySameSize(from, &bits16);
   2058  return bits16;
   2059 }
   2060 
   2061 }  // namespace detail
   2062 
   2063 template <class D, HWY_IF_F32_D(D), size_t N>
   2064 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) {
   2065  VFromD<D> ret;
   2066  for (size_t i = 0; i < N; ++i) {
   2067    ret.raw[i] = F32FromBF16(v.raw[i]);
   2068  }
   2069  return ret;
   2070 }
   2071 
   2072 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
   2073 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
   2074 #else
   2075 #define HWY_NATIVE_DEMOTE_F32_TO_BF16
   2076 #endif
   2077 
   2078 template <class D, HWY_IF_BF16_D(D), size_t N>
   2079 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) {
   2080  VFromD<D> ret;
   2081  for (size_t i = 0; i < N; ++i) {
   2082    ret.raw[i] = BF16FromF32(v.raw[i]);
   2083  }
   2084  return ret;
   2085 }
   2086 
   2087 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   2088 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   2089 #else
   2090 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   2091 #endif
   2092 
   2093 template <class D32, HWY_IF_UI32_D(D32)>
   2094 HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
   2095  VFromD<D32> ret;
   2096  for (size_t i = 0; i < MaxLanes(d32); ++i) {
   2097    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]);
   2098  }
   2099  return ret;
   2100 }
   2101 
   2102 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility
   2103 namespace detail {
   2104 
   2105 template <typename TFrom, typename DTo>
   2106 HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/,
   2107                              Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   2108  using ToT = TFromD<DTo>;
   2109  static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
   2110  VFromD<DTo> ret;
   2111  constexpr size_t N = HWY_MAX_LANES_D(DTo);
   2112 
   2113  for (size_t i = 0; i < N; ++i) {
   2114    // float## -> int##: return closest representable value
   2115    ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]);
   2116  }
   2117  return ret;
   2118 }
   2119 
   2120 template <typename TFrom, typename DTo>
   2121 HWY_API VFromD<DTo> ConvertTo(hwy::NonFloatTag /*tag*/, DTo /* tag */,
   2122                              Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   2123  using ToT = TFromD<DTo>;
   2124  static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size");
   2125  VFromD<DTo> ret;
   2126  constexpr size_t N = HWY_MAX_LANES_D(DTo);
   2127  for (size_t i = 0; i < N; ++i) {
   2128    // int## -> float##: no check needed
   2129    ret.raw[i] = static_cast<ToT>(from.raw[i]);
   2130  }
   2131  return ret;
   2132 }
   2133 
   2134 }  // namespace detail
   2135 
   2136 template <class DTo, typename TFrom>
   2137 HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) {
   2138  return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from);
   2139 }
   2140 
   2141 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   2142 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   2143 #else
   2144 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   2145 #endif
   2146 
   2147 template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
   2148          HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
   2149 HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
   2150  VFromD<DI> ret;
   2151  for (size_t i = 0; i < MaxLanes(di); i++) {
   2152    ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]);
   2153  }
   2154  return ret;
   2155 }
   2156 
   2157 template <size_t N>
   2158 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
   2159  return DemoteTo(Simd<uint8_t, N, 0>(), v);
   2160 }
   2161 
   2162 // ------------------------------ Truncations
   2163 
   2164 template <class D, HWY_IF_U8_D(D), size_t N>
   2165 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
   2166  VFromD<D> ret;
   2167  for (size_t i = 0; i < N; ++i) {
   2168    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
   2169  }
   2170  return ret;
   2171 }
   2172 
   2173 template <class D, HWY_IF_U16_D(D), size_t N>
   2174 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
   2175  VFromD<D> ret;
   2176  for (size_t i = 0; i < N; ++i) {
   2177    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
   2178  }
   2179  return ret;
   2180 }
   2181 
   2182 template <class D, HWY_IF_U32_D(D), size_t N>
   2183 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) {
   2184  VFromD<D> ret;
   2185  for (size_t i = 0; i < N; ++i) {
   2186    ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
   2187  }
   2188  return ret;
   2189 }
   2190 
   2191 template <class D, HWY_IF_U8_D(D), size_t N>
   2192 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
   2193  VFromD<D> ret;
   2194  for (size_t i = 0; i < N; ++i) {
   2195    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
   2196  }
   2197  return ret;
   2198 }
   2199 
   2200 template <class D, HWY_IF_U16_D(D), size_t N>
   2201 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) {
   2202  VFromD<D> ret;
   2203  for (size_t i = 0; i < N; ++i) {
   2204    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
   2205  }
   2206  return ret;
   2207 }
   2208 
   2209 template <class D, HWY_IF_U8_D(D), size_t N>
   2210 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint16_t, N> v) {
   2211  VFromD<D> ret;
   2212  for (size_t i = 0; i < N; ++i) {
   2213    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
   2214  }
   2215  return ret;
   2216 }
   2217 
   2218 #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   2219 #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   2220 #else
   2221 #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   2222 #endif
   2223 
   2224 template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
   2225          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   2226          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   2227 HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
   2228  const RepartitionToWide<decltype(dn)> dw;
   2229  const size_t NW = Lanes(dw);
   2230  using TW = TFromD<decltype(dw)>;
   2231  using TN = TFromD<decltype(dn)>;
   2232  VFromD<DN> ret;
   2233  constexpr TW max_val{LimitsMax<TN>()};
   2234 
   2235  for (size_t i = 0; i < NW; ++i) {
   2236    ret.raw[i] = static_cast<TN>(a.raw[i] & max_val);
   2237  }
   2238  for (size_t i = 0; i < NW; ++i) {
   2239    ret.raw[NW + i] = static_cast<TN>(b.raw[i] & max_val);
   2240  }
   2241  return ret;
   2242 }
   2243 
   2244 // ================================================== SWIZZLE
   2245 
   2246 template <typename T, size_t N>
   2247 HWY_API T GetLane(Vec128<T, N> v) {
   2248  return v.raw[0];
   2249 }
   2250 
   2251 template <typename T, size_t N>
   2252 HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
   2253  v.raw[i] = t;
   2254  return v;
   2255 }
   2256 
   2257 template <typename T, size_t N>
   2258 HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
   2259  return v.raw[i];
   2260 }
   2261 
   2262 template <typename T, size_t N>
   2263 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   2264  for (size_t i = 0; i < N; i += 2) {
   2265    v.raw[i + 1] = v.raw[i];
   2266  }
   2267  return v;
   2268 }
   2269 
   2270 template <typename T, size_t N>
   2271 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   2272  for (size_t i = 0; i < N; i += 2) {
   2273    v.raw[i] = v.raw[i + 1];
   2274  }
   2275  return v;
   2276 }
   2277 
   2278 template <typename T, size_t N>
   2279 HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
   2280  for (size_t i = 0; i < N; i += 2) {
   2281    odd.raw[i] = even.raw[i];
   2282  }
   2283  return odd;
   2284 }
   2285 
   2286 template <class D>
   2287 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2288  constexpr size_t N = HWY_MAX_LANES_D(D);
   2289  for (size_t i = 1; i < N; i += 2) {
   2290    a.raw[i] = b.raw[i - 1];
   2291  }
   2292  return a;
   2293 }
   2294 
   2295 template <class D>
   2296 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2297  constexpr size_t N = HWY_MAX_LANES_D(D);
   2298  for (size_t i = 1; i < N; i += 2) {
   2299    b.raw[i - 1] = a.raw[i];
   2300  }
   2301  return b;
   2302 }
   2303 
   2304 template <typename T, size_t N>
   2305 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   2306  return even;
   2307 }
   2308 
   2309 // ------------------------------ SwapAdjacentBlocks
   2310 template <typename T, size_t N>
   2311 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
   2312  return v;
   2313 }
   2314 
   2315 // ------------------------------ InterleaveEvenBlocks
   2316 template <class D, class V = VFromD<D>>
   2317 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
   2318  return a;
   2319 }
   2320 // ------------------------------ InterleaveOddBlocks
   2321 template <class D, class V = VFromD<D>>
   2322 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
   2323  return a;
   2324 }
   2325 
   2326 // ------------------------------ TableLookupLanes
   2327 
   2328 // Returned by SetTableIndices for use by TableLookupLanes.
   2329 template <typename T, size_t N>
   2330 struct Indices128 {
   2331  MakeSigned<T> raw[N];
   2332 };
   2333 
   2334 template <class D, typename TI, size_t N>
   2335 HWY_API Indices128<TFromD<D>, N> IndicesFromVec(D d, Vec128<TI, N> vec) {
   2336  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size must match");
   2337  Indices128<TFromD<D>, N> ret;
   2338  CopyBytes<d.MaxBytes()>(vec.raw, ret.raw);
   2339  return ret;
   2340 }
   2341 
   2342 template <class D, typename TI>
   2343 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
   2344    D d, const TI* idx) {
   2345  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
   2346 }
   2347 
   2348 template <typename T, size_t N>
   2349 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   2350  Vec128<T, N> ret;
   2351  for (size_t i = 0; i < N; ++i) {
   2352    ret.raw[i] = v.raw[idx.raw[i]];
   2353  }
   2354  return ret;
   2355 }
   2356 
   2357 template <typename T, size_t N>
   2358 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
   2359                                          Indices128<T, N> idx) {
   2360  using TI = MakeSigned<T>;
   2361  Vec128<T, N> ret;
   2362  constexpr TI kVecLaneIdxMask = static_cast<TI>(N - 1);
   2363  for (size_t i = 0; i < N; ++i) {
   2364    const auto src_idx = idx.raw[i];
   2365    const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask;
   2366    ret.raw[i] = (src_idx < static_cast<TI>(N)) ? a.raw[masked_src_lane_idx]
   2367                                                : b.raw[masked_src_lane_idx];
   2368  }
   2369  return ret;
   2370 }
   2371 
   2372 // ------------------------------ ReverseBlocks
   2373 template <class D>
   2374 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
   2375  return v;  // Single block: no change
   2376 }
   2377 
   2378 // ------------------------------ Reverse
   2379 
   2380 template <class D>
   2381 HWY_API VFromD<D> Reverse(D d, VFromD<D> v) {
   2382  VFromD<D> ret;
   2383  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2384    ret.raw[i] = v.raw[MaxLanes(d) - 1 - i];
   2385  }
   2386  return ret;
   2387 }
   2388 
   2389 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
   2390 #ifdef HWY_NATIVE_REVERSE2_8
   2391 #undef HWY_NATIVE_REVERSE2_8
   2392 #else
   2393 #define HWY_NATIVE_REVERSE2_8
   2394 #endif
   2395 
   2396 template <class D>
   2397 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   2398  VFromD<D> ret;
   2399  for (size_t i = 0; i < MaxLanes(d); i += 2) {
   2400    ret.raw[i + 0] = v.raw[i + 1];
   2401    ret.raw[i + 1] = v.raw[i + 0];
   2402  }
   2403  return ret;
   2404 }
   2405 
   2406 template <class D>
   2407 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   2408  VFromD<D> ret;
   2409  for (size_t i = 0; i < MaxLanes(d); i += 4) {
   2410    ret.raw[i + 0] = v.raw[i + 3];
   2411    ret.raw[i + 1] = v.raw[i + 2];
   2412    ret.raw[i + 2] = v.raw[i + 1];
   2413    ret.raw[i + 3] = v.raw[i + 0];
   2414  }
   2415  return ret;
   2416 }
   2417 
   2418 template <class D>
   2419 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
   2420  VFromD<D> ret;
   2421  for (size_t i = 0; i < MaxLanes(d); i += 8) {
   2422    ret.raw[i + 0] = v.raw[i + 7];
   2423    ret.raw[i + 1] = v.raw[i + 6];
   2424    ret.raw[i + 2] = v.raw[i + 5];
   2425    ret.raw[i + 3] = v.raw[i + 4];
   2426    ret.raw[i + 4] = v.raw[i + 3];
   2427    ret.raw[i + 5] = v.raw[i + 2];
   2428    ret.raw[i + 6] = v.raw[i + 1];
   2429    ret.raw[i + 7] = v.raw[i + 0];
   2430  }
   2431  return ret;
   2432 }
   2433 
   2434 // ------------------------------ SlideUpLanes
   2435 
   2436 template <class D>
   2437 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   2438  VFromD<D> ret = Zero(d);
   2439  constexpr size_t N = HWY_MAX_LANES_D(D);
   2440  const size_t clamped_amt = HWY_MIN(amt, N);
   2441  CopyBytes(v.raw, ret.raw + clamped_amt,
   2442            (N - clamped_amt) * sizeof(TFromD<D>));
   2443  return ret;
   2444 }
   2445 
   2446 // ------------------------------ SlideDownLanes
   2447 
   2448 template <class D>
   2449 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   2450  VFromD<D> ret = Zero(d);
   2451  constexpr size_t N = HWY_MAX_LANES_D(D);
   2452  const size_t clamped_amt = HWY_MIN(amt, N);
   2453  CopyBytes(v.raw + clamped_amt, ret.raw,
   2454            (N - clamped_amt) * sizeof(TFromD<D>));
   2455  return ret;
   2456 }
   2457 
   2458 // ================================================== BLOCKWISE
   2459 
   2460 // ------------------------------ Shuffle*
   2461 
   2462 // Swap 32-bit halves in 64-bit halves.
   2463 template <typename T, size_t N>
   2464 HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
   2465  static_assert(sizeof(T) == 4, "Only for 32-bit");
   2466  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   2467  return Reverse2(DFromV<decltype(v)>(), v);
   2468 }
   2469 
   2470 // Swap 64-bit halves
   2471 template <typename T>
   2472 HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
   2473  static_assert(sizeof(T) == 4, "Only for 32-bit");
   2474  Vec128<T> ret;
   2475  ret.raw[3] = v.raw[1];
   2476  ret.raw[2] = v.raw[0];
   2477  ret.raw[1] = v.raw[3];
   2478  ret.raw[0] = v.raw[2];
   2479  return ret;
   2480 }
   2481 template <typename T>
   2482 HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
   2483  static_assert(sizeof(T) == 8, "Only for 64-bit");
   2484  return Reverse2(DFromV<decltype(v)>(), v);
   2485 }
   2486 
   2487 // Rotate right 32 bits
   2488 template <typename T>
   2489 HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
   2490  Vec128<T> ret;
   2491  ret.raw[3] = v.raw[0];
   2492  ret.raw[2] = v.raw[3];
   2493  ret.raw[1] = v.raw[2];
   2494  ret.raw[0] = v.raw[1];
   2495  return ret;
   2496 }
   2497 
   2498 // Rotate left 32 bits
   2499 template <typename T>
   2500 HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
   2501  Vec128<T> ret;
   2502  ret.raw[3] = v.raw[2];
   2503  ret.raw[2] = v.raw[1];
   2504  ret.raw[1] = v.raw[0];
   2505  ret.raw[0] = v.raw[3];
   2506  return ret;
   2507 }
   2508 
   2509 template <typename T>
   2510 HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
   2511  return Reverse4(DFromV<decltype(v)>(), v);
   2512 }
   2513 
   2514 // ------------------------------ Broadcast
   2515 template <int kLane, typename T, size_t N>
   2516 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
   2517  for (size_t i = 0; i < N; ++i) {
   2518    v.raw[i] = v.raw[kLane];
   2519  }
   2520  return v;
   2521 }
   2522 
   2523 // ------------------------------ TableLookupBytes, TableLookupBytesOr0
   2524 
   2525 template <typename T, size_t N, typename TI, size_t NI>
   2526 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> v,
   2527                                        Vec128<TI, NI> indices) {
   2528  const uint8_t* HWY_RESTRICT v_bytes =
   2529      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
   2530  const uint8_t* HWY_RESTRICT idx_bytes =
   2531      reinterpret_cast<const uint8_t*>(indices.raw);
   2532  Vec128<TI, NI> ret;
   2533  uint8_t* HWY_RESTRICT ret_bytes =
   2534      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
   2535  for (size_t i = 0; i < NI * sizeof(TI); ++i) {
   2536    const size_t idx = idx_bytes[i];
   2537    // Avoid out of bounds reads.
   2538    ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
   2539  }
   2540  return ret;
   2541 }
   2542 
   2543 template <typename T, size_t N, typename TI, size_t NI>
   2544 HWY_API Vec128<TI, NI> TableLookupBytesOr0(Vec128<T, N> v,
   2545                                           Vec128<TI, NI> indices) {
   2546  // Same as TableLookupBytes, which already returns 0 if out of bounds.
   2547  return TableLookupBytes(v, indices);
   2548 }
   2549 
   2550 // ------------------------------ InterleaveLower/InterleaveUpper
   2551 
   2552 template <typename T, size_t N>
   2553 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   2554  Vec128<T, N> ret;
   2555  for (size_t i = 0; i < N / 2; ++i) {
   2556    ret.raw[2 * i + 0] = a.raw[i];
   2557    ret.raw[2 * i + 1] = b.raw[i];
   2558  }
   2559  return ret;
   2560 }
   2561 
   2562 // Additional overload for the optional tag.
   2563 template <class D>
   2564 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
   2565  return InterleaveLower(a, b);
   2566 }
   2567 
   2568 template <class D>
   2569 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   2570  const Half<decltype(d)> dh;
   2571  VFromD<D> ret;
   2572  for (size_t i = 0; i < MaxLanes(dh); ++i) {
   2573    ret.raw[2 * i + 0] = a.raw[MaxLanes(dh) + i];
   2574    ret.raw[2 * i + 1] = b.raw[MaxLanes(dh) + i];
   2575  }
   2576  return ret;
   2577 }
   2578 
   2579 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
   2580 
   2581 // Same as Interleave*, except that the return lanes are double-width integers;
   2582 // this is necessary because the single-lane scalar cannot return two values.
   2583 template <class V, class DW = RepartitionToWide<DFromV<V>>>
   2584 HWY_API VFromD<DW> ZipLower(V a, V b) {
   2585  return BitCast(DW(), InterleaveLower(a, b));
   2586 }
   2587 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   2588 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   2589  return BitCast(dw, InterleaveLower(D(), a, b));
   2590 }
   2591 
   2592 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   2593 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   2594  return BitCast(dw, InterleaveUpper(D(), a, b));
   2595 }
   2596 
   2597 // ================================================== MASK
   2598 
   2599 template <class D>
   2600 HWY_API bool AllFalse(D d, MFromD<D> mask) {
   2601  typename MFromD<D>::Raw or_sum = 0;
   2602  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2603    or_sum |= mask.bits[i];
   2604  }
   2605  return or_sum == 0;
   2606 }
   2607 
   2608 template <class D>
   2609 HWY_API bool AllTrue(D d, MFromD<D> mask) {
   2610  constexpr uint64_t kAll = LimitsMax<typename MFromD<D>::Raw>();
   2611  uint64_t and_sum = kAll;
   2612  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2613    and_sum &= mask.bits[i];
   2614  }
   2615  return and_sum == kAll;
   2616 }
   2617 
   2618 // `p` points to at least 8 readable bytes, not all of which need be valid.
   2619 template <class D>
   2620 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   2621  MFromD<D> m;
   2622  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2623    const size_t bit = size_t{1} << (i & 7);
   2624    const size_t idx_byte = i >> 3;
   2625    m.bits[i] = MFromD<D>::FromBool((bits[idx_byte] & bit) != 0);
   2626  }
   2627  return m;
   2628 }
   2629 
   2630 template <class D>
   2631 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   2632  MFromD<D> m;
   2633  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2634    m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0);
   2635  }
   2636  return m;
   2637 }
   2638 
   2639 // `p` points to at least 8 writable bytes.
   2640 template <class D>
   2641 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
   2642  bits[0] = 0;
   2643  if (MaxLanes(d) > 8) bits[1] = 0;  // MaxLanes(d) <= 16, so max two bytes
   2644  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2645    const size_t bit = size_t{1} << (i & 7);
   2646    const size_t idx_byte = i >> 3;
   2647    if (mask.bits[i]) {
   2648      bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
   2649    }
   2650  }
   2651  return MaxLanes(d) > 8 ? 2 : 1;
   2652 }
   2653 
   2654 template <class D>
   2655 HWY_API size_t CountTrue(D d, MFromD<D> mask) {
   2656  size_t count = 0;
   2657  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2658    count += mask.bits[i] != 0;
   2659  }
   2660  return count;
   2661 }
   2662 
   2663 template <class D>
   2664 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
   2665  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2666    if (mask.bits[i] != 0) return i;
   2667  }
   2668  HWY_DASSERT(false);
   2669  return 0;
   2670 }
   2671 
   2672 template <class D>
   2673 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
   2674  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2675    if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
   2676  }
   2677  return intptr_t{-1};
   2678 }
   2679 
   2680 template <class D>
   2681 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
   2682  for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) {
   2683    if (mask.bits[i] != 0) return static_cast<size_t>(i);
   2684  }
   2685  HWY_DASSERT(false);
   2686  return 0;
   2687 }
   2688 
   2689 template <class D>
   2690 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
   2691  for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) {
   2692    if (mask.bits[i] != 0) return i;
   2693  }
   2694  return intptr_t{-1};
   2695 }
   2696 
   2697 // ------------------------------ Compress
   2698 
   2699 template <typename T>
   2700 struct CompressIsPartition {
   2701  enum { value = (sizeof(T) != 1) };
   2702 };
   2703 
   2704 template <typename T, size_t N>
   2705 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   2706  size_t count = 0;
   2707  Vec128<T, N> ret;
   2708  for (size_t i = 0; i < N; ++i) {
   2709    if (mask.bits[i]) {
   2710      ret.raw[count++] = v.raw[i];
   2711    }
   2712  }
   2713  for (size_t i = 0; i < N; ++i) {
   2714    if (!mask.bits[i]) {
   2715      ret.raw[count++] = v.raw[i];
   2716    }
   2717  }
   2718  HWY_DASSERT(count == N);
   2719  return ret;
   2720 }
   2721 
   2722 // ------------------------------ Expand
   2723 
   2724 // Could also just allow generic_ops-inl.h to implement these, but use our
   2725 // simple implementation below to ensure the test is correct.
   2726 #ifdef HWY_NATIVE_EXPAND
   2727 #undef HWY_NATIVE_EXPAND
   2728 #else
   2729 #define HWY_NATIVE_EXPAND
   2730 #endif
   2731 
   2732 template <typename T, size_t N>
   2733 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) {
   2734  size_t in_pos = 0;
   2735  Vec128<T, N> ret;
   2736  for (size_t i = 0; i < N; ++i) {
   2737    if (mask.bits[i]) {
   2738      ret.raw[i] = v.raw[in_pos++];
   2739    } else {
   2740      ret.raw[i] = ConvertScalarTo<T>(0);
   2741    }
   2742  }
   2743  return ret;
   2744 }
   2745 
   2746 // ------------------------------ LoadExpand
   2747 
   2748 template <class D>
   2749 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   2750                             const TFromD<D>* HWY_RESTRICT unaligned) {
   2751  size_t in_pos = 0;
   2752  VFromD<D> ret;
   2753  for (size_t i = 0; i < Lanes(d); ++i) {
   2754    if (mask.bits[i]) {
   2755      ret.raw[i] = unaligned[in_pos++];
   2756    } else {
   2757      ret.raw[i] = TFromD<D>();  // zero, also works for float16_t
   2758    }
   2759  }
   2760  return ret;
   2761 }
   2762 
   2763 // ------------------------------ CompressNot
   2764 template <typename T, size_t N>
   2765 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
   2766  size_t count = 0;
   2767  Vec128<T, N> ret;
   2768  for (size_t i = 0; i < N; ++i) {
   2769    if (!mask.bits[i]) {
   2770      ret.raw[count++] = v.raw[i];
   2771    }
   2772  }
   2773  for (size_t i = 0; i < N; ++i) {
   2774    if (mask.bits[i]) {
   2775      ret.raw[count++] = v.raw[i];
   2776    }
   2777  }
   2778  HWY_DASSERT(count == N);
   2779  return ret;
   2780 }
   2781 
   2782 // ------------------------------ CompressBlocksNot
   2783 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
   2784                                           Mask128<uint64_t> /* m */) {
   2785  return v;
   2786 }
   2787 
   2788 // ------------------------------ CompressBits
   2789 template <typename T, size_t N>
   2790 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
   2791                                  const uint8_t* HWY_RESTRICT bits) {
   2792  return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
   2793 }
   2794 
   2795 // ------------------------------ CompressStore
   2796 
   2797 // generic_ops-inl defines the 8-bit versions.
   2798 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   2799 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
   2800                             TFromD<D>* HWY_RESTRICT unaligned) {
   2801  size_t count = 0;
   2802  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2803    if (mask.bits[i]) {
   2804      unaligned[count++] = v.raw[i];
   2805    }
   2806  }
   2807  return count;
   2808 }
   2809 
   2810 // ------------------------------ CompressBlendedStore
   2811 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   2812 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> mask, D d,
   2813                                    TFromD<D>* HWY_RESTRICT unaligned) {
   2814  return CompressStore(v, mask, d, unaligned);
   2815 }
   2816 
   2817 // ------------------------------ CompressBitsStore
   2818 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   2819 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
   2820                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
   2821  const MFromD<D> mask = LoadMaskBits(d, bits);
   2822  StoreU(Compress(v, mask), d, unaligned);
   2823  return CountTrue(d, mask);
   2824 }
   2825 
   2826 // ------------------------------ Additional mask logical operations
   2827 template <class T>
   2828 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
   2829  return mask;
   2830 }
   2831 
   2832 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
   2833 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
   2834  using TU = hwy::MakeUnsigned<T>;
   2835 
   2836  Mask128<T, N> result;
   2837  TU result_lane_mask{0};
   2838  for (size_t i = 0; i < N; i++) {
   2839    result_lane_mask = static_cast<TU>(result_lane_mask | mask.bits[i]);
   2840    result.bits[i] = result_lane_mask;
   2841  }
   2842  return result;
   2843 }
   2844 
   2845 template <class T, size_t N>
   2846 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
   2847  return Not(SetAtOrAfterFirst(mask));
   2848 }
   2849 
   2850 template <class T, size_t N>
   2851 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
   2852  using TU = hwy::MakeUnsigned<T>;
   2853  using TI = hwy::MakeSigned<T>;
   2854 
   2855  Mask128<T, N> result;
   2856  TU result_lane_mask = static_cast<TU>(~TU{0});
   2857  for (size_t i = 0; i < N; i++) {
   2858    const auto curr_lane_mask_bits = mask.bits[i];
   2859    result.bits[i] = static_cast<TU>(curr_lane_mask_bits & result_lane_mask);
   2860    result_lane_mask =
   2861        static_cast<TU>(result_lane_mask &
   2862                        static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0)));
   2863  }
   2864  return result;
   2865 }
   2866 
   2867 template <class T, size_t N>
   2868 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
   2869  using TU = hwy::MakeUnsigned<T>;
   2870  using TI = hwy::MakeSigned<T>;
   2871 
   2872  Mask128<T, N> result;
   2873  TU result_lane_mask = static_cast<TU>(~TU{0});
   2874  for (size_t i = 0; i < N; i++) {
   2875    result.bits[i] = result_lane_mask;
   2876    result_lane_mask =
   2877        static_cast<TU>(result_lane_mask &
   2878                        static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0)));
   2879  }
   2880  return result;
   2881 }
   2882 
   2883 // ------------------------------ WidenMulPairwiseAdd
   2884 
   2885 template <class DF, HWY_IF_F32_D(DF), class VBF>
   2886 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
   2887  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
   2888                Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
   2889 }
   2890 
   2891 template <class D, HWY_IF_UI32_D(D), class V16>
   2892 HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
   2893  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
   2894                Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
   2895 }
   2896 
   2897 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
   2898 
   2899 template <class D, HWY_IF_UI32_D(D), class V16>
   2900 HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
   2901                                            const VFromD<D> sum0,
   2902                                            VFromD<D>& sum1) {
   2903  sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1);
   2904  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0);
   2905 }
   2906 
   2907 // ------------------------------ RearrangeToOddPlusEven
   2908 template <class VW>
   2909 HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) {
   2910  return Add(sum0, sum1);
   2911 }
   2912 
   2913 // ================================================== REDUCTIONS
   2914 
   2915 #ifdef HWY_NATIVE_REDUCE_SCALAR
   2916 #undef HWY_NATIVE_REDUCE_SCALAR
   2917 #else
   2918 #define HWY_NATIVE_REDUCE_SCALAR
   2919 #endif
   2920 
   2921 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
   2922 HWY_API T ReduceSum(D d, VFromD<D> v) {
   2923  T sum = T{0};
   2924  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2925    sum += v.raw[i];
   2926  }
   2927  return sum;
   2928 }
   2929 
   2930 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
   2931 HWY_API T ReduceMin(D d, VFromD<D> v) {
   2932  T min = PositiveInfOrHighestValue<T>();
   2933  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2934    min = HWY_MIN(min, v.raw[i]);
   2935  }
   2936  return min;
   2937 }
   2938 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
   2939 HWY_API T ReduceMax(D d, VFromD<D> v) {
   2940  T max = NegativeInfOrLowestValue<T>();
   2941  for (size_t i = 0; i < MaxLanes(d); ++i) {
   2942    max = HWY_MAX(max, v.raw[i]);
   2943  }
   2944  return max;
   2945 }
   2946 
   2947 // ------------------------------ SumOfLanes
   2948 
   2949 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   2950 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
   2951  return Set(d, ReduceSum(d, v));
   2952 }
   2953 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   2954 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
   2955  return Set(d, ReduceMin(d, v));
   2956 }
   2957 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   2958 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
   2959  return Set(d, ReduceMax(d, v));
   2960 }
   2961 
   2962 // ================================================== OPS WITH DEPENDENCIES
   2963 
   2964 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
   2965 
   2966 template <class T, HWY_IF_UI64(T)>
   2967 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
   2968  alignas(16) T mul[2];
   2969  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
   2970  return Load(Full128<T>(), mul);
   2971 }
   2972 
   2973 template <class T, HWY_IF_UI64(T)>
   2974 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
   2975  alignas(16) T mul[2];
   2976  const Half<Full128<T>> d2;
   2977  mul[0] =
   2978      Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
   2979  return Load(Full128<T>(), mul);
   2980 }
   2981 
   2982 // NOLINTNEXTLINE(google-readability-namespace-comments)
   2983 }  // namespace HWY_NAMESPACE
   2984 }  // namespace hwy
   2985 HWY_AFTER_NAMESPACE();