tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

scalar-inl.h (69788B)


      1 // Copyright 2019 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // Single-element vectors and operations.
     17 // External include guard in highway.h - see comment there.
     18 
     19 #include <stdint.h>
     20 #ifndef HWY_NO_LIBCXX
     21 #include <math.h>  // sqrtf
     22 #endif
     23 
     24 #include "hwy/ops/shared-inl.h"
     25 
     26 HWY_BEFORE_NAMESPACE();
     27 namespace hwy {
     28 namespace HWY_NAMESPACE {
     29 
     30 // Single instruction, single data.
     31 template <typename T>
     32 using Sisd = Simd<T, 1, 0>;
     33 
     34 // (Wrapper class required for overloading comparison operators.)
     35 template <typename T>
     36 struct Vec1 {
     37  using PrivateT = T;                     // only for DFromV
     38  static constexpr size_t kPrivateN = 1;  // only for DFromV
     39 
     40  HWY_INLINE Vec1() = default;
     41  Vec1(const Vec1&) = default;
     42  Vec1& operator=(const Vec1&) = default;
     43  HWY_INLINE explicit Vec1(const T t) : raw(t) {}
     44 
     45  HWY_INLINE Vec1& operator*=(const Vec1 other) {
     46    return *this = (*this * other);
     47  }
     48  HWY_INLINE Vec1& operator/=(const Vec1 other) {
     49    return *this = (*this / other);
     50  }
     51  HWY_INLINE Vec1& operator+=(const Vec1 other) {
     52    return *this = (*this + other);
     53  }
     54  HWY_INLINE Vec1& operator-=(const Vec1 other) {
     55    return *this = (*this - other);
     56  }
     57  HWY_INLINE Vec1& operator%=(const Vec1 other) {
     58    return *this = (*this % other);
     59  }
     60  HWY_INLINE Vec1& operator&=(const Vec1 other) {
     61    return *this = (*this & other);
     62  }
     63  HWY_INLINE Vec1& operator|=(const Vec1 other) {
     64    return *this = (*this | other);
     65  }
     66  HWY_INLINE Vec1& operator^=(const Vec1 other) {
     67    return *this = (*this ^ other);
     68  }
     69 
     70  T raw;
     71 };
     72 
     73 // 0 or FF..FF, same size as Vec1.
     74 template <typename T>
     75 struct Mask1 {
     76  using Raw = hwy::MakeUnsigned<T>;
     77 
     78  using PrivateT = T;                     // only for DFromM
     79  static constexpr size_t kPrivateN = 1;  // only for DFromM
     80 
     81  static HWY_INLINE Mask1<T> FromBool(bool b) {
     82    Mask1<T> mask;
     83    mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
     84    return mask;
     85  }
     86 
     87  Raw bits;
     88 };
     89 
     90 template <class V>
     91 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
     92 
     93 template <class M>
     94 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
     95 
     96 template <class V>
     97 using TFromV = typename V::PrivateT;
     98 
     99 // ------------------------------ BitCast
    100 
    101 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
    102 HWY_API Vec1<TTo> BitCast(DTo /* tag */, Vec1<TFrom> v) {
    103  static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined");
    104  TTo to;
    105  CopyBytes<sizeof(TTo)>(&v.raw, &to);  // not same size - ok to shrink
    106  return Vec1<TTo>(to);
    107 }
    108 
    109 // ------------------------------ Zero
    110 
    111 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
    112 HWY_API Vec1<T> Zero(D /* tag */) {
    113  return Vec1<T>(ConvertScalarTo<T>(0));
    114 }
    115 
    116 template <class D>
    117 using VFromD = decltype(Zero(D()));
    118 
    119 // ------------------------------ Set
    120 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
    121 HWY_API Vec1<T> Set(D /* tag */, const T2 t) {
    122  return Vec1<T>(static_cast<T>(t));
    123 }
    124 
    125 // ------------------------------ Undefined
    126 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
    127 HWY_API Vec1<T> Undefined(D d) {
    128  return Zero(d);
    129 }
    130 
    131 // ------------------------------ Iota
    132 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2>
    133 HWY_API Vec1<T> Iota(const D /* tag */, const T2 first) {
    134  return Vec1<T>(static_cast<T>(first));
    135 }
    136 
    137 // ------------------------------ ResizeBitCast
    138 
    139 template <class D, typename FromV>
    140 HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) {
    141  using TFrom = TFromV<FromV>;
    142  using TTo = TFromD<D>;
    143  constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo));
    144  TTo to{};
    145  CopyBytes<kCopyLen>(&v.raw, &to);
    146  return VFromD<D>(to);
    147 }
    148 
    149 namespace detail {
    150 
    151 // ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if
    152 // sizeof(TFromD<DTo>) is greater than sizeof(TFromV<FromV>)
    153 template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom>
    154 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */,
    155                                               ToSizeTag /* to_size_tag */,
    156                                               DTo d_to, DFrom /*d_from*/,
    157                                               VFromD<DFrom> v) {
    158  return ResizeBitCast(d_to, v);
    159 }
    160 
    161 }  // namespace detail
    162 
    163 // ------------------------------ Dup128VecFromValues
    164 
    165 template <class D, HWY_IF_T_SIZE_D(D, 1)>
    166 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
    167                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/,
    168                                      TFromD<D> /*t4*/, TFromD<D> /*t5*/,
    169                                      TFromD<D> /*t6*/, TFromD<D> /*t7*/,
    170                                      TFromD<D> /*t8*/, TFromD<D> /*t9*/,
    171                                      TFromD<D> /*t10*/, TFromD<D> /*t11*/,
    172                                      TFromD<D> /*t12*/, TFromD<D> /*t13*/,
    173                                      TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
    174  return VFromD<D>(t0);
    175 }
    176 
    177 template <class D, HWY_IF_T_SIZE_D(D, 2)>
    178 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
    179                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/,
    180                                      TFromD<D> /*t4*/, TFromD<D> /*t5*/,
    181                                      TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
    182  return VFromD<D>(t0);
    183 }
    184 
    185 template <class D, HWY_IF_T_SIZE_D(D, 4)>
    186 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/,
    187                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
    188  return VFromD<D>(t0);
    189 }
    190 
    191 template <class D, HWY_IF_T_SIZE_D(D, 8)>
    192 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/) {
    193  return VFromD<D>(t0);
    194 }
    195 
    196 // ================================================== LOGICAL
    197 
    198 // ------------------------------ Not
    199 
    200 template <typename T>
    201 HWY_API Vec1<T> Not(const Vec1<T> v) {
    202  using TU = MakeUnsigned<T>;
    203  const Sisd<TU> du;
    204  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
    205 }
    206 
    207 // ------------------------------ And
    208 
    209 template <typename T>
    210 HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
    211  using TU = MakeUnsigned<T>;
    212  const Sisd<TU> du;
    213  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
    214 }
    215 template <typename T>
    216 HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
    217  return And(a, b);
    218 }
    219 
    220 // ------------------------------ AndNot
    221 
    222 template <typename T>
    223 HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
    224  using TU = MakeUnsigned<T>;
    225  const Sisd<TU> du;
    226  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
    227                                                     BitCast(du, b).raw)));
    228 }
    229 
    230 // ------------------------------ Or
    231 
    232 template <typename T>
    233 HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
    234  using TU = MakeUnsigned<T>;
    235  const Sisd<TU> du;
    236  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
    237 }
    238 template <typename T>
    239 HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
    240  return Or(a, b);
    241 }
    242 
    243 // ------------------------------ Xor
    244 
    245 template <typename T>
    246 HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
    247  using TU = MakeUnsigned<T>;
    248  const Sisd<TU> du;
    249  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
    250 }
    251 template <typename T>
    252 HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
    253  return Xor(a, b);
    254 }
    255 
    256 // ------------------------------ Xor3
    257 
    258 template <typename T>
    259 HWY_API Vec1<T> Xor3(Vec1<T> x1, Vec1<T> x2, Vec1<T> x3) {
    260  return Xor(x1, Xor(x2, x3));
    261 }
    262 
    263 // ------------------------------ Or3
    264 
    265 template <typename T>
    266 HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) {
    267  return Or(o1, Or(o2, o3));
    268 }
    269 
    270 // ------------------------------ OrAnd
    271 
    272 template <typename T>
    273 HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
    274  return Or(o, And(a1, a2));
    275 }
    276 
    277 // ------------------------------ Mask
    278 
    279 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
    280 HWY_API Mask1<TTo> RebindMask(DTo /*tag*/, Mask1<TFrom> m) {
    281  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
    282  return Mask1<TTo>{m.bits};
    283 }
    284 
    285 // v must be 0 or FF..FF.
    286 template <typename T>
    287 HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
    288  Mask1<T> mask;
    289  CopySameSize(&v, &mask);
    290  return mask;
    291 }
    292 
    293 template <class D>
    294 using MFromD = decltype(MaskFromVec(VFromD<D>()));
    295 
    296 template <class D, typename T = TFromD<D>>
    297 Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) {
    298  Vec1<T> v;
    299  CopySameSize(&mask, &v);
    300  return v;
    301 }
    302 
    303 template <class D>
    304 uint64_t BitsFromMask(D, MFromD<D> mask) {
    305  return mask.bits ? 1 : 0;
    306 }
    307 
    308 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
    309 HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) {
    310  return Mask1<T>::FromBool(n != 0);
    311 }
    312 
    313 #ifdef HWY_NATIVE_SET_MASK
    314 #undef HWY_NATIVE_SET_MASK
    315 #else
    316 #define HWY_NATIVE_SET_MASK
    317 #endif
    318 
    319 template <class D>
    320 HWY_API MFromD<D> SetMask(D /*d*/, bool val) {
    321  return MFromD<D>::FromBool(val);
    322 }
    323 
    324 // ------------------------------ IfVecThenElse
    325 template <typename T>
    326 HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
    327  return IfThenElse(MaskFromVec(mask), yes, no);
    328 }
    329 
    330 // ------------------------------ CopySign
    331 template <typename T>
    332 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
    333  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
    334  const DFromV<decltype(magn)> d;
    335  return BitwiseIfThenElse(SignBit(d), sign, magn);
    336 }
    337 
    338 // ------------------------------ CopySignToAbs
    339 template <typename T>
    340 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
    341  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
    342  const Sisd<T> d;
    343  return OrAnd(abs, SignBit(d), sign);
    344 }
    345 
    346 // ------------------------------ BroadcastSignBit
    347 template <typename T>
    348 HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
    349  return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1));
    350 }
    351 
    352 // ------------------------------ PopulationCount
    353 
    354 #ifdef HWY_NATIVE_POPCNT
    355 #undef HWY_NATIVE_POPCNT
    356 #else
    357 #define HWY_NATIVE_POPCNT
    358 #endif
    359 
    360 template <typename T>
    361 HWY_API Vec1<T> PopulationCount(Vec1<T> v) {
    362  return Vec1<T>(static_cast<T>(PopCount(v.raw)));
    363 }
    364 
    365 // ------------------------------ IfThenElse
    366 
    367 // Returns mask ? yes : no.
    368 template <typename T>
    369 HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
    370                           const Vec1<T> no) {
    371  return mask.bits ? yes : no;
    372 }
    373 
    374 template <typename T>
    375 HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
    376  return mask.bits ? yes : Vec1<T>(ConvertScalarTo<T>(0));
    377 }
    378 
    379 template <typename T>
    380 HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
    381  return mask.bits ? Vec1<T>(ConvertScalarTo<T>(0)) : no;
    382 }
    383 
    384 template <typename T>
    385 HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
    386  const DFromV<decltype(v)> d;
    387  const RebindToSigned<decltype(d)> di;
    388  const auto vi = BitCast(di, v);
    389 
    390  return vi.raw < 0 ? yes : no;
    391 }
    392 
    393 // ------------------------------ Mask logical
    394 
    395 template <typename T>
    396 HWY_API Mask1<T> Not(const Mask1<T> m) {
    397  return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
    398 }
    399 
    400 template <typename T>
    401 HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) {
    402  const Sisd<T> d;
    403  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
    404 }
    405 
    406 template <typename T>
    407 HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) {
    408  const Sisd<T> d;
    409  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
    410 }
    411 
    412 template <typename T>
    413 HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) {
    414  const Sisd<T> d;
    415  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
    416 }
    417 
    418 template <typename T>
    419 HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
    420  const Sisd<T> d;
    421  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
    422 }
    423 
    424 template <typename T>
    425 HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) {
    426  const Sisd<T> d;
    427  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
    428 }
    429 
    430 template <class T>
    431 HWY_API Mask1<T> SetAtOrAfterFirst(Mask1<T> mask) {
    432  return mask;
    433 }
    434 
    435 template <class T>
    436 HWY_API Mask1<T> SetBeforeFirst(Mask1<T> mask) {
    437  return Not(mask);
    438 }
    439 
    440 template <class T>
    441 HWY_API Mask1<T> SetOnlyFirst(Mask1<T> mask) {
    442  return mask;
    443 }
    444 
    445 template <class T>
    446 HWY_API Mask1<T> SetAtOrBeforeFirst(Mask1<T> /*mask*/) {
    447  return Mask1<T>::FromBool(true);
    448 }
    449 
    450 // ------------------------------ LowerHalfOfMask
    451 
    452 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
    453 #undef HWY_NATIVE_LOWER_HALF_OF_MASK
    454 #else
    455 #define HWY_NATIVE_LOWER_HALF_OF_MASK
    456 #endif
    457 
    458 template <class D>
    459 HWY_API MFromD<D> LowerHalfOfMask(D /*d*/, MFromD<D> m) {
    460  return m;
    461 }
    462 
    463 // ================================================== SHIFTS
    464 
    465 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
    466 
    467 template <int kBits, typename T>
    468 HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
    469  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
    470  return Vec1<T>(
    471      static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
    472 }
    473 
    474 template <int kBits, typename T>
    475 HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
    476  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
    477  return Vec1<T>(ScalarShr(v.raw, kBits));
    478 }
    479 
    480 // ------------------------------ RotateRight (ShiftRight)
    481 template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    482 HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
    483  const DFromV<decltype(v)> d;
    484  const RebindToUnsigned<decltype(d)> du;
    485 
    486  constexpr size_t kSizeInBits = sizeof(T) * 8;
    487  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
    488  if (kBits == 0) return v;
    489 
    490  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
    491            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
    492 }
    493 
    494 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
    495 
    496 template <typename T>
    497 HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
    498  return Vec1<T>(
    499      static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
    500 }
    501 
    502 template <typename T>
    503 HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
    504  return Vec1<T>(ScalarShr(v.raw, bits));
    505 }
    506 
    507 // ------------------------------ Shl
    508 
    509 // Single-lane => same as ShiftLeftSame except for the argument type.
    510 template <typename T>
    511 HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
    512  return ShiftLeftSame(v, static_cast<int>(bits.raw));
    513 }
    514 
    515 template <typename T>
    516 HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
    517  return ShiftRightSame(v, static_cast<int>(bits.raw));
    518 }
    519 
    520 // ================================================== ARITHMETIC
    521 
    522 template <typename T>
    523 HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
    524  const uint64_t a64 = static_cast<uint64_t>(a.raw);
    525  const uint64_t b64 = static_cast<uint64_t>(b.raw);
    526  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
    527 }
    528 HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
    529  return Vec1<float>(a.raw + b.raw);
    530 }
    531 HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
    532  return Vec1<double>(a.raw + b.raw);
    533 }
    534 
    535 template <typename T>
    536 HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
    537  const uint64_t a64 = static_cast<uint64_t>(a.raw);
    538  const uint64_t b64 = static_cast<uint64_t>(b.raw);
    539  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
    540 }
    541 HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
    542  return Vec1<float>(a.raw - b.raw);
    543 }
    544 HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
    545  return Vec1<double>(a.raw - b.raw);
    546 }
    547 
    548 // ------------------------------ SumsOf8
    549 
    550 HWY_API Vec1<int64_t> SumsOf8(const Vec1<int8_t> v) {
    551  return Vec1<int64_t>(v.raw);
    552 }
    553 HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
    554  return Vec1<uint64_t>(v.raw);
    555 }
    556 
    557 // ------------------------------ SumsOf2
    558 
    559 template <class T>
    560 HWY_API Vec1<MakeWide<T>> SumsOf2(const Vec1<T> v) {
    561  const DFromV<decltype(v)> d;
    562  const Rebind<MakeWide<T>, decltype(d)> dw;
    563  return PromoteTo(dw, v);
    564 }
    565 
    566 // ------------------------------ SaturatedAdd
    567 
    568 // Returns a + b clamped to the destination range.
    569 
    570 // Unsigned
    571 HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
    572                                   const Vec1<uint8_t> b) {
    573  return Vec1<uint8_t>(
    574      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
    575 }
    576 HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
    577                                    const Vec1<uint16_t> b) {
    578  return Vec1<uint16_t>(static_cast<uint16_t>(
    579      HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) + b.raw), 65535)));
    580 }
    581 
    582 // Signed
    583 HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) {
    584  return Vec1<int8_t>(
    585      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
    586 }
    587 HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
    588                                   const Vec1<int16_t> b) {
    589  return Vec1<int16_t>(static_cast<int16_t>(
    590      HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) + b.raw), 32767)));
    591 }
    592 
    593 // ------------------------------ Saturating subtraction
    594 
    595 // Returns a - b clamped to the destination range.
    596 
    597 // Unsigned
    598 HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
    599                                   const Vec1<uint8_t> b) {
    600  return Vec1<uint8_t>(
    601      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
    602 }
    603 HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
    604                                    const Vec1<uint16_t> b) {
    605  return Vec1<uint16_t>(static_cast<uint16_t>(
    606      HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) - b.raw), 65535)));
    607 }
    608 
    609 // Signed
    610 HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) {
    611  return Vec1<int8_t>(
    612      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
    613 }
    614 HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
    615                                   const Vec1<int16_t> b) {
    616  return Vec1<int16_t>(static_cast<int16_t>(
    617      HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) - b.raw), 32767)));
    618 }
    619 
    620 // ------------------------------ Average
    621 
    622 // Returns (a + b + 1) / 2
    623 
    624 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
    625 #undef HWY_NATIVE_AVERAGE_ROUND_UI32
    626 #else
    627 #define HWY_NATIVE_AVERAGE_ROUND_UI32
    628 #endif
    629 
    630 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
    631 #undef HWY_NATIVE_AVERAGE_ROUND_UI64
    632 #else
    633 #define HWY_NATIVE_AVERAGE_ROUND_UI64
    634 #endif
    635 
    636 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    637 HWY_API Vec1<T> AverageRound(const Vec1<T> a, const Vec1<T> b) {
    638  const T a_val = a.raw;
    639  const T b_val = b.raw;
    640  return Vec1<T>(static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1)));
    641 }
    642 
    643 // ------------------------------ Absolute value
    644 
    645 template <typename T>
    646 HWY_API Vec1<T> Abs(const Vec1<T> a) {
    647  return Vec1<T>(ScalarAbs(a.raw));
    648 }
    649 
    650 // ------------------------------ Min/Max
    651 
    652 // <cmath> may be unavailable, so implement our own.
    653 
    654 template <typename T, HWY_IF_NOT_FLOAT(T)>
    655 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
    656  return Vec1<T>(HWY_MIN(a.raw, b.raw));
    657 }
    658 
    659 template <typename T, HWY_IF_FLOAT(T)>
    660 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
    661  if (ScalarIsNaN(a.raw)) return b;
    662  if (ScalarIsNaN(b.raw)) return a;
    663  return Vec1<T>(HWY_MIN(a.raw, b.raw));
    664 }
    665 
    666 template <typename T, HWY_IF_NOT_FLOAT(T)>
    667 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
    668  return Vec1<T>(HWY_MAX(a.raw, b.raw));
    669 }
    670 
    671 template <typename T, HWY_IF_FLOAT(T)>
    672 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
    673  if (ScalarIsNaN(a.raw)) return b;
    674  if (ScalarIsNaN(b.raw)) return a;
    675  return Vec1<T>(HWY_MAX(a.raw, b.raw));
    676 }
    677 
    678 // ------------------------------ Floating-point negate
    679 
    680 template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)>
    681 HWY_API Vec1<T> Neg(const Vec1<T> v) {
    682  return Xor(v, SignBit(Sisd<T>()));
    683 }
    684 
    685 template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    686 HWY_API Vec1<T> Neg(const Vec1<T> v) {
    687  return Zero(Sisd<T>()) - v;
    688 }
    689 
    690 // ------------------------------ mul/div
    691 
    692 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
    693 #ifdef HWY_NATIVE_MUL_8
    694 #undef HWY_NATIVE_MUL_8
    695 #else
    696 #define HWY_NATIVE_MUL_8
    697 #endif
    698 #ifdef HWY_NATIVE_MUL_64
    699 #undef HWY_NATIVE_MUL_64
    700 #else
    701 #define HWY_NATIVE_MUL_64
    702 #endif
    703 
    704 template <typename T, HWY_IF_FLOAT(T)>
    705 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
    706  return Vec1<T>(static_cast<T>(double{a.raw} * b.raw));
    707 }
    708 
    709 template <typename T, HWY_IF_NOT_FLOAT(T)>
    710 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
    711  return Vec1<T>(static_cast<T>(static_cast<uint64_t>(a.raw) *
    712                                static_cast<uint64_t>(b.raw)));
    713 }
    714 
    715 template <typename T, HWY_IF_FLOAT(T)>
    716 HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
    717  return Vec1<T>(a.raw / b.raw);
    718 }
    719 
    720 // Returns the upper sizeof(T)*8 bits of a * b in each lane.
    721 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    722          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    723 HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
    724  using TW = MakeWide<T>;
    725  return Vec1<T>(static_cast<T>(
    726      (static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8)));
    727 }
    728 template <class T, HWY_IF_UI64(T)>
    729 HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) {
    730  T hi;
    731  Mul128(a.raw, b.raw, &hi);
    732  return Vec1<T>(hi);
    733 }
    734 
    735 HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
    736  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw + 16384) >> 15));
    737 }
    738 
    739 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
    740 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    741          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    742 HWY_API Vec1<MakeWide<T>> MulEven(const Vec1<T> a, const Vec1<T> b) {
    743  using TW = MakeWide<T>;
    744  const TW a_wide = a.raw;
    745  return Vec1<TW>(static_cast<TW>(a_wide * b.raw));
    746 }
    747 
    748 template <class T>
    749 HWY_API Vec1<MakeWide<T>> MulOdd(const Vec1<T>, const Vec1<T>) {
    750  static_assert(sizeof(T) == 0, "There are no odd lanes");
    751 }
    752 
    753 // Approximate reciprocal
    754 HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
    755  // Zero inputs are allowed, but callers are responsible for replacing the
    756  // return value with something else (typically using IfThenElse). This check
    757  // avoids a ubsan error. The return value is arbitrary.
    758  if (v.raw == 0.0f) return Vec1<float>(0.0f);
    759  return Vec1<float>(1.0f / v.raw);
    760 }
    761 
    762 // generic_ops takes care of integer T.
    763 template <typename T, HWY_IF_FLOAT(T)>
    764 HWY_API Vec1<T> AbsDiff(const Vec1<T> a, const Vec1<T> b) {
    765  return Abs(a - b);
    766 }
    767 
    768 // ------------------------------ Floating-point multiply-add variants
    769 
    770 template <typename T, HWY_IF_FLOAT(T)>
    771 HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
    772  return mul * x + add;
    773 }
    774 
    775 template <typename T, HWY_IF_FLOAT(T)>
    776 HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
    777                          const Vec1<T> add) {
    778  return add - mul * x;
    779 }
    780 
    781 template <typename T, HWY_IF_FLOAT(T)>
    782 HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
    783  return mul * x - sub;
    784 }
    785 
    786 template <typename T, HWY_IF_FLOAT(T)>
    787 HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
    788                          const Vec1<T> sub) {
    789  return Neg(mul) * x - sub;
    790 }
    791 
    792 // ------------------------------ Floating-point square root
    793 
    794 // Approximate reciprocal square root
    795 HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
    796  float f = v.raw;
    797  const float half = f * 0.5f;
    798  uint32_t bits;
    799  CopySameSize(&f, &bits);
    800  // Initial guess based on log2(f)
    801  bits = 0x5F3759DF - (bits >> 1);
    802  CopySameSize(&bits, &f);
    803  // One Newton-Raphson iteration
    804  return Vec1<float>(f * (1.5f - (half * f * f)));
    805 }
    806 
    807 // Square root
    808 HWY_API Vec1<float> Sqrt(Vec1<float> v) {
    809 #if defined(HWY_NO_LIBCXX)
    810 #if HWY_COMPILER_GCC_ACTUAL
    811  return Vec1<float>(__builtin_sqrt(v.raw));
    812 #else
    813  uint32_t bits;
    814  CopyBytes<sizeof(bits)>(&v, &bits);
    815  // Coarse approximation, letting the exponent LSB leak into the mantissa
    816  bits = (1 << 29) + (bits >> 1) - (1 << 22);
    817  CopyBytes<sizeof(bits)>(&bits, &v);
    818  return v;
    819 #endif  // !HWY_COMPILER_GCC_ACTUAL
    820 #else
    821  return Vec1<float>(sqrtf(v.raw));
    822 #endif  // !HWY_NO_LIBCXX
    823 }
    824 HWY_API Vec1<double> Sqrt(Vec1<double> v) {
    825 #if defined(HWY_NO_LIBCXX)
    826 #if HWY_COMPILER_GCC_ACTUAL
    827  return Vec1<double>(__builtin_sqrt(v.raw));
    828 #else
    829  uint64_t bits;
    830  CopyBytes<sizeof(bits)>(&v, &bits);
    831  // Coarse approximation, letting the exponent LSB leak into the mantissa
    832  bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51);
    833  CopyBytes<sizeof(bits)>(&bits, &v);
    834  return v;
    835 #endif  // !HWY_COMPILER_GCC_ACTUAL
    836 #else
    837  return Vec1<double>(sqrt(v.raw));
    838 #endif  // HWY_NO_LIBCXX
    839 }
    840 
    841 // ------------------------------ Floating-point rounding
    842 
    843 template <typename T>
    844 HWY_API Vec1<T> Round(const Vec1<T> v) {
    845  using TI = MakeSigned<T>;
    846  if (!(Abs(v).raw < MantissaEnd<T>())) {  // Huge or NaN
    847    return v;
    848  }
    849  const T k0 = ConvertScalarTo<T>(0);
    850  const T bias = ConvertScalarTo<T>(v.raw < k0 ? -0.5 : 0.5);
    851  const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
    852  if (rounded == 0) return CopySignToAbs(Vec1<T>(k0), v);
    853  TI offset = 0;
    854  // Round to even
    855  if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
    856                           ConvertScalarTo<T>(0.5)) {
    857    offset = v.raw < k0 ? -1 : 1;
    858  }
    859  return Vec1<T>(ConvertScalarTo<T>(rounded - offset));
    860 }
    861 
    862 // Round-to-nearest even.
    863 template <class T, HWY_IF_FLOAT3264(T)>
    864 HWY_API Vec1<MakeSigned<T>> NearestInt(const Vec1<T> v) {
    865  using TI = MakeSigned<T>;
    866 
    867  const T abs = Abs(v).raw;
    868  const bool is_sign = ScalarSignBit(v.raw);
    869 
    870  if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
    871    // Check if too large to cast or NaN
    872    if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
    873      return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
    874    }
    875    return Vec1<TI>(ConvertScalarTo<TI>(v.raw));
    876  }
    877  const T bias =
    878      ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
    879  const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
    880  if (rounded == 0) return Vec1<TI>(0);
    881  TI offset = 0;
    882  // Round to even
    883  if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
    884                           ConvertScalarTo<T>(0.5)) {
    885    offset = is_sign ? -1 : 1;
    886  }
    887  return Vec1<TI>(rounded - offset);
    888 }
    889 
    890 // Round-to-nearest even.
    891 template <class DI32, HWY_IF_I32_D(DI32)>
    892 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/, const Vec1<double> v) {
    893  using T = double;
    894  using TI = int32_t;
    895 
    896  const T abs = Abs(v).raw;
    897  const bool is_sign = ScalarSignBit(v.raw);
    898 
    899  // Check if too large to cast or NaN
    900  if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) {
    901    return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
    902  }
    903 
    904  const T bias =
    905      ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5);
    906  const TI rounded = ConvertScalarTo<TI>(v.raw + bias);
    907  if (rounded == 0) return Vec1<TI>(0);
    908  TI offset = 0;
    909  // Round to even
    910  if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) ==
    911                           ConvertScalarTo<T>(0.5)) {
    912    offset = is_sign ? -1 : 1;
    913  }
    914  return Vec1<TI>(rounded - offset);
    915 }
    916 
    917 template <typename T>
    918 HWY_API Vec1<T> Trunc(const Vec1<T> v) {
    919  using TI = MakeSigned<T>;
    920  if (!(Abs(v).raw <= MantissaEnd<T>())) {  // Huge or NaN
    921    return v;
    922  }
    923  const TI truncated = ConvertScalarTo<TI>(v.raw);
    924  if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
    925  return Vec1<T>(ConvertScalarTo<T>(truncated));
    926 }
    927 
    928 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
    929          class V>
    930 V Ceiling(const V v) {
    931  const Bits kExponentMask = (1ull << kExponentBits) - 1;
    932  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
    933  const Bits kBias = kExponentMask / 2;
    934 
    935  Float f = v.raw;
    936  const bool positive = f > Float(0.0);
    937 
    938  Bits bits;
    939  CopySameSize(&v, &bits);
    940 
    941  const int exponent =
    942      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
    943  // Already an integer.
    944  if (exponent >= kMantissaBits) return v;
    945  // |v| <= 1 => 0 or 1.
    946  if (exponent < 0) return positive ? V(1) : V(-0.0);
    947 
    948  const Bits mantissa_mask = kMantissaMask >> exponent;
    949  // Already an integer
    950  if ((bits & mantissa_mask) == 0) return v;
    951 
    952  // Clear fractional bits and round up
    953  if (positive) bits += (kMantissaMask + 1) >> exponent;
    954  bits &= ~mantissa_mask;
    955 
    956  CopySameSize(&bits, &f);
    957  return V(f);
    958 }
    959 
    960 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
    961          class V>
    962 V Floor(const V v) {
    963  const Bits kExponentMask = (1ull << kExponentBits) - 1;
    964  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
    965  const Bits kBias = kExponentMask / 2;
    966 
    967  Float f = v.raw;
    968  const bool negative = f < Float(0.0);
    969 
    970  Bits bits;
    971  CopySameSize(&v, &bits);
    972 
    973  const int exponent =
    974      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
    975  // Already an integer.
    976  if (exponent >= kMantissaBits) return v;
    977  // |v| <= 1 => -1 or 0.
    978  if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
    979 
    980  const Bits mantissa_mask = kMantissaMask >> exponent;
    981  // Already an integer
    982  if ((bits & mantissa_mask) == 0) return v;
    983 
    984  // Clear fractional bits and round down
    985  if (negative) bits += (kMantissaMask + 1) >> exponent;
    986  bits &= ~mantissa_mask;
    987 
    988  CopySameSize(&bits, &f);
    989  return V(f);
    990 }
    991 
    992 // Toward +infinity, aka ceiling
    993 HWY_API Vec1<float> Ceil(const Vec1<float> v) {
    994  return Ceiling<float, uint32_t, 23, 8>(v);
    995 }
    996 HWY_API Vec1<double> Ceil(const Vec1<double> v) {
    997  return Ceiling<double, uint64_t, 52, 11>(v);
    998 }
    999 
   1000 // Toward -infinity, aka floor
   1001 HWY_API Vec1<float> Floor(const Vec1<float> v) {
   1002  return Floor<float, uint32_t, 23, 8>(v);
   1003 }
   1004 HWY_API Vec1<double> Floor(const Vec1<double> v) {
   1005  return Floor<double, uint64_t, 52, 11>(v);
   1006 }
   1007 
   1008 // ================================================== COMPARE
   1009 
   1010 template <typename T>
   1011 HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
   1012  return Mask1<T>::FromBool(a.raw == b.raw);
   1013 }
   1014 
   1015 template <typename T>
   1016 HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) {
   1017  return Mask1<T>::FromBool(a.raw != b.raw);
   1018 }
   1019 
   1020 template <typename T>
   1021 HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
   1022  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   1023  return (v & bit) == bit;
   1024 }
   1025 
   1026 template <typename T>
   1027 HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
   1028  return Mask1<T>::FromBool(a.raw < b.raw);
   1029 }
   1030 template <typename T>
   1031 HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
   1032  return Mask1<T>::FromBool(a.raw > b.raw);
   1033 }
   1034 
   1035 template <typename T>
   1036 HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
   1037  return Mask1<T>::FromBool(a.raw <= b.raw);
   1038 }
   1039 template <typename T>
   1040 HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
   1041  return Mask1<T>::FromBool(a.raw >= b.raw);
   1042 }
   1043 
   1044 // ------------------------------ Floating-point classification (==)
   1045 
   1046 template <typename T>
   1047 HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
   1048  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
   1049  return Mask1<T>::FromBool(ScalarIsNaN(v.raw));
   1050 }
   1051 
   1052 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
   1053 #ifdef HWY_NATIVE_ISINF
   1054 #undef HWY_NATIVE_ISINF
   1055 #else
   1056 #define HWY_NATIVE_ISINF
   1057 #endif
   1058 
   1059 HWY_API Mask1<float> IsInf(const Vec1<float> v) {
   1060  const Sisd<float> d;
   1061  const RebindToUnsigned<decltype(d)> du;
   1062  const Vec1<uint32_t> vu = BitCast(du, v);
   1063  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
   1064  return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
   1065 }
   1066 HWY_API Mask1<double> IsInf(const Vec1<double> v) {
   1067  const Sisd<double> d;
   1068  const RebindToUnsigned<decltype(d)> du;
   1069  const Vec1<uint64_t> vu = BitCast(du, v);
   1070  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
   1071  return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
   1072 }
   1073 
   1074 HWY_API Mask1<float> IsFinite(const Vec1<float> v) {
   1075  const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
   1076  // Shift left to clear the sign bit, check whether exponent != max value.
   1077  return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
   1078 }
   1079 HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
   1080  const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
   1081  // Shift left to clear the sign bit, check whether exponent != max value.
   1082  return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
   1083 }
   1084 
   1085 // ================================================== MEMORY
   1086 
   1087 // ------------------------------ Load
   1088 
   1089 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
   1090 HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
   1091  T t;
   1092  CopySameSize(aligned, &t);
   1093  return Vec1<T>(t);
   1094 }
   1095 
   1096 template <class D, typename T = TFromD<D>>
   1097 HWY_API Vec1<T> MaskedLoad(Mask1<T> m, D d, const T* HWY_RESTRICT aligned) {
   1098  return IfThenElseZero(m, Load(d, aligned));
   1099 }
   1100 
   1101 template <class D, typename T = TFromD<D>>
   1102 HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d,
   1103                             const T* HWY_RESTRICT aligned) {
   1104  return IfThenElse(m, Load(d, aligned), v);
   1105 }
   1106 
   1107 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
   1108 HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) {
   1109  return Load(d, p);
   1110 }
   1111 
   1112 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
   1113 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
   1114 HWY_API Vec1<T> LoadDup128(D d, const T* HWY_RESTRICT aligned) {
   1115  return Load(d, aligned);
   1116 }
   1117 
   1118 #ifdef HWY_NATIVE_LOAD_N
   1119 #undef HWY_NATIVE_LOAD_N
   1120 #else
   1121 #define HWY_NATIVE_LOAD_N
   1122 #endif
   1123 
   1124 template <class D, typename T = TFromD<D>>
   1125 HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
   1126                        size_t max_lanes_to_load) {
   1127  return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d);
   1128 }
   1129 
   1130 template <class D, typename T = TFromD<D>>
   1131 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
   1132                          size_t max_lanes_to_load) {
   1133  return (max_lanes_to_load > 0) ? Load(d, p) : no;
   1134 }
   1135 
   1136 // ------------------------------ Store
   1137 
   1138 template <class D, typename T = TFromD<D>>
   1139 HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
   1140  CopySameSize(&v.raw, aligned);
   1141 }
   1142 
   1143 template <class D, typename T = TFromD<D>>
   1144 HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
   1145  return Store(v, d, p);
   1146 }
   1147 
   1148 template <class D, typename T = TFromD<D>>
   1149 HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, D d, T* HWY_RESTRICT p) {
   1150  if (!m.bits) return;
   1151  StoreU(v, d, p);
   1152 }
   1153 
   1154 #ifdef HWY_NATIVE_STORE_N
   1155 #undef HWY_NATIVE_STORE_N
   1156 #else
   1157 #define HWY_NATIVE_STORE_N
   1158 #endif
   1159 
   1160 template <class D, typename T = TFromD<D>>
   1161 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   1162                    size_t max_lanes_to_store) {
   1163  if (max_lanes_to_store > 0) {
   1164    Store(v, d, p);
   1165  }
   1166 }
   1167 
   1168 // ------------------------------ Tuples
   1169 #include "hwy/ops/inside-inl.h"
   1170 
   1171 // ------------------------------ LoadInterleaved2/3/4
   1172 
   1173 // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
   1174 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1175 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1176 #else
   1177 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1178 #endif
   1179 
   1180 template <class D, typename T = TFromD<D>>
   1181 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
   1182                              Vec1<T>& v1) {
   1183  v0 = LoadU(d, unaligned + 0);
   1184  v1 = LoadU(d, unaligned + 1);
   1185 }
   1186 
   1187 template <class D, typename T = TFromD<D>>
   1188 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
   1189                              Vec1<T>& v1, Vec1<T>& v2) {
   1190  v0 = LoadU(d, unaligned + 0);
   1191  v1 = LoadU(d, unaligned + 1);
   1192  v2 = LoadU(d, unaligned + 2);
   1193 }
   1194 
   1195 template <class D, typename T = TFromD<D>>
   1196 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0,
   1197                              Vec1<T>& v1, Vec1<T>& v2, Vec1<T>& v3) {
   1198  v0 = LoadU(d, unaligned + 0);
   1199  v1 = LoadU(d, unaligned + 1);
   1200  v2 = LoadU(d, unaligned + 2);
   1201  v3 = LoadU(d, unaligned + 3);
   1202 }
   1203 
   1204 // ------------------------------ StoreInterleaved2/3/4
   1205 
   1206 template <class D, typename T = TFromD<D>>
   1207 HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, D d,
   1208                               T* HWY_RESTRICT unaligned) {
   1209  StoreU(v0, d, unaligned + 0);
   1210  StoreU(v1, d, unaligned + 1);
   1211 }
   1212 
   1213 template <class D, typename T = TFromD<D>>
   1214 HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1,
   1215                               const Vec1<T> v2, D d,
   1216                               T* HWY_RESTRICT unaligned) {
   1217  StoreU(v0, d, unaligned + 0);
   1218  StoreU(v1, d, unaligned + 1);
   1219  StoreU(v2, d, unaligned + 2);
   1220 }
   1221 
   1222 template <class D, typename T = TFromD<D>>
   1223 HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1,
   1224                               const Vec1<T> v2, const Vec1<T> v3, D d,
   1225                               T* HWY_RESTRICT unaligned) {
   1226  StoreU(v0, d, unaligned + 0);
   1227  StoreU(v1, d, unaligned + 1);
   1228  StoreU(v2, d, unaligned + 2);
   1229  StoreU(v3, d, unaligned + 3);
   1230 }
   1231 
   1232 // ------------------------------ Stream
   1233 
   1234 template <class D, typename T = TFromD<D>>
   1235 HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
   1236  return Store(v, d, aligned);
   1237 }
   1238 
   1239 // ------------------------------ Scatter
   1240 
   1241 #ifdef HWY_NATIVE_SCATTER
   1242 #undef HWY_NATIVE_SCATTER
   1243 #else
   1244 #define HWY_NATIVE_SCATTER
   1245 #endif
   1246 
   1247 template <class D, typename T = TFromD<D>, typename TI>
   1248 HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) {
   1249  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   1250  const intptr_t addr =
   1251      reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
   1252  Store(v, d, reinterpret_cast<T*>(addr));
   1253 }
   1254 
   1255 template <class D, typename T = TFromD<D>, typename TI>
   1256 HWY_API void ScatterIndex(Vec1<T> v, D d, T* HWY_RESTRICT base,
   1257                          Vec1<TI> index) {
   1258  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   1259  Store(v, d, base + index.raw);
   1260 }
   1261 
   1262 template <class D, typename T = TFromD<D>, typename TI>
   1263 HWY_API void MaskedScatterIndex(Vec1<T> v, Mask1<T> m, D d,
   1264                                T* HWY_RESTRICT base, Vec1<TI> index) {
   1265  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   1266  if (m.bits) Store(v, d, base + index.raw);
   1267 }
   1268 
   1269 // ------------------------------ Gather
   1270 
   1271 #ifdef HWY_NATIVE_GATHER
   1272 #undef HWY_NATIVE_GATHER
   1273 #else
   1274 #define HWY_NATIVE_GATHER
   1275 #endif
   1276 
   1277 template <class D, typename T = TFromD<D>>
   1278 HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<MakeSigned<T>> offset) {
   1279  HWY_DASSERT(offset.raw >= 0);
   1280  const intptr_t addr =
   1281      reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
   1282  return Load(d, reinterpret_cast<const T*>(addr));
   1283 }
   1284 
   1285 template <class D, typename T = TFromD<D>>
   1286 HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base,
   1287                            Vec1<MakeSigned<T>> index) {
   1288  HWY_DASSERT(index.raw >= 0);
   1289  return Load(d, base + index.raw);
   1290 }
   1291 
   1292 template <class D, typename T = TFromD<D>>
   1293 HWY_API Vec1<T> MaskedGatherIndex(Mask1<T> m, D d, const T* HWY_RESTRICT base,
   1294                                  Vec1<MakeSigned<T>> index) {
   1295  HWY_DASSERT(index.raw >= 0);
   1296  return MaskedLoad(m, d, base + index.raw);
   1297 }
   1298 
   1299 template <class D, typename T = TFromD<D>>
   1300 HWY_API Vec1<T> MaskedGatherIndexOr(Vec1<T> no, Mask1<T> m, D d,
   1301                                    const T* HWY_RESTRICT base,
   1302                                    Vec1<MakeSigned<T>> index) {
   1303  HWY_DASSERT(index.raw >= 0);
   1304  return MaskedLoadOr(no, m, d, base + index.raw);
   1305 }
   1306 
   1307 // ================================================== CONVERT
   1308 
   1309 // ConvertTo and DemoteTo with floating-point input and integer output truncate
   1310 // (rounding toward zero).
   1311 
   1312 namespace detail {
   1313 
   1314 template <class ToT, class FromT>
   1315 HWY_INLINE ToT CastValueForF2IConv(FromT val) {
   1316  // Prevent ubsan errors when converting float to narrower integer
   1317 
   1318  using FromTU = MakeUnsigned<FromT>;
   1319  using ToTU = MakeUnsigned<ToT>;
   1320 
   1321  constexpr unsigned kMaxExpField =
   1322      static_cast<unsigned>(MaxExponentField<FromT>());
   1323  constexpr unsigned kExpBias = kMaxExpField >> 1;
   1324  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
   1325      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
   1326      kMaxExpField));
   1327 
   1328  // If ToT is signed, compare only the exponent bits of val against
   1329  // kMinOutOfRangeExpField.
   1330  //
   1331  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
   1332  // val against kMinOutOfRangeExpField as a negative value is outside of the
   1333  // range of an unsigned integer type.
   1334  const FromT val_to_compare =
   1335      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
   1336 
   1337  // val is within the range of ToT if
   1338  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
   1339  // than kMinOutOfRangeExpField
   1340  //
   1341  // Otherwise, val is either outside of the range of ToT or equal to
   1342  // LimitsMin<ToT>() if
   1343  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
   1344  // than or equal to kMinOutOfRangeExpField.
   1345 
   1346  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
   1347                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
   1348             ? static_cast<ToT>(val)
   1349             : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) +
   1350                                static_cast<ToTU>(ScalarSignBit(val)));
   1351 }
   1352 
   1353 template <class ToT, class ToTypeTag, class FromT>
   1354 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
   1355  return ConvertScalarTo<ToT>(val);
   1356 }
   1357 
   1358 template <class ToT>
   1359 HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/,
   1360                                     float val) {
   1361  return CastValueForF2IConv<ToT>(val);
   1362 }
   1363 
   1364 template <class ToT>
   1365 HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/,
   1366                                     float val) {
   1367  return CastValueForF2IConv<ToT>(val);
   1368 }
   1369 
   1370 // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val)
   1371 // returns static_cast<ToT>(val)
   1372 //
   1373 // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an
   1374 // implementation-defined result if val is not within the range of ToT.
   1375 template <class ToT, class FromT>
   1376 HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) {
   1377  // Prevent ubsan errors when converting float to narrower integer
   1378 
   1379  using FromTU = MakeUnsigned<FromT>;
   1380 
   1381  constexpr unsigned kMaxExpField =
   1382      static_cast<unsigned>(MaxExponentField<FromT>());
   1383  constexpr unsigned kExpBias = kMaxExpField >> 1;
   1384  constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN(
   1385      kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()),
   1386      kMaxExpField));
   1387 
   1388  // If ToT is signed, compare only the exponent bits of val against
   1389  // kMinOutOfRangeExpField.
   1390  //
   1391  // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of
   1392  // val against kMinOutOfRangeExpField as a negative value is outside of the
   1393  // range of an unsigned integer type.
   1394  const FromT val_to_compare =
   1395      static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val);
   1396 
   1397  // val is within the range of ToT if
   1398  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less
   1399  // than kMinOutOfRangeExpField
   1400  //
   1401  // Otherwise, val is either outside of the range of ToT or equal to
   1402  // LimitsMin<ToT>() if
   1403  // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater
   1404  // than or equal to kMinOutOfRangeExpField.
   1405 
   1406  return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >>
   1407                                MantissaBits<FromT>()) < kMinOutOfRangeExpField)
   1408             ? static_cast<ToT>(val)
   1409             : static_cast<ToT>(LimitsMin<ToT>());
   1410 }
   1411 
   1412 }  // namespace detail
   1413 
   1414 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
   1415 #undef HWY_NATIVE_PROMOTE_F16_TO_F64
   1416 #else
   1417 #define HWY_NATIVE_PROMOTE_F16_TO_F64
   1418 #endif
   1419 
   1420 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
   1421 HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
   1422  static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting");
   1423  // For bits Y > X, floatX->floatY and intX->intY are always representable.
   1424  return Vec1<TTo>(
   1425      detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw));
   1426 }
   1427 
   1428 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   1429 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   1430 #else
   1431 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   1432 #endif
   1433 
   1434 template <class DTo, HWY_IF_UI64_D(DTo)>
   1435 HWY_API VFromD<DTo> PromoteInRangeTo(DTo /* tag */, Vec1<float> from) {
   1436  using TTo = TFromD<DTo>;
   1437  return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw));
   1438 }
   1439 
   1440 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here,
   1441 // so we overload for TFrom=double and TTo={float,int32_t}.
   1442 template <class D, HWY_IF_F32_D(D)>
   1443 HWY_API Vec1<float> DemoteTo(D /* tag */, Vec1<double> from) {
   1444  // Prevent ubsan errors when converting float to narrower integer/float
   1445  if (IsInf(from).bits ||
   1446      Abs(from).raw > static_cast<double>(HighestValue<float>())) {
   1447    return Vec1<float>(ScalarSignBit(from.raw) ? LowestValue<float>()
   1448                                               : HighestValue<float>());
   1449  }
   1450  return Vec1<float>(static_cast<float>(from.raw));
   1451 }
   1452 template <class D, HWY_IF_UI32_D(D)>
   1453 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec1<double> from) {
   1454  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
   1455  return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(from.raw));
   1456 }
   1457 
   1458 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
   1459          HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)>
   1460 HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
   1461  static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
   1462  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
   1463 
   1464  // Int to int: choose closest value in TTo to `from` (avoids UB)
   1465  from.raw = HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw), LimitsMax<TTo>());
   1466  return Vec1<TTo>(static_cast<TTo>(from.raw));
   1467 }
   1468 
   1469 // Disable the default unsigned to signed DemoteTo implementation in
   1470 // generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific
   1471 // implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To
   1472 // is not supported on the SCALAR target
   1473 
   1474 // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
   1475 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
   1476 // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
   1477 // SFINAE to occur instead of a hard error due to a dependency on the V template
   1478 // argument
   1479 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
   1480 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
   1481  hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
   1482 
   1483 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
   1484          HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)>
   1485 HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
   1486  static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above");
   1487  static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting");
   1488 
   1489  const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>());
   1490 
   1491  // Int to int: choose closest value in TTo to `from` (avoids UB)
   1492  return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max)));
   1493 }
   1494 
   1495 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
   1496          HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)>
   1497 HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) {
   1498  // int64_t/uint64_t to float: simply cast to TTo
   1499  return Vec1<TTo>(static_cast<TTo>(from.raw));
   1500 }
   1501 
   1502 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   1503 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   1504 #else
   1505 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   1506 #endif
   1507 
   1508 template <class D32, HWY_IF_UI32_D(D32)>
   1509 HWY_API VFromD<D32> DemoteInRangeTo(D32 /*d32*/,
   1510                                    VFromD<Rebind<double, D32>> v) {
   1511  using TTo = TFromD<D32>;
   1512  return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
   1513 }
   1514 
   1515 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions;
   1516 // use this scalar version to verify the vector implementation.
   1517 #ifdef HWY_NATIVE_F16C
   1518 #undef HWY_NATIVE_F16C
   1519 #else
   1520 #define HWY_NATIVE_F16C
   1521 #endif
   1522 
   1523 template <class D, HWY_IF_F32_D(D)>
   1524 HWY_API Vec1<float> PromoteTo(D /* tag */, const Vec1<float16_t> v) {
   1525  return Vec1<float>(F32FromF16(v.raw));
   1526 }
   1527 
   1528 template <class D, HWY_IF_F32_D(D)>
   1529 HWY_API Vec1<float> PromoteTo(D d, const Vec1<bfloat16_t> v) {
   1530  return Set(d, F32FromBF16(v.raw));
   1531 }
   1532 
   1533 template <class DTo, typename TFrom>
   1534 HWY_API VFromD<DTo> PromoteEvenTo(DTo d_to, Vec1<TFrom> v) {
   1535  return PromoteTo(d_to, v);
   1536 }
   1537 
   1538 template <class D, HWY_IF_F16_D(D)>
   1539 HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) {
   1540  return Vec1<float16_t>(F16FromF32(v.raw));
   1541 }
   1542 
   1543 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
   1544 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
   1545 #else
   1546 #define HWY_NATIVE_DEMOTE_F32_TO_BF16
   1547 #endif
   1548 
   1549 template <class D, HWY_IF_BF16_D(D)>
   1550 HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) {
   1551  return Set(d, BF16FromF32(v.raw));
   1552 }
   1553 
   1554 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
   1555          HWY_IF_FLOAT(TFrom)>
   1556 HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
   1557  static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
   1558  // float## -> int##: return closest representable value.
   1559  return Vec1<TTo>(detail::CastValueForF2IConv<TTo>(from.raw));
   1560 }
   1561 
   1562 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
   1563          HWY_IF_NOT_FLOAT(TFrom)>
   1564 HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) {
   1565  static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size");
   1566  // int## -> float##: no check needed
   1567  return Vec1<TTo>(static_cast<TTo>(from.raw));
   1568 }
   1569 
   1570 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   1571 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   1572 #else
   1573 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   1574 #endif
   1575 
   1576 template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
   1577          HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))>
   1578 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) {
   1579  using TTo = TFromD<DI>;
   1580  return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw));
   1581 }
   1582 
   1583 HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
   1584  return DemoteTo(Sisd<uint8_t>(), v);
   1585 }
   1586 
   1587 // ------------------------------ TruncateTo
   1588 
   1589 template <class D, HWY_IF_U8_D(D)>
   1590 HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) {
   1591  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
   1592 }
   1593 
   1594 template <class D, HWY_IF_U16_D(D)>
   1595 HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) {
   1596  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
   1597 }
   1598 
   1599 template <class D, HWY_IF_U32_D(D)>
   1600 HWY_API Vec1<uint32_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) {
   1601  return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
   1602 }
   1603 
   1604 template <class D, HWY_IF_U8_D(D)>
   1605 HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) {
   1606  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
   1607 }
   1608 
   1609 template <class D, HWY_IF_U16_D(D)>
   1610 HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) {
   1611  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
   1612 }
   1613 
   1614 template <class D, HWY_IF_U8_D(D)>
   1615 HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint16_t> v) {
   1616  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
   1617 }
   1618 
   1619 // ================================================== COMBINE
   1620 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
   1621 
   1622 template <typename T>
   1623 HWY_API Vec1<T> LowerHalf(Vec1<T> v) {
   1624  return v;
   1625 }
   1626 
   1627 template <class D, typename T = TFromD<D>>
   1628 HWY_API Vec1<T> LowerHalf(D /* tag */, Vec1<T> v) {
   1629  return v;
   1630 }
   1631 
   1632 // ================================================== SWIZZLE
   1633 
   1634 template <typename T>
   1635 HWY_API T GetLane(const Vec1<T> v) {
   1636  return v.raw;
   1637 }
   1638 
   1639 template <typename T>
   1640 HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
   1641  HWY_DASSERT(i == 0);
   1642  (void)i;
   1643  return v.raw;
   1644 }
   1645 
   1646 template <typename T>
   1647 HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) {
   1648  HWY_DASSERT(i == 0);
   1649  (void)i;
   1650  v.raw = t;
   1651  return v;
   1652 }
   1653 
   1654 template <typename T>
   1655 HWY_API Vec1<T> DupEven(Vec1<T> v) {
   1656  return v;
   1657 }
   1658 // DupOdd is unsupported.
   1659 
   1660 template <typename T>
   1661 HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
   1662  return even;
   1663 }
   1664 
   1665 template <typename T>
   1666 HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) {
   1667  return even;
   1668 }
   1669 
   1670 // ------------------------------ SwapAdjacentBlocks
   1671 template <typename T>
   1672 HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) {
   1673  return v;
   1674 }
   1675 
   1676 // ------------------------------ InterleaveEvenBlocks
   1677 template <class D, class V = VFromD<D>>
   1678 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
   1679  return a;
   1680 }
   1681 // ------------------------------ InterleaveOddBlocks
   1682 template <class D, class V = VFromD<D>>
   1683 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
   1684  return a;
   1685 }
   1686 
   1687 // ------------------------------ TableLookupLanes
   1688 
   1689 // Returned by SetTableIndices for use by TableLookupLanes.
   1690 template <typename T>
   1691 struct Indices1 {
   1692  MakeSigned<T> raw;
   1693 };
   1694 
   1695 template <class D, typename T = TFromD<D>, typename TI>
   1696 HWY_API Indices1<T> IndicesFromVec(D, Vec1<TI> vec) {
   1697  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
   1698  HWY_DASSERT(vec.raw <= 1);
   1699  return Indices1<T>{static_cast<MakeSigned<T>>(vec.raw)};
   1700 }
   1701 
   1702 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename TI>
   1703 HWY_API Indices1<T> SetTableIndices(D d, const TI* idx) {
   1704  return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
   1705 }
   1706 
   1707 template <typename T>
   1708 HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
   1709  return v;
   1710 }
   1711 
   1712 template <typename T>
   1713 HWY_API Vec1<T> TwoTablesLookupLanes(const Vec1<T> a, const Vec1<T> b,
   1714                                     const Indices1<T> idx) {
   1715  return (idx.raw == 0) ? a : b;
   1716 }
   1717 
   1718 // ------------------------------ ReverseBlocks
   1719 
   1720 // Single block: no change
   1721 template <class D, typename T = TFromD<D>>
   1722 HWY_API Vec1<T> ReverseBlocks(D /* tag */, const Vec1<T> v) {
   1723  return v;
   1724 }
   1725 
   1726 // ------------------------------ Reverse
   1727 
   1728 template <class D, typename T = TFromD<D>>
   1729 HWY_API Vec1<T> Reverse(D /* tag */, const Vec1<T> v) {
   1730  return v;
   1731 }
   1732 
   1733 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
   1734 #ifdef HWY_NATIVE_REVERSE2_8
   1735 #undef HWY_NATIVE_REVERSE2_8
   1736 #else
   1737 #define HWY_NATIVE_REVERSE2_8
   1738 #endif
   1739 
   1740 // Must not be called:
   1741 template <class D, typename T = TFromD<D>>
   1742 HWY_API Vec1<T> Reverse2(D /* tag */, const Vec1<T> v) {
   1743  return v;
   1744 }
   1745 
   1746 template <class D, typename T = TFromD<D>>
   1747 HWY_API Vec1<T> Reverse4(D /* tag */, const Vec1<T> v) {
   1748  return v;
   1749 }
   1750 
   1751 template <class D, typename T = TFromD<D>>
   1752 HWY_API Vec1<T> Reverse8(D /* tag */, const Vec1<T> v) {
   1753  return v;
   1754 }
   1755 
   1756 // ------------------------------ ReverseLaneBytes
   1757 
   1758 #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
   1759 #undef HWY_NATIVE_REVERSE_LANE_BYTES
   1760 #else
   1761 #define HWY_NATIVE_REVERSE_LANE_BYTES
   1762 #endif
   1763 
   1764 HWY_API Vec1<uint16_t> ReverseLaneBytes(Vec1<uint16_t> v) {
   1765  const uint32_t val{v.raw};
   1766  return Vec1<uint16_t>(
   1767      static_cast<uint16_t>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu)));
   1768 }
   1769 
   1770 HWY_API Vec1<uint32_t> ReverseLaneBytes(Vec1<uint32_t> v) {
   1771  const uint32_t val = v.raw;
   1772  return Vec1<uint32_t>(static_cast<uint32_t>(
   1773      ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) |
   1774      ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu)));
   1775 }
   1776 
   1777 HWY_API Vec1<uint64_t> ReverseLaneBytes(Vec1<uint64_t> v) {
   1778  const uint64_t val = v.raw;
   1779  return Vec1<uint64_t>(static_cast<uint64_t>(
   1780      ((val << 56) & 0xFF00000000000000u) |
   1781      ((val << 40) & 0x00FF000000000000u) |
   1782      ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) |
   1783      ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) |
   1784      ((val >> 40) & 0x000000000000FF00u) |
   1785      ((val >> 56) & 0x00000000000000FFu)));
   1786 }
   1787 
   1788 template <class V, HWY_IF_SIGNED_V(V),
   1789          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
   1790 HWY_API V ReverseLaneBytes(V v) {
   1791  const DFromV<decltype(v)> d;
   1792  const RebindToUnsigned<decltype(d)> du;
   1793  return BitCast(d, ReverseLaneBytes(BitCast(du, v)));
   1794 }
   1795 
   1796 // ------------------------------ ReverseBits
   1797 #ifdef HWY_NATIVE_REVERSE_BITS_UI8
   1798 #undef HWY_NATIVE_REVERSE_BITS_UI8
   1799 #else
   1800 #define HWY_NATIVE_REVERSE_BITS_UI8
   1801 #endif
   1802 
   1803 #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
   1804 #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
   1805 #else
   1806 #define HWY_NATIVE_REVERSE_BITS_UI16_32_64
   1807 #endif
   1808 
   1809 namespace detail {
   1810 
   1811 template <class T>
   1812 HWY_INLINE T ReverseBitsOfEachByte(T val) {
   1813  using TU = MakeUnsigned<T>;
   1814  constexpr TU kMaxUnsignedVal{LimitsMax<TU>()};
   1815  constexpr TU kShrMask1 =
   1816      static_cast<TU>(0x5555555555555555u & kMaxUnsignedVal);
   1817  constexpr TU kShrMask2 =
   1818      static_cast<TU>(0x3333333333333333u & kMaxUnsignedVal);
   1819  constexpr TU kShrMask3 =
   1820      static_cast<TU>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal);
   1821 
   1822  constexpr TU kShlMask1 = static_cast<TU>(~kShrMask1);
   1823  constexpr TU kShlMask2 = static_cast<TU>(~kShrMask2);
   1824  constexpr TU kShlMask3 = static_cast<TU>(~kShrMask3);
   1825 
   1826  TU result = static_cast<TU>(val);
   1827  result = static_cast<TU>(((result << 1) & kShlMask1) |
   1828                           ((result >> 1) & kShrMask1));
   1829  result = static_cast<TU>(((result << 2) & kShlMask2) |
   1830                           ((result >> 2) & kShrMask2));
   1831  result = static_cast<TU>(((result << 4) & kShlMask3) |
   1832                           ((result >> 4) & kShrMask3));
   1833  return static_cast<T>(result);
   1834 }
   1835 
   1836 }  // namespace detail
   1837 
   1838 template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)>
   1839 HWY_API V ReverseBits(V v) {
   1840  return V(detail::ReverseBitsOfEachByte(v.raw));
   1841 }
   1842 
   1843 template <class V, HWY_IF_UNSIGNED_V(V),
   1844          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
   1845 HWY_API V ReverseBits(V v) {
   1846  return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw)));
   1847 }
   1848 
   1849 template <class V, HWY_IF_SIGNED_V(V)>
   1850 HWY_API V ReverseBits(V v) {
   1851  const DFromV<decltype(v)> d;
   1852  const RebindToUnsigned<decltype(d)> du;
   1853  return BitCast(d, ReverseBits(BitCast(du, v)));
   1854 }
   1855 
   1856 // ------------------------------ SlideUpLanes
   1857 
   1858 template <typename D>
   1859 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   1860  return v;
   1861 }
   1862 
   1863 // ------------------------------ SlideDownLanes
   1864 
   1865 template <typename D>
   1866 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   1867  return v;
   1868 }
   1869 
   1870 // ================================================== BLOCKWISE
   1871 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
   1872 
   1873 // ------------------------------ Broadcast/splat any lane
   1874 
   1875 template <int kLane, typename T>
   1876 HWY_API Vec1<T> Broadcast(const Vec1<T> v) {
   1877  static_assert(kLane == 0, "Scalar only has one lane");
   1878  return v;
   1879 }
   1880 
   1881 // ------------------------------ TableLookupBytes, TableLookupBytesOr0
   1882 
   1883 template <typename T, typename TI>
   1884 HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
   1885  uint8_t in_bytes[sizeof(T)];
   1886  uint8_t idx_bytes[sizeof(T)];
   1887  uint8_t out_bytes[sizeof(T)];
   1888  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
   1889  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
   1890  for (size_t i = 0; i < sizeof(T); ++i) {
   1891    out_bytes[i] = in_bytes[idx_bytes[i]];
   1892  }
   1893  TI out;
   1894  CopyBytes<sizeof(TI)>(&out_bytes, &out);
   1895  return Vec1<TI>{out};
   1896 }
   1897 
   1898 template <typename T, typename TI>
   1899 HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
   1900  uint8_t in_bytes[sizeof(T)];
   1901  uint8_t idx_bytes[sizeof(T)];
   1902  uint8_t out_bytes[sizeof(T)];
   1903  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
   1904  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
   1905  for (size_t i = 0; i < sizeof(T); ++i) {
   1906    out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
   1907  }
   1908  TI out;
   1909  CopyBytes<sizeof(TI)>(&out_bytes, &out);
   1910  return Vec1<TI>{out};
   1911 }
   1912 
   1913 // ------------------------------ ZipLower
   1914 
   1915 HWY_API Vec1<uint16_t> ZipLower(Vec1<uint8_t> a, Vec1<uint8_t> b) {
   1916  return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t{b.raw} << 8) + a.raw));
   1917 }
   1918 HWY_API Vec1<uint32_t> ZipLower(Vec1<uint16_t> a, Vec1<uint16_t> b) {
   1919  return Vec1<uint32_t>((uint32_t{b.raw} << 16) + a.raw);
   1920 }
   1921 HWY_API Vec1<uint64_t> ZipLower(Vec1<uint32_t> a, Vec1<uint32_t> b) {
   1922  return Vec1<uint64_t>((uint64_t{b.raw} << 32) + a.raw);
   1923 }
   1924 HWY_API Vec1<int16_t> ZipLower(Vec1<int8_t> a, Vec1<int8_t> b) {
   1925  return Vec1<int16_t>(static_cast<int16_t>((int32_t{b.raw} << 8) + a.raw));
   1926 }
   1927 HWY_API Vec1<int32_t> ZipLower(Vec1<int16_t> a, Vec1<int16_t> b) {
   1928  return Vec1<int32_t>((int32_t{b.raw} << 16) + a.raw);
   1929 }
   1930 HWY_API Vec1<int64_t> ZipLower(Vec1<int32_t> a, Vec1<int32_t> b) {
   1931  return Vec1<int64_t>((int64_t{b.raw} << 32) + a.raw);
   1932 }
   1933 
   1934 template <class DW, typename TW = TFromD<DW>, typename TN = MakeNarrow<TW>>
   1935 HWY_API Vec1<TW> ZipLower(DW /* tag */, Vec1<TN> a, Vec1<TN> b) {
   1936  return Vec1<TW>(static_cast<TW>((TW{b.raw} << (sizeof(TN) * 8)) + a.raw));
   1937 }
   1938 
   1939 // ================================================== MASK
   1940 
   1941 template <class D, typename T = TFromD<D>>
   1942 HWY_API bool AllFalse(D /* tag */, const Mask1<T> mask) {
   1943  return mask.bits == 0;
   1944 }
   1945 
   1946 template <class D, typename T = TFromD<D>>
   1947 HWY_API bool AllTrue(D /* tag */, const Mask1<T> mask) {
   1948  return mask.bits != 0;
   1949 }
   1950 
   1951 // `p` points to at least 8 readable bytes, not all of which need be valid.
   1952 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
   1953 HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) {
   1954  return Mask1<T>::FromBool((bits[0] & 1) != 0);
   1955 }
   1956 
   1957 template <class D, HWY_IF_LANES_D(D, 1)>
   1958 HWY_API MFromD<D> Dup128MaskFromMaskBits(D /*d*/, unsigned mask_bits) {
   1959  return MFromD<D>::FromBool((mask_bits & 1) != 0);
   1960 }
   1961 
   1962 // `p` points to at least 8 writable bytes.
   1963 template <class D, typename T = TFromD<D>>
   1964 HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) {
   1965  *bits = AllTrue(d, mask);
   1966  return 1;
   1967 }
   1968 
   1969 template <class D, typename T = TFromD<D>>
   1970 HWY_API size_t CountTrue(D /* tag */, const Mask1<T> mask) {
   1971  return mask.bits == 0 ? 0 : 1;
   1972 }
   1973 
   1974 template <class D, typename T = TFromD<D>>
   1975 HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1<T> mask) {
   1976  return mask.bits == 0 ? -1 : 0;
   1977 }
   1978 
   1979 template <class D, typename T = TFromD<D>>
   1980 HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1<T> /* m */) {
   1981  return 0;  // There is only one lane and we know it is true.
   1982 }
   1983 
   1984 template <class D, typename T = TFromD<D>>
   1985 HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1<T> mask) {
   1986  return mask.bits == 0 ? -1 : 0;
   1987 }
   1988 
   1989 template <class D, typename T = TFromD<D>>
   1990 HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1<T> /* m */) {
   1991  return 0;  // There is only one lane and we know it is true.
   1992 }
   1993 
   1994 // ------------------------------ Compress, CompressBits
   1995 
   1996 template <typename T>
   1997 struct CompressIsPartition {
   1998  enum { value = 1 };
   1999 };
   2000 
   2001 template <typename T>
   2002 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
   2003  // A single lane is already partitioned by definition.
   2004  return v;
   2005 }
   2006 
   2007 template <typename T>
   2008 HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
   2009  // A single lane is already partitioned by definition.
   2010  return v;
   2011 }
   2012 
   2013 // ------------------------------ CompressStore
   2014 template <class D, typename T = TFromD<D>>
   2015 HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, D d,
   2016                             T* HWY_RESTRICT unaligned) {
   2017  StoreU(Compress(v, mask), d, unaligned);
   2018  return CountTrue(d, mask);
   2019 }
   2020 
   2021 // ------------------------------ CompressBlendedStore
   2022 template <class D, typename T = TFromD<D>>
   2023 HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, D d,
   2024                                    T* HWY_RESTRICT unaligned) {
   2025  if (!mask.bits) return 0;
   2026  StoreU(v, d, unaligned);
   2027  return 1;
   2028 }
   2029 
   2030 // ------------------------------ CompressBits
   2031 template <typename T>
   2032 HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
   2033  return v;
   2034 }
   2035 
   2036 // ------------------------------ CompressBitsStore
   2037 template <class D, typename T = TFromD<D>>
   2038 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
   2039                                 D d, T* HWY_RESTRICT unaligned) {
   2040  const Mask1<T> mask = LoadMaskBits(d, bits);
   2041  StoreU(Compress(v, mask), d, unaligned);
   2042  return CountTrue(d, mask);
   2043 }
   2044 
   2045 // ------------------------------ Expand
   2046 
   2047 // generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here.
   2048 #ifdef HWY_NATIVE_EXPAND
   2049 #undef HWY_NATIVE_EXPAND
   2050 #else
   2051 #define HWY_NATIVE_EXPAND
   2052 #endif
   2053 
   2054 template <typename T>
   2055 HWY_API Vec1<T> Expand(Vec1<T> v, const Mask1<T> mask) {
   2056  return IfThenElseZero(mask, v);
   2057 }
   2058 
   2059 // ------------------------------ LoadExpand
   2060 template <class D>
   2061 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   2062                             const TFromD<D>* HWY_RESTRICT unaligned) {
   2063  return MaskedLoad(mask, d, unaligned);
   2064 }
   2065 
   2066 // ------------------------------ WidenMulPairwiseAdd
   2067 
   2068 template <class D32, HWY_IF_F32_D(D32)>
   2069 HWY_API Vec1<float> WidenMulPairwiseAdd(D32 /* tag */, Vec1<bfloat16_t> a,
   2070                                        Vec1<bfloat16_t> b) {
   2071  return Vec1<float>(F32FromBF16(a.raw)) * Vec1<float>(F32FromBF16(b.raw));
   2072 }
   2073 
   2074 template <class D32, HWY_IF_I32_D(D32)>
   2075 HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a,
   2076                                          Vec1<int16_t> b) {
   2077  return Vec1<int32_t>(a.raw * b.raw);
   2078 }
   2079 
   2080 // ------------------------------ SatWidenMulAccumFixedPoint
   2081 #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   2082 #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   2083 #else
   2084 #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   2085 #endif
   2086 
   2087 template <class DI32, HWY_IF_I32_D(DI32)>
   2088 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
   2089                                                VFromD<Rebind<int16_t, DI32>> a,
   2090                                                VFromD<Rebind<int16_t, DI32>> b,
   2091                                                VFromD<DI32> sum) {
   2092  // Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw)
   2093  // followed by an addition of the product is okay as
   2094  // (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as
   2095  // a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are
   2096  // equal to -32768.
   2097 
   2098  const VFromD<DI32> product(static_cast<int32_t>(a.raw) *
   2099                             static_cast<int32_t>(b.raw));
   2100  const VFromD<DI32> product2 = Add(product, product);
   2101 
   2102  const auto mul_overflow =
   2103      VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>())));
   2104 
   2105  return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
   2106                      Add(product2, mul_overflow));
   2107 }
   2108 
   2109 // ------------------------------ SatWidenMulPairwiseAdd
   2110 
   2111 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   2112 #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   2113 #else
   2114 #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   2115 #endif
   2116 
   2117 template <class DI16, HWY_IF_I16_D(DI16)>
   2118 HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a,
   2119                                             Vec1<int8_t> b) {
   2120  // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the
   2121  // input vectors only have 1 lane on the HWY_SCALAR target and as
   2122  // a.raw * b.raw is between -32640 and 32385, which is already within the
   2123  // range of an int16_t.
   2124 
   2125  // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed
   2126  // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if
   2127  // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the
   2128  // same sign.
   2129 
   2130  return Vec1<int16_t>(static_cast<int16_t>(a.raw) *
   2131                       static_cast<int16_t>(b.raw));
   2132 }
   2133 
   2134 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
   2135 
   2136 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   2137 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   2138 #else
   2139 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   2140 #endif
   2141 
   2142 template <class D32, HWY_IF_F32_D(D32)>
   2143 HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
   2144                                              Vec1<bfloat16_t> b,
   2145                                              const Vec1<float> sum0,
   2146                                              Vec1<float>& /* sum1 */) {
   2147  return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
   2148                Vec1<float>(F32FromBF16(b.raw)), sum0);
   2149 }
   2150 
   2151 template <class D32, HWY_IF_I32_D(D32)>
   2152 HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<int16_t> a,
   2153                                                Vec1<int16_t> b,
   2154                                                const Vec1<int32_t> sum0,
   2155                                                Vec1<int32_t>& /* sum1 */) {
   2156  return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
   2157 }
   2158 
   2159 template <class DU32, HWY_IF_U32_D(DU32)>
   2160 HWY_API Vec1<uint32_t> ReorderWidenMulAccumulate(DU32 /* tag */,
   2161                                                 Vec1<uint16_t> a,
   2162                                                 Vec1<uint16_t> b,
   2163                                                 const Vec1<uint32_t> sum0,
   2164                                                 Vec1<uint32_t>& /* sum1 */) {
   2165  return Vec1<uint32_t>(static_cast<uint32_t>(a.raw) * b.raw + sum0.raw);
   2166 }
   2167 
   2168 // ------------------------------ RearrangeToOddPlusEven
   2169 template <typename TW>
   2170 HWY_API Vec1<TW> RearrangeToOddPlusEven(Vec1<TW> sum0, Vec1<TW> /* sum1 */) {
   2171  return sum0;  // invariant already holds
   2172 }
   2173 
   2174 // ================================================== REDUCTIONS
   2175 
   2176 // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum.
   2177 
   2178 // NOLINTNEXTLINE(google-readability-namespace-comments)
   2179 }  // namespace HWY_NAMESPACE
   2180 }  // namespace hwy
   2181 HWY_AFTER_NAMESPACE();