tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

loongarch_lsx-inl.h (220759B)


      1 // Copyright 2024 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <stdio.h>
     17 
     18 #ifndef __loongarch_sx
     19 // If LSX is to be runtime dispatched (instead of in baseline), we need
     20 // to enable it *and* define __loongarch_sx or the intrinsic header will
     21 // fail to compile.
     22 //
     23 // We cannot simply move lsxintrin.h after HWY_BEFORE_NAMESPACE because
     24 // doing so may cause the first (the only effective) inclusion of
     25 // lsxintrin.h to be compiled with both LSX and LASX enabled.  Then when
     26 // we call the inline functions in the header with only LSX enabled,
     27 // we'll get an "always_inline function requires lasx but would be inlined
     28 // into a function that is compiled without suport for lasx" error.
     29 HWY_PUSH_ATTRIBUTES("lsx")
     30 #define __loongarch_sx
     31 #include <lsxintrin.h>
     32 #undef __loongarch_sx
     33 // Prevent "unused push_attribute" warning from Clang.
     34 HWY_MAYBE_UNUSED static void HWY_CONCAT(hwy_lsx_dummy, __COUNTER__) () {}
     35 HWY_POP_ATTRIBUTES
     36 #else
     37 #include <lsxintrin.h>
     38 #endif
     39 
     40 #include "hwy/base.h"
     41 #include "hwy/ops/shared-inl.h"
     42 
     43 HWY_BEFORE_NAMESPACE();
     44 namespace hwy {
     45 namespace HWY_NAMESPACE {
     46 namespace detail {
     47 
     48 // Enable generic functions for whichever of (f16, bf16) are not supported.
     49 #define HWY_LSX_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
     50 
     51 template <typename T>
     52 struct Raw128 {
     53  using type = __m128i;
     54 };
     55 template <>
     56 struct Raw128<float> {
     57  using type = __m128;
     58 };
     59 template <>
     60 struct Raw128<double> {
     61  using type = __m128d;
     62 };
     63 
     64 }  // namespace detail
     65 
     66 template <typename T, size_t N = 16 / sizeof(T)>
     67 class Vec128 {
     68  using Raw = typename detail::Raw128<T>::type;
     69 
     70 public:
     71  using PrivateT = T;                     // only for DFromV
     72  static constexpr size_t kPrivateN = N;  // only for DFromV
     73 
     74  // Compound assignment. Only usable if there is a corresponding non-member
     75  // binary operator overload. For example, only f32 and f64 support division.
     76  HWY_INLINE Vec128& operator*=(const Vec128 other) {
     77    return *this = (*this * other);
     78  }
     79  HWY_INLINE Vec128& operator/=(const Vec128 other) {
     80    return *this = (*this / other);
     81  }
     82  HWY_INLINE Vec128& operator+=(const Vec128 other) {
     83    return *this = (*this + other);
     84  }
     85  HWY_INLINE Vec128& operator-=(const Vec128 other) {
     86    return *this = (*this - other);
     87  }
     88  HWY_INLINE Vec128& operator%=(const Vec128 other) {
     89    return *this = (*this % other);
     90  }
     91  HWY_INLINE Vec128& operator&=(const Vec128 other) {
     92    return *this = (*this & other);
     93  }
     94  HWY_INLINE Vec128& operator|=(const Vec128 other) {
     95    return *this = (*this | other);
     96  }
     97  HWY_INLINE Vec128& operator^=(const Vec128 other) {
     98    return *this = (*this ^ other);
     99  }
    100 
    101  Raw raw;
    102 };
    103 
    104 template <typename T>
    105 using Vec64 = Vec128<T, 8 / sizeof(T)>;
    106 
    107 template <typename T>
    108 using Vec32 = Vec128<T, 4 / sizeof(T)>;
    109 
    110 template <typename T>
    111 using Vec16 = Vec128<T, 2 / sizeof(T)>;
    112 
    113 namespace detail {
    114 
    115 template <typename T>
    116 using RawMask128 = typename Raw128<T>::type;
    117 
    118 }  // namespace detail
    119 
    120 template <typename T, size_t N = 16 / sizeof(T)>
    121 struct Mask128 {
    122  using Raw = typename detail::RawMask128<T>;
    123 
    124  using PrivateT = T;                     // only for DFromM
    125  static constexpr size_t kPrivateN = N;  // only for DFromM
    126 
    127  Raw raw;
    128 };
    129 
    130 template <class V>
    131 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
    132 
    133 template <class M>
    134 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
    135 
    136 template <class V>
    137 using TFromV = typename V::PrivateT;
    138 
    139 // ------------------------------ BitCast
    140 
    141 namespace detail {
    142 
    143 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
    144 HWY_INLINE __m128i BitCastToInteger(__m128 v) {
    145  return reinterpret_cast<__m128i>(v);
    146 }
    147 HWY_INLINE __m128i BitCastToInteger(__m128d v) {
    148  return reinterpret_cast<__m128i>(v);
    149 }
    150 
    151 template <typename T, size_t N>
    152 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
    153  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
    154 }
    155 
    156 // Cannot rely on function overloading because return types differ.
    157 template <typename T>
    158 struct BitCastFromInteger128 {
    159  HWY_INLINE __m128i operator()(__m128i v) { return v; }
    160 };
    161 template <>
    162 struct BitCastFromInteger128<float> {
    163  HWY_INLINE __m128 operator()(__m128i v) {
    164    return reinterpret_cast<__m128>(v);
    165  }
    166 };
    167 template <>
    168 struct BitCastFromInteger128<double> {
    169  HWY_INLINE __m128d operator()(__m128i v) {
    170    return reinterpret_cast<__m128d>(v);
    171  }
    172 };
    173 
    174 }  // namespace detail
    175 
    176 // ------------------------------ Zero
    177 
    178 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero.
    179 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)>
    180 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    181  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{(__lsx_vreplgr2vr_w(0))};
    182 }
    183 
    184 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_FLOAT3264_D(D)>
    185 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
    186  return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{
    187      detail::BitCastFromInteger128<TFromD<D>>()(__lsx_vreplgr2vr_w(0))};
    188 }
    189 
    190 template <class D>
    191 using VFromD = decltype(Zero(D()));
    192 
    193 namespace detail {
    194 
    195 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    196 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    197                                     Vec128<uint8_t, D().MaxBytes()> v) {
    198  return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)};
    199 }
    200 
    201 }  // namespace detail
    202 
    203 template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)>
    204 HWY_API VFromD<D> BitCast(D d,
    205                          Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
    206  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
    207 }
    208 
    209 // ------------------------------ Set
    210 
    211 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
    212 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    213  return VFromD<D>{__lsx_vreplgr2vr_b(static_cast<int>(t))};
    214 }
    215 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
    216 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    217  return VFromD<D>{__lsx_vreplgr2vr_h(static_cast<int>(t))};
    218 }
    219 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
    220 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    221  return VFromD<D>{__lsx_vreplgr2vr_w(static_cast<int>(t))};
    222 }
    223 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)>
    224 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    225  return VFromD<D>{__lsx_vreplgr2vr_d(static_cast<long int>(t))};
    226 }
    227 
    228 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
    229 HWY_API VFromD<D> Set(D d, float t) {
    230  const RebindToSigned<decltype(d)> di;
    231  return BitCast(d, VFromD<decltype(di)>{__lsx_vldrepl_w(&t, 0)});
    232 }
    233 
    234 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
    235 HWY_API VFromD<D> Set(D d, double t) {
    236  const RebindToSigned<decltype(d)> di;
    237  return BitCast(d, VFromD<decltype(di)>{__lsx_vldrepl_d(&t, 0)});
    238 }
    239 
    240 // Generic for all vector lengths.
    241 template <class D, HWY_LSX_IF_EMULATED_D(D)>
    242 HWY_API VFromD<D> Set(D df, TFromD<D> t) {
    243  const RebindToUnsigned<decltype(df)> du;
    244  static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16");
    245  uint16_t bits;
    246  CopyBytes<2>(&t, &bits);
    247  return BitCast(df, Set(du, bits));
    248 }
    249 
    250 // ------------------------------ Undefined
    251 
    252 HWY_DIAGNOSTICS(push)
    253 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
    254 
    255 // Returns a vector with uninitialized elements.
    256 template <class D>
    257 HWY_API VFromD<D> Undefined(D /* tag */) {
    258  VFromD<D> v;
    259  return v;
    260 }
    261 
    262 HWY_DIAGNOSTICS(pop)
    263 
    264 // ------------------------------ GetLane
    265 
    266 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
    267 HWY_API T GetLane(const Vec128<T, N> v) {
    268  return static_cast<T>(__lsx_vpickve2gr_b(v.raw, 0));
    269 }
    270 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
    271 HWY_API T GetLane(const Vec128<T, N> v) {
    272  return static_cast<T>(__lsx_vpickve2gr_h(v.raw, 0));
    273 }
    274 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
    275 HWY_API T GetLane(const Vec128<T, N> v) {
    276  return static_cast<T>(__lsx_vpickve2gr_w(v.raw, 0));
    277 }
    278 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
    279 HWY_API T GetLane(const Vec128<T, N> v) {
    280  return static_cast<T>(__lsx_vpickve2gr_d(v.raw, 0));
    281 }
    282 template <size_t N>
    283 HWY_API float GetLane(const Vec128<float, N> v) {
    284  float f32;
    285  int32_t i32 = __lsx_vpickve2gr_w(reinterpret_cast<__m128i>(v.raw), 0);
    286  CopyBytes<4>(&i32, &f32);
    287  return f32;
    288 }
    289 template <size_t N>
    290 HWY_API double GetLane(const Vec128<double, N> v) {
    291  double f64;
    292  int64_t i64 = __lsx_vpickve2gr_d(reinterpret_cast<__m128i>(v.raw), 0);
    293  CopyBytes<8>(&i64, &f64);
    294  return f64;
    295 }
    296 
    297 // ------------------------------ ResizeBitCast
    298 
    299 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
    300          HWY_IF_V_SIZE_LE_D(D, 16)>
    301 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    302  const Repartition<uint8_t, decltype(d)> du8;
    303  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
    304 }
    305 
    306 // ------------------------------ Dup128VecFromValues
    307 
    308 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    309 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    310                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    311                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
    312                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
    313                                      TFromD<D> t11, TFromD<D> t12,
    314                                      TFromD<D> t13, TFromD<D> t14,
    315                                      TFromD<D> t15) {
    316  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
    317  const GccI8RawVectType raw = {
    318      static_cast<int8_t>(t0),  static_cast<int8_t>(t1),
    319      static_cast<int8_t>(t2),  static_cast<int8_t>(t3),
    320      static_cast<int8_t>(t4),  static_cast<int8_t>(t5),
    321      static_cast<int8_t>(t6),  static_cast<int8_t>(t7),
    322      static_cast<int8_t>(t8),  static_cast<int8_t>(t9),
    323      static_cast<int8_t>(t10), static_cast<int8_t>(t11),
    324      static_cast<int8_t>(t12), static_cast<int8_t>(t13),
    325      static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
    326  return VFromD<D>{reinterpret_cast<__m128i>(raw)};
    327 }
    328 
    329 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    330 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    331                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    332                                      TFromD<D> t5, TFromD<D> t6,
    333                                      TFromD<D> t7) {
    334  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
    335  const GccI16RawVectType raw = {
    336      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
    337      static_cast<int16_t>(t2), static_cast<int16_t>(t3),
    338      static_cast<int16_t>(t4), static_cast<int16_t>(t5),
    339      static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
    340  return VFromD<D>{reinterpret_cast<__m128i>(raw)};
    341 }
    342 
    343 template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
    344 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
    345                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    346                                      TFromD<D> t5, TFromD<D> t6,
    347                                      TFromD<D> t7) {
    348  const RebindToSigned<decltype(d)> di;
    349  return BitCast(d,
    350                 Dup128VecFromValues(
    351                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
    352                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
    353                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
    354                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
    355 }
    356 
    357 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    358 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    359                                      TFromD<D> t2, TFromD<D> t3) {
    360  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
    361  const GccI32RawVectType raw = {
    362      static_cast<int32_t>(t0), static_cast<int32_t>(t1),
    363      static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
    364  return VFromD<D>{reinterpret_cast<__m128i>(raw)};
    365 }
    366 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    367 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    368  typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
    369  const GccI64RawVectType raw = {static_cast<int64_t>(t0),
    370                                 static_cast<int64_t>(t1)};
    371  return VFromD<D>{reinterpret_cast<__m128i>(raw)};
    372 }
    373 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    374 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    375                                      TFromD<D> t2, TFromD<D> t3) {
    376  typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
    377  const GccF32RawVectType raw = {t0, t1, t2, t3};
    378  return VFromD<D>{reinterpret_cast<__m128>(raw)};
    379 }
    380 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
    381 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    382  typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
    383  const GccF64RawVectType raw = {t0, t1};
    384  return VFromD<D>{reinterpret_cast<__m128d>(raw)};
    385 }
    386 
    387 // ================================================== LOGICAL
    388 
    389 // ------------------------------ And
    390 
    391 template <typename T, size_t N>
    392 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
    393  const DFromV<decltype(a)> d;
    394  const RebindToUnsigned<decltype(d)> du;
    395  return BitCast(d, VFromD<decltype(du)>{
    396                        __lsx_vand_v(BitCast(du, a).raw, BitCast(du, b).raw)});
    397 }
    398 
    399 // ------------------------------ AndNot
    400 
    401 // Returns ~not_mask & mask.
    402 template <typename T, size_t N>
    403 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
    404  const DFromV<decltype(mask)> d;
    405  const RebindToUnsigned<decltype(d)> du;
    406  return BitCast(d, VFromD<decltype(du)>{__lsx_vandn_v(
    407                        BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
    408 }
    409 
    410 // ------------------------------ Or
    411 
    412 template <typename T, size_t N>
    413 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
    414  const DFromV<decltype(a)> d;
    415  const RebindToUnsigned<decltype(d)> du;
    416  return BitCast(d, VFromD<decltype(du)>{
    417                        __lsx_vor_v(BitCast(du, a).raw, BitCast(du, b).raw)});
    418 }
    419 
    420 // ------------------------------ Xor
    421 
    422 template <typename T, size_t N>
    423 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
    424  const DFromV<decltype(a)> d;
    425  const RebindToUnsigned<decltype(d)> du;
    426  return BitCast(d, VFromD<decltype(du)>{
    427                        __lsx_vxor_v(BitCast(du, a).raw, BitCast(du, b).raw)});
    428 }
    429 
    430 // ------------------------------ Not
    431 template <typename T, size_t N>
    432 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
    433  const DFromV<decltype(v)> d;
    434  const RebindToUnsigned<decltype(d)> du;
    435  return BitCast(d, VFromD<decltype(du)>{
    436                        __lsx_vnor_v(BitCast(du, v).raw, BitCast(du, v).raw)});
    437 }
    438 
    439 // ------------------------------ Xor3
    440 template <typename T, size_t N>
    441 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
    442  return Xor(x1, Xor(x2, x3));
    443 }
    444 
    445 // ------------------------------ Or3
    446 template <typename T, size_t N>
    447 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
    448  return Or(o1, Or(o2, o3));
    449 }
    450 
    451 // ------------------------------ OrAnd
    452 template <typename T, size_t N>
    453 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
    454  return Or(o, And(a1, a2));
    455 }
    456 
    457 // ------------------------------ Mask
    458 
    459 // Mask and Vec are the same (true = FF..FF).
    460 template <typename T, size_t N>
    461 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
    462  return Mask128<T, N>{v.raw};
    463 }
    464 
    465 template <class D>
    466 using MFromD = decltype(MaskFromVec(VFromD<D>()));
    467 
    468 template <typename T, size_t N>
    469 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
    470  return Vec128<T, N>{v.raw};
    471 }
    472 
    473 // Generic for all vector lengths.
    474 template <class D>
    475 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
    476  return VecFromMask(v);
    477 }
    478 
    479 template <typename T, size_t N>
    480 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
    481                                Vec128<T, N> no) {
    482  const DFromV<decltype(yes)> d;
    483  RebindToSigned<decltype(d)> di;
    484  return BitCast(d, VFromD<decltype(di)>{__lsx_vbitsel_v(
    485                        BitCast(di, no).raw, BitCast(di, yes).raw,
    486                        RebindMask(di, mask).raw)});
    487 }
    488 
    489 // ------------------------------ IfVecThenElse
    490 template <typename T, size_t N>
    491 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
    492                                   Vec128<T, N> no) {
    493  return IfThenElse(MaskFromVec(mask), yes, no);
    494 }
    495 
    496 // ------------------------------ BitwiseIfThenElse
    497 
    498 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
    499 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
    500 #else
    501 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
    502 #endif
    503 
    504 template <class V>
    505 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
    506  return IfVecThenElse(mask, yes, no);
    507 }
    508 
    509 // ------------------------------ Operator overloads (internal-only if float)
    510 
    511 template <typename T, size_t N>
    512 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
    513  return And(a, b);
    514 }
    515 
    516 template <typename T, size_t N>
    517 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
    518  return Or(a, b);
    519 }
    520 
    521 template <typename T, size_t N>
    522 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
    523  return Xor(a, b);
    524 }
    525 
    526 // ------------------------------ PopulationCount
    527 
    528 #ifdef HWY_NATIVE_POPCNT
    529 #undef HWY_NATIVE_POPCNT
    530 #else
    531 #define HWY_NATIVE_POPCNT
    532 #endif
    533 
    534 namespace detail {
    535 
    536 template <typename T, size_t N>
    537 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
    538                                        Vec128<T, N> v) {
    539  return Vec128<T, N>{__lsx_vpcnt_b(v.raw)};
    540 }
    541 template <typename T, size_t N>
    542 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
    543                                        Vec128<T, N> v) {
    544  return Vec128<T, N>{__lsx_vpcnt_h(v.raw)};
    545 }
    546 template <typename T, size_t N>
    547 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
    548                                        Vec128<T, N> v) {
    549  return Vec128<T, N>{__lsx_vpcnt_w(v.raw)};
    550 }
    551 template <typename T, size_t N>
    552 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
    553                                        Vec128<T, N> v) {
    554  return Vec128<T, N>{__lsx_vpcnt_d(v.raw)};
    555 }
    556 
    557 }  // namespace detail
    558 
    559 template <typename T, size_t N>
    560 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
    561  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
    562 }
    563 
    564 // ================================================== SIGN
    565 
    566 // ------------------------------ Neg
    567 
    568 template <typename T, size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)>
    569 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
    570  return Xor(v, SignBit(DFromV<decltype(v)>()));
    571 }
    572 
    573 template <typename T, size_t N, HWY_IF_UI8(T)>
    574 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
    575  return Vec128<T, N>{__lsx_vneg_b(v.raw)};
    576 }
    577 
    578 template <typename T, size_t N, HWY_IF_UI16(T)>
    579 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
    580  return Vec128<T, N>{__lsx_vneg_h(v.raw)};
    581 }
    582 
    583 template <typename T, size_t N, HWY_IF_UI32(T)>
    584 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
    585  return Vec128<T, N>{__lsx_vneg_w(v.raw)};
    586 }
    587 
    588 template <typename T, size_t N, HWY_IF_UI64(T)>
    589 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
    590  return Vec128<T, N>{__lsx_vneg_d(v.raw)};
    591 }
    592 
    593 // ------------------------------ Floating-point Abs
    594 // Generic for all vector lengths
    595 template <class V, HWY_IF_FLOAT(TFromV<V>)>
    596 HWY_API V Abs(V v) {
    597  const DFromV<decltype(v)> d;
    598  const RebindToSigned<decltype(d)> di;
    599  using TI = TFromD<decltype(di)>;
    600  return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>())));
    601 }
    602 
    603 // ------------------------------ CopySign
    604 // Generic for all vector lengths.
    605 template <class V>
    606 HWY_API V CopySign(const V magn, const V sign) {
    607  static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
    608 
    609  const DFromV<decltype(magn)> d;
    610  const auto msb = SignBit(d);
    611  return BitwiseIfThenElse(msb, sign, magn);
    612 }
    613 
    614 // ------------------------------ CopySignToAbs
    615 // Generic for all vector lengths.
    616 template <class V>
    617 HWY_API V CopySignToAbs(const V abs, const V sign) {
    618  const DFromV<decltype(abs)> d;
    619  return OrAnd(abs, SignBit(d), sign);
    620 }
    621 
    622 // ------------------------------ IfThenElseZero
    623 
    624 template <typename T, size_t N>
    625 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
    626  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
    627 }
    628 
    629 template <typename T, size_t N>
    630 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
    631  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
    632 }
    633 
    634 // ------------------------------ Mask logical
    635 
    636 template <typename T, size_t N>
    637 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
    638  const Simd<T, N, 0> d;
    639  return MaskFromVec(Not(VecFromMask(d, m)));
    640 }
    641 
    642 template <typename T, size_t N>
    643 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
    644  const Simd<T, N, 0> d;
    645  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
    646 }
    647 
    648 template <typename T, size_t N>
    649 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
    650  const Simd<T, N, 0> d;
    651  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
    652 }
    653 
    654 template <typename T, size_t N>
    655 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
    656  const Simd<T, N, 0> d;
    657  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
    658 }
    659 
    660 template <typename T, size_t N>
    661 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
    662  const Simd<T, N, 0> d;
    663  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
    664 }
    665 
    666 // ------------------------------ ExclusiveNeither
    667 
    668 template <typename T, size_t N>
    669 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
    670  const Simd<T, N, 0> d;
    671  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
    672 }
    673 
    674 // ------------------------------ ShiftLeft
    675 
    676 template <int kBits, size_t N>
    677 HWY_API Vec128<uint8_t, N> ShiftLeft(const Vec128<uint8_t, N> v) {
    678  return Vec128<uint8_t, N>{__lsx_vslli_b(v.raw, kBits)};
    679 }
    680 template <int kBits, size_t N>
    681 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
    682  return Vec128<uint16_t, N>{__lsx_vslli_h(v.raw, kBits)};
    683 }
    684 template <int kBits, size_t N>
    685 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
    686  return Vec128<uint32_t, N>{__lsx_vslli_w(v.raw, kBits)};
    687 }
    688 template <int kBits, size_t N>
    689 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
    690  return Vec128<uint64_t, N>{__lsx_vslli_d(v.raw, kBits)};
    691 }
    692 
    693 template <int kBits, size_t N>
    694 HWY_API Vec128<int8_t, N> ShiftLeft(const Vec128<int8_t, N> v) {
    695  return Vec128<int8_t, N>{__lsx_vslli_b(v.raw, kBits)};
    696 }
    697 template <int kBits, size_t N>
    698 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
    699  return Vec128<int16_t, N>{__lsx_vslli_h(v.raw, kBits)};
    700 }
    701 template <int kBits, size_t N>
    702 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
    703  return Vec128<int32_t, N>{__lsx_vslli_w(v.raw, kBits)};
    704 }
    705 template <int kBits, size_t N>
    706 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
    707  return Vec128<int64_t, N>{__lsx_vslli_d(v.raw, kBits)};
    708 }
    709 
    710 // ------------------------------ ShiftRight
    711 
    712 template <int kBits, size_t N>
    713 HWY_API Vec128<uint8_t, N> ShiftRight(Vec128<uint8_t, N> v) {
    714  return Vec128<uint8_t, N>{__lsx_vsrli_b(v.raw, kBits)};
    715 }
    716 template <int kBits, size_t N>
    717 HWY_API Vec128<uint16_t, N> ShiftRight(Vec128<uint16_t, N> v) {
    718  return Vec128<uint16_t, N>{__lsx_vsrli_h(v.raw, kBits)};
    719 }
    720 template <int kBits, size_t N>
    721 HWY_API Vec128<uint32_t, N> ShiftRight(Vec128<uint32_t, N> v) {
    722  return Vec128<uint32_t, N>{__lsx_vsrli_w(v.raw, kBits)};
    723 }
    724 template <int kBits, size_t N>
    725 HWY_API Vec128<uint64_t, N> ShiftRight(Vec128<uint64_t, N> v) {
    726  return Vec128<uint64_t, N>{__lsx_vsrli_d(v.raw, kBits)};
    727 }
    728 
    729 template <int kBits, size_t N>
    730 HWY_API Vec128<int8_t, N> ShiftRight(Vec128<int8_t, N> v) {
    731  return Vec128<int8_t, N>{__lsx_vsrai_b(v.raw, kBits)};
    732 }
    733 template <int kBits, size_t N>
    734 HWY_API Vec128<int16_t, N> ShiftRight(Vec128<int16_t, N> v) {
    735  return Vec128<int16_t, N>{__lsx_vsrai_h(v.raw, kBits)};
    736 }
    737 template <int kBits, size_t N>
    738 HWY_API Vec128<int32_t, N> ShiftRight(Vec128<int32_t, N> v) {
    739  return Vec128<int32_t, N>{__lsx_vsrai_w(v.raw, kBits)};
    740 }
    741 template <int kBits, size_t N>
    742 HWY_API Vec128<int64_t, N> ShiftRight(Vec128<int64_t, N> v) {
    743  return Vec128<int64_t, N>{__lsx_vsrai_d(v.raw, kBits)};
    744 }
    745 
    746 // ------------------------------ RoundingShiftRight
    747 
    748 #ifdef HWY_NATIVE_ROUNDING_SHR
    749 #undef HWY_NATIVE_ROUNDING_SHR
    750 #else
    751 #define HWY_NATIVE_ROUNDING_SHR
    752 #endif
    753 
    754 template <int kBits, size_t N>
    755 HWY_API Vec128<int8_t, N> RoundingShiftRight(Vec128<int8_t, N> v) {
    756  return Vec128<int8_t, N>{__lsx_vsrari_b(v.raw, kBits)};
    757 }
    758 template <int kBits, size_t N>
    759 HWY_API Vec128<int16_t, N> RoundingShiftRight(Vec128<int16_t, N> v) {
    760  return Vec128<int16_t, N>{__lsx_vsrari_h(v.raw, kBits)};
    761 }
    762 template <int kBits, size_t N>
    763 HWY_API Vec128<int32_t, N> RoundingShiftRight(Vec128<int32_t, N> v) {
    764  return Vec128<int32_t, N>{__lsx_vsrari_w(v.raw, kBits)};
    765 }
    766 template <int kBits, size_t N>
    767 HWY_API Vec128<int64_t, N> RoundingShiftRight(Vec128<int64_t, N> v) {
    768  return Vec128<int64_t, N>{__lsx_vsrari_d(v.raw, kBits)};
    769 }
    770 
    771 template <int kBits, size_t N>
    772 HWY_API Vec128<uint8_t, N> RoundingShiftRight(Vec128<uint8_t, N> v) {
    773  return Vec128<uint8_t, N>{__lsx_vsrlri_b(v.raw, kBits)};
    774 }
    775 template <int kBits, size_t N>
    776 HWY_API Vec128<uint16_t, N> RoundingShiftRight(Vec128<uint16_t, N> v) {
    777  return Vec128<uint16_t, N>{__lsx_vsrlri_h(v.raw, kBits)};
    778 }
    779 template <int kBits, size_t N>
    780 HWY_API Vec128<uint32_t, N> RoundingShiftRight(Vec128<uint32_t, N> v) {
    781  return Vec128<uint32_t, N>{__lsx_vsrlri_w(v.raw, kBits)};
    782 }
    783 template <int kBits, size_t N>
    784 HWY_API Vec128<uint64_t, N> RoundingShiftRight(Vec128<uint64_t, N> v) {
    785  return Vec128<uint64_t, N>{__lsx_vsrlri_d(v.raw, kBits)};
    786 }
    787 
    788 // ------------------------------ RoundingShr
    789 
    790 template <size_t N>
    791 HWY_API Vec128<int8_t, N> RoundingShr(Vec128<int8_t, N> v,
    792                                      Vec128<int8_t, N> bits) {
    793  return Vec128<int8_t, N>{__lsx_vsrar_b(v.raw, bits.raw)};
    794 }
    795 template <size_t N>
    796 HWY_API Vec128<int16_t, N> RoundingShr(Vec128<int16_t, N> v,
    797                                       Vec128<int16_t, N> bits) {
    798  return Vec128<int16_t, N>{__lsx_vsrar_h(v.raw, bits.raw)};
    799 }
    800 template <size_t N>
    801 HWY_API Vec128<int32_t, N> RoundingShr(Vec128<int32_t, N> v,
    802                                       Vec128<int32_t, N> bits) {
    803  return Vec128<int32_t, N>{__lsx_vsrar_w(v.raw, bits.raw)};
    804 }
    805 template <size_t N>
    806 HWY_API Vec128<int64_t, N> RoundingShr(Vec128<int64_t, N> v,
    807                                       Vec128<int64_t, N> bits) {
    808  return Vec128<int64_t, N>{__lsx_vsrar_d(v.raw, bits.raw)};
    809 }
    810 
    811 template <size_t N>
    812 HWY_API Vec128<uint8_t, N> RoundingShr(Vec128<uint8_t, N> v,
    813                                       Vec128<uint8_t, N> bits) {
    814  return Vec128<uint8_t, N>{__lsx_vsrlr_b(v.raw, bits.raw)};
    815 }
    816 template <size_t N>
    817 HWY_API Vec128<uint16_t, N> RoundingShr(Vec128<uint16_t, N> v,
    818                                        Vec128<uint16_t, N> bits) {
    819  return Vec128<uint16_t, N>{__lsx_vsrlr_h(v.raw, bits.raw)};
    820 }
    821 template <size_t N>
    822 HWY_API Vec128<uint32_t, N> RoundingShr(Vec128<uint32_t, N> v,
    823                                        Vec128<uint32_t, N> bits) {
    824  return Vec128<uint32_t, N>{__lsx_vsrlr_w(v.raw, bits.raw)};
    825 }
    826 template <size_t N>
    827 HWY_API Vec128<uint64_t, N> RoundingShr(Vec128<uint64_t, N> v,
    828                                        Vec128<uint64_t, N> bits) {
    829  return Vec128<uint64_t, N>{__lsx_vsrlr_d(v.raw, bits.raw)};
    830 }
    831 
    832 // ------------------------------ RoundingShiftRightSame (RoundingShr)
    833 
    834 template <typename T, size_t N>
    835 HWY_API Vec128<T, N> RoundingShiftRightSame(const Vec128<T, N> v, int bits) {
    836  return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
    837 }
    838 
    839 // ================================================== MEMORY (1)
    840 
    841 // ------------------------------ Load 128
    842 
    843 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
    844 HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
    845  const RebindToUnsigned<decltype(d)> du;
    846  return BitCast(d, VFromD<decltype(du)>{__lsx_vld(aligned, 0)});
    847 }
    848 
    849 // Partial
    850 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
    851 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
    852  VFromD<D> v;
    853  CopyBytes<d.MaxBytes()>(p, &v);
    854  return v;
    855 }
    856 
    857 // LoadU == Load
    858 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    859 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
    860  return Load(d, p);
    861 }
    862 
    863 // ------------------------------ MaskedLoad
    864 
    865 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    866 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
    867                             const TFromD<D>* HWY_RESTRICT p) {
    868  return IfThenElseZero(m, LoadU(d, p));
    869 }
    870 
    871 // ------------------------------ MaskedLoadOr
    872 
    873 template <class D>
    874 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
    875                               const TFromD<D>* HWY_RESTRICT p) {
    876  return IfThenElse(m, LoadU(d, p), v);
    877 }
    878 
    879 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
    880 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    881 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
    882  return Load(d, p);
    883 }
    884 
    885 // ------------------------------ Store 128
    886 
    887 template <class D, HWY_IF_V_SIZE_D(D, 16)>
    888 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) {
    889  __lsx_vst(v.raw, aligned, 0);
    890 }
    891 
    892 // ------------------------------ Store 64
    893 
    894 template <class D, HWY_IF_V_SIZE_D(D, 8)>
    895 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) {
    896  __lsx_vstelm_d(v.raw, aligned, 0, 0);
    897 }
    898 
    899 // ------------------------------ Store 32
    900 
    901 template <class D, HWY_IF_V_SIZE_D(D, 4)>
    902 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) {
    903  __lsx_vstelm_w(v.raw, aligned, 0, 0);
    904 }
    905 
    906 // ------------------------------ Store 16
    907 
    908 template <class D, HWY_IF_V_SIZE_D(D, 2)>
    909 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) {
    910  __lsx_vstelm_h(v.raw, aligned, 0, 0);
    911 }
    912 
    913 // ------------------------------ Store 8
    914 
    915 template <class D, HWY_IF_V_SIZE_D(D, 1)>
    916 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) {
    917  __lsx_vstelm_b(v.raw, aligned, 0, 0);
    918 }
    919 
    920 template <class D>
    921 HWY_API void StoreU(VFromD<D> v, D d, void* HWY_RESTRICT p) {
    922  Store(v, d, p);
    923 }
    924 
    925 // ================================================== SWIZZLE (1)
    926 
    927 // ------------------------------ TableLookupBytes
    928 template <typename T, size_t N, typename TI, size_t NI>
    929 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
    930                                        const Vec128<TI, NI> from) {
    931  const DFromV<decltype(from)> d;
    932  const Repartition<uint8_t, decltype(d)> du8;
    933  const DFromV<decltype(bytes)> d_bytes;
    934  const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
    935  return BitCast(
    936      d, VFromD<decltype(du8)>{__lsx_vshuf_b(BitCast(du8_bytes, bytes).raw,
    937                                             BitCast(du8_bytes, bytes).raw,
    938                                             (BitCast(du8, from).raw))});
    939 }
    940 
    941 // ------------------------------ TableLookupBytesOr0
    942 template <class V, class VI>
    943 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
    944  const DFromV<VI> d;
    945  const Repartition<int8_t, decltype(d)> di8;
    946  return BitCast(d,
    947                 IfThenZeroElse(Lt(BitCast(di8, from), Zero(di8)),
    948                                BitCast(di8, TableLookupBytes(bytes, from))));
    949 }
    950 
    951 // ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
    952 
    953 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
    954 // Shuffle0321 rotates one lane to the right (the previous least-significant
    955 // lane is now most-significant). These could also be implemented via
    956 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
    957 
    958 // Swap 32-bit halves in 64-bit halves.
    959 template <typename T, size_t N>
    960 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
    961  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
    962  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
    963  const DFromV<decltype(v)> d;
    964  const RebindToUnsigned<decltype(d)> du;
    965  return BitCast(d, VFromD<decltype(du)>{__lsx_vshuf4i_w(
    966                        detail::BitCastToInteger(v.raw), 0xB1)});
    967 }
    968 
    969 namespace detail {
    970 
    971 template <typename T, HWY_IF_T_SIZE(T, 1)>
    972 HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) {
    973  const int8_t _data_idx[] = {1, 0, 19, 18};
    974  __m128i shuffle_idx = __lsx_vld(_data_idx, 0);
    975  return Vec32<T>{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)};
    976 }
    977 template <typename T, HWY_IF_T_SIZE(T, 2)>
    978 HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) {
    979  const int16_t _data_idx[] = {9, 8, 3, 2};
    980  __m128i shuffle_idx = __lsx_vld(_data_idx, 0);
    981  return Vec64<T>{__lsx_vshuf_h(shuffle_idx, a.raw, b.raw)};
    982 }
    983 template <typename T, HWY_IF_T_SIZE(T, 4)>
    984 HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) {
    985  const DFromV<decltype(a)> d;
    986  const RebindToSigned<decltype(d)> di;
    987  return BitCast(d, Vec128<int32_t>{__lsx_vpermi_w(BitCast(di, b).raw,
    988                                                   BitCast(di, a).raw, 0xB1)});
    989 }
    990 
    991 template <typename T, HWY_IF_T_SIZE(T, 1)>
    992 HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) {
    993  const int8_t _data_idx[] = {0, 3, 18, 17};
    994  __m128i shuffle_idx = __lsx_vld(_data_idx, 0);
    995  return Vec32<T>{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)};
    996 }
    997 template <typename T, HWY_IF_T_SIZE(T, 2)>
    998 HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) {
    999  const int16_t _data_idx[] = {10, 11, 2, 1};
   1000  __m128i shuffle_idx = __lsx_vld(_data_idx, 0);
   1001  auto t0 = __lsx_vshuf_h(shuffle_idx, a.raw, b.raw);
   1002  return Vec64<T>{t0};
   1003 }
   1004 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1005 HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) {
   1006  const DFromV<decltype(a)> d;
   1007  const RebindToSigned<decltype(d)> di;
   1008  return BitCast(d, Vec128<int32_t>{__lsx_vpermi_w(BitCast(di, b).raw,
   1009                                                   BitCast(di, a).raw, 0x6C)});
   1010 }
   1011 
   1012 template <typename T, HWY_IF_T_SIZE(T, 1)>
   1013 HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) {
   1014  const int8_t _data_idx[] = {2, 1, 16, 19};
   1015  __m128i shuffle_idx = __lsx_vld(_data_idx, 0);
   1016  return Vec32<T>{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)};
   1017 }
   1018 template <typename T, HWY_IF_T_SIZE(T, 2)>
   1019 HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) {
   1020  const int16_t _data_idx[] = {8, 9, 0, 3};
   1021  __m128i shuffle_idx = __lsx_vld(_data_idx, 0);
   1022  return Vec64<T>{__lsx_vshuf_h(shuffle_idx, a.raw, b.raw)};
   1023 }
   1024 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1025 HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) {
   1026  const DFromV<decltype(a)> d;
   1027  const RebindToSigned<decltype(d)> di;
   1028  return BitCast(d, Vec128<int32_t>{__lsx_vpermi_w(BitCast(di, b).raw,
   1029                                                   BitCast(di, a).raw, 0xC6)});
   1030 }
   1031 
   1032 }  // namespace detail
   1033 
   1034 // Swap 64-bit halves
   1035 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1036 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
   1037  const DFromV<decltype(v)> d;
   1038  return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w(
   1039                        reinterpret_cast<__m128i>(v.raw), 0x4E)});
   1040 }
   1041 HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) {
   1042  return Vec128<uint64_t>{__lsx_vshuf4i_w(v.raw, 0x4E)};
   1043 }
   1044 HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) {
   1045  return Vec128<int64_t>{__lsx_vshuf4i_w(v.raw, 0x4E)};
   1046 }
   1047 HWY_API Vec128<double> Shuffle01(const Vec128<double> v) {
   1048  const DFromV<decltype(v)> d;
   1049  return BitCast(d, Vec128<uint64_t>{__lsx_vshuf4i_d(
   1050                        reinterpret_cast<__m128i>(v.raw),
   1051                        reinterpret_cast<__m128i>(v.raw), 0x1)});
   1052 }
   1053 
   1054 // Rotate right 32 bits
   1055 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1056 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
   1057  const DFromV<decltype(v)> d;
   1058  return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w(
   1059                        reinterpret_cast<__m128i>(v.raw), 0x39)});
   1060 }
   1061 // Rotate left 32 bits
   1062 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1063 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
   1064  const DFromV<decltype(v)> d;
   1065  return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w(
   1066                        reinterpret_cast<__m128i>(v.raw), 0x93)});
   1067 }
   1068 // Reverse
   1069 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1070 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
   1071  const DFromV<decltype(v)> d;
   1072  return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w(
   1073                        reinterpret_cast<__m128i>(v.raw), 0x1B)});
   1074 }
   1075 
   1076 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
   1077 
   1078 template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)>
   1079 HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) {
   1080  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
   1081  const Simd<TFrom, NFrom, 0> d;
   1082  return MaskFromVec(BitCast(dto, VecFromMask(d, m)));
   1083 }
   1084 
   1085 // ================================================== COMPARE
   1086 
   1087 template <typename T, size_t N>
   1088 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
   1089  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   1090  return (v & bit) == bit;
   1091 }
   1092 
   1093 // ------------------------------ Equality
   1094 
   1095 // Unsigned
   1096 template <size_t N>
   1097 HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a,
   1098                                       Vec128<uint8_t, N> b) {
   1099  return Mask128<uint8_t, N>{__lsx_vseq_b(a.raw, b.raw)};
   1100 }
   1101 template <size_t N>
   1102 HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a,
   1103                                        Vec128<uint16_t, N> b) {
   1104  return Mask128<uint16_t, N>{__lsx_vseq_h(a.raw, b.raw)};
   1105 }
   1106 template <size_t N>
   1107 HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a,
   1108                                        Vec128<uint32_t, N> b) {
   1109  return Mask128<uint32_t, N>{__lsx_vseq_w(a.raw, b.raw)};
   1110 }
   1111 template <size_t N>
   1112 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
   1113                                        const Vec128<uint64_t, N> b) {
   1114  return Mask128<uint64_t, N>{__lsx_vseq_d(a.raw, b.raw)};
   1115 }
   1116 
   1117 // Signed
   1118 template <size_t N>
   1119 HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a,
   1120                                      Vec128<int8_t, N> b) {
   1121  return Mask128<int8_t, N>{__lsx_vseq_b(a.raw, b.raw)};
   1122 }
   1123 template <size_t N>
   1124 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
   1125                                       Vec128<int16_t, N> b) {
   1126  return Mask128<int16_t, N>{__lsx_vseq_h(a.raw, b.raw)};
   1127 }
   1128 template <size_t N>
   1129 HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a,
   1130                                       Vec128<int32_t, N> b) {
   1131  return Mask128<int32_t, N>{__lsx_vseq_w(a.raw, b.raw)};
   1132 }
   1133 template <size_t N>
   1134 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
   1135                                       const Vec128<int64_t, N> b) {
   1136  return Mask128<int64_t, N>{__lsx_vseq_d(a.raw, b.raw)};
   1137 }
   1138 
   1139 // Float
   1140 template <size_t N>
   1141 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
   1142  return Mask128<float, N>{
   1143      reinterpret_cast<__m128>(__lsx_vfcmp_ceq_s(a.raw, b.raw))};
   1144 }
   1145 template <size_t N>
   1146 HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
   1147                                      Vec128<double, N> b) {
   1148  return Mask128<double, N>{
   1149      reinterpret_cast<__m128d>(__lsx_vfcmp_ceq_d(a.raw, b.raw))};
   1150 }
   1151 
   1152 // ------------------------------ Inequality
   1153 
   1154 // This cannot have T as a template argument, otherwise it is not more
   1155 // specialized than rewritten operator== in C++20, leading to compile
   1156 // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
   1157 template <size_t N>
   1158 HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
   1159                                       Vec128<uint8_t, N> b) {
   1160  return Not(a == b);
   1161 }
   1162 template <size_t N>
   1163 HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
   1164                                        Vec128<uint16_t, N> b) {
   1165  return Not(a == b);
   1166 }
   1167 template <size_t N>
   1168 HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
   1169                                        Vec128<uint32_t, N> b) {
   1170  return Not(a == b);
   1171 }
   1172 template <size_t N>
   1173 HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
   1174                                        Vec128<uint64_t, N> b) {
   1175  return Not(a == b);
   1176 }
   1177 template <size_t N>
   1178 HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
   1179                                      Vec128<int8_t, N> b) {
   1180  return Not(a == b);
   1181 }
   1182 template <size_t N>
   1183 HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
   1184                                       Vec128<int16_t, N> b) {
   1185  return Not(a == b);
   1186 }
   1187 template <size_t N>
   1188 HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
   1189                                       Vec128<int32_t, N> b) {
   1190  return Not(a == b);
   1191 }
   1192 template <size_t N>
   1193 HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
   1194                                       Vec128<int64_t, N> b) {
   1195  return Not(a == b);
   1196 }
   1197 
   1198 template <size_t N>
   1199 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
   1200  return Mask128<float, N>{
   1201      reinterpret_cast<__m128>(__lsx_vfcmp_cune_s(a.raw, b.raw))};
   1202 }
   1203 template <size_t N>
   1204 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
   1205                                      Vec128<double, N> b) {
   1206  return Mask128<double, N>{
   1207      reinterpret_cast<__m128d>(__lsx_vfcmp_cune_d(a.raw, b.raw))};
   1208 }
   1209 
   1210 // ------------------------------ Strict inequality
   1211 
   1212 namespace detail {
   1213 
   1214 template <size_t N>
   1215 HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
   1216                                 Vec128<int8_t, N> b) {
   1217  return Mask128<int8_t, N>{__lsx_vslt_b(b.raw, a.raw)};
   1218 }
   1219 template <size_t N>
   1220 HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
   1221                                  Vec128<int16_t, N> b) {
   1222  return Mask128<int16_t, N>{__lsx_vslt_h(b.raw, a.raw)};
   1223 }
   1224 template <size_t N>
   1225 HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
   1226                                  Vec128<int32_t, N> b) {
   1227  return Mask128<int32_t, N>{__lsx_vslt_w(b.raw, a.raw)};
   1228 }
   1229 template <size_t N>
   1230 HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
   1231                                  const Vec128<int64_t, N> a,
   1232                                  const Vec128<int64_t, N> b) {
   1233  return Mask128<int64_t, N>{__lsx_vslt_d(b.raw, a.raw)};
   1234 }
   1235 
   1236 template <size_t N>
   1237 HWY_INLINE Mask128<uint8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<uint8_t, N> a,
   1238                                  Vec128<uint8_t, N> b) {
   1239  return Mask128<uint8_t, N>{__lsx_vslt_b(b.raw, a.raw)};
   1240 }
   1241 template <size_t N>
   1242 HWY_INLINE Mask128<uint16_t, N> Gt(hwy::SignedTag /*tag*/,
   1243                                   Vec128<uint16_t, N> a,
   1244                                   Vec128<uint16_t, N> b) {
   1245  return Mask128<uint16_t, N>{__lsx_vslt_h(b.raw, a.raw)};
   1246 }
   1247 template <size_t N>
   1248 HWY_INLINE Mask128<uint32_t, N> Gt(hwy::SignedTag /*tag*/,
   1249                                   Vec128<uint32_t, N> a,
   1250                                   Vec128<uint32_t, N> b) {
   1251  return Mask128<uint32_t, N>{__lsx_vslt_w(b.raw, a.raw)};
   1252 }
   1253 template <size_t N>
   1254 HWY_INLINE Mask128<uint64_t, N> Gt(hwy::SignedTag /*tag*/,
   1255                                   const Vec128<uint64_t, N> a,
   1256                                   const Vec128<uint64_t, N> b) {
   1257  return Mask128<uint64_t, N>{__lsx_vslt_d(b.raw, a.raw)};
   1258 }
   1259 
   1260 template <typename T, size_t N>
   1261 HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
   1262                            Vec128<T, N> b) {
   1263  const DFromV<decltype(a)> du;
   1264  const RebindToSigned<decltype(du)> di;
   1265  const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
   1266  const auto sa = BitCast(di, Xor(a, msb));
   1267  const auto sb = BitCast(di, Xor(b, msb));
   1268  return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
   1269 }
   1270 
   1271 template <size_t N>
   1272 HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
   1273                                Vec128<float, N> b) {
   1274  return Mask128<float, N>{
   1275      reinterpret_cast<__m128>(__lsx_vfcmp_clt_s(b.raw, a.raw))};
   1276 }
   1277 template <size_t N>
   1278 HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
   1279                                 Vec128<double, N> b) {
   1280  return Mask128<double, N>{
   1281      reinterpret_cast<__m128d>(__lsx_vfcmp_clt_d(b.raw, a.raw))};
   1282 }
   1283 
   1284 }  // namespace detail
   1285 
   1286 template <typename T, size_t N>
   1287 HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
   1288  return detail::Gt(hwy::TypeTag<T>(), a, b);
   1289 }
   1290 
   1291 // ------------------------------ Weak inequality
   1292 
   1293 namespace detail {
   1294 template <typename T, size_t N>
   1295 HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a,
   1296                            Vec128<T, N> b) {
   1297  return Not(Gt(tag, b, a));
   1298 }
   1299 
   1300 template <typename T, size_t N>
   1301 HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a,
   1302                            Vec128<T, N> b) {
   1303  return Not(Gt(tag, b, a));
   1304 }
   1305 
   1306 template <size_t N>
   1307 HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a,
   1308                                Vec128<float, N> b) {
   1309  return Mask128<float, N>{
   1310      reinterpret_cast<__m128>(__lsx_vfcmp_cle_s(b.raw, a.raw))};
   1311 }
   1312 template <size_t N>
   1313 HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a,
   1314                                 Vec128<double, N> b) {
   1315  return Mask128<double, N>{
   1316      reinterpret_cast<__m128d>(__lsx_vfcmp_cle_d(b.raw, a.raw))};
   1317 }
   1318 
   1319 }  // namespace detail
   1320 
   1321 template <typename T, size_t N>
   1322 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
   1323  return detail::Ge(hwy::TypeTag<T>(), a, b);
   1324 }
   1325 
   1326 // ------------------------------ Reversed comparisons
   1327 
   1328 template <typename T, size_t N>
   1329 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
   1330  return b > a;
   1331 }
   1332 
   1333 template <typename T, size_t N>
   1334 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
   1335  return b >= a;
   1336 }
   1337 
   1338 // ------------------------------ Iota (Load)
   1339 
   1340 namespace detail {
   1341 
   1342 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   1343 HWY_INLINE VFromD<D> Iota0(D d) {
   1344  return Dup128VecFromValues(
   1345      d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
   1346      TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
   1347      TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
   1348      TFromD<D>{15});
   1349 }
   1350 
   1351 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)>
   1352 HWY_INLINE VFromD<D> Iota0(D d) {
   1353  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
   1354                             TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
   1355                             TFromD<D>{6}, TFromD<D>{7});
   1356 }
   1357 
   1358 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   1359 HWY_INLINE VFromD<D> Iota0(D d) {
   1360  return Dup128VecFromValues(
   1361      d, static_cast<TFromD<D>>(0), static_cast<TFromD<D>>(1),
   1362      static_cast<TFromD<D>>(2), static_cast<TFromD<D>>(3));
   1363 }
   1364 
   1365 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   1366 HWY_INLINE VFromD<D> Iota0(D d) {
   1367  return Dup128VecFromValues(d, static_cast<TFromD<D>>(0),
   1368                             static_cast<TFromD<D>>(1));
   1369 }
   1370 
   1371 }  // namespace detail
   1372 
   1373 template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
   1374 HWY_API VFromD<D> Iota(D d, const T2 first) {
   1375  const auto result_iota =
   1376      detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
   1377  return result_iota;
   1378 }
   1379 
   1380 // ------------------------------ FirstN (Iota, Lt)
   1381 
   1382 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   1383 HWY_API MFromD<D> FirstN(D d, size_t num) {
   1384  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
   1385  using TI = TFromD<decltype(di)>;
   1386  return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
   1387 }
   1388 
   1389 // ------------------------------ InterleaveLower
   1390 
   1391 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
   1392 // the least-significant lane) and "b". To concatenate two half-width integers
   1393 // into one, use ZipLower/Upper instead (also works with scalar).
   1394 
   1395 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   1396 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   1397  return Vec128<T, N>{__lsx_vilvl_b(b.raw, a.raw)};
   1398 }
   1399 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   1400 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   1401  return Vec128<T, N>{__lsx_vilvl_h(b.raw, a.raw)};
   1402 }
   1403 template <typename T, size_t N, HWY_IF_UI32(T)>
   1404 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   1405  return Vec128<T, N>{__lsx_vilvl_w(b.raw, a.raw)};
   1406 }
   1407 template <typename T, size_t N, HWY_IF_UI64(T)>
   1408 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   1409  return Vec128<T, N>{__lsx_vilvl_d(b.raw, a.raw)};
   1410 }
   1411 
   1412 template <size_t N>
   1413 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
   1414                                         Vec128<float, N> b) {
   1415  return Vec128<float, N>{reinterpret_cast<__m128>(__lsx_vilvl_w(
   1416      reinterpret_cast<__m128i>(b.raw), reinterpret_cast<__m128i>(a.raw)))};
   1417 }
   1418 template <size_t N>
   1419 HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
   1420                                          Vec128<double, N> b) {
   1421  return Vec128<double, N>{reinterpret_cast<__m128d>(__lsx_vilvl_d(
   1422      reinterpret_cast<__m128i>(b.raw), reinterpret_cast<__m128i>(a.raw)))};
   1423 }
   1424 
   1425 // Generic for all vector lengths.
   1426 template <class D>
   1427 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
   1428  return InterleaveLower(a, b);
   1429 }
   1430 
   1431 // ------------------------------ BlendedStore
   1432 
   1433 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   1434 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   1435                          TFromD<D>* HWY_RESTRICT p) {
   1436  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
   1437 }
   1438 
   1439 // ================================================== ARITHMETIC
   1440 
   1441 // ------------------------------ Addition
   1442 
   1443 // Unsigned
   1444 template <size_t N>
   1445 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
   1446                                     const Vec128<uint8_t, N> b) {
   1447  return Vec128<uint8_t, N>{__lsx_vadd_b(a.raw, b.raw)};
   1448 }
   1449 template <size_t N>
   1450 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
   1451                                      const Vec128<uint16_t, N> b) {
   1452  return Vec128<uint16_t, N>{__lsx_vadd_h(a.raw, b.raw)};
   1453 }
   1454 template <size_t N>
   1455 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
   1456                                      const Vec128<uint32_t, N> b) {
   1457  return Vec128<uint32_t, N>{__lsx_vadd_w(a.raw, b.raw)};
   1458 }
   1459 template <size_t N>
   1460 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
   1461                                      const Vec128<uint64_t, N> b) {
   1462  return Vec128<uint64_t, N>{__lsx_vadd_d(a.raw, b.raw)};
   1463 }
   1464 
   1465 // Signed
   1466 template <size_t N>
   1467 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
   1468                                    const Vec128<int8_t, N> b) {
   1469  return Vec128<int8_t, N>{__lsx_vadd_b(a.raw, b.raw)};
   1470 }
   1471 template <size_t N>
   1472 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
   1473                                     const Vec128<int16_t, N> b) {
   1474  return Vec128<int16_t, N>{__lsx_vadd_h(a.raw, b.raw)};
   1475 }
   1476 template <size_t N>
   1477 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
   1478                                     const Vec128<int32_t, N> b) {
   1479  return Vec128<int32_t, N>{__lsx_vadd_w(a.raw, b.raw)};
   1480 }
   1481 template <size_t N>
   1482 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
   1483                                     const Vec128<int64_t, N> b) {
   1484  return Vec128<int64_t, N>{__lsx_vadd_d(a.raw, b.raw)};
   1485 }
   1486 
   1487 template <size_t N>
   1488 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
   1489                                   const Vec128<float, N> b) {
   1490  return Vec128<float, N>{__lsx_vfadd_s(a.raw, b.raw)};
   1491 }
   1492 template <size_t N>
   1493 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
   1494                                    const Vec128<double, N> b) {
   1495  return Vec128<double, N>{__lsx_vfadd_d(a.raw, b.raw)};
   1496 }
   1497 
   1498 // ------------------------------ Subtraction
   1499 
   1500 // Unsigned
   1501 template <size_t N>
   1502 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
   1503                                     const Vec128<uint8_t, N> b) {
   1504  return Vec128<uint8_t, N>{__lsx_vsub_b(a.raw, b.raw)};
   1505 }
   1506 template <size_t N>
   1507 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
   1508                                      Vec128<uint16_t, N> b) {
   1509  return Vec128<uint16_t, N>{__lsx_vsub_h(a.raw, b.raw)};
   1510 }
   1511 template <size_t N>
   1512 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
   1513                                      const Vec128<uint32_t, N> b) {
   1514  return Vec128<uint32_t, N>{__lsx_vsub_w(a.raw, b.raw)};
   1515 }
   1516 template <size_t N>
   1517 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
   1518                                      const Vec128<uint64_t, N> b) {
   1519  return Vec128<uint64_t, N>{__lsx_vsub_d(a.raw, b.raw)};
   1520 }
   1521 
   1522 // Signed
   1523 template <size_t N>
   1524 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
   1525                                    const Vec128<int8_t, N> b) {
   1526  return Vec128<int8_t, N>{__lsx_vsub_b(a.raw, b.raw)};
   1527 }
   1528 template <size_t N>
   1529 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
   1530                                     const Vec128<int16_t, N> b) {
   1531  return Vec128<int16_t, N>{__lsx_vsub_h(a.raw, b.raw)};
   1532 }
   1533 template <size_t N>
   1534 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
   1535                                     const Vec128<int32_t, N> b) {
   1536  return Vec128<int32_t, N>{__lsx_vsub_w(a.raw, b.raw)};
   1537 }
   1538 template <size_t N>
   1539 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
   1540                                     const Vec128<int64_t, N> b) {
   1541  return Vec128<int64_t, N>{__lsx_vsub_d(a.raw, b.raw)};
   1542 }
   1543 
   1544 template <size_t N>
   1545 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
   1546                                   const Vec128<float, N> b) {
   1547  return Vec128<float, N>{__lsx_vfsub_s(a.raw, b.raw)};
   1548 }
   1549 template <size_t N>
   1550 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
   1551                                    const Vec128<double, N> b) {
   1552  return Vec128<double, N>{__lsx_vfsub_d(a.raw, b.raw)};
   1553 }
   1554 
   1555 // ------------------------------ SumsOf2
   1556 namespace detail {
   1557 
   1558 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   1559 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   1560    hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
   1561  return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_h_b(v.raw, v.raw)};
   1562 }
   1563 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   1564 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   1565    hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
   1566  return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_hu_bu(v.raw, v.raw)};
   1567 }
   1568 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   1569 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   1570    hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
   1571  return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_w_h(v.raw, v.raw)};
   1572 }
   1573 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   1574 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   1575    hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
   1576  return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_wu_hu(v.raw, v.raw)};
   1577 }
   1578 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   1579 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   1580    hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
   1581  return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_d_w(v.raw, v.raw)};
   1582 }
   1583 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   1584 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   1585    hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
   1586  return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_du_wu(v.raw, v.raw)};
   1587 }
   1588 
   1589 }  // namespace detail
   1590 
   1591 // ------------------------------ SumsOf8
   1592 template <size_t N>
   1593 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
   1594  __m128i temp = __lsx_vhaddw_hu_bu(v.raw, v.raw);
   1595  temp = __lsx_vhaddw_wu_hu(temp, temp);
   1596  return Vec128<uint64_t, N / 8>{__lsx_vhaddw_du_wu(temp, temp)};
   1597 }
   1598 template <size_t N>
   1599 HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) {
   1600  __m128i temp = __lsx_vhaddw_h_b(v.raw, v.raw);
   1601  temp = __lsx_vhaddw_w_h(temp, temp);
   1602  return Vec128<int64_t, N / 8>{__lsx_vhaddw_d_w(temp, temp)};
   1603 }
   1604 
   1605 // ------------------------------ SaturatedAdd
   1606 
   1607 // Returns a + b clamped to the destination range.
   1608 
   1609 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
   1610 #undef HWY_NATIVE_I32_SATURATED_ADDSUB
   1611 #else
   1612 #define HWY_NATIVE_I32_SATURATED_ADDSUB
   1613 #endif
   1614 
   1615 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
   1616 #undef HWY_NATIVE_I64_SATURATED_ADDSUB
   1617 #else
   1618 #define HWY_NATIVE_I64_SATURATED_ADDSUB
   1619 #endif
   1620 
   1621 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
   1622 #undef HWY_NATIVE_U32_SATURATED_ADDSUB
   1623 #else
   1624 #define HWY_NATIVE_U32_SATURATED_ADDSUB
   1625 #endif
   1626 
   1627 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
   1628 #undef HWY_NATIVE_U64_SATURATED_ADDSUB
   1629 #else
   1630 #define HWY_NATIVE_U64_SATURATED_ADDSUB
   1631 #endif
   1632 
   1633 // Unsigned
   1634 template <size_t N>
   1635 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
   1636                                        const Vec128<uint8_t, N> b) {
   1637  return Vec128<uint8_t, N>{__lsx_vsadd_bu(a.raw, b.raw)};
   1638 }
   1639 template <size_t N>
   1640 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
   1641                                         const Vec128<uint16_t, N> b) {
   1642  return Vec128<uint16_t, N>{__lsx_vsadd_hu(a.raw, b.raw)};
   1643 }
   1644 template <size_t N>
   1645 HWY_API Vec128<uint32_t, N> SaturatedAdd(const Vec128<uint32_t, N> a,
   1646                                         const Vec128<uint32_t, N> b) {
   1647  return Vec128<uint32_t, N>{__lsx_vsadd_wu(a.raw, b.raw)};
   1648 }
   1649 template <size_t N>
   1650 HWY_API Vec128<uint64_t, N> SaturatedAdd(const Vec128<uint64_t, N> a,
   1651                                         const Vec128<uint64_t, N> b) {
   1652  return Vec128<uint64_t, N>{__lsx_vsadd_du(a.raw, b.raw)};
   1653 }
   1654 
   1655 // signed
   1656 template <size_t N>
   1657 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
   1658                                       const Vec128<int8_t, N> b) {
   1659  return Vec128<int8_t, N>{__lsx_vsadd_b(a.raw, b.raw)};
   1660 }
   1661 template <size_t N>
   1662 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
   1663                                        const Vec128<int16_t, N> b) {
   1664  return Vec128<int16_t, N>{__lsx_vsadd_h(a.raw, b.raw)};
   1665 }
   1666 template <size_t N>
   1667 HWY_API Vec128<int32_t, N> SaturatedAdd(const Vec128<int32_t, N> a,
   1668                                        const Vec128<int32_t, N> b) {
   1669  return Vec128<int32_t, N>{__lsx_vsadd_w(a.raw, b.raw)};
   1670 }
   1671 template <size_t N>
   1672 HWY_API Vec128<int64_t, N> SaturatedAdd(const Vec128<int64_t, N> a,
   1673                                        const Vec128<int64_t, N> b) {
   1674  return Vec128<int64_t, N>{__lsx_vsadd_d(a.raw, b.raw)};
   1675 }
   1676 
   1677 // ------------------------------ SaturatedSub
   1678 
   1679 // Returns a - b clamped to the destination range.
   1680 
   1681 // Unsigned
   1682 template <size_t N>
   1683 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
   1684                                        const Vec128<uint8_t, N> b) {
   1685  return Vec128<uint8_t, N>{__lsx_vssub_bu(a.raw, b.raw)};
   1686 }
   1687 template <size_t N>
   1688 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
   1689                                         const Vec128<uint16_t, N> b) {
   1690  return Vec128<uint16_t, N>{__lsx_vssub_hu(a.raw, b.raw)};
   1691 }
   1692 template <size_t N>
   1693 HWY_API Vec128<uint32_t, N> SaturatedSub(const Vec128<uint32_t, N> a,
   1694                                         const Vec128<uint32_t, N> b) {
   1695  return Vec128<uint32_t, N>{__lsx_vssub_wu(a.raw, b.raw)};
   1696 }
   1697 template <size_t N>
   1698 HWY_API Vec128<uint64_t, N> SaturatedSub(const Vec128<uint64_t, N> a,
   1699                                         const Vec128<uint64_t, N> b) {
   1700  return Vec128<uint64_t, N>{__lsx_vssub_du(a.raw, b.raw)};
   1701 }
   1702 
   1703 // signed
   1704 template <size_t N>
   1705 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
   1706                                       const Vec128<int8_t, N> b) {
   1707  return Vec128<int8_t, N>{__lsx_vssub_b(a.raw, b.raw)};
   1708 }
   1709 template <size_t N>
   1710 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
   1711                                        const Vec128<int16_t, N> b) {
   1712  return Vec128<int16_t, N>{__lsx_vssub_h(a.raw, b.raw)};
   1713 }
   1714 template <size_t N>
   1715 HWY_API Vec128<int32_t, N> SaturatedSub(const Vec128<int32_t, N> a,
   1716                                        const Vec128<int32_t, N> b) {
   1717  return Vec128<int32_t, N>{__lsx_vssub_w(a.raw, b.raw)};
   1718 }
   1719 template <size_t N>
   1720 HWY_API Vec128<int64_t, N> SaturatedSub(const Vec128<int64_t, N> a,
   1721                                        const Vec128<int64_t, N> b) {
   1722  return Vec128<int64_t, N>{__lsx_vssub_d(a.raw, b.raw)};
   1723 }
   1724 
   1725 // ------------------------------ AverageRound
   1726 
   1727 // Returns (a + b + 1) / 2
   1728 
   1729 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
   1730 #undef HWY_NATIVE_AVERAGE_ROUND_UI32
   1731 #else
   1732 #define HWY_NATIVE_AVERAGE_ROUND_UI32
   1733 #endif
   1734 
   1735 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
   1736 #undef HWY_NATIVE_AVERAGE_ROUND_UI64
   1737 #else
   1738 #define HWY_NATIVE_AVERAGE_ROUND_UI64
   1739 #endif
   1740 
   1741 // Unsigned
   1742 template <size_t N>
   1743 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
   1744                                        const Vec128<uint8_t, N> b) {
   1745  return Vec128<uint8_t, N>{__lsx_vavgr_bu(a.raw, b.raw)};
   1746 }
   1747 template <size_t N>
   1748 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
   1749                                         const Vec128<uint16_t, N> b) {
   1750  return Vec128<uint16_t, N>{__lsx_vavgr_hu(a.raw, b.raw)};
   1751 }
   1752 template <size_t N>
   1753 HWY_API Vec128<uint32_t, N> AverageRound(const Vec128<uint32_t, N> a,
   1754                                         const Vec128<uint32_t, N> b) {
   1755  return Vec128<uint32_t, N>{__lsx_vavgr_wu(a.raw, b.raw)};
   1756 }
   1757 template <size_t N>
   1758 HWY_API Vec128<uint64_t, N> AverageRound(const Vec128<uint64_t, N> a,
   1759                                         const Vec128<uint64_t, N> b) {
   1760  return Vec128<uint64_t, N>{__lsx_vavgr_du(a.raw, b.raw)};
   1761 }
   1762 
   1763 // signed
   1764 template <size_t N>
   1765 HWY_API Vec128<int8_t, N> AverageRound(const Vec128<int8_t, N> a,
   1766                                       const Vec128<int8_t, N> b) {
   1767  return Vec128<int8_t, N>{__lsx_vavgr_b(a.raw, b.raw)};
   1768 }
   1769 template <size_t N>
   1770 HWY_API Vec128<int16_t, N> AverageRound(const Vec128<int16_t, N> a,
   1771                                        const Vec128<int16_t, N> b) {
   1772  return Vec128<int16_t, N>{__lsx_vavgr_h(a.raw, b.raw)};
   1773 }
   1774 template <size_t N>
   1775 HWY_API Vec128<int32_t, N> AverageRound(const Vec128<int32_t, N> a,
   1776                                        const Vec128<int32_t, N> b) {
   1777  return Vec128<int32_t, N>{__lsx_vavgr_w(a.raw, b.raw)};
   1778 }
   1779 template <size_t N>
   1780 HWY_API Vec128<int64_t, N> AverageRound(const Vec128<int64_t, N> a,
   1781                                        const Vec128<int64_t, N> b) {
   1782  return Vec128<int64_t, N>{__lsx_vavgr_d(a.raw, b.raw)};
   1783 }
   1784 
   1785 // ------------------------------ Integer/Float multiplication
   1786 
   1787 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
   1788 #ifdef HWY_NATIVE_MUL_8
   1789 #undef HWY_NATIVE_MUL_8
   1790 #else
   1791 #define HWY_NATIVE_MUL_8
   1792 #endif
   1793 #ifdef HWY_NATIVE_MUL_64
   1794 #undef HWY_NATIVE_MUL_64
   1795 #else
   1796 #define HWY_NATIVE_MUL_64
   1797 #endif
   1798 
   1799 template <typename T, size_t N, HWY_IF_UI8(T)>
   1800 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) {
   1801  return Vec128<T, N>{__lsx_vmul_b(a.raw, b.raw)};
   1802 }
   1803 template <typename T, size_t N, HWY_IF_UI16(T)>
   1804 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) {
   1805  return Vec128<T, N>{__lsx_vmul_h(a.raw, b.raw)};
   1806 }
   1807 template <typename T, size_t N, HWY_IF_UI32(T)>
   1808 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) {
   1809  return Vec128<T, N>{__lsx_vmul_w(a.raw, b.raw)};
   1810 }
   1811 template <typename T, size_t N, HWY_IF_UI64(T)>
   1812 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) {
   1813  return Vec128<T, N>{__lsx_vmul_d(a.raw, b.raw)};
   1814 }
   1815 
   1816 template <size_t N>
   1817 HWY_API Vec128<float, N> operator*(const Vec128<float, N> a,
   1818                                   const Vec128<float, N> b) {
   1819  return Vec128<float, N>{__lsx_vfmul_s(a.raw, b.raw)};
   1820 }
   1821 template <size_t N>
   1822 HWY_API Vec128<double, N> operator*(const Vec128<double, N> a,
   1823                                    const Vec128<double, N> b) {
   1824  return Vec128<double, N>{__lsx_vfmul_d(a.raw, b.raw)};
   1825 }
   1826 
   1827 // ------------------------------ MulHigh
   1828 
   1829 // Usigned
   1830 template <size_t N>
   1831 HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a,
   1832                                   const Vec128<uint8_t, N> b) {
   1833  return Vec128<uint8_t, N>{__lsx_vmuh_bu(a.raw, b.raw)};
   1834 }
   1835 template <size_t N>
   1836 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
   1837                                    const Vec128<uint16_t, N> b) {
   1838  return Vec128<uint16_t, N>{__lsx_vmuh_hu(a.raw, b.raw)};
   1839 }
   1840 template <size_t N>
   1841 HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a,
   1842                                    const Vec128<uint32_t, N> b) {
   1843  return Vec128<uint32_t, N>{__lsx_vmuh_wu(a.raw, b.raw)};
   1844 }
   1845 template <size_t N>
   1846 HWY_API Vec128<uint64_t, N> MulHigh(const Vec128<uint64_t, N> a,
   1847                                    const Vec128<uint64_t, N> b) {
   1848  return Vec128<uint64_t, N>{__lsx_vmuh_du(a.raw, b.raw)};
   1849 }
   1850 
   1851 // signed
   1852 template <size_t N>
   1853 HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a,
   1854                                  const Vec128<int8_t, N> b) {
   1855  return Vec128<int8_t, N>{__lsx_vmuh_b(a.raw, b.raw)};
   1856 }
   1857 template <size_t N>
   1858 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
   1859                                   const Vec128<int16_t, N> b) {
   1860  return Vec128<int16_t, N>{__lsx_vmuh_h(a.raw, b.raw)};
   1861 }
   1862 template <size_t N>
   1863 HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a,
   1864                                   const Vec128<int32_t, N> b) {
   1865  return Vec128<int32_t, N>{__lsx_vmuh_w(a.raw, b.raw)};
   1866 }
   1867 template <size_t N>
   1868 HWY_API Vec128<int64_t, N> MulHigh(const Vec128<int64_t, N> a,
   1869                                   const Vec128<int64_t, N> b) {
   1870  return Vec128<int64_t, N>{__lsx_vmuh_d(a.raw, b.raw)};
   1871 }
   1872 
   1873 // ------------------------------ MulEven
   1874 
   1875 template <size_t N>
   1876 HWY_API Vec128<int16_t, (N + 1) / 2> MulEven(Vec128<int8_t, N> a,
   1877                                             Vec128<int8_t, N> b) {
   1878  return Vec128<int16_t, (N + 1) / 2>{__lsx_vmulwev_h_b(a.raw, b.raw)};
   1879 }
   1880 
   1881 template <size_t N>
   1882 HWY_API Vec128<uint16_t, (N + 1) / 2> MulEven(Vec128<uint8_t, N> a,
   1883                                              Vec128<uint8_t, N> b) {
   1884  return Vec128<uint16_t, (N + 1) / 2>{__lsx_vmulwev_h_bu(a.raw, b.raw)};
   1885 }
   1886 
   1887 template <size_t N>
   1888 HWY_API Vec128<int32_t, (N + 1) / 2> MulEven(Vec128<int16_t, N> a,
   1889                                             Vec128<int16_t, N> b) {
   1890  return Vec128<int32_t, (N + 1) / 2>{__lsx_vmulwev_w_h(a.raw, b.raw)};
   1891 }
   1892 
   1893 template <size_t N>
   1894 HWY_API Vec128<uint32_t, (N + 1) / 2> MulEven(Vec128<uint16_t, N> a,
   1895                                              Vec128<uint16_t, N> b) {
   1896  return Vec128<uint32_t, (N + 1) / 2>{__lsx_vmulwev_w_hu(a.raw, b.raw)};
   1897 }
   1898 
   1899 template <size_t N>
   1900 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a,
   1901                                             Vec128<int32_t, N> b) {
   1902  return Vec128<int64_t, (N + 1) / 2>{__lsx_vmulwev_d_w(a.raw, b.raw)};
   1903 }
   1904 
   1905 template <size_t N>
   1906 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
   1907                                              Vec128<uint32_t, N> b) {
   1908  return Vec128<uint64_t, (N + 1) / 2>{__lsx_vmulwev_d_wu(a.raw, b.raw)};
   1909 }
   1910 
   1911 template <typename T, HWY_IF_I64(T)>
   1912 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
   1913  return Vec128<T>{__lsx_vmulwev_q_d(a.raw, b.raw)};
   1914 }
   1915 
   1916 template <typename T, HWY_IF_U64(T)>
   1917 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
   1918  return Vec128<T>{__lsx_vmulwev_q_du(a.raw, b.raw)};
   1919 }
   1920 
   1921 // ------------------------------ MulOdd
   1922 
   1923 template <size_t N>
   1924 HWY_API Vec128<int16_t, (N + 1) / 2> MulOdd(Vec128<int8_t, N> a,
   1925                                            Vec128<int8_t, N> b) {
   1926  return Vec128<int16_t, (N + 1) / 2>{__lsx_vmulwod_h_b(a.raw, b.raw)};
   1927 }
   1928 
   1929 template <size_t N>
   1930 HWY_API Vec128<uint16_t, (N + 1) / 2> MulOdd(Vec128<uint8_t, N> a,
   1931                                             Vec128<uint8_t, N> b) {
   1932  return Vec128<uint16_t, (N + 1) / 2>{__lsx_vmulwod_h_bu(a.raw, b.raw)};
   1933 }
   1934 
   1935 template <size_t N>
   1936 HWY_API Vec128<int32_t, (N + 1) / 2> MulOdd(Vec128<int16_t, N> a,
   1937                                            Vec128<int16_t, N> b) {
   1938  return Vec128<int32_t, (N + 1) / 2>{__lsx_vmulwod_w_h(a.raw, b.raw)};
   1939 }
   1940 
   1941 template <size_t N>
   1942 HWY_API Vec128<uint32_t, (N + 1) / 2> MulOdd(Vec128<uint16_t, N> a,
   1943                                             Vec128<uint16_t, N> b) {
   1944  return Vec128<uint32_t, (N + 1) / 2>{__lsx_vmulwod_w_hu(a.raw, b.raw)};
   1945 }
   1946 
   1947 template <size_t N>
   1948 HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(Vec128<int32_t, N> a,
   1949                                            Vec128<int32_t, N> b) {
   1950  return Vec128<int64_t, (N + 1) / 2>{__lsx_vmulwod_d_w(a.raw, b.raw)};
   1951 }
   1952 
   1953 template <size_t N>
   1954 HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
   1955                                             Vec128<uint32_t, N> b) {
   1956  return Vec128<uint64_t, (N + 1) / 2>{__lsx_vmulwod_d_wu(a.raw, b.raw)};
   1957 }
   1958 
   1959 template <typename T, HWY_IF_I64(T)>
   1960 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
   1961  return Vec128<T>{__lsx_vmulwod_q_d(a.raw, b.raw)};
   1962 }
   1963 
   1964 template <typename T, HWY_IF_U64(T)>
   1965 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
   1966  return Vec128<T>{__lsx_vmulwod_q_du(a.raw, b.raw)};
   1967 }
   1968 
   1969 // ------------------------------ RotateRight (ShiftRight, Or)
   1970 
   1971 template <int kBits, typename T, size_t N, HWY_IF_UI8(T)>
   1972 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   1973  return Vec128<T, N>{__lsx_vrotri_b(v.raw, kBits)};
   1974 }
   1975 template <int kBits, typename T, size_t N, HWY_IF_UI16(T)>
   1976 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   1977  return Vec128<T, N>{__lsx_vrotri_h(v.raw, kBits)};
   1978 }
   1979 template <int kBits, typename T, size_t N, HWY_IF_UI32(T)>
   1980 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   1981  return Vec128<T, N>{__lsx_vrotri_w(v.raw, kBits)};
   1982 }
   1983 template <int kBits, typename T, size_t N, HWY_IF_UI64(T)>
   1984 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   1985  return Vec128<T, N>{__lsx_vrotri_d(v.raw, kBits)};
   1986 }
   1987 
   1988 // ------------------------------ Ror
   1989 #ifdef HWY_NATIVE_ROL_ROR_8
   1990 #undef HWY_NATIVE_ROL_ROR_8
   1991 #else
   1992 #define HWY_NATIVE_ROL_ROR_8
   1993 #endif
   1994 
   1995 #ifdef HWY_NATIVE_ROL_ROR_16
   1996 #undef HWY_NATIVE_ROL_ROR_16
   1997 #else
   1998 #define HWY_NATIVE_ROL_ROR_16
   1999 #endif
   2000 
   2001 #ifdef HWY_NATIVE_ROL_ROR_32_64
   2002 #undef HWY_NATIVE_ROL_ROR_32_64
   2003 #else
   2004 #define HWY_NATIVE_ROL_ROR_32_64
   2005 #endif
   2006 
   2007 template <class T, size_t N, HWY_IF_UI8(T)>
   2008 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   2009  return Vec128<T, N>{__lsx_vrotr_b(a.raw, b.raw)};
   2010 }
   2011 
   2012 template <class T, size_t N, HWY_IF_UI16(T)>
   2013 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   2014  return Vec128<T, N>{__lsx_vrotr_h(a.raw, b.raw)};
   2015 }
   2016 
   2017 template <class T, size_t N, HWY_IF_UI32(T)>
   2018 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   2019  return Vec128<T, N>{__lsx_vrotr_w(a.raw, b.raw)};
   2020 }
   2021 
   2022 template <class T, size_t N, HWY_IF_UI64(T)>
   2023 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
   2024  return Vec128<T, N>{__lsx_vrotr_d(a.raw, b.raw)};
   2025 }
   2026 
   2027 // Rol is generic for all vector lengths
   2028 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   2029 HWY_API V Rol(V a, V b) {
   2030  const DFromV<decltype(a)> d;
   2031  const RebindToSigned<decltype(d)> di;
   2032 
   2033  return Ror(a, BitCast(d, Neg(BitCast(di, b))));
   2034 }
   2035 
   2036 // ------------------------------ RotateLeftSame/RotateRightSame
   2037 
   2038 #ifdef HWY_NATIVE_ROL_ROR_SAME_8
   2039 #undef HWY_NATIVE_ROL_ROR_SAME_8
   2040 #else
   2041 #define HWY_NATIVE_ROL_ROR_SAME_8
   2042 #endif
   2043 
   2044 #ifdef HWY_NATIVE_ROL_ROR_SAME_16
   2045 #undef HWY_NATIVE_ROL_ROR_SAME_16
   2046 #else
   2047 #define HWY_NATIVE_ROL_ROR_SAME_16
   2048 #endif
   2049 
   2050 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
   2051 #undef HWY_NATIVE_ROL_ROR_SAME_32_64
   2052 #else
   2053 #define HWY_NATIVE_ROL_ROR_SAME_32_64
   2054 #endif
   2055 
   2056 // RotateLeftSame/RotateRightSame are generic for all vector lengths
   2057 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   2058 HWY_API V RotateLeftSame(V v, int bits) {
   2059  using T = TFromV<V>;
   2060  const DFromV<decltype(v)> d;
   2061  return Rol(v, Set(d, static_cast<T>(bits)));
   2062 }
   2063 
   2064 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   2065 HWY_API V RotateRightSame(V v, int bits) {
   2066  using T = TFromV<V>;
   2067  const DFromV<decltype(v)> d;
   2068  return Ror(v, Set(d, static_cast<T>(bits)));
   2069 }
   2070 
   2071 // ------------------------------ BroadcastSignBit
   2072 
   2073 template <typename T, size_t N, HWY_IF_SIGNED(T)>
   2074 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
   2075  return ShiftRight<sizeof(T) * 8 - 1>(v);
   2076 }
   2077 
   2078 // ------------------------------ Integer Abs
   2079 
   2080 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
   2081 template <size_t N>
   2082 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
   2083  return Vec128<int8_t, N>{__lsx_vabsd_b(v.raw, __lsx_vreplgr2vr_b(0))};
   2084 }
   2085 template <size_t N>
   2086 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
   2087  return Vec128<int16_t, N>{__lsx_vabsd_h(v.raw, __lsx_vreplgr2vr_b(0))};
   2088 }
   2089 template <size_t N>
   2090 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
   2091  return Vec128<int32_t, N>{__lsx_vabsd_w(v.raw, __lsx_vreplgr2vr_b(0))};
   2092 }
   2093 template <size_t N>
   2094 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
   2095  return Vec128<int64_t, N>{__lsx_vabsd_d(v.raw, __lsx_vreplgr2vr_b(0))};
   2096 }
   2097 
   2098 // ------------------------------ SaturatedAbs
   2099 
   2100 #ifdef HWY_NATIVE_SATURATED_ABS
   2101 #undef HWY_NATIVE_SATURATED_ABS
   2102 #else
   2103 #define HWY_NATIVE_SATURATED_ABS
   2104 #endif
   2105 
   2106 template <class V, HWY_IF_I8(TFromV<V>)>
   2107 HWY_API V SaturatedAbs(V v) {
   2108  const DFromV<decltype(v)> d;
   2109  const RebindToUnsigned<decltype(d)> du;
   2110  return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
   2111 }
   2112 template <class V, HWY_IF_I16(TFromV<V>)>
   2113 HWY_API V SaturatedAbs(V v) {
   2114  return Max(v, SaturatedSub(Zero(DFromV<V>()), v));
   2115 }
   2116 template <class V, HWY_IF_I32(TFromV<V>)>
   2117 HWY_API V SaturatedAbs(V v) {
   2118  const auto abs_v = Abs(v);
   2119  const DFromV<decltype(v)> d;
   2120  const RebindToUnsigned<decltype(d)> du;
   2121  return BitCast(d, Min(BitCast(du, abs_v),
   2122                        Set(du, static_cast<uint32_t>(LimitsMax<int32_t>()))));
   2123 }
   2124 template <class V, HWY_IF_I64(TFromV<V>)>
   2125 HWY_API V SaturatedAbs(V v) {
   2126  const auto abs_v = Abs(v);
   2127  return Add(abs_v, BroadcastSignBit(abs_v));
   2128 }
   2129 
   2130 // ------------------------------ IfNegativeThenElse
   2131 template <typename T, size_t N>
   2132 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
   2133                                        Vec128<T, N> no) {
   2134  static_assert(IsSigned<T>(), "Only works for signed/float");
   2135  const DFromV<decltype(no)> d;
   2136  const RebindToSigned<decltype(d)> di;
   2137 
   2138  Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
   2139  return IfThenElse(m, yes, no);
   2140 }
   2141 
   2142 // ------------------------------ IfNegativeThenNegOrUndefIfZero
   2143 
   2144 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
   2145 #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
   2146 #else
   2147 #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
   2148 #endif
   2149 
   2150 template <size_t N>
   2151 HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask,
   2152                                                         Vec128<int8_t, N> v) {
   2153  return Vec128<int8_t, N>{__lsx_vsigncov_b(mask.raw, v.raw)};
   2154 }
   2155 
   2156 template <size_t N>
   2157 HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero(
   2158    Vec128<int16_t, N> mask, Vec128<int16_t, N> v) {
   2159  return Vec128<int16_t, N>{__lsx_vsigncov_h(mask.raw, v.raw)};
   2160 }
   2161 
   2162 template <size_t N>
   2163 HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero(
   2164    Vec128<int32_t, N> mask, Vec128<int32_t, N> v) {
   2165  return Vec128<int32_t, N>{__lsx_vsigncov_w(mask.raw, v.raw)};
   2166 }
   2167 
   2168 template <size_t N>
   2169 HWY_API Vec128<int64_t, N> IfNegativeThenNegOrUndefIfZero(
   2170    Vec128<int64_t, N> mask, Vec128<int64_t, N> v) {
   2171  return Vec128<int64_t, N>{__lsx_vsigncov_d(mask.raw, v.raw)};
   2172 }
   2173 
   2174 // ------------------------------ ShiftLeftSame/ShiftRightSame
   2175 
   2176 template <typename T, size_t N>
   2177 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) {
   2178  return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits));
   2179 }
   2180 template <typename T, size_t N>
   2181 HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
   2182  return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits));
   2183 }
   2184 
   2185 // ------------------------------ Integer/Float Div
   2186 
   2187 #ifdef HWY_NATIVE_INT_DIV
   2188 #undef HWY_NATIVE_INT_DIV
   2189 #else
   2190 #define HWY_NATIVE_INT_DIV
   2191 #endif
   2192 
   2193 template <size_t N>
   2194 HWY_API Vec128<int8_t, N> operator/(const Vec128<int8_t, N> a,
   2195                                    const Vec128<int8_t, N> b) {
   2196  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2197  // or a[i] == LimitsMin<int8_t>() && b[i] == -1
   2198  __m128i raw_result;
   2199  __asm__("vdiv.b %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2200  return Vec128<int8_t, N>{raw_result};
   2201 }
   2202 
   2203 template <size_t N>
   2204 HWY_API Vec128<uint8_t, N> operator/(const Vec128<uint8_t, N> a,
   2205                                     const Vec128<uint8_t, N> b) {
   2206  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2207  __m128i raw_result;
   2208  __asm__("vdiv.bu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2209  return Vec128<uint8_t, N>{raw_result};
   2210 }
   2211 
   2212 template <size_t N>
   2213 HWY_API Vec128<int16_t, N> operator/(const Vec128<int16_t, N> a,
   2214                                     const Vec128<int16_t, N> b) {
   2215  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2216  // or a[i] == LimitsMin<int16_t>() && b[i] == -1
   2217  __m128i raw_result;
   2218  __asm__("vdiv.h %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2219  return Vec128<int16_t, N>{raw_result};
   2220 }
   2221 
   2222 template <size_t N>
   2223 HWY_API Vec128<uint16_t, N> operator/(const Vec128<uint16_t, N> a,
   2224                                      const Vec128<uint16_t, N> b) {
   2225  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2226  __m128i raw_result;
   2227  __asm__("vdiv.hu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2228  return Vec128<uint16_t, N>{raw_result};
   2229 }
   2230 
   2231 template <size_t N>
   2232 HWY_API Vec128<int32_t, N> operator/(const Vec128<int32_t, N> a,
   2233                                     const Vec128<int32_t, N> b) {
   2234  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2235  // or a[i] == LimitsMin<int32_t>() && b[i] == -1
   2236  __m128i raw_result;
   2237  __asm__("vdiv.w %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2238  return Vec128<int32_t, N>{raw_result};
   2239 }
   2240 
   2241 template <size_t N>
   2242 HWY_API Vec128<uint32_t, N> operator/(const Vec128<uint32_t, N> a,
   2243                                      const Vec128<uint32_t, N> b) {
   2244  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2245  __m128i raw_result;
   2246  __asm__("vdiv.wu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2247  return Vec128<uint32_t, N>{raw_result};
   2248 }
   2249 
   2250 template <size_t N>
   2251 HWY_API Vec128<int64_t, N> operator/(const Vec128<int64_t, N> a,
   2252                                     const Vec128<int64_t, N> b) {
   2253  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2254  // or a[i] == LimitsMin<int64_t>() && b[i] == -1
   2255  __m128i raw_result;
   2256  __asm__("vdiv.d %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2257  return Vec128<int64_t, N>{raw_result};
   2258 }
   2259 
   2260 template <size_t N>
   2261 HWY_API Vec128<uint64_t, N> operator/(const Vec128<uint64_t, N> a,
   2262                                      const Vec128<uint64_t, N> b) {
   2263  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2264  __m128i raw_result;
   2265  __asm__("vdiv.du %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2266  return Vec128<uint64_t, N>{raw_result};
   2267 }
   2268 
   2269 template <size_t N>
   2270 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
   2271                                   const Vec128<float, N> b) {
   2272  return Vec128<float, N>{__lsx_vfdiv_s(a.raw, b.raw)};
   2273 }
   2274 template <size_t N>
   2275 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
   2276                                    const Vec128<double, N> b) {
   2277  return Vec128<double, N>{__lsx_vfdiv_d(a.raw, b.raw)};
   2278 }
   2279 
   2280 // ------------------------------ Integer Mod
   2281 
   2282 template <size_t N>
   2283 HWY_API Vec128<int8_t, N> operator%(const Vec128<int8_t, N> a,
   2284                                    const Vec128<int8_t, N> b) {
   2285  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2286  // or a[i] == LimitsMin<int8_t>() && b[i] == -1
   2287  __m128i raw_result;
   2288  __asm__("vmod.b %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2289  return Vec128<int8_t, N>{raw_result};
   2290 }
   2291 
   2292 template <size_t N>
   2293 HWY_API Vec128<uint8_t, N> operator%(const Vec128<uint8_t, N> a,
   2294                                     const Vec128<uint8_t, N> b) {
   2295  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2296  __m128i raw_result;
   2297  __asm__("vmod.bu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2298  return Vec128<uint8_t, N>{raw_result};
   2299 }
   2300 
   2301 template <size_t N>
   2302 HWY_API Vec128<int16_t, N> operator%(const Vec128<int16_t, N> a,
   2303                                     const Vec128<int16_t, N> b) {
   2304  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2305  // or a[i] == LimitsMin<int16_t>() && b[i] == -1
   2306  __m128i raw_result;
   2307  __asm__("vmod.h %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2308  return Vec128<int16_t, N>{raw_result};
   2309 }
   2310 
   2311 template <size_t N>
   2312 HWY_API Vec128<uint16_t, N> operator%(const Vec128<uint16_t, N> a,
   2313                                      const Vec128<uint16_t, N> b) {
   2314  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2315  __m128i raw_result;
   2316  __asm__("vmod.hu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2317  return Vec128<uint16_t, N>{raw_result};
   2318 }
   2319 
   2320 template <size_t N>
   2321 HWY_API Vec128<int32_t, N> operator%(const Vec128<int32_t, N> a,
   2322                                     const Vec128<int32_t, N> b) {
   2323  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2324  // or a[i] == LimitsMin<int32_t>() && b[i] == -1
   2325  __m128i raw_result;
   2326  __asm__("vmod.w %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2327  return Vec128<int32_t, N>{raw_result};
   2328 }
   2329 
   2330 template <size_t N>
   2331 HWY_API Vec128<uint32_t, N> operator%(const Vec128<uint32_t, N> a,
   2332                                      const Vec128<uint32_t, N> b) {
   2333  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2334  __m128i raw_result;
   2335  __asm__("vmod.wu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2336  return Vec128<uint32_t, N>{raw_result};
   2337 }
   2338 
   2339 template <size_t N>
   2340 HWY_API Vec128<int64_t, N> operator%(const Vec128<int64_t, N> a,
   2341                                     const Vec128<int64_t, N> b) {
   2342  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2343  // or a[i] == LimitsMin<int64_t>() && b[i] == -1
   2344  __m128i raw_result;
   2345  __asm__("vmod.d %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2346  return Vec128<int64_t, N>{raw_result};
   2347 }
   2348 
   2349 template <size_t N>
   2350 HWY_API Vec128<uint64_t, N> operator%(const Vec128<uint64_t, N> a,
   2351                                      const Vec128<uint64_t, N> b) {
   2352  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   2353  __m128i raw_result;
   2354  __asm__("vmod.du %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   2355  return Vec128<uint64_t, N>{raw_result};
   2356 }
   2357 
   2358 // ------------------------------ ApproximateReciprocal
   2359 
   2360 #ifdef HWY_NATIVE_F64_APPROX_RECIP
   2361 #undef HWY_NATIVE_F64_APPROX_RECIP
   2362 #else
   2363 #define HWY_NATIVE_F64_APPROX_RECIP
   2364 #endif
   2365 
   2366 template <size_t N>
   2367 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
   2368  return Vec128<float, N>{__lsx_vfrecip_s(v.raw)};
   2369 }
   2370 template <size_t N>
   2371 HWY_API Vec128<double, N> ApproximateReciprocal(const Vec128<double, N> v) {
   2372  return Vec128<double, N>{__lsx_vfrecip_d(v.raw)};
   2373 }
   2374 
   2375 // ------------------------------ Absolute value of difference
   2376 
   2377 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
   2378 #undef HWY_NATIVE_INTEGER_ABS_DIFF
   2379 #else
   2380 #define HWY_NATIVE_INTEGER_ABS_DIFF
   2381 #endif
   2382 
   2383 template <size_t N>
   2384 HWY_API Vec128<int8_t, N> AbsDiff(const Vec128<int8_t, N> a,
   2385                                  Vec128<int8_t, N> b) {
   2386  return Vec128<int8_t, N>{__lsx_vabsd_b(a.raw, b.raw)};
   2387 }
   2388 template <size_t N>
   2389 HWY_API Vec128<int16_t, N> AbsDiff(const Vec128<int16_t, N> a,
   2390                                   Vec128<int16_t, N> b) {
   2391  return Vec128<int16_t, N>{__lsx_vabsd_h(a.raw, b.raw)};
   2392 }
   2393 template <size_t N>
   2394 HWY_API Vec128<int32_t, N> AbsDiff(const Vec128<int32_t, N> a,
   2395                                   Vec128<int32_t, N> b) {
   2396  return Vec128<int32_t, N>{__lsx_vabsd_w(a.raw, b.raw)};
   2397 }
   2398 template <size_t N>
   2399 HWY_API Vec128<int64_t, N> AbsDiff(const Vec128<int64_t, N> a,
   2400                                   Vec128<int64_t, N> b) {
   2401  return Vec128<int64_t, N>{__lsx_vabsd_d(a.raw, b.raw)};
   2402 }
   2403 
   2404 template <size_t N>
   2405 HWY_API Vec128<uint8_t, N> AbsDiff(const Vec128<uint8_t, N> a,
   2406                                   Vec128<uint8_t, N> b) {
   2407  return Vec128<uint8_t, N>{__lsx_vabsd_bu(a.raw, b.raw)};
   2408 }
   2409 template <size_t N>
   2410 HWY_API Vec128<uint16_t, N> AbsDiff(const Vec128<uint16_t, N> a,
   2411                                    Vec128<uint16_t, N> b) {
   2412  return Vec128<uint16_t, N>{__lsx_vabsd_hu(a.raw, b.raw)};
   2413 }
   2414 template <size_t N>
   2415 HWY_API Vec128<uint32_t, N> AbsDiff(const Vec128<uint32_t, N> a,
   2416                                    Vec128<uint32_t, N> b) {
   2417  return Vec128<uint32_t, N>{__lsx_vabsd_wu(a.raw, b.raw)};
   2418 }
   2419 template <size_t N>
   2420 HWY_API Vec128<uint64_t, N> AbsDiff(const Vec128<uint64_t, N> a,
   2421                                    Vec128<uint64_t, N> b) {
   2422  return Vec128<uint64_t, N>{__lsx_vabsd_du(a.raw, b.raw)};
   2423 }
   2424 
   2425 // Generic for all vector lengths.
   2426 template <class V, HWY_IF_FLOAT_V(V)>
   2427 HWY_API V AbsDiff(V a, V b) {
   2428  return Abs(a - b);
   2429 }
   2430 
   2431 // ------------------------------ Integer/Float multiply-add
   2432 
   2433 #ifdef HWY_NATIVE_INT_FMA
   2434 #undef HWY_NATIVE_INT_FMA
   2435 #else
   2436 #define HWY_NATIVE_INT_FMA
   2437 #endif
   2438 
   2439 template <size_t N>
   2440 HWY_API Vec128<int8_t, N> MulAdd(Vec128<int8_t, N> mul, Vec128<int8_t, N> x,
   2441                                 Vec128<int8_t, N> add) {
   2442  return Vec128<int8_t, N>{__lsx_vmadd_b(add.raw, mul.raw, x.raw)};
   2443 }
   2444 template <size_t N>
   2445 HWY_API Vec128<int16_t, N> MulAdd(Vec128<int16_t, N> mul, Vec128<int16_t, N> x,
   2446                                  Vec128<int16_t, N> add) {
   2447  return Vec128<int16_t, N>{__lsx_vmadd_h(add.raw, mul.raw, x.raw)};
   2448 }
   2449 template <size_t N>
   2450 HWY_API Vec128<int32_t, N> MulAdd(Vec128<int32_t, N> mul, Vec128<int32_t, N> x,
   2451                                  Vec128<int32_t, N> add) {
   2452  return Vec128<int32_t, N>{__lsx_vmadd_w(add.raw, mul.raw, x.raw)};
   2453 }
   2454 template <size_t N>
   2455 HWY_API Vec128<int64_t, N> MulAdd(Vec128<int64_t, N> mul, Vec128<int64_t, N> x,
   2456                                  Vec128<int64_t, N> add) {
   2457  return Vec128<int64_t, N>{__lsx_vmadd_d(add.raw, mul.raw, x.raw)};
   2458 }
   2459 
   2460 template <size_t N>
   2461 HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
   2462                                Vec128<float, N> add) {
   2463  return Vec128<float, N>{__lsx_vfmadd_s(mul.raw, x.raw, add.raw)};
   2464 }
   2465 template <size_t N>
   2466 HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
   2467                                 Vec128<double, N> add) {
   2468  return Vec128<double, N>{__lsx_vfmadd_d(mul.raw, x.raw, add.raw)};
   2469 }
   2470 
   2471 // Unsinged
   2472 template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
   2473 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2474                            Vec128<T, N> add) {
   2475  return mul * x + add;
   2476 }
   2477 
   2478 // ------------------------------ Integer/Float NegMulAdd
   2479 
   2480 template <size_t N>
   2481 HWY_API Vec128<int8_t, N> NegMulAdd(Vec128<int8_t, N> mul, Vec128<int8_t, N> x,
   2482                                    Vec128<int8_t, N> add) {
   2483  return Vec128<int8_t, N>{__lsx_vmsub_b(add.raw, mul.raw, x.raw)};
   2484 }
   2485 template <size_t N>
   2486 HWY_API Vec128<int16_t, N> NegMulAdd(Vec128<int16_t, N> mul,
   2487                                     Vec128<int16_t, N> x,
   2488                                     Vec128<int16_t, N> add) {
   2489  return Vec128<int16_t, N>{__lsx_vmsub_h(add.raw, mul.raw, x.raw)};
   2490 }
   2491 template <size_t N>
   2492 HWY_API Vec128<int32_t, N> NegMulAdd(Vec128<int32_t, N> mul,
   2493                                     Vec128<int32_t, N> x,
   2494                                     Vec128<int32_t, N> sub) {
   2495  return Vec128<int32_t, N>{__lsx_vmsub_w(sub.raw, mul.raw, x.raw)};
   2496 }
   2497 template <size_t N>
   2498 HWY_API Vec128<int64_t, N> NegMulAdd(Vec128<int64_t, N> mul,
   2499                                     Vec128<int64_t, N> x,
   2500                                     Vec128<int64_t, N> sub) {
   2501  return Vec128<int64_t, N>{__lsx_vmsub_d(sub.raw, mul.raw, x.raw)};
   2502 }
   2503 
   2504 // Float/unsigned
   2505 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
   2506 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2507                               Vec128<T, N> add) {
   2508  return add - mul * x;
   2509 }
   2510 
   2511 // ------------------------------ Float MulSub
   2512 
   2513 // float
   2514 template <size_t N>
   2515 HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
   2516                                Vec128<float, N> sub) {
   2517  return Vec128<float, N>{__lsx_vfmsub_s(x.raw, mul.raw, sub.raw)};
   2518 }
   2519 template <size_t N>
   2520 HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x,
   2521                                 Vec128<double, N> sub) {
   2522  return Vec128<double, N>{__lsx_vfmsub_d(x.raw, mul.raw, sub.raw)};
   2523 }
   2524 
   2525 // unsigned
   2526 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
   2527 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
   2528                            Vec128<T, N> sub) {
   2529  return mul * x - sub;
   2530 }
   2531 
   2532 // ------------------------------ Float NegMulSub
   2533 
   2534 // float/unsigned
   2535 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
   2536 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
   2537                               Vec128<T, N> sub) {
   2538  return Neg(mul) * x - sub;
   2539 }
   2540 
   2541 // ------------------------------ Floating-point square root
   2542 
   2543 template <size_t N>
   2544 HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) {
   2545  return Vec128<float, N>{__lsx_vfsqrt_s(v.raw)};
   2546 }
   2547 template <size_t N>
   2548 HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) {
   2549  return Vec128<double, N>{__lsx_vfsqrt_d(v.raw)};
   2550 }
   2551 
   2552 // ------------------------------ ApproximateReciprocalSqrt
   2553 #ifdef HWY_NATIVE_F64_APPROX_RSQRT
   2554 #undef HWY_NATIVE_F64_APPROX_RSQRT
   2555 #else
   2556 #define HWY_NATIVE_F64_APPROX_RSQRT
   2557 #endif
   2558 
   2559 template <size_t N>
   2560 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
   2561  return Vec128<float, N>{__lsx_vfrsqrt_s(v.raw)};
   2562 }
   2563 template <size_t N>
   2564 HWY_API Vec128<double, N> ApproximateReciprocalSqrt(Vec128<double, N> v) {
   2565  return Vec128<double, N>{__lsx_vfrsqrt_d(v.raw)};
   2566 }
   2567 
   2568 // ------------------------------ Min
   2569 
   2570 template <size_t N>
   2571 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   2572  return Vec128<uint8_t, N>{__lsx_vmin_bu(a.raw, b.raw)};
   2573 }
   2574 template <size_t N>
   2575 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   2576  return Vec128<uint16_t, N>{__lsx_vmin_hu(a.raw, b.raw)};
   2577 }
   2578 template <size_t N>
   2579 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   2580  return Vec128<uint32_t, N>{__lsx_vmin_wu(a.raw, b.raw)};
   2581 }
   2582 template <size_t N>
   2583 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   2584  return Vec128<uint64_t, N>{__lsx_vmin_du(a.raw, b.raw)};
   2585 }
   2586 
   2587 template <size_t N>
   2588 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   2589  return Vec128<int8_t, N>{__lsx_vmin_b(a.raw, b.raw)};
   2590 }
   2591 template <size_t N>
   2592 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   2593  return Vec128<int16_t, N>{__lsx_vmin_h(a.raw, b.raw)};
   2594 }
   2595 template <size_t N>
   2596 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   2597  return Vec128<int32_t, N>{__lsx_vmin_w(a.raw, b.raw)};
   2598 }
   2599 template <size_t N>
   2600 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   2601  return Vec128<int64_t, N>{__lsx_vmin_d(a.raw, b.raw)};
   2602 }
   2603 
   2604 template <size_t N>
   2605 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
   2606  return Vec128<float, N>{__lsx_vfmin_s(a.raw, b.raw)};
   2607 }
   2608 template <size_t N>
   2609 HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) {
   2610  return Vec128<double, N>{__lsx_vfmin_d(a.raw, b.raw)};
   2611 }
   2612 
   2613 // ------------------------------ Max
   2614 
   2615 template <size_t N>
   2616 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   2617  return Vec128<uint8_t, N>{__lsx_vmax_bu(a.raw, b.raw)};
   2618 }
   2619 template <size_t N>
   2620 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   2621  return Vec128<uint16_t, N>{__lsx_vmax_hu(a.raw, b.raw)};
   2622 }
   2623 template <size_t N>
   2624 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   2625  return Vec128<uint32_t, N>{__lsx_vmax_wu(a.raw, b.raw)};
   2626 }
   2627 template <size_t N>
   2628 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   2629  return Vec128<uint64_t, N>{__lsx_vmax_du(a.raw, b.raw)};
   2630 }
   2631 
   2632 template <size_t N>
   2633 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   2634  return Vec128<int8_t, N>{__lsx_vmax_b(a.raw, b.raw)};
   2635 }
   2636 template <size_t N>
   2637 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   2638  return Vec128<int16_t, N>{__lsx_vmax_h(a.raw, b.raw)};
   2639 }
   2640 template <size_t N>
   2641 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   2642  return Vec128<int32_t, N>{__lsx_vmax_w(a.raw, b.raw)};
   2643 }
   2644 template <size_t N>
   2645 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   2646  return Vec128<int64_t, N>{__lsx_vmax_d(a.raw, b.raw)};
   2647 }
   2648 
   2649 template <size_t N>
   2650 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
   2651  return Vec128<float, N>{__lsx_vfmax_s(a.raw, b.raw)};
   2652 }
   2653 template <size_t N>
   2654 HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
   2655  return Vec128<double, N>{__lsx_vfmax_d(a.raw, b.raw)};
   2656 }
   2657 
   2658 // ------------------------------ MinMagnitude and MaxMagnitude
   2659 
   2660 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
   2661 #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
   2662 #else
   2663 #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
   2664 #endif
   2665 
   2666 template <size_t N>
   2667 HWY_API Vec128<float, N> MinMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
   2668  return Vec128<float, N>{__lsx_vfmina_s(a.raw, b.raw)};
   2669 }
   2670 template <size_t N>
   2671 HWY_API Vec128<double, N> MinMagnitude(Vec128<double, N> a,
   2672                                       Vec128<double, N> b) {
   2673  return Vec128<double, N>{__lsx_vfmina_d(a.raw, b.raw)};
   2674 }
   2675 
   2676 template <size_t N>
   2677 HWY_API Vec128<float, N> MaxMagnitude(Vec128<float, N> a, Vec128<float, N> b) {
   2678  return Vec128<float, N>{__lsx_vfmaxa_s(a.raw, b.raw)};
   2679 }
   2680 template <size_t N>
   2681 HWY_API Vec128<double, N> MaxMagnitude(Vec128<double, N> a,
   2682                                       Vec128<double, N> b) {
   2683  return Vec128<double, N>{__lsx_vfmaxa_d(a.raw, b.raw)};
   2684 }
   2685 
   2686 // ------------------------------ Non-temporal stores
   2687 
   2688 // Same as aligned stores on non-x86.
   2689 
   2690 template <class D>
   2691 HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   2692  __builtin_prefetch(aligned, 1, 0);
   2693  Store(v, d, aligned);
   2694 }
   2695 
   2696 // ------------------------------ Scatter in generic_ops-inl.h
   2697 // ------------------------------ Gather in generic_ops-inl.h
   2698 
   2699 // ================================================== SWIZZLE (2)
   2700 
   2701 // ------------------------------ LowerHalf
   2702 
   2703 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   2704 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
   2705  return VFromD<D>{v.raw};
   2706 }
   2707 template <typename T, size_t N>
   2708 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
   2709  return Vec128<T, N / 2>{v.raw};
   2710 }
   2711 
   2712 // ------------------------------ ShiftLeftBytes
   2713 
   2714 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   2715 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
   2716  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   2717  if (kBytes == 0) return v;
   2718  const RebindToUnsigned<decltype(d)> du;
   2719  return BitCast(
   2720      d, VFromD<decltype(du)>{__lsx_vbsll_v(BitCast(du, v).raw, kBytes)});
   2721 }
   2722 
   2723 // Generic for all vector lengths.
   2724 template <int kBytes, class V>
   2725 HWY_API V ShiftLeftBytes(const V v) {
   2726  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
   2727 }
   2728 
   2729 // ------------------------------ ShiftLeftLanes
   2730 
   2731 // Generic for all vector lengths.
   2732 template <int kLanes, class D>
   2733 HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) {
   2734  const Repartition<uint8_t, decltype(d)> d8;
   2735  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
   2736 }
   2737 
   2738 // Generic for all vector lengths.
   2739 template <int kLanes, class V>
   2740 HWY_API V ShiftLeftLanes(const V v) {
   2741  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
   2742 }
   2743 
   2744 // ------------------------------ ShiftRightBytes
   2745 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   2746 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
   2747  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   2748  if (kBytes == 0) return v;
   2749  const RebindToUnsigned<decltype(d)> du;
   2750  // For partial vectors, clear upper lanes so we shift in zeros.
   2751  if (d.MaxBytes() != 16) {
   2752    const Full128<TFromD<D>> dfull;
   2753    const VFromD<decltype(dfull)> vfull{v.raw};
   2754    v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
   2755  }
   2756  return BitCast(
   2757      d, VFromD<decltype(du)>{__lsx_vbsrl_v(BitCast(du, v).raw, kBytes)});
   2758 }
   2759 
   2760 // ------------------------------ ShiftRightLanes
   2761 // Generic for all vector lengths.
   2762 template <int kLanes, class D>
   2763 HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) {
   2764  const Repartition<uint8_t, decltype(d)> d8;
   2765  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
   2766  return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
   2767 }
   2768 
   2769 // ------------------------------ UpperHalf (ShiftRightBytes)
   2770 
   2771 template <class D, HWY_IF_V_SIZE_D(D, 8)>
   2772 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   2773  const Twice<RebindToUnsigned<decltype(d)>> dut;
   2774  using VUT = VFromD<decltype(dut)>;  // for float16_t
   2775  const VUT vut = BitCast(dut, v);
   2776  return BitCast(d, LowerHalf(VUT{__lsx_vilvh_d(vut.raw, vut.raw)}));
   2777 }
   2778 
   2779 // Partial
   2780 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   2781 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   2782  return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
   2783 }
   2784 
   2785 // ------------------------------ ExtractLane (UpperHalf)
   2786 
   2787 namespace detail {
   2788 
   2789 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   2790 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   2791  static_assert(kLane < N, "Lane index out of bounds");
   2792  return static_cast<T>(__lsx_vpickve2gr_b(v.raw, kLane) & 0xFF);
   2793 }
   2794 
   2795 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   2796 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   2797  static_assert(kLane < N, "Lane index out of bounds");
   2798  const DFromV<decltype(v)> d;
   2799  const RebindToUnsigned<decltype(d)> du;
   2800  const uint16_t lane = static_cast<uint16_t>(
   2801      __lsx_vpickve2gr_hu(BitCast(du, v).raw, kLane) & 0xFFFF);
   2802  return BitCastScalar<T>(lane);
   2803 }
   2804 
   2805 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   2806 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   2807  static_assert(kLane < N, "Lane index out of bounds");
   2808  return static_cast<T>(__lsx_vpickve2gr_w(v.raw, kLane));
   2809 }
   2810 
   2811 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   2812 HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
   2813  static_assert(kLane < N, "Lane index out of bounds");
   2814  return static_cast<T>(__lsx_vpickve2gr_d(v.raw, kLane));
   2815 }
   2816 
   2817 template <size_t kLane, size_t N>
   2818 HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
   2819  float f32;
   2820  int32_t i32 = __lsx_vpickve2gr_w(reinterpret_cast<__m128i>(v.raw), kLane);
   2821  CopyBytes<4>(&i32, &f32);
   2822  return f32;
   2823 }
   2824 template <size_t kLane, size_t N>
   2825 HWY_INLINE double ExtractLane(const Vec128<double, N> v) {
   2826  double f64;
   2827  int64_t i64 = __lsx_vpickve2gr_d(reinterpret_cast<__m128i>(v.raw), kLane);
   2828  CopyBytes<8>(&i64, &f64);
   2829  return f64;
   2830 }
   2831 
   2832 }  // namespace detail
   2833 
   2834 template <typename T>
   2835 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
   2836  HWY_DASSERT(i == 0);
   2837  (void)i;
   2838  return GetLane(v);
   2839 }
   2840 
   2841 template <typename T>
   2842 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
   2843 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   2844  if (__builtin_constant_p(i)) {
   2845    switch (i) {
   2846      case 0:
   2847        return detail::ExtractLane<0>(v);
   2848      case 1:
   2849        return detail::ExtractLane<1>(v);
   2850    }
   2851  }
   2852 #endif
   2853  alignas(16) T lanes[2];
   2854  Store(v, DFromV<decltype(v)>(), lanes);
   2855  return lanes[i];
   2856 }
   2857 
   2858 template <typename T>
   2859 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
   2860 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   2861  if (__builtin_constant_p(i)) {
   2862    switch (i) {
   2863      case 0:
   2864        return detail::ExtractLane<0>(v);
   2865      case 1:
   2866        return detail::ExtractLane<1>(v);
   2867      case 2:
   2868        return detail::ExtractLane<2>(v);
   2869      case 3:
   2870        return detail::ExtractLane<3>(v);
   2871    }
   2872  }
   2873 #endif
   2874  alignas(16) T lanes[4];
   2875  Store(v, DFromV<decltype(v)>(), lanes);
   2876  return lanes[i];
   2877 }
   2878 
   2879 template <typename T>
   2880 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
   2881 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   2882  if (__builtin_constant_p(i)) {
   2883    switch (i) {
   2884      case 0:
   2885        return detail::ExtractLane<0>(v);
   2886      case 1:
   2887        return detail::ExtractLane<1>(v);
   2888      case 2:
   2889        return detail::ExtractLane<2>(v);
   2890      case 3:
   2891        return detail::ExtractLane<3>(v);
   2892      case 4:
   2893        return detail::ExtractLane<4>(v);
   2894      case 5:
   2895        return detail::ExtractLane<5>(v);
   2896      case 6:
   2897        return detail::ExtractLane<6>(v);
   2898      case 7:
   2899        return detail::ExtractLane<7>(v);
   2900    }
   2901  }
   2902 #endif
   2903  alignas(16) T lanes[8];
   2904  Store(v, DFromV<decltype(v)>(), lanes);
   2905  return lanes[i];
   2906 }
   2907 
   2908 template <typename T>
   2909 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
   2910 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   2911  if (__builtin_constant_p(i)) {
   2912    switch (i) {
   2913      case 0:
   2914        return detail::ExtractLane<0>(v);
   2915      case 1:
   2916        return detail::ExtractLane<1>(v);
   2917      case 2:
   2918        return detail::ExtractLane<2>(v);
   2919      case 3:
   2920        return detail::ExtractLane<3>(v);
   2921      case 4:
   2922        return detail::ExtractLane<4>(v);
   2923      case 5:
   2924        return detail::ExtractLane<5>(v);
   2925      case 6:
   2926        return detail::ExtractLane<6>(v);
   2927      case 7:
   2928        return detail::ExtractLane<7>(v);
   2929      case 8:
   2930        return detail::ExtractLane<8>(v);
   2931      case 9:
   2932        return detail::ExtractLane<9>(v);
   2933      case 10:
   2934        return detail::ExtractLane<10>(v);
   2935      case 11:
   2936        return detail::ExtractLane<11>(v);
   2937      case 12:
   2938        return detail::ExtractLane<12>(v);
   2939      case 13:
   2940        return detail::ExtractLane<13>(v);
   2941      case 14:
   2942        return detail::ExtractLane<14>(v);
   2943      case 15:
   2944        return detail::ExtractLane<15>(v);
   2945    }
   2946  }
   2947 #endif
   2948  alignas(16) T lanes[16];
   2949  Store(v, DFromV<decltype(v)>(), lanes);
   2950  return lanes[i];
   2951 }
   2952 
   2953 // ------------------------------ InsertLane (UpperHalf)
   2954 
   2955 namespace detail {
   2956 
   2957 template <class V>
   2958 HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) {
   2959  const DFromV<decltype(v)> d;
   2960 
   2961 #if HWY_TARGET <= HWY_AVX3
   2962  using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw);
   2963  const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)};
   2964 #else
   2965  const RebindToUnsigned<decltype(d)> du;
   2966  using TU = TFromD<decltype(du)>;
   2967  const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i)));
   2968 #endif
   2969 
   2970  return IfThenElse(mask, Set(d, t), v);
   2971 }
   2972 
   2973 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   2974 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   2975  static_assert(kLane < N, "Lane index out of bounds");
   2976  return Vec128<T, N>{__lsx_vinsgr2vr_b(v.raw, t, kLane)};
   2977 }
   2978 
   2979 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   2980 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   2981  static_assert(kLane < N, "Lane index out of bounds");
   2982  const DFromV<decltype(v)> d;
   2983  const RebindToUnsigned<decltype(d)> du;
   2984  const uint16_t bits = BitCastScalar<uint16_t>(t);
   2985  return BitCast(d, VFromD<decltype(du)>{
   2986                        __lsx_vinsgr2vr_h(BitCast(du, v).raw, bits, kLane)});
   2987 }
   2988 template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
   2989 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   2990  static_assert(kLane < N, "Lane index out of bounds");
   2991  return Vec128<T, N>{__lsx_vinsgr2vr_w(v.raw, t, kLane)};
   2992 }
   2993 template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)>
   2994 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   2995  static_assert(kLane < N, "Lane index out of bounds");
   2996  return Vec128<T, N>{__lsx_vinsgr2vr_d(v.raw, t, kLane)};
   2997 }
   2998 
   2999 template <size_t kLane, size_t N>
   3000 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
   3001  static_assert(kLane < N, "Lane index out of bounds");
   3002  const DFromV<decltype(v)> d;
   3003  int ti = BitCastScalar<int>(t);
   3004  RebindToUnsigned<decltype(d)> du;
   3005  return BitCast(d, VFromD<decltype(du)>{__lsx_vinsgr2vr_w(
   3006                        reinterpret_cast<__m128i>(v.raw), ti, kLane)});
   3007 }
   3008 
   3009 template <size_t kLane>
   3010 HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) {
   3011  static_assert(kLane < 2, "Lane index out of bounds");
   3012  const DFromV<decltype(v)> d;
   3013  long int ti = BitCastScalar<long int>(t);
   3014  RebindToUnsigned<decltype(d)> du;
   3015  return BitCast(d, VFromD<decltype(du)>{__lsx_vinsgr2vr_d(
   3016                        reinterpret_cast<__m128i>(v.raw), ti, kLane)});
   3017 }
   3018 
   3019 }  // namespace detail
   3020 
   3021 template <typename T>
   3022 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
   3023  HWY_DASSERT(i == 0);
   3024  (void)i;
   3025  return Set(DFromV<decltype(v)>(), t);
   3026 }
   3027 
   3028 template <typename T>
   3029 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
   3030 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3031  if (__builtin_constant_p(i)) {
   3032    switch (i) {
   3033      case 0:
   3034        return detail::InsertLane<0>(v, t);
   3035      case 1:
   3036        return detail::InsertLane<1>(v, t);
   3037    }
   3038  }
   3039 #endif
   3040  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   3041 }
   3042 
   3043 template <typename T>
   3044 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
   3045 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3046  if (__builtin_constant_p(i)) {
   3047    switch (i) {
   3048      case 0:
   3049        return detail::InsertLane<0>(v, t);
   3050      case 1:
   3051        return detail::InsertLane<1>(v, t);
   3052      case 2:
   3053        return detail::InsertLane<2>(v, t);
   3054      case 3:
   3055        return detail::InsertLane<3>(v, t);
   3056    }
   3057  }
   3058 #endif
   3059  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   3060 }
   3061 
   3062 template <typename T>
   3063 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
   3064 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3065  if (__builtin_constant_p(i)) {
   3066    switch (i) {
   3067      case 0:
   3068        return detail::InsertLane<0>(v, t);
   3069      case 1:
   3070        return detail::InsertLane<1>(v, t);
   3071      case 2:
   3072        return detail::InsertLane<2>(v, t);
   3073      case 3:
   3074        return detail::InsertLane<3>(v, t);
   3075      case 4:
   3076        return detail::InsertLane<4>(v, t);
   3077      case 5:
   3078        return detail::InsertLane<5>(v, t);
   3079      case 6:
   3080        return detail::InsertLane<6>(v, t);
   3081      case 7:
   3082        return detail::InsertLane<7>(v, t);
   3083    }
   3084  }
   3085 #endif
   3086  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   3087 }
   3088 
   3089 template <typename T>
   3090 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
   3091 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3092  if (__builtin_constant_p(i)) {
   3093    switch (i) {
   3094      case 0:
   3095        return detail::InsertLane<0>(v, t);
   3096      case 1:
   3097        return detail::InsertLane<1>(v, t);
   3098      case 2:
   3099        return detail::InsertLane<2>(v, t);
   3100      case 3:
   3101        return detail::InsertLane<3>(v, t);
   3102      case 4:
   3103        return detail::InsertLane<4>(v, t);
   3104      case 5:
   3105        return detail::InsertLane<5>(v, t);
   3106      case 6:
   3107        return detail::InsertLane<6>(v, t);
   3108      case 7:
   3109        return detail::InsertLane<7>(v, t);
   3110      case 8:
   3111        return detail::InsertLane<8>(v, t);
   3112      case 9:
   3113        return detail::InsertLane<9>(v, t);
   3114      case 10:
   3115        return detail::InsertLane<10>(v, t);
   3116      case 11:
   3117        return detail::InsertLane<11>(v, t);
   3118      case 12:
   3119        return detail::InsertLane<12>(v, t);
   3120      case 13:
   3121        return detail::InsertLane<13>(v, t);
   3122      case 14:
   3123        return detail::InsertLane<14>(v, t);
   3124      case 15:
   3125        return detail::InsertLane<15>(v, t);
   3126    }
   3127  }
   3128 #endif
   3129  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   3130 }
   3131 
   3132 // ------------------------------ CombineShiftRightBytes
   3133 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)>
   3134 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   3135  static_assert(0 < kBytes && kBytes < 16, "kBytes invalid");
   3136  return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
   3137 }
   3138 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   3139 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   3140  constexpr size_t kSize = d.MaxBytes();
   3141  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
   3142 
   3143  const Twice<decltype(d)> dt;
   3144  return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw};
   3145 }
   3146 
   3147 // ------------------------------ Broadcast/splat any lane
   3148 
   3149 template <int kLane, typename T, size_t N, HWY_IF_UI8(T)>
   3150 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
   3151  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   3152  return Vec128<T, N>{__lsx_vreplvei_b(v.raw, kLane)};
   3153 }
   3154 template <int kLane, typename T, size_t N, HWY_IF_UI16(T)>
   3155 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
   3156  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   3157  return Vec128<T, N>{__lsx_vreplvei_h(v.raw, kLane)};
   3158 }
   3159 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   3160 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
   3161  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   3162  const DFromV<decltype(v)> d;
   3163  return BitCast(d, Vec128<int32_t, N>{__lsx_vreplvei_w(
   3164                        reinterpret_cast<__m128i>(v.raw), kLane)});
   3165 }
   3166 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   3167 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
   3168  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   3169  const DFromV<decltype(v)> d;
   3170  return BitCast(d, Vec128<int64_t, N>{__lsx_vreplvei_d(
   3171                        reinterpret_cast<__m128i>(v.raw), kLane)});
   3172 }
   3173 
   3174 // ------------------------------ TableLookupLanes (Shuffle01)
   3175 
   3176 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
   3177 template <typename T, size_t N = 16 / sizeof(T)>
   3178 struct Indices128 {
   3179  __m128i raw;
   3180 };
   3181 
   3182 namespace detail {
   3183 
   3184 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   3185 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   3186    D d) {
   3187  const Repartition<uint8_t, decltype(d)> d8;
   3188  return Iota(d8, 0);
   3189 }
   3190 
   3191 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   3192 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   3193    D d) {
   3194  const Repartition<uint8_t, decltype(d)> d8;
   3195  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   3196      0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
   3197  return Load(d8, kBroadcastLaneBytes);
   3198 }
   3199 
   3200 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   3201 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   3202    D d) {
   3203  const Repartition<uint8_t, decltype(d)> d8;
   3204  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   3205      0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
   3206  return Load(d8, kBroadcastLaneBytes);
   3207 }
   3208 
   3209 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   3210 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   3211    D d) {
   3212  const Repartition<uint8_t, decltype(d)> d8;
   3213  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   3214      0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
   3215  return Load(d8, kBroadcastLaneBytes);
   3216 }
   3217 
   3218 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   3219 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   3220  const Repartition<uint8_t, decltype(d)> d8;
   3221  return Zero(d8);
   3222 }
   3223 
   3224 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   3225 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   3226  const Repartition<uint8_t, decltype(d)> d8;
   3227  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   3228      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
   3229  return Load(d8, kByteOffsets);
   3230 }
   3231 
   3232 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   3233 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   3234  const Repartition<uint8_t, decltype(d)> d8;
   3235  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   3236      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
   3237  return Load(d8, kByteOffsets);
   3238 }
   3239 
   3240 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   3241 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   3242  const Repartition<uint8_t, decltype(d)> d8;
   3243  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   3244      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
   3245  return Load(d8, kByteOffsets);
   3246 }
   3247 
   3248 }  // namespace detail
   3249 
   3250 template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
   3251 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
   3252    D d, Vec128<TI, MaxLanes(D())> vec) {
   3253  using T = TFromD<D>;
   3254  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   3255 #if HWY_IS_DEBUG_BUILD
   3256  const RebindToUnsigned<decltype(d)> du;
   3257  using TU = TFromD<decltype(du)>;
   3258  HWY_DASSERT(AllTrue(
   3259      du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
   3260 #endif
   3261 
   3262  (void)d;
   3263  return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw};
   3264 }
   3265 
   3266 template <class D, typename TI,
   3267          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
   3268 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
   3269    D d, Vec128<TI, MaxLanes(D())> vec) {
   3270  using T = TFromD<D>;
   3271  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   3272 #if HWY_IS_DEBUG_BUILD
   3273  const RebindToUnsigned<decltype(d)> du;
   3274  using TU = TFromD<decltype(du)>;
   3275  HWY_DASSERT(AllTrue(
   3276      du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
   3277 #endif
   3278 
   3279  const Repartition<uint8_t, decltype(d)> d8;
   3280  using V8 = VFromD<decltype(d8)>;
   3281 
   3282  // Broadcast each lane index to all bytes of T and shift to bytes
   3283  const V8 lane_indices = TableLookupBytes(
   3284      BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
   3285  constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
   3286  const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
   3287  const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
   3288  return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw};
   3289 }
   3290 
   3291 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI>
   3292 HWY_API Indices128<TFromD<D>, MaxLanes(D())> SetTableIndices(D d,
   3293                                                             const TI* idx) {
   3294  const Rebind<TI, decltype(d)> di;
   3295  return IndicesFromVec(d, LoadU(di, idx));
   3296 }
   3297 
   3298 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
   3299 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   3300  using TI = MakeSigned<T>;
   3301  const DFromV<decltype(v)> d;
   3302  const Rebind<TI, decltype(d)> di;
   3303  auto t1 = TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw});
   3304  return BitCast(d, t1);
   3305 }
   3306 
   3307 // Single lane: no change
   3308 template <typename T>
   3309 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
   3310                                      Indices128<T, 1> /* idx */) {
   3311  return v;
   3312 }
   3313 
   3314 // ------------------------------ ReverseBlocks
   3315 
   3316 // Single block: no change
   3317 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   3318 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
   3319  return v;
   3320 }
   3321 
   3322 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
   3323 
   3324 // Single lane: no change
   3325 template <class D, HWY_IF_LANES_D(D, 1)>
   3326 HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
   3327  return v;
   3328 }
   3329 // 32-bit x2: shuffle
   3330 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
   3331 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
   3332  return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw};
   3333 }
   3334 // 64-bit x2: shuffle
   3335 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   3336 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
   3337  return Shuffle01(v);
   3338 }
   3339 // 32-bit x4: shuffle
   3340 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   3341 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) {
   3342  return Shuffle0123(v);
   3343 }
   3344 
   3345 // 16-bit
   3346 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2),
   3347          HWY_IF_LANES_GT_D(D, 1)>
   3348 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   3349  const RebindToUnsigned<decltype(d)> du;
   3350  using VU = VFromD<decltype(du)>;
   3351  const VU vu = BitCast(du, v);
   3352  constexpr size_t kN = MaxLanes(d);
   3353  if (kN == 1) return v;
   3354  if (kN == 2) {
   3355    return BitCast(d, VU{__lsx_vshuf4i_h(vu.raw, 0x11)});
   3356  }
   3357  if (kN == 4) {
   3358    return BitCast(d, VU{__lsx_vshuf4i_h(vu.raw, 0x1B)});
   3359  }
   3360  const RebindToSigned<decltype(d)> di;
   3361  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
   3362      di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
   3363  return BitCast(d, TableLookupBytes(v, shuffle));
   3364 }
   3365 
   3366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1),
   3367          HWY_IF_LANES_GT_D(D, 1)>
   3368 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   3369  static constexpr int kN = static_cast<int>(MaxLanes(d));
   3370  if (kN == 1) return v;
   3371  alignas(16) static constexpr int8_t _tmp_data[] = {
   3372      kN - 1, kN - 2,  kN - 3,  kN - 4,  kN - 5,  kN - 6,  kN - 7,  kN - 8,
   3373      kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16};
   3374  return VFromD<D>{__lsx_vshuf_b(v.raw, v.raw, __lsx_vld(_tmp_data, 0))};
   3375 }
   3376 
   3377 // ------------------------------ Reverse2
   3378 
   3379 // Single lane: no change
   3380 template <class D, HWY_IF_LANES_D(D, 1)>
   3381 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
   3382  return v;
   3383 }
   3384 
   3385 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   3386 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
   3387  const RepartitionToWide<RebindToUnsigned<decltype(d)>> dw;
   3388  return BitCast(d, RotateRight<16>(BitCast(dw, v)));
   3389 }
   3390 
   3391 // Generic for all vector lengths.
   3392 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)>
   3393 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
   3394  return Shuffle2301(v);
   3395 }
   3396 
   3397 // Generic for all vector lengths.
   3398 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
   3399 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
   3400  return Shuffle01(v);
   3401 }
   3402 
   3403 // ------------------------------ Reverse4
   3404 
   3405 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   3406 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> v) {
   3407  return VFromD<D>{__lsx_vshuf4i_h(v.raw, 0x1B)};
   3408 }
   3409 
   3410 // Generic for all vector lengths.
   3411 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   3412 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
   3413  return Shuffle0123(v);
   3414 }
   3415 
   3416 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   3417 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
   3418  HWY_ASSERT(0);  // don't have 4 u64 lanes
   3419 }
   3420 
   3421 // ------------------------------ Reverse8
   3422 
   3423 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   3424 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   3425  const RepartitionToWide<decltype(d)> dw;
   3426  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
   3427 }
   3428 
   3429 template <class D, HWY_IF_V_SIZE_LE_D(D, 16),
   3430          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   3431 HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
   3432  HWY_ASSERT(0);  // don't have 8 lanes if larger than 16-bit
   3433 }
   3434 
   3435 // ------------------------------ InterleaveUpper (UpperHalf)
   3436 
   3437 // Full
   3438 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   3439 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   3440  return VFromD<D>{__lsx_vilvh_b(b.raw, a.raw)};
   3441 }
   3442 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   3443 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   3444  return VFromD<D>{__lsx_vilvh_h(b.raw, a.raw)};
   3445 }
   3446 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   3447 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   3448  const RebindToSigned<decltype(d)> df;
   3449  return BitCast(d, VFromD<decltype(df)>{
   3450                        __lsx_vilvh_w(BitCast(df, b).raw, BitCast(df, a).raw)});
   3451 }
   3452 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   3453 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   3454  const RebindToSigned<decltype(d)> dd;
   3455  return BitCast(d, VFromD<decltype(dd)>{
   3456                        __lsx_vilvh_d(BitCast(dd, b).raw, BitCast(dd, a).raw)});
   3457 }
   3458 
   3459 // Partial
   3460 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   3461 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   3462  const Half<decltype(d)> d2;
   3463  return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
   3464                         VFromD<D>{UpperHalf(d2, b).raw});
   3465 }
   3466 
   3467 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
   3468 
   3469 // Same as Interleave*, except that the return lanes are double-width integers;
   3470 // this is necessary because the single-lane scalar cannot return two values.
   3471 template <class V, class DW = RepartitionToWide<DFromV<V>>>
   3472 HWY_API VFromD<DW> ZipLower(V a, V b) {
   3473  return BitCast(DW(), InterleaveLower(a, b));
   3474 }
   3475 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   3476 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   3477  return BitCast(dw, InterleaveLower(D(), a, b));
   3478 }
   3479 
   3480 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   3481 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   3482  return BitCast(dw, InterleaveUpper(D(), a, b));
   3483 }
   3484 
   3485 // ================================================== CONVERT (1)
   3486 
   3487 // ------------------------------ PromoteTo unsigned
   3488 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
   3489 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   3490  return VFromD<D>{__lsx_vsllwil_hu_bu(v.raw, 0)};
   3491 }
   3492 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
   3493 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   3494  return VFromD<D>{__lsx_vsllwil_wu_hu(v.raw, 0)};
   3495 }
   3496 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   3497 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   3498  return VFromD<D>{__lsx_vsllwil_du_wu(v.raw, 0)};
   3499 }
   3500 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
   3501 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   3502  const __m128i u16 = __lsx_vsllwil_hu_bu(v.raw, 0);
   3503  return VFromD<D>{__lsx_vsllwil_wu_hu(u16, 0)};
   3504 }
   3505 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   3506 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
   3507  const Rebind<uint32_t, decltype(d)> du32;
   3508  return PromoteTo(d, PromoteTo(du32, v));
   3509 }
   3510 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   3511 HWY_API VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<uint16_t, D>> v) {
   3512  const __m128i u32 = __lsx_vsllwil_wu_hu(v.raw, 0);
   3513  return VFromD<D>{__lsx_vsllwil_du_wu(u32, 0)};
   3514 }
   3515 
   3516 // Unsigned to signed: same plus cast.
   3517 template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V),
   3518          HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)),
   3519          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))>
   3520 HWY_API VFromD<D> PromoteTo(D di, V v) {
   3521  const RebindToUnsigned<decltype(di)> du;
   3522  return BitCast(di, PromoteTo(du, v));
   3523 }
   3524 
   3525 // signed
   3526 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)>
   3527 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   3528  return VFromD<D>{__lsx_vsllwil_h_b(v.raw, 0)};
   3529 }
   3530 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
   3531 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   3532  return VFromD<D>{__lsx_vsllwil_w_h(v.raw, 0)};
   3533 }
   3534 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   3535 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   3536  return VFromD<D>{__lsx_vsllwil_d_w(v.raw, 0)};
   3537 }
   3538 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
   3539 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   3540  const __m128i i16 = __lsx_vsllwil_h_b(v.raw, 0);
   3541  return VFromD<D>{__lsx_vsllwil_w_h(i16, 0)};
   3542 }
   3543 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   3544 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) {
   3545  const Rebind<int32_t, decltype(d)> di32;
   3546  return PromoteTo(d, PromoteTo(di32, v));
   3547 }
   3548 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   3549 HWY_API VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<int16_t, D>> v) {
   3550  const __m128i i32 = __lsx_vsllwil_w_h(v.raw, 0);
   3551  return VFromD<D>{__lsx_vsllwil_d_w(i32, 0)};
   3552 }
   3553 
   3554 // -------------------- PromoteTo float
   3555 
   3556 #ifdef HWY_NATIVE_F16C
   3557 #undef HWY_NATIVE_F16C
   3558 #else
   3559 #define HWY_NATIVE_F16C
   3560 #endif
   3561 
   3562 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   3563 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<hwy::float16_t, D>> v) {
   3564  return VFromD<D>{__lsx_vfcvtl_s_h(v.raw)};
   3565 }
   3566 
   3567 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3568 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   3569  return VFromD<D>{__lsx_vfcvtl_d_s(v.raw)};
   3570 }
   3571 
   3572 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   3573 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   3574  return VFromD<D>{__lsx_vffintl_d_w(v.raw)};
   3575 }
   3576 
   3577 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
   3578 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
   3579  const Rebind<int32_t, decltype(df64)> di32;
   3580  const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v));
   3581  return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result,
   3582                                                Set(df64, 4294967296.0),
   3583                                                Zero(df64));
   3584 }
   3585 
   3586 template <class D, HWY_IF_F32_D(D)>
   3587 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) {
   3588  const RebindToSigned<decltype(d)> di32;
   3589  const Rebind<uint16_t, decltype(d)> du16;
   3590  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
   3591 }
   3592 
   3593 // ------------------------------ Per4LaneBlockShuffle
   3594 
   3595 namespace detail {
   3596 
   3597 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   3598 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   3599 #else
   3600 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   3601 #endif
   3602 
   3603 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   3604 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
   3605                                                const uint32_t x2,
   3606                                                const uint32_t x1,
   3607                                                const uint32_t x0) {
   3608  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
   3609  const GccU32RawVectType raw = {x0, x1, x2, x3};
   3610  return ResizeBitCast(d, Vec128<uint32_t>{reinterpret_cast<__m128i>(raw)});
   3611 }
   3612 
   3613 template <size_t kIdx3210, size_t kVectSize, class V,
   3614          HWY_IF_LANES_LE(kVectSize, 16)>
   3615 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   3616                                  hwy::SizeTag<1> /*lane_size_tag*/,
   3617                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
   3618                                  V v) {
   3619  constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
   3620  return V{__lsx_vshuf4i_b(v.raw, kShuffle)};
   3621 }
   3622 
   3623 template <size_t kIdx3210, size_t kVectSize, class V,
   3624          HWY_IF_LANES_LE(kVectSize, 16)>
   3625 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   3626                                  hwy::SizeTag<2> /*lane_size_tag*/,
   3627                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
   3628                                  V v) {
   3629  const DFromV<decltype(v)> d;
   3630  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   3631  constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
   3632  return BitCast(
   3633      d, VFromD<decltype(du)>{__lsx_vshuf4i_h(BitCast(du, v).raw, kShuffle)});
   3634 }
   3635 
   3636 template <size_t kIdx3210, class V>
   3637 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   3638                                  hwy::SizeTag<4> /*lane_size_tag*/,
   3639                                  hwy::SizeTag<16> /*vect_size_tag*/, V v) {
   3640  const DFromV<decltype(v)> d;
   3641  constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
   3642  const RebindToUnsigned<decltype(d)> du;
   3643  return BitCast(d, VFromD<decltype(du)>{__lsx_vshuf4i_w(
   3644                        reinterpret_cast<__m128i>(v.raw), kShuffle)});
   3645 }
   3646 
   3647 }  // namespace detail
   3648 
   3649 // ------------------------------ SlideUpLanes
   3650 
   3651 namespace detail {
   3652 
   3653 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   3654 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   3655  const DFromV<decltype(v)> d;
   3656  const Full64<uint64_t> du64;
   3657  const auto vu64 = ResizeBitCast(du64, v);
   3658  return ResizeBitCast(
   3659      d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
   3660 }
   3661 
   3662 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   3663 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   3664  const DFromV<decltype(v)> d;
   3665  const Repartition<uint8_t, decltype(d)> du8;
   3666  const auto idx =
   3667      Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
   3668  return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
   3669 }
   3670 
   3671 }  // namespace detail
   3672 
   3673 template <class D, HWY_IF_LANES_D(D, 1)>
   3674 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   3675  return v;
   3676 }
   3677 
   3678 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
   3679 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3680 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3681  if (__builtin_constant_p(amt)) {
   3682    switch (amt) {
   3683      case 0:
   3684        return v;
   3685      case 1:
   3686        return ShiftLeftLanes<1>(d, v);
   3687    }
   3688  }
   3689 #else
   3690  (void)d;
   3691 #endif
   3692 
   3693  return detail::SlideUpLanes(v, amt);
   3694 }
   3695 
   3696 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
   3697 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3698 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3699  if (__builtin_constant_p(amt)) {
   3700    switch (amt) {
   3701      case 0:
   3702        return v;
   3703      case 1:
   3704        return ShiftLeftLanes<1>(d, v);
   3705      case 2:
   3706        return ShiftLeftLanes<2>(d, v);
   3707      case 3:
   3708        return ShiftLeftLanes<3>(d, v);
   3709    }
   3710  }
   3711 #else
   3712  (void)d;
   3713 #endif
   3714 
   3715  return detail::SlideUpLanes(v, amt);
   3716 }
   3717 
   3718 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
   3719 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3720 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3721  if (__builtin_constant_p(amt)) {
   3722    switch (amt) {
   3723      case 0:
   3724        return v;
   3725      case 1:
   3726        return ShiftLeftLanes<1>(d, v);
   3727      case 2:
   3728        return ShiftLeftLanes<2>(d, v);
   3729      case 3:
   3730        return ShiftLeftLanes<3>(d, v);
   3731      case 4:
   3732        return ShiftLeftLanes<4>(d, v);
   3733      case 5:
   3734        return ShiftLeftLanes<5>(d, v);
   3735      case 6:
   3736        return ShiftLeftLanes<6>(d, v);
   3737      case 7:
   3738        return ShiftLeftLanes<7>(d, v);
   3739    }
   3740  }
   3741 #else
   3742  (void)d;
   3743 #endif
   3744 
   3745  return detail::SlideUpLanes(v, amt);
   3746 }
   3747 
   3748 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
   3749 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3750 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3751  if (__builtin_constant_p(amt)) {
   3752    switch (amt) {
   3753      case 0:
   3754        return v;
   3755      case 1:
   3756        return ShiftLeftLanes<1>(d, v);
   3757      case 2:
   3758        return ShiftLeftLanes<2>(d, v);
   3759      case 3:
   3760        return ShiftLeftLanes<3>(d, v);
   3761      case 4:
   3762        return ShiftLeftLanes<4>(d, v);
   3763      case 5:
   3764        return ShiftLeftLanes<5>(d, v);
   3765      case 6:
   3766        return ShiftLeftLanes<6>(d, v);
   3767      case 7:
   3768        return ShiftLeftLanes<7>(d, v);
   3769      case 8:
   3770        return ShiftLeftLanes<8>(d, v);
   3771      case 9:
   3772        return ShiftLeftLanes<9>(d, v);
   3773      case 10:
   3774        return ShiftLeftLanes<10>(d, v);
   3775      case 11:
   3776        return ShiftLeftLanes<11>(d, v);
   3777      case 12:
   3778        return ShiftLeftLanes<12>(d, v);
   3779      case 13:
   3780        return ShiftLeftLanes<13>(d, v);
   3781      case 14:
   3782        return ShiftLeftLanes<14>(d, v);
   3783      case 15:
   3784        return ShiftLeftLanes<15>(d, v);
   3785    }
   3786  }
   3787 #else
   3788  (void)d;
   3789 #endif
   3790 
   3791  return detail::SlideUpLanes(v, amt);
   3792 }
   3793 
   3794 // ------------------------------ SlideDownLanes
   3795 
   3796 namespace detail {
   3797 
   3798 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   3799 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   3800  const DFromV<decltype(v)> d;
   3801  const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv;
   3802  return BitCast(d,
   3803                 ShiftRightSame(BitCast(dv, v),
   3804                                static_cast<int>(amt * sizeof(TFromV<V>) * 8)));
   3805 }
   3806 
   3807 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   3808 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   3809  const DFromV<decltype(v)> d;
   3810  const Repartition<int8_t, decltype(d)> di8;
   3811  auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
   3812  idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
   3813  return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
   3814 }
   3815 
   3816 }  // namespace detail
   3817 
   3818 template <class D, HWY_IF_LANES_D(D, 1)>
   3819 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   3820  return v;
   3821 }
   3822 
   3823 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
   3824 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3825 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3826  if (__builtin_constant_p(amt)) {
   3827    switch (amt) {
   3828      case 0:
   3829        return v;
   3830      case 1:
   3831        return ShiftRightLanes<1>(d, v);
   3832    }
   3833  }
   3834 #else
   3835  (void)d;
   3836 #endif
   3837 
   3838  return detail::SlideDownLanes(v, amt);
   3839 }
   3840 
   3841 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
   3842 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3843 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3844  if (__builtin_constant_p(amt)) {
   3845    switch (amt) {
   3846      case 0:
   3847        return v;
   3848      case 1:
   3849        return ShiftRightLanes<1>(d, v);
   3850      case 2:
   3851        return ShiftRightLanes<2>(d, v);
   3852      case 3:
   3853        return ShiftRightLanes<3>(d, v);
   3854    }
   3855  }
   3856 #else
   3857  (void)d;
   3858 #endif
   3859 
   3860  return detail::SlideDownLanes(v, amt);
   3861 }
   3862 
   3863 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
   3864 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3865 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3866  if (__builtin_constant_p(amt)) {
   3867    switch (amt) {
   3868      case 0:
   3869        return v;
   3870      case 1:
   3871        return ShiftRightLanes<1>(d, v);
   3872      case 2:
   3873        return ShiftRightLanes<2>(d, v);
   3874      case 3:
   3875        return ShiftRightLanes<3>(d, v);
   3876      case 4:
   3877        return ShiftRightLanes<4>(d, v);
   3878      case 5:
   3879        return ShiftRightLanes<5>(d, v);
   3880      case 6:
   3881        return ShiftRightLanes<6>(d, v);
   3882      case 7:
   3883        return ShiftRightLanes<7>(d, v);
   3884    }
   3885  }
   3886 #else
   3887  (void)d;
   3888 #endif
   3889 
   3890  return detail::SlideDownLanes(v, amt);
   3891 }
   3892 
   3893 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
   3894 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3895 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   3896  if (__builtin_constant_p(amt)) {
   3897    switch (amt) {
   3898      case 0:
   3899        return v;
   3900      case 1:
   3901        return ShiftRightLanes<1>(d, v);
   3902      case 2:
   3903        return ShiftRightLanes<2>(d, v);
   3904      case 3:
   3905        return ShiftRightLanes<3>(d, v);
   3906      case 4:
   3907        return ShiftRightLanes<4>(d, v);
   3908      case 5:
   3909        return ShiftRightLanes<5>(d, v);
   3910      case 6:
   3911        return ShiftRightLanes<6>(d, v);
   3912      case 7:
   3913        return ShiftRightLanes<7>(d, v);
   3914      case 8:
   3915        return ShiftRightLanes<8>(d, v);
   3916      case 9:
   3917        return ShiftRightLanes<9>(d, v);
   3918      case 10:
   3919        return ShiftRightLanes<10>(d, v);
   3920      case 11:
   3921        return ShiftRightLanes<11>(d, v);
   3922      case 12:
   3923        return ShiftRightLanes<12>(d, v);
   3924      case 13:
   3925        return ShiftRightLanes<13>(d, v);
   3926      case 14:
   3927        return ShiftRightLanes<14>(d, v);
   3928      case 15:
   3929        return ShiftRightLanes<15>(d, v);
   3930    }
   3931  }
   3932 #else
   3933  (void)d;
   3934 #endif
   3935 
   3936  return detail::SlideDownLanes(v, amt);
   3937 }
   3938 
   3939 // ================================================== COMBINE
   3940 
   3941 // ------------------------------ Combine (InterleaveLower)
   3942 
   3943 // N = N/2 + N/2 (upper half undefined)
   3944 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
   3945 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
   3946  const Half<decltype(d)> dh;
   3947  const RebindToUnsigned<decltype(dh)> duh;
   3948  // Treat half-width input as one lane, and expand to two lanes.
   3949  using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
   3950  const VU lo{BitCast(duh, lo_half).raw};
   3951  const VU hi{BitCast(duh, hi_half).raw};
   3952  return BitCast(d, InterleaveLower(lo, hi));
   3953 }
   3954 
   3955 // ------------------------------ ZeroExtendVector (Combine)
   3956 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   3957 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   3958  return Combine(d, Zero(Half<decltype(d)>()), lo);
   3959 }
   3960 
   3961 // ------------------------------ Concat full (InterleaveLower)
   3962 
   3963 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
   3964 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   3965 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   3966  const Repartition<uint64_t, decltype(d)> d64;
   3967  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
   3968 }
   3969 
   3970 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
   3971 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   3972 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   3973  const Repartition<uint64_t, decltype(d)> d64;
   3974  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
   3975 }
   3976 
   3977 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
   3978 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   3979 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   3980  return CombineShiftRightBytes<8>(d, hi, lo);
   3981 }
   3982 
   3983 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
   3984 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   3985 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   3986  return BitCast(d, Vec128<uint8_t>{__lsx_vshuf4i_d(
   3987                        reinterpret_cast<__m128i>(lo.raw),
   3988                        reinterpret_cast<__m128i>(hi.raw), 0xC)});
   3989 }
   3990 
   3991 // ------------------------------ Concat partial (Combine, LowerHalf)
   3992 
   3993 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   3994 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   3995  const Half<decltype(d)> d2;
   3996  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
   3997 }
   3998 
   3999 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   4000 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   4001  const Half<decltype(d)> d2;
   4002  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
   4003 }
   4004 
   4005 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   4006 HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi,
   4007                                   const VFromD<D> lo) {
   4008  const Half<decltype(d)> d2;
   4009  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
   4010 }
   4011 
   4012 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   4013 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   4014  const Half<decltype(d)> d2;
   4015  return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
   4016 }
   4017 
   4018 // ------------------------------ ConcatOdd
   4019 
   4020 // 8-bit full
   4021 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   4022 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4023  return VFromD<D>{__lsx_vpickod_b(hi.raw, lo.raw)};
   4024 }
   4025 // 8-bit x8
   4026 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   4027 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4028  __m128i _tmp = __lsx_vpickod_b(hi.raw, lo.raw);
   4029  return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)};
   4030 }
   4031 // 8-bit x4
   4032 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
   4033 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4034  __m128i _tmp = __lsx_vpickod_b(hi.raw, lo.raw);
   4035  return VFromD<D>{__lsx_vextrins_h(_tmp, _tmp, 0x14)};
   4036 }
   4037 
   4038 // 16-bit full
   4039 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   4040 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4041  return VFromD<D>{__lsx_vpickod_h(hi.raw, lo.raw)};
   4042 }
   4043 // 16-bit x4
   4044 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   4045 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4046  __m128i _tmp = __lsx_vpickod_h(hi.raw, lo.raw);
   4047  return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)};
   4048 }
   4049 
   4050 // 32-bit full
   4051 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   4052 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   4053  return BitCast(
   4054      d, Vec128<uint8_t>{__lsx_vpickod_w(reinterpret_cast<__m128i>(hi.raw),
   4055                                         reinterpret_cast<__m128i>(lo.raw))});
   4056 }
   4057 
   4058 // Any T x2
   4059 template <class D, HWY_IF_LANES_D(D, 2)>
   4060 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   4061  return InterleaveUpper(d, lo, hi);
   4062 }
   4063 
   4064 // ------------------------------ ConcatEven
   4065 
   4066 // 8-bit full
   4067 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   4068 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4069  return VFromD<D>{__lsx_vpickev_b(hi.raw, lo.raw)};
   4070 }
   4071 // 8-bit x8
   4072 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   4073 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4074  __m128i _tmp = __lsx_vpickev_b(hi.raw, lo.raw);
   4075  return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)};
   4076 }
   4077 // 8-bit x4
   4078 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)>
   4079 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4080  __m128i _tmp = __lsx_vpickev_b(hi.raw, lo.raw);
   4081  return VFromD<D>{__lsx_vextrins_h(_tmp, _tmp, 0x14)};
   4082 }
   4083 
   4084 // 16-bit full
   4085 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   4086 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4087  return VFromD<D>{__lsx_vpickev_h(hi.raw, lo.raw)};
   4088 }
   4089 // 16-bit x4
   4090 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   4091 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   4092  __m128i _tmp = __lsx_vpickev_h(hi.raw, lo.raw);
   4093  return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)};
   4094 }
   4095 
   4096 // 32-bit full
   4097 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   4098 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   4099  return BitCast(
   4100      d, Vec128<uint8_t>{__lsx_vpickev_w(reinterpret_cast<__m128i>(hi.raw),
   4101                                         reinterpret_cast<__m128i>(lo.raw))});
   4102 }
   4103 
   4104 // Any T x2
   4105 template <class D, HWY_IF_LANES_D(D, 2)>
   4106 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   4107  return InterleaveLower(d, lo, hi);
   4108 }
   4109 
   4110 template <size_t N>
   4111 HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi,
   4112                                           Vec128<float16_t, N> lo) {
   4113  const DFromV<decltype(hi)> d;
   4114  const RebindToUnsigned<decltype(d)> du;
   4115  return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
   4116 }
   4117 // ------------------------------ DupEven (InterleaveLower)
   4118 
   4119 template <typename T>
   4120 HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) {
   4121  return v;
   4122 }
   4123 
   4124 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   4125 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
   4126  __m128i _tmp = __lsx_vpickev_b(v.raw, v.raw);
   4127  return Vec128<T, N>{__lsx_vilvl_b(_tmp, _tmp)};
   4128 }
   4129 
   4130 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   4131 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
   4132  const DFromV<decltype(v)> d;
   4133  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   4134  __m128i _tmp = __lsx_vpickev_h(BitCast(du, v).raw, BitCast(du, v).raw);
   4135  return BitCast(d, VFromD<decltype(du)>{__lsx_vilvl_h(_tmp, _tmp)});
   4136 }
   4137 
   4138 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   4139 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
   4140  const DFromV<decltype(v)> d;
   4141  __m128i _tmp = detail::BitCastToInteger(v.raw);
   4142  __m128i _tmp1 = __lsx_vpickev_w(_tmp, _tmp);
   4143  return BitCast(d, Vec128<uint32_t, N>{__lsx_vilvl_w(_tmp1, _tmp1)});
   4144 }
   4145 
   4146 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   4147 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   4148  return InterleaveLower(DFromV<decltype(v)>(), v, v);
   4149 }
   4150 
   4151 // ------------------------------ DupOdd (InterleaveUpper)
   4152 
   4153 template <typename T, HWY_IF_T_SIZE(T, 1)>
   4154 HWY_API Vec128<T, 1> DupOdd(Vec128<T, 1> v) {
   4155  return v;
   4156 }
   4157 
   4158 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
   4159 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
   4160  __m128i _tmp = __lsx_vpickod_b(v.raw, v.raw);
   4161  return Vec128<T, N>{__lsx_vilvl_b(_tmp, _tmp)};
   4162 }
   4163 
   4164 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   4165 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
   4166  __m128i _tmp = __lsx_vpickod_h(v.raw, v.raw);
   4167  return Vec128<T, N>{__lsx_vilvl_h(_tmp, _tmp)};
   4168 }
   4169 
   4170 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   4171 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
   4172  const DFromV<decltype(v)> d;
   4173  __m128i _tmp = detail::BitCastToInteger(v.raw);
   4174  __m128i _tmp1 = __lsx_vpickod_w(_tmp, _tmp);
   4175  return BitCast(d, Vec128<uint32_t, N>{__lsx_vilvl_w(_tmp1, _tmp1)});
   4176 }
   4177 
   4178 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   4179 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   4180  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
   4181 }
   4182 
   4183 // ------------------------------ TwoTablesLookupLanes (DupEven)
   4184 
   4185 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   4186 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
   4187                                          Indices128<T, N> idx) {
   4188  const DFromV<decltype(a)> d;
   4189  const Twice<decltype(d)> dt;
   4190  const Repartition<uint8_t, decltype(dt)> dt_u8;
   4191 // TableLookupLanes currently requires table and index vectors to be the same
   4192 // size, though a half-length index vector would be sufficient here.
   4193 #if HWY_IS_MSAN
   4194  const Vec128<T, N> idx_vec{idx.raw};
   4195  const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
   4196 #else
   4197  // We only keep LowerHalf of the result, which is valid in idx.
   4198  const Indices128<T, N * 2> idx2{idx.raw};
   4199 #endif
   4200  return LowerHalf(
   4201      d, TableLookupBytes(Combine(dt, b, a),
   4202                          BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw})));
   4203 }
   4204 
   4205 template <typename T, HWY_IF_UI8(T)>
   4206 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   4207                                       Indices128<T> idx) {
   4208  return Vec128<T>{__lsx_vshuf_b(b.raw, a.raw, idx.raw)};
   4209 }
   4210 
   4211 template <typename T, HWY_IF_T_SIZE_ONE_OF(T, ((1 << 2) | (1 << 4) | (1 << 8)))>
   4212 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   4213                                       Indices128<T> idx) {
   4214  const DFromV<decltype(a)> d;
   4215  const Repartition<uint8_t, decltype(d)> du8;
   4216  return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b),
   4217                                         Indices128<uint8_t>{idx.raw}));
   4218 }
   4219 
   4220 // ------------------------------ OddEven
   4221 
   4222 template <typename T, size_t N, HWY_IF_UI8(T)>
   4223 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   4224  __m128i t0 = __lsx_vpackod_b(a.raw, a.raw);
   4225  return Vec128<T, N>{__lsx_vpackev_b(t0, b.raw)};
   4226 }
   4227 template <typename T, size_t N, HWY_IF_UI16(T)>
   4228 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   4229  __m128i t0 = __lsx_vpackod_h(a.raw, a.raw);
   4230  return Vec128<T, N>{__lsx_vpackev_h(t0, b.raw)};
   4231 }
   4232 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   4233 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   4234  const DFromV<decltype(a)> d;
   4235  const RebindToUnsigned<decltype(d)> du;
   4236  __m128i t0 = __lsx_vpackod_w(BitCast(du, a).raw, BitCast(du, a).raw);
   4237  return BitCast(d,
   4238                 VFromD<decltype(du)>{__lsx_vpackev_w(t0, BitCast(du, b).raw)});
   4239 }
   4240 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   4241 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   4242  const DFromV<decltype(a)> d;
   4243  const RebindToUnsigned<decltype(d)> du;
   4244  return BitCast(d, VFromD<decltype(du)>{__lsx_vextrins_d(
   4245                        BitCast(du, b).raw, BitCast(du, a).raw, 0x11)});
   4246 }
   4247 
   4248 // -------------------------- InterleaveEven
   4249 
   4250 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   4251 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   4252  return VFromD<D>{__lsx_vpackev_b(b.raw, a.raw)};
   4253 }
   4254 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   4255 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   4256  return VFromD<D>{__lsx_vpackev_h(b.raw, a.raw)};
   4257 }
   4258 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   4259 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   4260  const RebindToSigned<D> di;
   4261  return BitCast(d, VFromD<decltype(di)>{__lsx_vpackev_w(BitCast(di, b).raw,
   4262                                                         BitCast(di, a).raw)});
   4263 }
   4264 
   4265 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   4266 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   4267  const RebindToSigned<D> di;
   4268  return BitCast(d, VFromD<decltype(di)>{__lsx_vpackev_d(BitCast(di, b).raw,
   4269                                                         BitCast(di, a).raw)});
   4270 }
   4271 
   4272 // -------------------------- InterleaveOdd
   4273 
   4274 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
   4275 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   4276  return VFromD<D>{__lsx_vpackod_b(b.raw, a.raw)};
   4277 }
   4278 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
   4279 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   4280  return VFromD<D>{__lsx_vpackod_h(b.raw, a.raw)};
   4281 }
   4282 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
   4283 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   4284  const RebindToSigned<D> di;
   4285  return BitCast(d, VFromD<decltype(di)>{__lsx_vpackod_w(BitCast(di, b).raw,
   4286                                                         BitCast(di, a).raw)});
   4287 }
   4288 
   4289 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
   4290 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   4291  const RebindToSigned<D> di;
   4292  return BitCast(d, VFromD<decltype(di)>{__lsx_vpackod_d(BitCast(di, b).raw,
   4293                                                         BitCast(di, a).raw)});
   4294 }
   4295 
   4296 // ------------------------------ OddEvenBlocks
   4297 
   4298 template <typename T, size_t N>
   4299 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   4300  return even;
   4301 }
   4302 
   4303 // ------------------------------ SwapAdjacentBlocks
   4304 
   4305 template <typename T, size_t N>
   4306 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
   4307  return v;
   4308 }
   4309 
   4310 // ------------------------------ InterleaveEvenBlocks
   4311 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
   4312 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
   4313  return a;
   4314 }
   4315 // ------------------------------ InterleaveOddBlocks
   4316 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
   4317 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
   4318  return a;
   4319 }
   4320 
   4321 // ------------------------------ Shl
   4322 
   4323 template <typename T, size_t N, HWY_IF_UI8(T)>
   4324 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
   4325  return Vec128<T, N>{__lsx_vsll_b(v.raw, bits.raw)};
   4326 }
   4327 
   4328 template <typename T, size_t N, HWY_IF_UI16(T)>
   4329 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
   4330  return Vec128<T, N>{__lsx_vsll_h(v.raw, bits.raw)};
   4331 }
   4332 
   4333 template <typename T, size_t N, HWY_IF_UI32(T)>
   4334 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
   4335  return Vec128<T, N>{__lsx_vsll_w(v.raw, bits.raw)};
   4336 }
   4337 
   4338 template <typename T, size_t N, HWY_IF_UI64(T)>
   4339 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
   4340  return Vec128<T, N>{__lsx_vsll_d(v.raw, bits.raw)};
   4341 }
   4342 
   4343 // ------------------------------ Shr
   4344 
   4345 namespace detail {
   4346 
   4347 template <size_t N>
   4348 HWY_API Vec128<uint8_t, N> Shr(Vec128<uint8_t, N> v, Vec128<uint8_t, N> bits) {
   4349  return Vec128<uint8_t, N>{__lsx_vsrl_b(v.raw, bits.raw)};
   4350 }
   4351 template <size_t N>
   4352 HWY_API Vec128<uint16_t, N> Shr(Vec128<uint16_t, N> v,
   4353                                Vec128<uint16_t, N> bits) {
   4354  return Vec128<uint16_t, N>{__lsx_vsrl_h(v.raw, bits.raw)};
   4355 }
   4356 template <size_t N>
   4357 HWY_API Vec128<uint32_t, N> Shr(Vec128<uint32_t, N> v,
   4358                                Vec128<uint32_t, N> bits) {
   4359  return Vec128<uint32_t, N>{__lsx_vsrl_w(v.raw, bits.raw)};
   4360 }
   4361 template <size_t N>
   4362 HWY_API Vec128<uint64_t, N> Shr(Vec128<uint64_t, N> v,
   4363                                Vec128<uint64_t, N> bits) {
   4364  return Vec128<uint64_t, N>{__lsx_vsrl_d(v.raw, bits.raw)};
   4365 }
   4366 
   4367 template <size_t N>
   4368 HWY_API Vec128<int8_t, N> Shr(Vec128<int8_t, N> v, Vec128<int8_t, N> bits) {
   4369  return Vec128<int8_t, N>{__lsx_vsra_b(v.raw, bits.raw)};
   4370 }
   4371 template <size_t N>
   4372 HWY_API Vec128<int16_t, N> Shr(Vec128<int16_t, N> v, Vec128<int16_t, N> bits) {
   4373  return Vec128<int16_t, N>{__lsx_vsra_h(v.raw, bits.raw)};
   4374 }
   4375 template <size_t N>
   4376 HWY_API Vec128<int32_t, N> Shr(Vec128<int32_t, N> v, Vec128<int32_t, N> bits) {
   4377  return Vec128<int32_t, N>{__lsx_vsra_w(v.raw, bits.raw)};
   4378 }
   4379 template <size_t N>
   4380 HWY_API Vec128<int64_t, N> Shr(Vec128<int64_t, N> v, Vec128<int64_t, N> bits) {
   4381  return Vec128<int64_t, N>{__lsx_vsra_d(v.raw, bits.raw)};
   4382 }
   4383 
   4384 }  // namespace detail
   4385 
   4386 template <typename T, size_t N>
   4387 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
   4388  return detail::Shr(v, bits);
   4389 }
   4390 
   4391 // ================================================== CONVERT (2)
   4392 
   4393 // ------------------------------ PromoteEvenTo/PromoteOddTo
   4394 #include "hwy/ops/inside-inl.h"
   4395 
   4396 // Generic for all vector lengths.
   4397 template <class DF, HWY_IF_F32_D(DF),
   4398          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   4399 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
   4400  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
   4401                Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
   4402 }
   4403 
   4404 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
   4405          class V16 = VFromD<RepartitionToNarrow<D32>>>
   4406 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) {
   4407  __m128i _tmp = __lsx_vmulwev_w_h(a.raw, b.raw);
   4408  return VFromD<D32>{__lsx_vmaddwod_w_h(_tmp, a.raw, b.raw)};
   4409 }
   4410 
   4411 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16),
   4412          class VU16 = VFromD<RepartitionToNarrow<DU32>>>
   4413 HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 /* tag */, VU16 a, VU16 b) {
   4414  __m128i _tmp = __lsx_vmulwev_w_hu(a.raw, b.raw);
   4415  return VFromD<DU32>{__lsx_vmaddwod_w_hu(_tmp, a.raw, b.raw)};
   4416 }
   4417 
   4418 // ------------------------------ ReorderWidenMulAccumulate
   4419 
   4420 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
   4421          class V16 = VFromD<RepartitionToNarrow<D32>>>
   4422 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b,
   4423                                              const VFromD<D32> sum0,
   4424                                              VFromD<D32>& /* sum1 */) {
   4425  return VFromD<D32>{__lsx_vmaddwev_w_h(
   4426      __lsx_vmaddwod_w_h(sum0.raw, a.raw, b.raw), a.raw, b.raw)};
   4427 }
   4428 
   4429 template <class DU32, HWY_IF_U32_D(DU32),
   4430          class VU16 = VFromD<RepartitionToNarrow<DU32>>>
   4431 HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 /* tag */, VU16 a, VU16 b,
   4432                                               const VFromD<DU32> sum0,
   4433                                               VFromD<DU32>& /* sum1 */) {
   4434  return VFromD<DU32>{__lsx_vmaddwev_w_hu(
   4435      __lsx_vmaddwod_w_hu(sum0.raw, a.raw, b.raw), a.raw, b.raw)};
   4436 }
   4437 
   4438 // ------------------------------ RearrangeToOddPlusEven
   4439 template <size_t N>
   4440 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0,
   4441                                                  Vec128<int32_t, N> /*sum1*/) {
   4442  return sum0;  // invariant already holds
   4443 }
   4444 
   4445 template <size_t N>
   4446 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
   4447    const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) {
   4448  return sum0;  // invariant already holds
   4449 }
   4450 
   4451 template <class VW>
   4452 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
   4453  return Add(sum0, sum1);
   4454 }
   4455 
   4456 // ------------------------------ Demotions
   4457 
   4458 // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of
   4459 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
   4460 // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause
   4461 // SFINAE to occur instead of a hard error due to a dependency on the V template
   4462 // argument
   4463 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
   4464 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \
   4465  hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr
   4466 
   4467 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
   4468 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   4469  return VFromD<D>{__lsx_vssrani_b_h(v.raw, v.raw, 0)};
   4470 }
   4471 
   4472 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
   4473 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   4474  return VFromD<D>{__lsx_vssrani_bu_h(v.raw, v.raw, 0)};
   4475 }
   4476 
   4477 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
   4478 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4479  return VFromD<D>{__lsx_vssrlni_b_h(v.raw, v.raw, 0)};
   4480 }
   4481 
   4482 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
   4483 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4484  return VFromD<D>{__lsx_vssrlni_bu_h(v.raw, v.raw, 0)};
   4485 }
   4486 
   4487 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
   4488 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4489  return VFromD<D>{__lsx_vssrani_h_w(v.raw, v.raw, 0)};
   4490 }
   4491 
   4492 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
   4493 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4494  return VFromD<D>{__lsx_vssrani_hu_w(v.raw, v.raw, 0)};
   4495 }
   4496 
   4497 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
   4498 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4499  return VFromD<D>{__lsx_vssrlni_h_w(v.raw, v.raw, 0)};
   4500 }
   4501 
   4502 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
   4503 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4504  return VFromD<D>{__lsx_vssrlni_hu_w(v.raw, v.raw, 0)};
   4505 }
   4506 
   4507 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4508 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
   4509  return VFromD<D>{__lsx_vssrani_w_d(v.raw, v.raw, 0)};
   4510 }
   4511 
   4512 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
   4513 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
   4514  return VFromD<D>{__lsx_vssrani_wu_d(v.raw, v.raw, 0)};
   4515 }
   4516 
   4517 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4518 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
   4519  return VFromD<D>{__lsx_vssrlni_w_d(v.raw, v.raw, 0)};
   4520 }
   4521 
   4522 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
   4523 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
   4524  return VFromD<D>{__lsx_vssrlni_wu_d(v.raw, v.raw, 0)};
   4525 }
   4526 
   4527 // UI->UI DemoteTo for the case where
   4528 // sizeof(TFromD<D>) <= sizeof(TFromV<V>) / 4 is generic for all vector lengths
   4529 template <class DN, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN),
   4530          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4531          HWY_IF_T_SIZE_LE_D(DN, sizeof(TFromV<V>) / 4)>
   4532 HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
   4533  using T = TFromV<V>;
   4534  using TN = TFromD<DN>;
   4535 
   4536  using TDemoteTo =
   4537      MakeNarrow<If<IsSigned<T>() && IsSigned<TN>(), T, MakeUnsigned<T>>>;
   4538  return DemoteTo(dn, DemoteTo(Rebind<TDemoteTo, DN>(), v));
   4539 }
   4540 
   4541 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
   4542 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   4543  return VFromD<D>{__lsx_vfcvt_h_s(v.raw, v.raw)};
   4544 }
   4545 
   4546 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
   4547 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   4548  return VFromD<D>{__lsx_vfcvt_s_d(v.raw, v.raw)};
   4549 }
   4550 
   4551 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4552 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   4553  return VFromD<D>{__lsx_vftintrz_w_d(
   4554      reinterpret_cast<__m128d>(__lsx_vreplgr2vr_w(0)), v.raw)};
   4555 }
   4556 
   4557 template <class D, HWY_IF_U32_D(D)>
   4558 HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
   4559  const Rebind<uint64_t, decltype(du32)> du64;
   4560  return DemoteTo(du32, ConvertTo(du64, v));
   4561 }
   4562 
   4563 template <class D, HWY_IF_F32_D(D)>
   4564 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) {
   4565  const Rebind<double, decltype(df32)> df64;
   4566  const RebindToUnsigned<decltype(df64)> du64;
   4567  const RebindToSigned<decltype(df32)> di32;
   4568  const RebindToUnsigned<decltype(df32)> du32;
   4569 
   4570  const auto k2p64_63 = Set(df64, 27670116110564327424.0);
   4571  const auto f64_hi52 =
   4572      Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
   4573  const auto f64_lo12 =
   4574      PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
   4575                                        Set(du32, uint32_t{0x00000FFF}))));
   4576 
   4577  const auto f64_sum = f64_hi52 + f64_lo12;
   4578  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
   4579 
   4580  const auto f64_sum_is_inexact =
   4581      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
   4582  const auto f64_bits_decrement =
   4583      And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
   4584          f64_sum_is_inexact);
   4585 
   4586  const auto adj_f64_val = BitCast(
   4587      df64,
   4588      Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
   4589 
   4590  return DemoteTo(df32, adj_f64_val);
   4591 }
   4592 
   4593 template <class D, HWY_IF_F32_D(D)>
   4594 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
   4595  const Rebind<double, decltype(df32)> df64;
   4596  const RebindToUnsigned<decltype(df64)> du64;
   4597  const RebindToSigned<decltype(df32)> di32;
   4598  const RebindToUnsigned<decltype(df32)> du32;
   4599 
   4600  const auto k2p64 = Set(df64, 18446744073709551616.0);
   4601  const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
   4602  const auto f64_lo12 =
   4603      PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)),
   4604                                        Set(du32, uint32_t{0x00000FFF}))));
   4605 
   4606  const auto f64_sum = f64_hi52 + f64_lo12;
   4607  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
   4608  const auto f64_sum_is_inexact =
   4609      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
   4610 
   4611  const auto adj_f64_val = BitCast(
   4612      df64,
   4613      Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
   4614         f64_sum_is_inexact));
   4615 
   4616  return DemoteTo(df32, adj_f64_val);
   4617 }
   4618 
   4619 // ------------------------------ ReorderDemote2To
   4620 
   4621 // ReorderDemote2To for 8-byte UI64->UI32, <= 4-byte UI32->UI16,
   4622 // and <= 4-byte UI16->UI8
   4623 template <class DN, class V,
   4624          HWY_IF_V_SIZE_LE_D(DN, ((sizeof(TFromD<DN>) <= 2 ? 4 : 8))),
   4625          HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4626          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   4627          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   4628 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   4629  const DFromV<decltype(a)> d;
   4630  const Twice<decltype(d)> dt;
   4631  return DemoteTo(dn, Combine(dt, b, a));
   4632 }
   4633 
   4634 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
   4635 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
   4636                                   Vec128<int16_t> b) {
   4637  return VFromD<D>{__lsx_vssrani_b_h(b.raw, a.raw, 0)};
   4638 }
   4639 
   4640 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
   4641 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a,
   4642                                   Vec128<int16_t> b) {
   4643  return VFromD<D>{__lsx_vssrani_bu_h(b.raw, a.raw, 0)};
   4644 }
   4645 
   4646 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
   4647 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint16_t> a,
   4648                                   Vec128<uint16_t> b) {
   4649  return VFromD<D>{__lsx_vssrlni_b_h(b.raw, a.raw, 0)};
   4650 }
   4651 
   4652 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
   4653 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint16_t> a,
   4654                                   Vec128<uint16_t> b) {
   4655  return VFromD<D>{__lsx_vssrlni_bu_h(b.raw, a.raw, 0)};
   4656 }
   4657 
   4658 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
   4659 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
   4660                                   Vec128<int32_t> b) {
   4661  return VFromD<D>{__lsx_vssrani_h_w(b.raw, a.raw, 0)};
   4662 }
   4663 
   4664 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
   4665 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a,
   4666                                   Vec128<int32_t> b) {
   4667  return VFromD<D>{__lsx_vssrani_hu_w(b.raw, a.raw, 0)};
   4668 }
   4669 
   4670 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
   4671 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint32_t> a,
   4672                                   Vec128<uint32_t> b) {
   4673  return VFromD<D>{__lsx_vssrlni_h_w(b.raw, a.raw, 0)};
   4674 }
   4675 
   4676 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
   4677 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint32_t> a,
   4678                                   Vec128<uint32_t> b) {
   4679  return VFromD<D>{__lsx_vssrlni_hu_w(b.raw, a.raw, 0)};
   4680 }
   4681 
   4682 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
   4683 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int64_t> a,
   4684                                   Vec128<int64_t> b) {
   4685  return VFromD<D>{__lsx_vssrani_w_d(b.raw, a.raw, 0)};
   4686 }
   4687 
   4688 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
   4689 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int64_t> a,
   4690                                   Vec128<int64_t> b) {
   4691  return VFromD<D>{__lsx_vssrani_wu_d(b.raw, a.raw, 0)};
   4692 }
   4693 
   4694 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
   4695 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint64_t> a,
   4696                                   Vec128<uint64_t> b) {
   4697  return VFromD<D>{__lsx_vssrlni_w_d(b.raw, a.raw, 0)};
   4698 }
   4699 
   4700 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
   4701 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint64_t> a,
   4702                                   Vec128<uint64_t> b) {
   4703  return VFromD<D>{__lsx_vssrlni_wu_d(b.raw, a.raw, 0)};
   4704 }
   4705 
   4706 // 8-byte UI32->UI16 and UI16->UI8 ReorderDemote2To
   4707 template <class DN, class V, HWY_IF_V_SIZE_D(DN, 8),
   4708          HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4709          HWY_IF_T_SIZE_LE_D(DN, 2), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   4710          HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   4711 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   4712  const Twice<DFromV<V>> dt;
   4713  const Twice<decltype(dn)> dt_n;
   4714 
   4715  const auto demote2_result =
   4716      ReorderDemote2To(dt_n, ResizeBitCast(dt, a), ResizeBitCast(dt, b));
   4717  return VFromD<DN>{__lsx_vshuf4i_w(demote2_result.raw, 0x88)};
   4718 }
   4719 
   4720 template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16),
   4721          HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
   4722          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4723          HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
   4724          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   4725 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
   4726  return ReorderDemote2To(d, a, b);
   4727 }
   4728 
   4729 template <size_t N>
   4730 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
   4731  const DFromV<decltype(v)> du32;
   4732  const Rebind<uint8_t, decltype(du32)> du8;
   4733  return DemoteTo(du8, BitCast(du32, v));
   4734 }
   4735 
   4736 // ------------------------------ F32->UI64 PromoteTo
   4737 
   4738 // f32 ->i64
   4739 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   4740 HWY_API VFromD<D> PromoteTo(D /*di64*/, VFromD<Rebind<float, D>> v) {
   4741  return VFromD<D>{__lsx_vftintrzl_l_s(v.raw)};
   4742 }
   4743 
   4744 // F32->U64 PromoteTo generic for all vector lengths
   4745 template <class D, HWY_IF_U64_D(D)>
   4746 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
   4747  const RebindToFloat<decltype(du64)> df64;
   4748  return ConvertTo(du64, PromoteTo(df64, v));
   4749 }
   4750 
   4751 // ------------------------------ MulFixedPoint15
   4752 
   4753 template <size_t N>
   4754 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
   4755                                           const Vec128<int16_t, N> b) {
   4756  __m128i temp_ev = __lsx_vmulwev_w_h(a.raw, b.raw);
   4757  __m128i temp_od = __lsx_vmulwod_w_h(a.raw, b.raw);
   4758  __m128i temp1 = __lsx_vilvl_w(temp_od, temp_ev);
   4759  __m128i temp2 = __lsx_vilvh_w(temp_od, temp_ev);
   4760  return Vec128<int16_t, N>{__lsx_vssrarni_h_w(temp2, temp1, 15)};
   4761 }
   4762 
   4763 // ------------------------------ Truncations
   4764 
   4765 template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)>
   4766 HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) {
   4767  const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto;
   4768  return VFromD<DTo>{BitCast(dto, v).raw};
   4769 }
   4770 
   4771 template <class D, HWY_IF_U8_D(D)>
   4772 HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   4773  return Vec16<uint8_t>{__lsx_vextrins_b(v.raw, v.raw, 0x18)};
   4774 }
   4775 
   4776 template <class D, HWY_IF_U16_D(D)>
   4777 HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   4778  return Vec32<uint16_t>{__lsx_vextrins_h(v.raw, v.raw, 0x14)};
   4779 }
   4780 
   4781 template <class D, HWY_IF_U32_D(D)>
   4782 HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   4783  return Vec64<uint32_t>{__lsx_vpickev_w(v.raw, v.raw)};
   4784 }
   4785 
   4786 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
   4787 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4788  __m128i v_ev = __lsx_vpickev_b(v.raw, v.raw);
   4789  return VFromD<D>{__lsx_vpickev_b(v_ev, v_ev)};
   4790 }
   4791 
   4792 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
   4793 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4794  return VFromD<D>{__lsx_vpickev_h(v.raw, v.raw)};
   4795 }
   4796 
   4797 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)>
   4798 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4799  return VFromD<D>{__lsx_vpickev_b(v.raw, v.raw)};
   4800 }
   4801 
   4802 // ------------------------------ int -> float ConvertTo
   4803 
   4804 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   4805 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4806  return VFromD<D>{__lsx_vffint_s_w(v.raw)};
   4807 }
   4808 
   4809 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
   4810 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4811  return VFromD<D>{__lsx_vffint_s_wu(v.raw)};
   4812 }
   4813 
   4814 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   4815 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
   4816  return VFromD<D>{__lsx_vffint_d_l(v.raw)};
   4817 }
   4818 
   4819 // ------------------------------ float -> int ConvertTo
   4820 
   4821 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
   4822 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) {
   4823  return VFromD<D>{__lsx_vffint_d_lu(v.raw)};
   4824 }
   4825 
   4826 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
   4827 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   4828  return VFromD<D>{__lsx_vftintrz_w_s(v.raw)};
   4829 }
   4830 
   4831 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)>
   4832 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   4833  return VFromD<D>{__lsx_vftintrz_wu_s(v.raw)};
   4834 }
   4835 
   4836 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)>
   4837 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   4838  return VFromD<D>{__lsx_vftintrz_l_d(v.raw)};
   4839 }
   4840 
   4841 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
   4842 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<double, D>> v) {
   4843  return VFromD<D>{__lsx_vftintrz_lu_d(v.raw)};
   4844 }
   4845 
   4846 // ------------------------------ NearestInt (Round)
   4847 
   4848 template <size_t N>
   4849 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
   4850  return Vec128<int32_t, N>{__lsx_vftintrne_w_s(v.raw)};
   4851 }
   4852 
   4853 template <size_t N>
   4854 HWY_API Vec128<int64_t, N> NearestInt(const Vec128<double, N> v) {
   4855  return Vec128<int64_t, N>{__lsx_vftintrne_l_d(v.raw)};
   4856 }
   4857 
   4858 template <class DI32, HWY_IF_I32_D(DI32)>
   4859 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
   4860                                        VFromD<Rebind<double, DI32>> v) {
   4861  return DemoteTo(di32, NearestInt(v));
   4862 }
   4863 
   4864 // ------------------------------ Floating-point rounding
   4865 
   4866 template <size_t N>
   4867 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
   4868  return Vec128<float, N>{__lsx_vfrintrne_s(v.raw)};
   4869 }
   4870 template <size_t N>
   4871 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
   4872  return Vec128<double, N>{__lsx_vfrintrne_d(v.raw)};
   4873 }
   4874 template <size_t N>
   4875 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
   4876  return Vec128<float, N>{__lsx_vfrintrz_s(v.raw)};
   4877 }
   4878 template <size_t N>
   4879 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
   4880  return Vec128<double, N>{__lsx_vfrintrz_d(v.raw)};
   4881 }
   4882 template <size_t N>
   4883 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
   4884  return Vec128<float, N>{__lsx_vfrintrp_s(v.raw)};
   4885 }
   4886 template <size_t N>
   4887 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
   4888  return Vec128<double, N>{__lsx_vfrintrp_d(v.raw)};
   4889 }
   4890 // Toward -infinity, aka floor
   4891 template <size_t N>
   4892 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
   4893  return Vec128<float, N>{__lsx_vfrintrm_s(v.raw)};
   4894 }
   4895 template <size_t N>
   4896 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
   4897  return Vec128<double, N>{__lsx_vfrintrm_d(v.raw)};
   4898 }
   4899 
   4900 // ------------------------------ Floating-point classification
   4901 
   4902 // FIXME: disable gcc-14 tree-based loop optimizations to prevent
   4903 // 'HighwayTestGroup/HighwayTest.TestAllIsNaN/LSX' failures
   4904 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
   4905 #pragma GCC push_options
   4906 #pragma GCC optimize("-fno-tree-loop-optimize")
   4907 #endif
   4908 
   4909 template <size_t N>
   4910 HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) {
   4911  return Mask128<float, N>{
   4912      reinterpret_cast<__m128>(__lsx_vfcmp_cune_s(v.raw, v.raw))};
   4913 }
   4914 
   4915 template <size_t N>
   4916 HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
   4917  return Mask128<double, N>{
   4918      reinterpret_cast<__m128d>(__lsx_vfcmp_cune_d(v.raw, v.raw))};
   4919 }
   4920 
   4921 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
   4922 #pragma GCC pop_options
   4923 #endif
   4924 
   4925 #ifdef HWY_NATIVE_IS_EITHER_NAN
   4926 #undef HWY_NATIVE_IS_EITHER_NAN
   4927 #else
   4928 #define HWY_NATIVE_IS_EITHER_NAN
   4929 #endif
   4930 
   4931 template <size_t N>
   4932 HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) {
   4933  return Mask128<float, N>{
   4934      reinterpret_cast<__m128>(__lsx_vfcmp_cun_s(a.raw, b.raw))};
   4935 }
   4936 
   4937 template <size_t N>
   4938 HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a,
   4939                                       Vec128<double, N> b) {
   4940  __m128i _tmp = __lsx_vor_v(__lsx_vfcmp_cune_d(a.raw, a.raw),
   4941                             __lsx_vfcmp_cune_d(b.raw, b.raw));
   4942  return Mask128<double, N>{reinterpret_cast<__m128d>(_tmp)};
   4943 }
   4944 
   4945 #ifdef HWY_NATIVE_ISINF
   4946 #undef HWY_NATIVE_ISINF
   4947 #else
   4948 #define HWY_NATIVE_ISINF
   4949 #endif
   4950 
   4951 template <class V>
   4952 HWY_API MFromD<DFromV<V>> IsInf(V v) {
   4953  using T = TFromV<V>;
   4954 
   4955  static_assert(IsFloat<T>(), "Only for float");
   4956  using TU = MakeUnsigned<T>;
   4957  const DFromV<decltype(v)> d;
   4958  const RebindToUnsigned<decltype(d)> du;
   4959  const VFromD<decltype(du)> vu = BitCast(du, v);
   4960  // 'Shift left' to clear the sign bit, check for exponent=max and
   4961  // mantissa=0.
   4962  return RebindMask(
   4963      d,
   4964      Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
   4965 }
   4966 
   4967 // Returns whether normal/subnormal/zero.
   4968 template <class V>
   4969 HWY_API MFromD<DFromV<V>> IsFinite(V v) {
   4970  using T = TFromV<V>;
   4971 
   4972  static_assert(IsFloat<T>(), "Only for float");
   4973  using TU = MakeUnsigned<T>;
   4974  const DFromV<decltype(v)> d;
   4975  const RebindToUnsigned<decltype(d)> du;
   4976  const VFromD<decltype(du)> vu = BitCast(du, v);
   4977  // 'Shift left' to clear the sign bit, check for exponent<max.
   4978  return RebindMask(
   4979      d,
   4980      Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
   4981 }
   4982 
   4983 // ================================================== MISC
   4984 
   4985 // ------------------------------ LoadMaskBits (TestBit)
   4986 
   4987 namespace detail {
   4988 
   4989 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   4990 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   4991  const RebindToUnsigned<decltype(d)> du;
   4992  // Easier than Set(), which would require an >8-bit type, which would not
   4993  // compile for T=uint8_t, N=1.
   4994  const VFromD<D> vbits{__lsx_vreplgr2vr_w(static_cast<int32_t>(bits))};
   4995 
   4996  // Replicate bytes 8x such that each byte contains the bit that governs it.
   4997  alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
   4998                                                    1, 1, 1, 1, 1, 1, 1, 1};
   4999  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
   5000 
   5001  alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
   5002                                                   1, 2, 4, 8, 16, 32, 64, 128};
   5003  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
   5004 }
   5005 
   5006 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   5007 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   5008  const RebindToUnsigned<decltype(d)> du;
   5009  alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
   5010  return RebindMask(
   5011      d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
   5012 }
   5013 
   5014 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   5015 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   5016  const RebindToUnsigned<decltype(d)> du;
   5017  alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
   5018  return RebindMask(
   5019      d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
   5020 }
   5021 
   5022 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   5023 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) {
   5024  const RebindToUnsigned<decltype(d)> du;
   5025  alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
   5026  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
   5027 }
   5028 
   5029 }  // namespace detail
   5030 
   5031 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5032 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   5033  uint64_t mask_bits = 0;
   5034  CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits);
   5035  return detail::LoadMaskBits(d, mask_bits);
   5036 }
   5037 
   5038 // ------------------------------ Dup128MaskFromMaskBits
   5039 
   5040 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5041 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   5042  constexpr size_t kN = MaxLanes(d);
   5043  if (kN < 8) mask_bits &= (1u << kN) - 1;
   5044  return detail::LoadMaskBits(d, mask_bits);
   5045 }
   5046 
   5047 template <typename T>
   5048 struct CompressIsPartition {
   5049  enum { value = (sizeof(T) != 1) };
   5050 };
   5051 
   5052 // ------------------------------ BitsFromMask
   5053 
   5054 namespace detail {
   5055 
   5056 template <class D>
   5057 constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
   5058  return (d.MaxBytes() >= 16) ? mask_bits
   5059                              : mask_bits & ((1ull << d.MaxLanes()) - 1);
   5060 }
   5061 
   5062 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
   5063  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
   5064 }
   5065 
   5066 }  // namespace detail
   5067 
   5068 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)>
   5069 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   5070  return detail::OnlyActive(
   5071      d, detail::U64FromInt(__lsx_vpickve2gr_w(__lsx_vmskltz_b(mask.raw), 0)));
   5072 }
   5073 
   5074 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
   5075 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   5076  return detail::OnlyActive(
   5077      d, detail::U64FromInt(__lsx_vpickve2gr_w(__lsx_vmskltz_h(mask.raw), 0)));
   5078 }
   5079 
   5080 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
   5081 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   5082  return detail::OnlyActive(
   5083      d, detail::U64FromInt(__lsx_vpickve2gr_w(
   5084             __lsx_vmskltz_w(reinterpret_cast<__m128i>(mask.raw)), 0)));
   5085 }
   5086 
   5087 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
   5088 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   5089  return detail::OnlyActive(
   5090      d, detail::U64FromInt(__lsx_vpickve2gr_w(
   5091             __lsx_vmskltz_d(reinterpret_cast<__m128i>(mask.raw)), 0)));
   5092 }
   5093 
   5094 // ------------------------------ StoreMaskBits
   5095 // `p` points to at least 8 writable bytes.
   5096 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5097 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
   5098  constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8;
   5099  const uint64_t mask_bits = BitsFromMask(d, mask);
   5100  CopyBytes<kNumBytes>(&mask_bits, bits);
   5101  return kNumBytes;
   5102 }
   5103 
   5104 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5105 HWY_API bool AllFalse(D d, MFromD<D> mask) {
   5106  return BitsFromMask(d, mask) == 0;
   5107 }
   5108 
   5109 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5110 HWY_API bool AllTrue(D d, MFromD<D> mask) {
   5111  constexpr size_t kN = MaxLanes(d);
   5112  constexpr uint64_t kAllBits = (1ull << kN) - 1;
   5113  return BitsFromMask(d, mask) == kAllBits;
   5114 }
   5115 
   5116 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5117 HWY_API size_t CountTrue(D d, MFromD<D> mask) {
   5118  return PopCount(BitsFromMask(d, mask));
   5119 }
   5120 
   5121 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5122 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
   5123  return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask));
   5124 }
   5125 
   5126 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5127 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
   5128  const uint64_t mask_bits = BitsFromMask(d, mask);
   5129  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
   5130 }
   5131 
   5132 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5133 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
   5134  return 31 - Num0BitsAboveMS1Bit_Nonzero32(
   5135                  static_cast<uint32_t>(BitsFromMask(d, mask)));
   5136 }
   5137 
   5138 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   5139 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
   5140  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   5141  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
   5142                   : -1;
   5143 }
   5144 
   5145 // ------------------------------ Compress, CompressBits
   5146 
   5147 namespace detail {
   5148 
   5149 // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
   5150 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   5151 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
   5152  HWY_DASSERT(mask_bits < 256);
   5153  const Rebind<uint8_t, decltype(d)> d8;
   5154  const Twice<decltype(d8)> d8t;
   5155  const RebindToUnsigned<decltype(d)> du;
   5156 
   5157  alignas(16) static constexpr uint8_t table[2048] = {
   5158      // PrintCompress16x8Tables
   5159      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5160      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5161      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
   5162      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5163      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
   5164      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
   5165      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
   5166      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5167      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
   5168      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
   5169      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
   5170      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
   5171      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
   5172      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
   5173      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
   5174      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5175      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
   5176      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
   5177      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
   5178      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
   5179      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
   5180      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
   5181      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
   5182      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
   5183      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
   5184      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
   5185      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
   5186      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
   5187      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
   5188      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
   5189      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
   5190      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5191      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
   5192      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
   5193      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
   5194      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
   5195      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
   5196      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
   5197      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
   5198      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
   5199      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
   5200      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
   5201      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
   5202      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
   5203      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
   5204      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
   5205      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
   5206      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
   5207      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
   5208      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
   5209      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
   5210      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
   5211      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
   5212      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
   5213      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
   5214      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
   5215      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
   5216      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
   5217      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
   5218      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
   5219      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
   5220      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
   5221      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
   5222      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   5223      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
   5224      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
   5225      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
   5226      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
   5227      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
   5228      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
   5229      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
   5230      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
   5231      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
   5232      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
   5233      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
   5234      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
   5235      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
   5236      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
   5237      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
   5238      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
   5239      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
   5240      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
   5241      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
   5242      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
   5243      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
   5244      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
   5245      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
   5246      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
   5247      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
   5248      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
   5249      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
   5250      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
   5251      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
   5252      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
   5253      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
   5254      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
   5255      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
   5256      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
   5257      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
   5258      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
   5259      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
   5260      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
   5261      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
   5262      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
   5263      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
   5264      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
   5265      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
   5266      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
   5267      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
   5268      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
   5269      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
   5270      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
   5271      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
   5272      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
   5273      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
   5274      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
   5275      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
   5276      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
   5277      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
   5278      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
   5279      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
   5280      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
   5281      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
   5282      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
   5283      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
   5284      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
   5285      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
   5286      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};
   5287 
   5288  const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
   5289  const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
   5290  return BitCast(d, pairs + Set(du, 0x0100));
   5291 }
   5292 
   5293 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   5294 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
   5295  HWY_DASSERT(mask_bits < 256);
   5296  const Rebind<uint8_t, decltype(d)> d8;
   5297  const Twice<decltype(d8)> d8t;
   5298  const RebindToUnsigned<decltype(d)> du;
   5299 
   5300  alignas(16) static constexpr uint8_t table[2048] = {
   5301      // PrintCompressNot16x8Tables
   5302      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
   5303      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
   5304      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
   5305      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
   5306      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
   5307      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
   5308      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
   5309      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
   5310      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
   5311      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
   5312      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
   5313      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
   5314      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
   5315      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
   5316      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
   5317      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
   5318      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
   5319      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
   5320      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
   5321      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
   5322      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
   5323      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
   5324      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
   5325      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
   5326      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
   5327      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
   5328      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
   5329      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
   5330      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
   5331      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
   5332      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
   5333      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
   5334      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
   5335      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
   5336      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
   5337      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
   5338      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
   5339      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
   5340      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
   5341      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
   5342      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
   5343      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
   5344      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
   5345      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
   5346      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
   5347      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
   5348      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
   5349      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
   5350      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
   5351      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
   5352      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
   5353      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
   5354      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
   5355      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
   5356      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
   5357      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
   5358      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
   5359      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
   5360      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
   5361      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
   5362      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
   5363      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
   5364      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
   5365      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
   5366      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
   5367      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
   5368      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
   5369      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
   5370      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
   5371      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
   5372      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
   5373      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
   5374      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
   5375      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
   5376      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
   5377      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
   5378      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
   5379      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
   5380      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
   5381      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
   5382      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
   5383      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
   5384      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
   5385      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
   5386      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
   5387      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
   5388      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
   5389      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
   5390      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
   5391      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
   5392      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
   5393      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
   5394      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
   5395      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
   5396      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
   5397      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
   5398      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
   5399      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
   5400      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
   5401      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
   5402      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
   5403      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
   5404      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
   5405      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
   5406      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
   5407      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
   5408      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
   5409      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
   5410      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
   5411      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
   5412      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
   5413      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
   5414      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
   5415      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
   5416      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
   5417      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
   5418      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
   5419      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
   5420      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
   5421      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
   5422      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
   5423      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
   5424      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
   5425      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
   5426      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
   5427      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
   5428      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
   5429      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
   5430 
   5431  const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
   5432  const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
   5433  return BitCast(d, pairs + Set(du, 0x0100));
   5434 }
   5435 
   5436 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   5437 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
   5438  HWY_DASSERT(mask_bits < 16);
   5439 
   5440  // There are only 4 lanes, so we can afford to load the index vector directly.
   5441  alignas(16) static constexpr uint8_t u8_indices[256] = {
   5442      // PrintCompress32x4Tables
   5443      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   5444      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   5445      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
   5446      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   5447      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
   5448      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
   5449      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
   5450      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   5451      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
   5452      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
   5453      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
   5454      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
   5455      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
   5456      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
   5457      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
   5458      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   5459 
   5460  const Repartition<uint8_t, decltype(d)> d8;
   5461  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   5462 }
   5463 
   5464 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   5465 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
   5466  HWY_DASSERT(mask_bits < 16);
   5467 
   5468  // There are only 4 lanes, so we can afford to load the index vector directly.
   5469  alignas(16) static constexpr uint8_t u8_indices[256] = {
   5470      // PrintCompressNot32x4Tables
   5471      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
   5472      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
   5473      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
   5474      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
   5475      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
   5476      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
   5477      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
   5478      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   5479      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
   5480      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
   5481      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
   5482      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
   5483      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
   5484      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
   5485      12, 13, 14, 15};
   5486 
   5487  const Repartition<uint8_t, decltype(d)> d8;
   5488  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   5489 }
   5490 
   5491 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   5492 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
   5493  HWY_DASSERT(mask_bits < 4);
   5494 
   5495  // There are only 2 lanes, so we can afford to load the index vector directly.
   5496  alignas(16) static constexpr uint8_t u8_indices[64] = {
   5497      // PrintCompress64x2Tables
   5498      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   5499      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   5500      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
   5501      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
   5502 
   5503  const Repartition<uint8_t, decltype(d)> d8;
   5504  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   5505 }
   5506 
   5507 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   5508 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
   5509  HWY_DASSERT(mask_bits < 4);
   5510 
   5511  // There are only 2 lanes, so we can afford to load the index vector directly.
   5512  alignas(16) static constexpr uint8_t u8_indices[64] = {
   5513      // PrintCompressNot64x2Tables
   5514      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   5515      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
   5516      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   5517      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
   5518 
   5519  const Repartition<uint8_t, decltype(d)> d8;
   5520  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   5521 }
   5522 
   5523 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
   5524 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
   5525  const DFromV<decltype(v)> d;
   5526  const RebindToUnsigned<decltype(d)> du;
   5527 
   5528  HWY_DASSERT(mask_bits < (1ull << N));
   5529  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
   5530  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
   5531 }
   5532 
   5533 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
   5534 HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
   5535  const DFromV<decltype(v)> d;
   5536  const RebindToUnsigned<decltype(d)> du;
   5537 
   5538  HWY_DASSERT(mask_bits < (1ull << N));
   5539  const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
   5540  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
   5541 }
   5542 
   5543 }  // namespace detail
   5544 
   5545 // Single lane: no-op
   5546 template <typename T>
   5547 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
   5548  return v;
   5549 }
   5550 
   5551 // Two lanes: conditional swap
   5552 template <typename T, HWY_IF_T_SIZE(T, 8)>
   5553 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
   5554  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
   5555  const DFromV<decltype(v)> d;
   5556  const Vec128<T> m = VecFromMask(d, mask);
   5557  const Vec128<T> maskL = DupEven(m);
   5558  const Vec128<T> maskH = DupOdd(m);
   5559  const Vec128<T> swap = AndNot(maskL, maskH);
   5560  return IfVecThenElse(swap, Shuffle01(v), v);
   5561 }
   5562 
   5563 // General case, 2 or 4 bytes
   5564 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
   5565 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   5566  const DFromV<decltype(v)> d;
   5567  return detail::CompressBits(v, BitsFromMask(d, mask));
   5568 }
   5569 
   5570 // ------------------------------ CompressNot
   5571 
   5572 // Single lane: no-op
   5573 template <typename T>
   5574 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
   5575  return v;
   5576 }
   5577 
   5578 // Two lanes: conditional swap
   5579 template <typename T, HWY_IF_T_SIZE(T, 8)>
   5580 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
   5581  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
   5582  const DFromV<decltype(v)> d;
   5583  const Vec128<T> m = VecFromMask(d, mask);
   5584  const Vec128<T> maskL = DupEven(m);
   5585  const Vec128<T> maskH = DupOdd(m);
   5586  const Vec128<T> swap = AndNot(maskH, maskL);
   5587  return IfVecThenElse(swap, Shuffle01(v), v);
   5588 }
   5589 
   5590 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
   5591 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
   5592  const DFromV<decltype(v)> d;
   5593  // For partial vectors, we cannot pull the Not() into the table because
   5594  // BitsFromMask clears the upper bits.
   5595  if (N < 16 / sizeof(T)) {
   5596    return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
   5597  }
   5598  return detail::CompressNotBits(v, BitsFromMask(d, mask));
   5599 }
   5600 
   5601 // ------------------------------ CompressBlocksNot
   5602 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
   5603                                           Mask128<uint64_t> /* m */) {
   5604  return v;
   5605 }
   5606 
   5607 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
   5608 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
   5609                                  const uint8_t* HWY_RESTRICT bits) {
   5610  uint64_t mask_bits = 0;
   5611  constexpr size_t kNumBytes = (N + 7) / 8;
   5612  CopyBytes<kNumBytes>(bits, &mask_bits);
   5613  if (N < 8) {
   5614    mask_bits &= (1ull << N) - 1;
   5615  }
   5616 
   5617  return detail::CompressBits(v, mask_bits);
   5618 }
   5619 
   5620 // ------------------------------ CompressStore, CompressBitsStore
   5621 
   5622 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
   5623 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
   5624                             TFromD<D>* HWY_RESTRICT unaligned) {
   5625  const RebindToUnsigned<decltype(d)> du;
   5626 
   5627  const uint64_t mask_bits = BitsFromMask(d, m);
   5628  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
   5629  const size_t count = PopCount(mask_bits);
   5630 
   5631  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
   5632  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
   5633  StoreU(compressed, d, unaligned);
   5634  detail::MaybeUnpoison(unaligned, count);
   5635  return count;
   5636 }
   5637 
   5638 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
   5639 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
   5640                                    TFromD<D>* HWY_RESTRICT unaligned) {
   5641  const RebindToUnsigned<decltype(d)> du;
   5642 
   5643  const uint64_t mask_bits = BitsFromMask(d, m);
   5644  HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
   5645  const size_t count = PopCount(mask_bits);
   5646 
   5647  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
   5648  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
   5649  BlendedStore(compressed, FirstN(d, count), d, unaligned);
   5650  detail::MaybeUnpoison(unaligned, count);
   5651  return count;
   5652 }
   5653 
   5654 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)>
   5655 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
   5656                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
   5657  const RebindToUnsigned<decltype(d)> du;
   5658 
   5659  uint64_t mask_bits = 0;
   5660  constexpr size_t kN = MaxLanes(d);
   5661  constexpr size_t kNumBytes = (kN + 7) / 8;
   5662  CopyBytes<kNumBytes>(bits, &mask_bits);
   5663  if (kN < 8) {
   5664    mask_bits &= (1ull << kN) - 1;
   5665  }
   5666  const size_t count = PopCount(mask_bits);
   5667 
   5668  const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
   5669  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
   5670  StoreU(compressed, d, unaligned);
   5671 
   5672  detail::MaybeUnpoison(unaligned, count);
   5673  return count;
   5674 }
   5675 
   5676 // ------------------------------ StoreInterleaved2/3/4
   5677 
   5678 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
   5679 // generic_ops-inl.h.
   5680 
   5681 // ------------------------------ Additional mask logical operations
   5682 
   5683 template <class T>
   5684 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
   5685  return mask;
   5686 }
   5687 template <class T>
   5688 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
   5689  const FixedTag<T, 2> d;
   5690  const auto vmask = VecFromMask(d, mask);
   5691  return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
   5692 }
   5693 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
   5694 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
   5695  const Simd<T, N, 0> d;
   5696  const auto vmask = VecFromMask(d, mask);
   5697  const auto neg_vmask =
   5698      ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
   5699  return MaskFromVec(Or(vmask, neg_vmask));
   5700 }
   5701 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
   5702 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
   5703  const Full128<T> d;
   5704  const Repartition<int64_t, decltype(d)> di64;
   5705 
   5706  auto vmask = BitCast(di64, VecFromMask(d, mask));
   5707  VFromD<decltype(di64)> neg_vmask{__lsx_vsub_q(Zero(di64).raw, vmask.raw)};
   5708 
   5709  return MaskFromVec(BitCast(d, Or(vmask, neg_vmask)));
   5710 }
   5711 
   5712 template <class T, size_t N>
   5713 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
   5714  return Not(SetAtOrAfterFirst(mask));
   5715 }
   5716 
   5717 template <class T>
   5718 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
   5719  return mask;
   5720 }
   5721 template <class T>
   5722 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
   5723  const FixedTag<T, 2> d;
   5724  const RebindToSigned<decltype(d)> di;
   5725 
   5726  const auto vmask = BitCast(di, VecFromMask(d, mask));
   5727  const auto zero = Zero(di);
   5728  const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
   5729  return MaskFromVec(BitCast(d, And(vmask, vmask2)));
   5730 }
   5731 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
   5732 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
   5733  const Simd<T, N, 0> d;
   5734  const RebindToSigned<decltype(d)> di;
   5735 
   5736  const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
   5737  const auto only_first_vmask =
   5738      BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
   5739  return MaskFromVec(only_first_vmask);
   5740 }
   5741 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
   5742 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
   5743  const Full128<T> d;
   5744  const RebindToSigned<decltype(d)> di;
   5745 
   5746  auto vmask = BitCast(di, VecFromMask(d, mask));
   5747  VFromD<decltype(di)> neg_vmask{__lsx_vsub_q(Zero(di).raw, vmask.raw)};
   5748 
   5749  return MaskFromVec(BitCast(d, Neg(And(vmask, neg_vmask))));
   5750 }
   5751 
   5752 template <class T>
   5753 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
   5754  const FixedTag<T, 1> d;
   5755  const RebindToSigned<decltype(d)> di;
   5756  using TI = MakeSigned<T>;
   5757 
   5758  return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
   5759 }
   5760 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
   5761 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
   5762  const Simd<T, N, 0> d;
   5763  return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
   5764 }
   5765 
   5766 // ------------------------------ Reductions
   5767 #undef HWY_IF_SUM_OF_LANES_D
   5768 #define HWY_IF_SUM_OF_LANES_D(D)                                        \
   5769  HWY_IF_LANES_GT_D(D, 1),                                              \
   5770      hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() ||               \
   5771                    (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
   5772          nullptr
   5773 // ------------------------------ SumOfLanes
   5774 
   5775 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
   5776 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
   5777  return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
   5778 }
   5779 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
   5780 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
   5781  const Repartition<uint64_t, decltype(d)> d64;
   5782  VFromD<decltype(d64)> sums = SumsOf8(v);
   5783  sums = SumOfLanes(d64, sums);
   5784  return Broadcast<0>(BitCast(d, sums));
   5785 }
   5786 
   5787 // ------------------------------ Lt128
   5788 
   5789 namespace detail {
   5790 
   5791 // Returns vector-mask for Lt128. Generic for all vector lengths.
   5792 template <class D, HWY_IF_U64_D(D)>
   5793 HWY_INLINE VFromD<D> Lt128Vec(const D d, VFromD<D> a, VFromD<D> b) {
   5794  // Truth table of Eq and Lt for Hi and Lo u64.
   5795  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   5796  // =H =L cH cL  | out = cH | (=H & cL)
   5797  //  0  0  0  0  |  0
   5798  //  0  0  0  1  |  0
   5799  //  0  0  1  0  |  1
   5800  //  0  0  1  1  |  1
   5801  //  0  1  0  0  |  0
   5802  //  0  1  0  1  |  0
   5803  //  0  1  1  0  |  1
   5804  //  1  0  0  0  |  0
   5805  //  1  0  0  1  |  1
   5806  //  1  1  0  0  |  0
   5807  const auto eqHL = Eq(a, b);
   5808  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
   5809  const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL);
   5810  const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL);
   5811  return InterleaveUpper(d, vecHx, vecHx);
   5812 }
   5813 
   5814 // Returns vector-mask for Eq128. Generic for all vector lengths.
   5815 template <class D, HWY_IF_U64_D(D)>
   5816 HWY_INLINE VFromD<D> Eq128Vec(D d, VFromD<D> a, VFromD<D> b) {
   5817  const auto eqHL = VecFromMask(d, Eq(a, b));
   5818  const auto eqLH = Reverse2(d, eqHL);
   5819  return And(eqHL, eqLH);
   5820 }
   5821 
   5822 template <class D, HWY_IF_U64_D(D)>
   5823 HWY_INLINE VFromD<D> Ne128Vec(D d, VFromD<D> a, VFromD<D> b) {
   5824  const auto neHL = VecFromMask(d, Ne(a, b));
   5825  const auto neLH = Reverse2(d, neHL);
   5826  return Or(neHL, neLH);
   5827 }
   5828 
   5829 template <class D, HWY_IF_U64_D(D)>
   5830 HWY_INLINE VFromD<D> Lt128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
   5831  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
   5832  return InterleaveUpper(d, ltHL, ltHL);
   5833 }
   5834 
   5835 template <class D, HWY_IF_U64_D(D)>
   5836 HWY_INLINE VFromD<D> Eq128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
   5837  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
   5838  return InterleaveUpper(d, eqHL, eqHL);
   5839 }
   5840 
   5841 template <class D, HWY_IF_U64_D(D)>
   5842 HWY_INLINE VFromD<D> Ne128UpperVec(D d, VFromD<D> a, VFromD<D> b) {
   5843  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
   5844  return InterleaveUpper(d, neHL, neHL);
   5845 }
   5846 
   5847 }  // namespace detail
   5848 
   5849 template <class D, HWY_IF_U64_D(D)>
   5850 HWY_API MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) {
   5851  return MaskFromVec(detail::Lt128Vec(d, a, b));
   5852 }
   5853 
   5854 template <class D, HWY_IF_U64_D(D)>
   5855 HWY_API MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) {
   5856  return MaskFromVec(detail::Eq128Vec(d, a, b));
   5857 }
   5858 
   5859 template <class D, HWY_IF_U64_D(D)>
   5860 HWY_API MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) {
   5861  return MaskFromVec(detail::Ne128Vec(d, a, b));
   5862 }
   5863 
   5864 template <class D, HWY_IF_U64_D(D)>
   5865 HWY_API MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
   5866  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
   5867 }
   5868 
   5869 template <class D, HWY_IF_U64_D(D)>
   5870 HWY_API MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
   5871  return MaskFromVec(detail::Eq128UpperVec(d, a, b));
   5872 }
   5873 
   5874 template <class D, HWY_IF_U64_D(D)>
   5875 HWY_API MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
   5876  return MaskFromVec(detail::Ne128UpperVec(d, a, b));
   5877 }
   5878 
   5879 // ------------------------------ Min128, Max128 (Lt128)
   5880 
   5881 // Avoids the extra MaskFromVec in Lt128.
   5882 template <class D, HWY_IF_U64_D(D)>
   5883 HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
   5884  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
   5885 }
   5886 
   5887 template <class D, HWY_IF_U64_D(D)>
   5888 HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
   5889  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
   5890 }
   5891 
   5892 template <class D, HWY_IF_U64_D(D)>
   5893 HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
   5894  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
   5895 }
   5896 
   5897 template <class D, HWY_IF_U64_D(D)>
   5898 HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
   5899  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
   5900 }
   5901 
   5902 // -------------------- LeadingZeroCount, TrailingZeroCount,
   5903 //                      HighestSetBitIndex
   5904 
   5905 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
   5906 #undef HWY_NATIVE_LEADING_ZERO_COUNT
   5907 #else
   5908 #define HWY_NATIVE_LEADING_ZERO_COUNT
   5909 #endif
   5910 
   5911 template <class V, HWY_IF_UI8_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
   5912 HWY_API V LeadingZeroCount(V v) {
   5913  return V{__lsx_vclz_b(v.raw)};
   5914 }
   5915 
   5916 template <class V, HWY_IF_UI16_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
   5917 HWY_API V LeadingZeroCount(V v) {
   5918  return V{__lsx_vclz_h(v.raw)};
   5919 }
   5920 
   5921 template <class V, HWY_IF_UI32_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
   5922 HWY_API V LeadingZeroCount(V v) {
   5923  return V{__lsx_vclz_w(v.raw)};
   5924 }
   5925 
   5926 template <class V, HWY_IF_UI64_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)>
   5927 HWY_API V LeadingZeroCount(V v) {
   5928  return V{__lsx_vclz_d(v.raw)};
   5929 }
   5930 
   5931 template <class V, HWY_IF_V_SIZE_LE_V(V, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5932 HWY_API V HighestSetBitIndex(V v) {
   5933  const DFromV<decltype(v)> d;
   5934  using T = TFromD<decltype(d)>;
   5935  return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
   5936 }
   5937 
   5938 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5939 HWY_API V TrailingZeroCount(V v) {
   5940  const DFromV<decltype(v)> d;
   5941  const RebindToSigned<decltype(d)> di;
   5942  using T = TFromD<decltype(d)>;
   5943 
   5944  const auto lsb = And(v, BitCast(d, Neg(BitCast(di, v))));
   5945  return IfThenElse(Eq(v, Zero(d)), Set(d, T{sizeof(T) * 8}),
   5946                    HighestSetBitIndex(lsb));
   5947 }
   5948 
   5949 }  // namespace HWY_NAMESPACE
   5950 }  // namespace hwy
   5951 
   5952 HWY_AFTER_NAMESPACE();
   5953 
   5954 #undef HWY_LSX_IF_EMULATED_D