tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

loongarch_lasx-inl.h (179443B)


      1 // Copyright 2019 Google LLC
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // 256-bit LASX vectors and operations.
     16 // External include guard in highway.h - see comment there.
     17 
     18 #include "hwy/ops/loongarch_lsx-inl.h"
     19 #include "hwy/ops/shared-inl.h"
     20 
     21 #ifndef __loongarch_asx
     22 // If LASX is to be runtime dispatched (instead of in baseline), we need
     23 // to enable it *and* define __loongarch_asx or the intrinsic header will
     24 // fail to compile.
     25 //
     26 // For consistency, the same pattern as the lsxintrin.h handling in
     27 // loongarch_lsx-inl.h is used (instead of moving lasxintrin.h after
     28 // HWY_BEFORE_NAMESPACE).
     29 HWY_PUSH_ATTRIBUTES("lsx,lasx")
     30 #define __loongarch_asx
     31 #include <lasxintrin.h>
     32 #undef __loongarch_asx
     33 // Prevent "unused push_attribute" warning from Clang.
     34 HWY_MAYBE_UNUSED static void HWY_CONCAT(hwy_lasx_dummy, __COUNTER__) () {}
     35 HWY_POP_ATTRIBUTES
     36 #else
     37 #include <lasxintrin.h>
     38 #endif
     39 
     40 HWY_BEFORE_NAMESPACE();
     41 namespace hwy {
     42 namespace HWY_NAMESPACE {
     43 namespace detail {
     44 
     45 template <typename T>
     46 struct Raw256 {
     47  using type = __m256i;
     48 };
     49 template <>
     50 struct Raw256<float> {
     51  using type = __m256;
     52 };
     53 template <>
     54 struct Raw256<double> {
     55  using type = __m256d;
     56 };
     57 
     58 }  // namespace detail
     59 
     60 template <typename T>
     61 class Vec256 {
     62  using Raw = typename detail::Raw256<T>::type;
     63 
     64 public:
     65  using PrivateT = T;                                  // only for DFromV
     66  static constexpr size_t kPrivateN = 32 / sizeof(T);  // only for DFromV
     67 
     68  // Compound assignment. Only usable if there is a corresponding non-member
     69  // binary operator overload. For example, only f32 and f64 support division.
     70  HWY_INLINE Vec256& operator*=(const Vec256 other) {
     71    return *this = (*this * other);
     72  }
     73  HWY_INLINE Vec256& operator/=(const Vec256 other) {
     74    return *this = (*this / other);
     75  }
     76  HWY_INLINE Vec256& operator+=(const Vec256 other) {
     77    return *this = (*this + other);
     78  }
     79  HWY_INLINE Vec256& operator-=(const Vec256 other) {
     80    return *this = (*this - other);
     81  }
     82  HWY_INLINE Vec256& operator%=(const Vec256 other) {
     83    return *this = (*this % other);
     84  }
     85  HWY_INLINE Vec256& operator&=(const Vec256 other) {
     86    return *this = (*this & other);
     87  }
     88  HWY_INLINE Vec256& operator|=(const Vec256 other) {
     89    return *this = (*this | other);
     90  }
     91  HWY_INLINE Vec256& operator^=(const Vec256 other) {
     92    return *this = (*this ^ other);
     93  }
     94 
     95  Raw raw;
     96 };
     97 
     98 namespace detail {
     99 
    100 template <typename T>
    101 using RawMask256 = typename Raw256<T>::type;
    102 
    103 }  // namespace detail
    104 
    105 template <typename T>
    106 struct Mask256 {
    107  using Raw = typename detail::RawMask256<T>;
    108 
    109  using PrivateT = T;                                  // only for DFromM
    110  static constexpr size_t kPrivateN = 32 / sizeof(T);  // only for DFromM
    111 
    112  Raw raw;
    113 };
    114 
    115 template <typename T>
    116 using Full256 = Simd<T, 32 / sizeof(T), 0>;
    117 
    118 // ------------------------------ Zero
    119 
    120 // Cannot use VFromD here because it is defined in terms of Zero.
    121 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
    122 HWY_API Vec256<TFromD<D>> Zero(D /* tag */) {
    123  return Vec256<TFromD<D>>{__lasx_xvreplgr2vr_d(0)};
    124 }
    125 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
    126 HWY_API Vec256<bfloat16_t> Zero(D /* tag */) {
    127  return Vec256<bfloat16_t>{__lasx_xvreplgr2vr_d(0)};
    128 }
    129 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)>
    130 HWY_API Vec256<float16_t> Zero(D /* tag */) {
    131  return Vec256<float16_t>{__lasx_xvreplgr2vr_d(0)};
    132 }
    133 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
    134 HWY_API Vec256<float> Zero(D /* tag */) {
    135  return Vec256<float>{reinterpret_cast<__m256>(__lasx_xvreplgr2vr_d(0))};
    136 }
    137 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
    138 HWY_API Vec256<double> Zero(D /* tag */) {
    139  return Vec256<double>{reinterpret_cast<__m256d>(__lasx_xvreplgr2vr_d(0))};
    140 }
    141 
    142 // ------------------------------ BitCast
    143 
    144 namespace detail {
    145 
    146 HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
    147 HWY_INLINE __m256i BitCastToInteger(__m256 v) {
    148  return reinterpret_cast<__m256i>(v);
    149 }
    150 HWY_INLINE __m256i BitCastToInteger(__m256d v) {
    151  return reinterpret_cast<__m256i>(v);
    152 }
    153 
    154 template <typename T>
    155 HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
    156  return Vec256<uint8_t>{BitCastToInteger(v.raw)};
    157 }
    158 
    159 // Cannot rely on function overloading because return types differ.
    160 template <typename T>
    161 struct BitCastFromInteger256 {
    162  HWY_INLINE __m256i operator()(__m256i v) { return v; }
    163 };
    164 template <>
    165 struct BitCastFromInteger256<float> {
    166  HWY_INLINE __m256 operator()(__m256i v) {
    167    return reinterpret_cast<__m256>(v);
    168  }
    169 };
    170 template <>
    171 struct BitCastFromInteger256<double> {
    172  HWY_INLINE __m256d operator()(__m256i v) {
    173    return reinterpret_cast<__m256d>(v);
    174  }
    175 };
    176 
    177 template <class D, HWY_IF_V_SIZE_D(D, 32)>
    178 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, Vec256<uint8_t> v) {
    179  return VFromD<D>{BitCastFromInteger256<TFromD<D>>()(v.raw)};
    180 }
    181 
    182 }  // namespace detail
    183 
    184 template <class D, HWY_IF_V_SIZE_D(D, 32), typename FromT>
    185 HWY_API VFromD<D> BitCast(D d, Vec256<FromT> v) {
    186  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
    187 }
    188 
    189 // ------------------------------ Set
    190 
    191 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
    192 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    193  return VFromD<D>{__lasx_xvreplgr2vr_b(static_cast<char>(t))};  // NOLINT
    194 }
    195 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
    196 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    197  return VFromD<D>{__lasx_xvreplgr2vr_h(static_cast<short>(t))};  // NOLINT
    198 }
    199 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
    200 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    201  return VFromD<D>{__lasx_xvreplgr2vr_w(static_cast<int>(t))};
    202 }
    203 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
    204 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
    205  return VFromD<D>{__lasx_xvreplgr2vr_d(static_cast<long long>(t))};  // NOLINT
    206 }
    207 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
    208 HWY_API Vec256<float> Set(D /* tag */, float t) {
    209  return BitCast(D(), Vec256<int32_t>{__lasx_xvldrepl_w(&t, 0)});
    210 }
    211 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
    212 HWY_API Vec256<double> Set(D /* tag */, double t) {
    213  return BitCast(D(), Vec256<int64_t>{__lasx_xvldrepl_d(&t, 0)});
    214 }
    215 
    216 // ------------------------------ ResizeBitCast
    217 
    218 // 32-byte vector to 32-byte vector
    219 template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
    220          HWY_IF_V_SIZE_D(D, HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>))>
    221 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    222  return BitCast(d, v);
    223 }
    224 
    225 // 32-byte vector to 16-byte vector
    226 template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
    227          HWY_IF_V_SIZE_D(D, 16)>
    228 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    229  const DFromV<decltype(v)> d_from;
    230  const Half<decltype(d_from)> dh_from;
    231  return BitCast(d, LowerHalf(dh_from, v));
    232 }
    233 
    234 // 32-byte vector to <= 8-byte vector
    235 template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16),
    236          HWY_IF_V_SIZE_LE_D(D, 8)>
    237 HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
    238  return VFromD<D>{ResizeBitCast(Full128<TFromD<D>>(), v).raw};
    239 }
    240 
    241 // <= 16-byte vector to 32-byte vector
    242 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
    243          HWY_IF_V_SIZE_D(D, 32)>
    244 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    245  typedef uint64_t GccRawU64M128Vec __attribute__((__vector_size__(16)));
    246 
    247  const GccRawU64M128Vec raw_v0 = reinterpret_cast<GccRawU64M128Vec>(v.raw);
    248 #if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
    249  const GccRawU64M128Vec raw_v1 = __builtin_nondeterministic_value(raw_v0);
    250 #else
    251  const GccRawU64M128Vec raw_v1 = raw_v0;
    252 #endif
    253 
    254  const Repartition<uint64_t, decltype(d)> du64;
    255  const Half<decltype(du64)> dh_u64;
    256  return BitCast(
    257      d,
    258      Combine(du64, VFromD<decltype(dh_u64)>{reinterpret_cast<__m128i>(raw_v1)},
    259              VFromD<decltype(dh_u64)>{reinterpret_cast<__m128i>(raw_v0)}));
    260 }
    261 
    262 // ------------------------------ Dup128VecFromValues
    263 
    264 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)>
    265 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    266                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    267                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
    268                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
    269                                      TFromD<D> t11, TFromD<D> t12,
    270                                      TFromD<D> t13, TFromD<D> t14,
    271                                      TFromD<D> t15) {
    272  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(32)));
    273  GccI8RawVectType raw_i8_vec = {
    274      static_cast<char>(t0),  static_cast<char>(t1),  static_cast<char>(t2),
    275      static_cast<char>(t3),  static_cast<char>(t4),  static_cast<char>(t5),
    276      static_cast<char>(t6),  static_cast<char>(t7),  static_cast<char>(t8),
    277      static_cast<char>(t9),  static_cast<char>(t10), static_cast<char>(t11),
    278      static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
    279      static_cast<char>(t15), static_cast<char>(t0),  static_cast<char>(t1),
    280      static_cast<char>(t2),  static_cast<char>(t3),  static_cast<char>(t4),
    281      static_cast<char>(t5),  static_cast<char>(t6),  static_cast<char>(t7),
    282      static_cast<char>(t8),  static_cast<char>(t9),  static_cast<char>(t10),
    283      static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13),
    284      static_cast<char>(t14), static_cast<char>(t15)};
    285  return VFromD<D>{reinterpret_cast<__m256i>(raw_i8_vec)};
    286 }
    287 
    288 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)>
    289 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    290                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    291                                      TFromD<D> t5, TFromD<D> t6,
    292                                      TFromD<D> t7) {
    293  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(32)));
    294  GccI16RawVectType raw_i16_vec = {
    295      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
    296      static_cast<int16_t>(t2), static_cast<int16_t>(t3),
    297      static_cast<int16_t>(t4), static_cast<int16_t>(t5),
    298      static_cast<int16_t>(t6), static_cast<int16_t>(t7),
    299      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
    300      static_cast<int16_t>(t2), static_cast<int16_t>(t3),
    301      static_cast<int16_t>(t4), static_cast<int16_t>(t5),
    302      static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
    303  return VFromD<D>{reinterpret_cast<__m256i>(raw_i16_vec)};
    304 }
    305 
    306 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)>
    307 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    308                                      TFromD<D> t2, TFromD<D> t3) {
    309  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(32)));
    310  GccI32RawVectType raw_i32_vec = {
    311      static_cast<int32_t>(t0), static_cast<int32_t>(t1),
    312      static_cast<int32_t>(t2), static_cast<int32_t>(t3),
    313      static_cast<int32_t>(t0), static_cast<int32_t>(t1),
    314      static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
    315  return VFromD<D>{reinterpret_cast<__m256i>(raw_i32_vec)};
    316 }
    317 
    318 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)>
    319 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
    320                                      TFromD<D> t2, TFromD<D> t3) {
    321  typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
    322  GccF32RawVectType raw_f32_vec = {t0, t1, t2, t3, t0, t1, t2, t3};
    323  return Vec256<float>{reinterpret_cast<__m256>(raw_f32_vec)};
    324 }
    325 
    326 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)>
    327 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    328  typedef int64_t GccI64RawVectType __attribute__((__vector_size__(32)));
    329  const GccI64RawVectType raw_i64_vec = {
    330      static_cast<int64_t>(t0), static_cast<int64_t>(t1),
    331      static_cast<int64_t>(t0), static_cast<int64_t>(t1)};
    332  return VFromD<D>{reinterpret_cast<__m256i>(raw_i64_vec)};
    333 }
    334 
    335 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)>
    336 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
    337  typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
    338  const GccF64RawVectType raw_f64_vec = {t0, t1, t0, t1};
    339  return VFromD<D>{reinterpret_cast<__m256d>(raw_f64_vec)};
    340 }
    341 
    342 // ------------------------------ And
    343 
    344 template <typename T>
    345 HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
    346  const DFromV<decltype(a)> d;  // for float16_t
    347  const RebindToUnsigned<decltype(d)> du;
    348  return BitCast(d, VFromD<decltype(du)>{__lasx_xvand_v(BitCast(du, a).raw,
    349                                                        BitCast(du, b).raw)});
    350 }
    351 
    352 // ------------------------------ AndNot
    353 
    354 // Returns ~not_mask & mask.
    355 template <typename T>
    356 HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
    357  const DFromV<decltype(mask)> d;  // for float16_t
    358  const RebindToUnsigned<decltype(d)> du;
    359  return BitCast(d, VFromD<decltype(du)>{__lasx_xvandn_v(
    360                        BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
    361 }
    362 
    363 // ------------------------------ Or
    364 
    365 template <typename T>
    366 HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
    367  const DFromV<decltype(a)> d;  // for float16_t
    368  const RebindToUnsigned<decltype(d)> du;
    369  return BitCast(d, VFromD<decltype(du)>{
    370                        __lasx_xvor_v(BitCast(du, a).raw, BitCast(du, b).raw)});
    371 }
    372 
    373 // ------------------------------ Xor
    374 
    375 template <typename T>
    376 HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
    377  const DFromV<decltype(a)> d;  // for float16_t
    378  const RebindToUnsigned<decltype(d)> du;
    379  return BitCast(d, VFromD<decltype(du)>{__lasx_xvxor_v(BitCast(du, a).raw,
    380                                                        BitCast(du, b).raw)});
    381 }
    382 
    383 // ------------------------------ Not
    384 template <typename T>
    385 HWY_API Vec256<T> Not(const Vec256<T> v) {
    386  const DFromV<decltype(v)> d;
    387  const RebindToUnsigned<decltype(d)> du;
    388  return BitCast(d, VFromD<decltype(du)>{__lasx_xvnor_v(BitCast(du, v).raw,
    389                                                        BitCast(du, v).raw)});
    390 }
    391 
    392 // ------------------------------ Xor3
    393 template <typename T>
    394 HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
    395  return Xor(x1, Xor(x2, x3));
    396 }
    397 
    398 // ------------------------------ Or3
    399 template <typename T>
    400 HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
    401  return Or(o1, Or(o2, o3));
    402 }
    403 
    404 // ------------------------------ OrAnd
    405 template <typename T>
    406 HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
    407  return Or(o, And(a1, a2));
    408 }
    409 
    410 // ------------------------------ IfVecThenElse
    411 template <typename T>
    412 HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
    413  return IfThenElse(MaskFromVec(mask), yes, no);
    414 }
    415 
    416 // ------------------------------ Operator overloads (internal-only if float)
    417 
    418 template <typename T>
    419 HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
    420  return And(a, b);
    421 }
    422 
    423 template <typename T>
    424 HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
    425  return Or(a, b);
    426 }
    427 
    428 template <typename T>
    429 HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
    430  return Xor(a, b);
    431 }
    432 
    433 // ------------------------------ PopulationCount
    434 
    435 namespace detail {
    436 
    437 template <typename T>
    438 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec256<T> v) {
    439  return Vec256<T>{__lasx_xvpcnt_b(v.raw)};
    440 }
    441 template <typename T>
    442 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
    443  return Vec256<T>{__lasx_xvpcnt_h(v.raw)};
    444 }
    445 template <typename T>
    446 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec256<T> v) {
    447  return Vec256<T>{__lasx_xvpcnt_w(v.raw)};
    448 }
    449 template <typename T>
    450 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec256<T> v) {
    451  return Vec256<T>{__lasx_xvpcnt_d(v.raw)};
    452 }
    453 
    454 }  // namespace detail
    455 
    456 template <typename T>
    457 HWY_API Vec256<T> PopulationCount(Vec256<T> v) {
    458  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
    459 }
    460 
    461 // ------------------------------ Mask
    462 
    463 // Mask and Vec are the same (true = FF..FF).
    464 template <typename T>
    465 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
    466  return Mask256<T>{v.raw};
    467 }
    468 
    469 template <typename T>
    470 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
    471  return Vec256<T>{v.raw};
    472 }
    473 
    474 // ------------------------------ IfThenElse
    475 
    476 // mask ? yes : no
    477 template <typename T>
    478 HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
    479  const DFromV<decltype(yes)> d;
    480  RebindToSigned<decltype(d)> di;
    481  return BitCast(d, VFromD<decltype(di)>{__lasx_xvbitsel_v(
    482                        BitCast(di, no).raw, BitCast(di, yes).raw,
    483                        RebindMask(di, mask).raw)});
    484 }
    485 
    486 // mask ? yes : 0
    487 template <typename T>
    488 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
    489  return yes & VecFromMask(mask);
    490 }
    491 
    492 // mask ? 0 : no
    493 template <typename T>
    494 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
    495  return AndNot(VecFromMask(mask), no);
    496 }
    497 
    498 template <typename T>
    499 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
    500  static_assert(IsSigned<T>(), "Only for float");
    501  const DFromV<decltype(v)> d;
    502  const auto zero = Zero(d);
    503  return IfThenElse(v < zero, zero, v);
    504 }
    505 
    506 // ------------------------------ Mask logical
    507 
    508 template <typename T>
    509 HWY_API Mask256<T> Not(const Mask256<T> m) {
    510  const Full256<T> d;
    511  return MaskFromVec(Not(VecFromMask(d, m)));
    512 }
    513 
    514 template <typename T>
    515 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
    516  const Full256<T> d;
    517  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
    518 }
    519 
    520 template <typename T>
    521 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
    522  const Full256<T> d;
    523  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
    524 }
    525 
    526 template <typename T>
    527 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
    528  const Full256<T> d;
    529  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
    530 }
    531 
    532 template <typename T>
    533 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
    534  const Full256<T> d;
    535  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
    536 }
    537 
    538 template <typename T>
    539 HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
    540  const Full256<T> d;
    541  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
    542 }
    543 
    544 // ================================================== COMPARE
    545 
    546 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
    547 
    548 template <class DTo, HWY_IF_V_SIZE_D(DTo, 32), typename TFrom>
    549 HWY_API MFromD<DTo> RebindMask(DTo d_to, Mask256<TFrom> m) {
    550  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
    551  const Full256<TFrom> dfrom;
    552  return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m)));
    553 }
    554 
    555 template <typename T>
    556 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
    557  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
    558  return (v & bit) == bit;
    559 }
    560 
    561 // ------------------------------ Equality
    562 
    563 template <typename T, HWY_IF_T_SIZE(T, 1)>
    564 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
    565  return Mask256<T>{__lasx_xvseq_b(a.raw, b.raw)};
    566 }
    567 
    568 template <typename T, HWY_IF_UI16(T)>
    569 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
    570  return Mask256<T>{__lasx_xvseq_h(a.raw, b.raw)};
    571 }
    572 
    573 template <typename T, HWY_IF_UI32(T)>
    574 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
    575  return Mask256<T>{__lasx_xvseq_w(a.raw, b.raw)};
    576 }
    577 
    578 template <typename T, HWY_IF_UI64(T)>
    579 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) {
    580  return Mask256<T>{__lasx_xvseq_d(a.raw, b.raw)};
    581 }
    582 
    583 HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
    584  const DFromV<decltype(a)> d;
    585  const RebindToSigned<decltype(d)> di;
    586  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_ceq_s(a.raw, b.raw)});
    587 }
    588 
    589 HWY_API Mask256<double> operator==(Vec256<double> a, Vec256<double> b) {
    590  const DFromV<decltype(a)> d;
    591  const RebindToSigned<decltype(d)> di;
    592  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_ceq_d(a.raw, b.raw)});
    593 }
    594 
    595 // ------------------------------ Inequality
    596 
    597 template <typename T, HWY_IF_NOT_FLOAT3264(T)>
    598 HWY_API Mask256<T> operator!=(Vec256<T> a, Vec256<T> b) {
    599  return Not(a == b);
    600 }
    601 HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
    602  const DFromV<decltype(a)> d;
    603  const RebindToSigned<decltype(d)> di;
    604  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cne_s(a.raw, b.raw)});
    605 }
    606 HWY_API Mask256<double> operator!=(Vec256<double> a, Vec256<double> b) {
    607  const DFromV<decltype(a)> d;
    608  const RebindToSigned<decltype(d)> di;
    609  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cne_d(a.raw, b.raw)});
    610 }
    611 
    612 // ------------------------------ Strict inequality
    613 
    614 namespace detail {
    615 
    616 HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
    617                           Vec256<int8_t> b) {
    618  return Mask256<int8_t>{__lasx_xvslt_b(b.raw, a.raw)};
    619 }
    620 HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
    621                            Vec256<int16_t> b) {
    622  return Mask256<int16_t>{__lasx_xvslt_h(b.raw, a.raw)};
    623 }
    624 HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
    625                            Vec256<int32_t> b) {
    626  return Mask256<int32_t>{__lasx_xvslt_w(b.raw, a.raw)};
    627 }
    628 HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
    629                            Vec256<int64_t> b) {
    630  return Mask256<int64_t>{__lasx_xvslt_d(b.raw, a.raw)};
    631 }
    632 
    633 HWY_API Mask256<uint8_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint8_t> a,
    634                            Vec256<uint8_t> b) {
    635  return Mask256<uint8_t>{__lasx_xvslt_bu(b.raw, a.raw)};
    636 }
    637 HWY_API Mask256<uint16_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> a,
    638                             Vec256<uint16_t> b) {
    639  return Mask256<uint16_t>{__lasx_xvslt_hu(b.raw, a.raw)};
    640 }
    641 HWY_API Mask256<uint32_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> a,
    642                             Vec256<uint32_t> b) {
    643  return Mask256<uint32_t>{__lasx_xvslt_wu(b.raw, a.raw)};
    644 }
    645 HWY_API Mask256<uint64_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> a,
    646                             Vec256<uint64_t> b) {
    647  return Mask256<uint64_t>{__lasx_xvslt_du(b.raw, a.raw)};
    648 }
    649 
    650 HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
    651                          Vec256<float> b) {
    652  const DFromV<decltype(a)> d;
    653  const RebindToSigned<decltype(d)> di;
    654  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_clt_s(b.raw, a.raw)});
    655 }
    656 HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
    657                           Vec256<double> b) {
    658  const DFromV<decltype(a)> d;
    659  const RebindToSigned<decltype(d)> di;
    660  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_clt_d(b.raw, a.raw)});
    661 }
    662 
    663 }  // namespace detail
    664 
    665 template <typename T>
    666 HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
    667  return detail::Gt(hwy::TypeTag<T>(), a, b);
    668 }
    669 
    670 // ------------------------------ Weak inequality
    671 
    672 namespace detail {
    673 
    674 template <typename T>
    675 HWY_INLINE Mask256<T> Ge(hwy::SignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
    676  return Not(b > a);
    677 }
    678 
    679 template <typename T>
    680 HWY_INLINE Mask256<T> Ge(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
    681  return Not(b > a);
    682 }
    683 
    684 HWY_INLINE Mask256<float> Ge(hwy::FloatTag /*tag*/, Vec256<float> a,
    685                             Vec256<float> b) {
    686  const DFromV<decltype(a)> d;
    687  const RebindToSigned<decltype(d)> di;
    688  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cle_s(b.raw, a.raw)});
    689 }
    690 HWY_INLINE Mask256<double> Ge(hwy::FloatTag /*tag*/, Vec256<double> a,
    691                              Vec256<double> b) {
    692  const DFromV<decltype(a)> d;
    693  const RebindToSigned<decltype(d)> di;
    694  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cle_d(b.raw, a.raw)});
    695 }
    696 
    697 }  // namespace detail
    698 
    699 template <typename T>
    700 HWY_API Mask256<T> operator>=(Vec256<T> a, Vec256<T> b) {
    701  return detail::Ge(hwy::TypeTag<T>(), a, b);
    702 }
    703 
    704 // ------------------------------ Reversed comparisons
    705 
    706 template <typename T>
    707 HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
    708  return b > a;
    709 }
    710 
    711 template <typename T>
    712 HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) {
    713  return b >= a;
    714 }
    715 
    716 // ------------------------------ Min (Gt, IfThenElse)
    717 
    718 // Unsigned
    719 HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
    720  return Vec256<uint8_t>{__lasx_xvmin_bu(a.raw, b.raw)};
    721 }
    722 HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
    723                             const Vec256<uint16_t> b) {
    724  return Vec256<uint16_t>{__lasx_xvmin_hu(a.raw, b.raw)};
    725 }
    726 HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
    727                             const Vec256<uint32_t> b) {
    728  return Vec256<uint32_t>{__lasx_xvmin_wu(a.raw, b.raw)};
    729 }
    730 HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
    731                             const Vec256<uint64_t> b) {
    732  return Vec256<uint64_t>{__lasx_xvmin_du(a.raw, b.raw)};
    733 }
    734 
    735 // Signed
    736 HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
    737  return Vec256<int8_t>{__lasx_xvmin_b(a.raw, b.raw)};
    738 }
    739 HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
    740  return Vec256<int16_t>{__lasx_xvmin_h(a.raw, b.raw)};
    741 }
    742 HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
    743  return Vec256<int32_t>{__lasx_xvmin_w(a.raw, b.raw)};
    744 }
    745 HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
    746  return Vec256<int64_t>{__lasx_xvmin_d(a.raw, b.raw)};
    747 }
    748 
    749 // Float
    750 HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
    751  return Vec256<float>{__lasx_xvfmin_s(a.raw, b.raw)};
    752 }
    753 HWY_API Vec256<double> Min(const Vec256<double> a, const Vec256<double> b) {
    754  return Vec256<double>{__lasx_xvfmin_d(a.raw, b.raw)};
    755 }
    756 
    757 // ------------------------------ Max (Gt, IfThenElse)
    758 
    759 // Unsigned
    760 HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
    761  return Vec256<uint8_t>{__lasx_xvmax_bu(a.raw, b.raw)};
    762 }
    763 HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
    764                             const Vec256<uint16_t> b) {
    765  return Vec256<uint16_t>{__lasx_xvmax_hu(a.raw, b.raw)};
    766 }
    767 HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
    768                             const Vec256<uint32_t> b) {
    769  return Vec256<uint32_t>{__lasx_xvmax_wu(a.raw, b.raw)};
    770 }
    771 HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
    772                             const Vec256<uint64_t> b) {
    773  return Vec256<uint64_t>{__lasx_xvmax_du(a.raw, b.raw)};
    774 }
    775 
    776 // Signed
    777 HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
    778  return Vec256<int8_t>{__lasx_xvmax_b(a.raw, b.raw)};
    779 }
    780 HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
    781  return Vec256<int16_t>{__lasx_xvmax_h(a.raw, b.raw)};
    782 }
    783 HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
    784  return Vec256<int32_t>{__lasx_xvmax_w(a.raw, b.raw)};
    785 }
    786 HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
    787  return Vec256<int64_t>{__lasx_xvmax_d(a.raw, b.raw)};
    788 }
    789 
    790 // Float
    791 HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
    792  return Vec256<float>{__lasx_xvfmax_s(a.raw, b.raw)};
    793 }
    794 HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) {
    795  return Vec256<double>{__lasx_xvfmax_d(a.raw, b.raw)};
    796 }
    797 
    798 // ------------------------------ MinMagnitude and MaxMagnitude
    799 
    800 HWY_API Vec256<float> MinMagnitude(Vec256<float> a, Vec256<float> b) {
    801  return Vec256<float>{__lasx_xvfmina_s(a.raw, b.raw)};
    802 }
    803 HWY_API Vec256<double> MinMagnitude(Vec256<double> a, Vec256<double> b) {
    804  return Vec256<double>{__lasx_xvfmina_d(a.raw, b.raw)};
    805 }
    806 
    807 HWY_API Vec256<float> MaxMagnitude(Vec256<float> a, Vec256<float> b) {
    808  return Vec256<float>{__lasx_xvfmaxa_s(a.raw, b.raw)};
    809 }
    810 HWY_API Vec256<double> MaxMagnitude(Vec256<double> a, Vec256<double> b) {
    811  return Vec256<double>{__lasx_xvfmaxa_d(a.raw, b.raw)};
    812 }
    813 
    814 // ------------------------------ Iota
    815 
    816 namespace detail {
    817 
    818 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
    819 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
    820  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(32)));
    821  const GccI8RawVectType raw_i8_vec = {
    822      static_cast<char>(0),  static_cast<char>(1),  static_cast<char>(2),
    823      static_cast<char>(3),  static_cast<char>(4),  static_cast<char>(5),
    824      static_cast<char>(6),  static_cast<char>(7),  static_cast<char>(8),
    825      static_cast<char>(9),  static_cast<char>(10), static_cast<char>(11),
    826      static_cast<char>(12), static_cast<char>(13), static_cast<char>(14),
    827      static_cast<char>(15), static_cast<char>(16), static_cast<char>(17),
    828      static_cast<char>(18), static_cast<char>(19), static_cast<char>(20),
    829      static_cast<char>(21), static_cast<char>(22), static_cast<char>(23),
    830      static_cast<char>(24), static_cast<char>(25), static_cast<char>(26),
    831      static_cast<char>(27), static_cast<char>(28), static_cast<char>(29),
    832      static_cast<char>(30), static_cast<char>(31)};
    833  return VFromD<D>{reinterpret_cast<__m256i>(raw_i8_vec)};
    834 }
    835 
    836 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)>
    837 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
    838  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(32)));
    839  const GccI16RawVectType raw_i16_vec = {
    840      static_cast<int16_t>(0),  static_cast<int16_t>(1),
    841      static_cast<int16_t>(2),  static_cast<int16_t>(3),
    842      static_cast<int16_t>(4),  static_cast<int16_t>(5),
    843      static_cast<int16_t>(6),  static_cast<int16_t>(7),
    844      static_cast<int16_t>(8),  static_cast<int16_t>(9),
    845      static_cast<int16_t>(10), static_cast<int16_t>(11),
    846      static_cast<int16_t>(12), static_cast<int16_t>(13),
    847      static_cast<int16_t>(14), static_cast<int16_t>(15)};
    848  return VFromD<D>{reinterpret_cast<__m256i>(raw_i16_vec)};
    849 }
    850 
    851 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
    852 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
    853  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(32)));
    854  const GccI32RawVectType raw_i32_vec = {
    855      static_cast<int32_t>(0), static_cast<int32_t>(1), static_cast<int32_t>(2),
    856      static_cast<int32_t>(3), static_cast<int32_t>(4), static_cast<int32_t>(5),
    857      static_cast<int32_t>(6), static_cast<int32_t>(7)};
    858  return VFromD<D>{reinterpret_cast<__m256i>(raw_i32_vec)};
    859 }
    860 
    861 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
    862 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
    863  typedef int64_t GccI64RawVectType __attribute__((__vector_size__(32)));
    864  const GccI64RawVectType raw_i64_vec = {
    865      static_cast<int64_t>(0), static_cast<int64_t>(1), static_cast<int64_t>(2),
    866      static_cast<int64_t>(3)};
    867  return VFromD<D>{reinterpret_cast<__m256i>(raw_i64_vec)};
    868 }
    869 
    870 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
    871 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
    872  typedef float GccF32RawVectType __attribute__((__vector_size__(32)));
    873  const GccF32RawVectType raw_f32_vec = {0.0f, 1.0f, 2.0f, 3.0f,
    874                                         4.0f, 5.0f, 6.0f, 7.0f};
    875  return VFromD<D>{reinterpret_cast<__m256>(raw_f32_vec)};
    876 }
    877 
    878 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
    879 HWY_INLINE VFromD<D> Iota0(D /*d*/) {
    880  typedef double GccF64RawVectType __attribute__((__vector_size__(32)));
    881  const GccF64RawVectType raw_f64_vec = {0.0, 1.0, 2.0, 3.0};
    882  return VFromD<D>{reinterpret_cast<__m256d>(raw_f64_vec)};
    883 }
    884 
    885 }  // namespace detail
    886 
    887 template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
    888 HWY_API VFromD<D> Iota(D d, const T2 first) {
    889  return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
    890 }
    891 
    892 // ------------------------------ FirstN (Iota, Lt)
    893 
    894 template <class D, HWY_IF_V_SIZE_D(D, 32), class M = MFromD<D>>
    895 HWY_API M FirstN(const D d, size_t n) {
    896  constexpr size_t kN = MaxLanes(d);
    897  n = HWY_MIN(n, kN);
    898  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
    899  using TI = TFromD<decltype(di)>;
    900  return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(n)));
    901 }
    902 
    903 // ================================================== ARITHMETIC
    904 
    905 // ------------------------------ Addition
    906 
    907 // Unsigned
    908 HWY_API Vec256<uint8_t> operator+(Vec256<uint8_t> a, Vec256<uint8_t> b) {
    909  return Vec256<uint8_t>{__lasx_xvadd_b(a.raw, b.raw)};
    910 }
    911 HWY_API Vec256<uint16_t> operator+(Vec256<uint16_t> a, Vec256<uint16_t> b) {
    912  return Vec256<uint16_t>{__lasx_xvadd_h(a.raw, b.raw)};
    913 }
    914 HWY_API Vec256<uint32_t> operator+(Vec256<uint32_t> a, Vec256<uint32_t> b) {
    915  return Vec256<uint32_t>{__lasx_xvadd_w(a.raw, b.raw)};
    916 }
    917 HWY_API Vec256<uint64_t> operator+(Vec256<uint64_t> a, Vec256<uint64_t> b) {
    918  return Vec256<uint64_t>{__lasx_xvadd_d(a.raw, b.raw)};
    919 }
    920 
    921 // Signed
    922 HWY_API Vec256<int8_t> operator+(Vec256<int8_t> a, Vec256<int8_t> b) {
    923  return Vec256<int8_t>{__lasx_xvadd_b(a.raw, b.raw)};
    924 }
    925 HWY_API Vec256<int16_t> operator+(Vec256<int16_t> a, Vec256<int16_t> b) {
    926  return Vec256<int16_t>{__lasx_xvadd_h(a.raw, b.raw)};
    927 }
    928 HWY_API Vec256<int32_t> operator+(Vec256<int32_t> a, Vec256<int32_t> b) {
    929  return Vec256<int32_t>{__lasx_xvadd_w(a.raw, b.raw)};
    930 }
    931 HWY_API Vec256<int64_t> operator+(Vec256<int64_t> a, Vec256<int64_t> b) {
    932  return Vec256<int64_t>{__lasx_xvadd_d(a.raw, b.raw)};
    933 }
    934 
    935 HWY_API Vec256<float> operator+(Vec256<float> a, Vec256<float> b) {
    936  return Vec256<float>{__lasx_xvfadd_s(a.raw, b.raw)};
    937 }
    938 HWY_API Vec256<double> operator+(Vec256<double> a, Vec256<double> b) {
    939  return Vec256<double>{__lasx_xvfadd_d(a.raw, b.raw)};
    940 }
    941 
    942 template <typename T>
    943 HWY_API Vec256<T> Add(Vec256<T> a, Vec256<T> b) {
    944  return a + b;
    945 }
    946 
    947 // ------------------------------ Subtraction
    948 
    949 // Unsigne
    950 HWY_API Vec256<uint8_t> operator-(Vec256<uint8_t> a, Vec256<uint8_t> b) {
    951  return Vec256<uint8_t>{__lasx_xvsub_b(a.raw, b.raw)};
    952 }
    953 HWY_API Vec256<uint16_t> operator-(Vec256<uint16_t> a, Vec256<uint16_t> b) {
    954  return Vec256<uint16_t>{__lasx_xvsub_h(a.raw, b.raw)};
    955 }
    956 HWY_API Vec256<uint32_t> operator-(Vec256<uint32_t> a, Vec256<uint32_t> b) {
    957  return Vec256<uint32_t>{__lasx_xvsub_w(a.raw, b.raw)};
    958 }
    959 HWY_API Vec256<uint64_t> operator-(Vec256<uint64_t> a, Vec256<uint64_t> b) {
    960  return Vec256<uint64_t>{__lasx_xvsub_d(a.raw, b.raw)};
    961 }
    962 
    963 // Signed
    964 HWY_API Vec256<int8_t> operator-(Vec256<int8_t> a, Vec256<int8_t> b) {
    965  return Vec256<int8_t>{__lasx_xvsub_b(a.raw, b.raw)};
    966 }
    967 HWY_API Vec256<int16_t> operator-(Vec256<int16_t> a, Vec256<int16_t> b) {
    968  return Vec256<int16_t>{__lasx_xvsub_h(a.raw, b.raw)};
    969 }
    970 HWY_API Vec256<int32_t> operator-(Vec256<int32_t> a, Vec256<int32_t> b) {
    971  return Vec256<int32_t>{__lasx_xvsub_w(a.raw, b.raw)};
    972 }
    973 HWY_API Vec256<int64_t> operator-(Vec256<int64_t> a, Vec256<int64_t> b) {
    974  return Vec256<int64_t>{__lasx_xvsub_d(a.raw, b.raw)};
    975 }
    976 
    977 HWY_API Vec256<float> operator-(Vec256<float> a, Vec256<float> b) {
    978  return Vec256<float>{__lasx_xvfsub_s(a.raw, b.raw)};
    979 }
    980 HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) {
    981  return Vec256<double>{__lasx_xvfsub_d(a.raw, b.raw)};
    982 }
    983 
    984 // ------------------------------ SumsOf8
    985 HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) {
    986  v.raw = __lasx_xvhaddw_hu_bu(v.raw, v.raw);
    987  v.raw = __lasx_xvhaddw_wu_hu(v.raw, v.raw);
    988  return Vec256<uint64_t>{__lasx_xvhaddw_du_wu(v.raw, v.raw)};
    989 }
    990 HWY_API Vec256<int64_t> SumsOf8(Vec256<int8_t> v) {
    991  v.raw = __lasx_xvhaddw_h_b(v.raw, v.raw);
    992  v.raw = __lasx_xvhaddw_w_h(v.raw, v.raw);
    993  return Vec256<int64_t>{__lasx_xvhaddw_d_w(v.raw, v.raw)};
    994 }
    995 
    996 // ------------------------------ SaturatedAdd
    997 
    998 // Returns a + b clamped to the destination range.
    999 
   1000 // Unsigned
   1001 HWY_API Vec256<uint8_t> SaturatedAdd(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1002  return Vec256<uint8_t>{__lasx_xvsadd_bu(a.raw, b.raw)};
   1003 }
   1004 HWY_API Vec256<uint16_t> SaturatedAdd(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1005  return Vec256<uint16_t>{__lasx_xvsadd_hu(a.raw, b.raw)};
   1006 }
   1007 HWY_API Vec256<uint32_t> SaturatedAdd(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1008  return Vec256<uint32_t>{__lasx_xvsadd_wu(a.raw, b.raw)};
   1009 }
   1010 HWY_API Vec256<uint64_t> SaturatedAdd(Vec256<uint64_t> a, Vec256<uint64_t> b) {
   1011  return Vec256<uint64_t>{__lasx_xvsadd_du(a.raw, b.raw)};
   1012 }
   1013 
   1014 // Signed
   1015 HWY_API Vec256<int8_t> SaturatedAdd(Vec256<int8_t> a, Vec256<int8_t> b) {
   1016  return Vec256<int8_t>{__lasx_xvsadd_b(a.raw, b.raw)};
   1017 }
   1018 HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) {
   1019  return Vec256<int16_t>{__lasx_xvsadd_h(a.raw, b.raw)};
   1020 }
   1021 HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) {
   1022  return Vec256<int32_t>{__lasx_xvsadd_w(a.raw, b.raw)};
   1023 }
   1024 HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) {
   1025  return Vec256<int64_t>{__lasx_xvsadd_d(a.raw, b.raw)};
   1026 }
   1027 
   1028 // ------------------------------ SaturatedSub
   1029 
   1030 // Returns a - b clamped to the destination range.
   1031 
   1032 // Unsigned
   1033 HWY_API Vec256<uint8_t> SaturatedSub(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1034  return Vec256<uint8_t>{__lasx_xvssub_bu(a.raw, b.raw)};
   1035 }
   1036 HWY_API Vec256<uint16_t> SaturatedSub(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1037  return Vec256<uint16_t>{__lasx_xvssub_hu(a.raw, b.raw)};
   1038 }
   1039 HWY_API Vec256<uint32_t> SaturatedSub(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1040  return Vec256<uint32_t>{__lasx_xvssub_wu(a.raw, b.raw)};
   1041 }
   1042 HWY_API Vec256<uint64_t> SaturatedSub(Vec256<uint64_t> a, Vec256<uint64_t> b) {
   1043  return Vec256<uint64_t>{__lasx_xvssub_du(a.raw, b.raw)};
   1044 }
   1045 
   1046 // Signed
   1047 HWY_API Vec256<int8_t> SaturatedSub(Vec256<int8_t> a, Vec256<int8_t> b) {
   1048  return Vec256<int8_t>{__lasx_xvssub_b(a.raw, b.raw)};
   1049 }
   1050 HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) {
   1051  return Vec256<int16_t>{__lasx_xvssub_h(a.raw, b.raw)};
   1052 }
   1053 HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) {
   1054  return Vec256<int32_t>{__lasx_xvssub_w(a.raw, b.raw)};
   1055 }
   1056 HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) {
   1057  return Vec256<int64_t>{__lasx_xvssub_d(a.raw, b.raw)};
   1058 }
   1059 
   1060 // ------------------------------ Average
   1061 
   1062 // Returns (a + b + 1) / 2
   1063 
   1064 // Unsigned
   1065 HWY_API Vec256<int8_t> AverageRound(Vec256<int8_t> a, Vec256<int8_t> b) {
   1066  return Vec256<int8_t>{__lasx_xvavgr_b(a.raw, b.raw)};
   1067 }
   1068 HWY_API Vec256<uint8_t> AverageRound(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1069  return Vec256<uint8_t>{__lasx_xvavgr_bu(a.raw, b.raw)};
   1070 }
   1071 HWY_API Vec256<int16_t> AverageRound(Vec256<int16_t> a, Vec256<int16_t> b) {
   1072  return Vec256<int16_t>{__lasx_xvavgr_h(a.raw, b.raw)};
   1073 }
   1074 HWY_API Vec256<uint16_t> AverageRound(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1075  return Vec256<uint16_t>{__lasx_xvavgr_hu(a.raw, b.raw)};
   1076 }
   1077 HWY_API Vec256<int32_t> AverageRound(Vec256<int32_t> a, Vec256<int32_t> b) {
   1078  return Vec256<int32_t>{__lasx_xvavgr_w(a.raw, b.raw)};
   1079 }
   1080 HWY_API Vec256<uint32_t> AverageRound(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1081  return Vec256<uint32_t>{__lasx_xvavgr_wu(a.raw, b.raw)};
   1082 }
   1083 HWY_API Vec256<int64_t> AverageRound(Vec256<int64_t> a, Vec256<int64_t> b) {
   1084  return Vec256<int64_t>{__lasx_xvavgr_d(a.raw, b.raw)};
   1085 }
   1086 HWY_API Vec256<uint64_t> AverageRound(Vec256<uint64_t> a, Vec256<uint64_t> b) {
   1087  return Vec256<uint64_t>{__lasx_xvavgr_du(a.raw, b.raw)};
   1088 }
   1089 
   1090 // ------------------------------ Abs (Sub)
   1091 
   1092 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
   1093 HWY_API Vec256<int8_t> Abs(Vec256<int8_t> v) {
   1094  return Vec256<int8_t>{__lasx_xvabsd_b(v.raw, __lasx_xvreplgr2vr_b(0))};
   1095 }
   1096 HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
   1097  return Vec256<int16_t>{__lasx_xvabsd_h(v.raw, __lasx_xvreplgr2vr_h(0))};
   1098 }
   1099 HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
   1100  return Vec256<int32_t>{__lasx_xvabsd_w(v.raw, __lasx_xvreplgr2vr_w(0))};
   1101 }
   1102 HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
   1103  return Vec256<int64_t>{__lasx_xvabsd_d(v.raw, __lasx_xvreplgr2vr_d(0))};
   1104 }
   1105 
   1106 // ------------------------------ Integer AbsDiff
   1107 HWY_API Vec256<int8_t> AbsDiff(const Vec256<int8_t> a, Vec256<int8_t> b) {
   1108  return Vec256<int8_t>{__lasx_xvabsd_b(a.raw, b.raw)};
   1109 }
   1110 HWY_API Vec256<int16_t> AbsDiff(const Vec256<int16_t> a, Vec256<int16_t> b) {
   1111  return Vec256<int16_t>{__lasx_xvabsd_h(a.raw, b.raw)};
   1112 }
   1113 HWY_API Vec256<int32_t> AbsDiff(const Vec256<int32_t> a, Vec256<int32_t> b) {
   1114  return Vec256<int32_t>{__lasx_xvabsd_w(a.raw, b.raw)};
   1115 }
   1116 HWY_API Vec256<int64_t> AbsDiff(const Vec256<int64_t> a, Vec256<int64_t> b) {
   1117  return Vec256<int64_t>{__lasx_xvabsd_d(a.raw, b.raw)};
   1118 }
   1119 
   1120 HWY_API Vec256<uint8_t> AbsDiff(const Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1121  return Vec256<uint8_t>{__lasx_xvabsd_bu(a.raw, b.raw)};
   1122 }
   1123 HWY_API Vec256<uint16_t> AbsDiff(const Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1124  return Vec256<uint16_t>{__lasx_xvabsd_hu(a.raw, b.raw)};
   1125 }
   1126 HWY_API Vec256<uint32_t> AbsDiff(const Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1127  return Vec256<uint32_t>{__lasx_xvabsd_wu(a.raw, b.raw)};
   1128 }
   1129 HWY_API Vec256<uint64_t> AbsDiff(const Vec256<uint64_t> a, Vec256<uint64_t> b) {
   1130  return Vec256<uint64_t>{__lasx_xvabsd_du(a.raw, b.raw)};
   1131 }
   1132 
   1133 // ------------------------------ Integer multiplication
   1134 
   1135 // Unsigned
   1136 HWY_API Vec256<uint8_t> operator*(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1137  return Vec256<uint8_t>{__lasx_xvmul_b(a.raw, b.raw)};
   1138 }
   1139 HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1140  return Vec256<uint16_t>{__lasx_xvmul_h(a.raw, b.raw)};
   1141 }
   1142 HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1143  return Vec256<uint32_t>{__lasx_xvmul_w(a.raw, b.raw)};
   1144 }
   1145 HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
   1146  return Vec256<uint64_t>{__lasx_xvmul_d(a.raw, b.raw)};
   1147 }
   1148 
   1149 // Signed
   1150 HWY_API Vec256<int8_t> operator*(Vec256<int8_t> a, Vec256<int8_t> b) {
   1151  return Vec256<int8_t>{__lasx_xvmul_b(a.raw, b.raw)};
   1152 }
   1153 HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
   1154  return Vec256<int16_t>{__lasx_xvmul_h(a.raw, b.raw)};
   1155 }
   1156 HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
   1157  return Vec256<int32_t>{__lasx_xvmul_w(a.raw, b.raw)};
   1158 }
   1159 HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
   1160  return Vec256<int64_t>{__lasx_xvmul_d(a.raw, b.raw)};
   1161 }
   1162 
   1163 HWY_API Vec256<uint8_t> MulHigh(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1164  return Vec256<uint8_t>{__lasx_xvmuh_bu(a.raw, b.raw)};
   1165 }
   1166 HWY_API Vec256<int8_t> MulHigh(Vec256<int8_t> a, Vec256<int8_t> b) {
   1167  return Vec256<int8_t>{__lasx_xvmuh_b(a.raw, b.raw)};
   1168 }
   1169 HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1170  return Vec256<uint16_t>{__lasx_xvmuh_hu(a.raw, b.raw)};
   1171 }
   1172 HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
   1173  return Vec256<int16_t>{__lasx_xvmuh_h(a.raw, b.raw)};
   1174 }
   1175 HWY_API Vec256<uint32_t> MulHigh(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1176  return Vec256<uint32_t>{__lasx_xvmuh_wu(a.raw, b.raw)};
   1177 }
   1178 HWY_API Vec256<int32_t> MulHigh(Vec256<int32_t> a, Vec256<int32_t> b) {
   1179  return Vec256<int32_t>{__lasx_xvmuh_w(a.raw, b.raw)};
   1180 }
   1181 HWY_API Vec256<uint64_t> MulHigh(Vec256<uint64_t> a, Vec256<uint64_t> b) {
   1182  return Vec256<uint64_t>{__lasx_xvmuh_du(a.raw, b.raw)};
   1183 }
   1184 HWY_API Vec256<int64_t> MulHigh(Vec256<int64_t> a, Vec256<int64_t> b) {
   1185  return Vec256<int64_t>{__lasx_xvmuh_d(a.raw, b.raw)};
   1186 }
   1187 
   1188 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
   1189 // even and the upper half into its odd neighbor lane.
   1190 HWY_API Vec256<int16_t> MulEven(Vec256<int8_t> a, Vec256<int8_t> b) {
   1191  return Vec256<int16_t>{__lasx_xvmulwev_h_b(a.raw, b.raw)};
   1192 }
   1193 HWY_API Vec256<uint16_t> MulEven(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1194  return Vec256<uint16_t>{__lasx_xvmulwev_h_bu(a.raw, b.raw)};
   1195 }
   1196 HWY_API Vec256<int32_t> MulEven(Vec256<int16_t> a, Vec256<int16_t> b) {
   1197  return Vec256<int32_t>{__lasx_xvmulwev_w_h(a.raw, b.raw)};
   1198 }
   1199 HWY_API Vec256<uint32_t> MulEven(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1200  return Vec256<uint32_t>{__lasx_xvmulwev_w_hu(a.raw, b.raw)};
   1201 }
   1202 HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
   1203  return Vec256<int64_t>{__lasx_xvmulwev_d_w(a.raw, b.raw)};
   1204 }
   1205 HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1206  return Vec256<uint64_t>{__lasx_xvmulwev_d_wu(a.raw, b.raw)};
   1207 }
   1208 template <typename T, HWY_IF_I64(T)>
   1209 HWY_API Vec256<T> MulEven(Vec256<T> a, Vec256<T> b) {
   1210  return Vec256<T>{__lasx_xvmulwev_q_d(a.raw, b.raw)};
   1211 }
   1212 template <typename T, HWY_IF_U64(T)>
   1213 HWY_API Vec256<T> MulEven(Vec256<T> a, Vec256<T> b) {
   1214  return Vec256<T>{__lasx_xvmulwev_q_du(a.raw, b.raw)};
   1215 }
   1216 
   1217 HWY_API Vec256<int16_t> MulOdd(Vec256<int8_t> a, Vec256<int8_t> b) {
   1218  return Vec256<int16_t>{__lasx_xvmulwod_h_b(a.raw, b.raw)};
   1219 }
   1220 HWY_API Vec256<uint16_t> MulOdd(Vec256<uint8_t> a, Vec256<uint8_t> b) {
   1221  return Vec256<uint16_t>{__lasx_xvmulwod_h_bu(a.raw, b.raw)};
   1222 }
   1223 HWY_API Vec256<int32_t> MulOdd(Vec256<int16_t> a, Vec256<int16_t> b) {
   1224  return Vec256<int32_t>{__lasx_xvmulwod_w_h(a.raw, b.raw)};
   1225 }
   1226 HWY_API Vec256<uint32_t> MulOdd(Vec256<uint16_t> a, Vec256<uint16_t> b) {
   1227  return Vec256<uint32_t>{__lasx_xvmulwod_w_hu(a.raw, b.raw)};
   1228 }
   1229 HWY_API Vec256<int64_t> MulOdd(Vec256<int32_t> a, Vec256<int32_t> b) {
   1230  return Vec256<int64_t>{__lasx_xvmulwod_d_w(a.raw, b.raw)};
   1231 }
   1232 HWY_API Vec256<uint64_t> MulOdd(Vec256<uint32_t> a, Vec256<uint32_t> b) {
   1233  return Vec256<uint64_t>{__lasx_xvmulwod_d_wu(a.raw, b.raw)};
   1234 }
   1235 template <typename T, HWY_IF_I64(T)>
   1236 HWY_API Vec256<T> MulOdd(Vec256<T> a, Vec256<T> b) {
   1237  return Vec256<T>{__lasx_xvmulwod_q_d(a.raw, b.raw)};
   1238 }
   1239 template <typename T, HWY_IF_U64(T)>
   1240 HWY_API Vec256<T> MulOdd(Vec256<T> a, Vec256<T> b) {
   1241  return Vec256<T>{__lasx_xvmulwod_q_du(a.raw, b.raw)};
   1242 }
   1243 
   1244 template <typename T, HWY_IF_I16(T)>
   1245 HWY_API Vec256<T> MulFixedPoint15(Vec256<T> a, Vec256<T> b) {
   1246  const auto i32_ev = MulEven(a, b);
   1247  const auto i32_od = MulOdd(a, b);
   1248  const auto i64_lo = InterleaveLower(i32_ev, i32_od);
   1249  const auto i64_hi = InterleaveUpper(Full256<int32_t>(), i32_ev, i32_od);
   1250  return Vec256<T>{__lasx_xvssrarni_h_w(i64_hi.raw, i64_lo.raw, 15)};
   1251 }
   1252 
   1253 // ------------------------------ Integer division
   1254 
   1255 HWY_API Vec256<int8_t> operator/(const Vec256<int8_t> a,
   1256                                 const Vec256<int8_t> b) {
   1257  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1258  // or a[i] == LimitsMin<int8_t>() && b[i] == -1
   1259  __m256i raw_result;
   1260  __asm__("xvdiv.b %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1261  return Vec256<int8_t>{raw_result};
   1262 }
   1263 
   1264 HWY_API Vec256<uint8_t> operator/(const Vec256<uint8_t> a,
   1265                                  const Vec256<uint8_t> b) {
   1266  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1267  __m256i raw_result;
   1268  __asm__("xvdiv.bu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1269  return Vec256<uint8_t>{raw_result};
   1270 }
   1271 
   1272 HWY_API Vec256<int16_t> operator/(const Vec256<int16_t> a,
   1273                                  const Vec256<int16_t> b) {
   1274  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1275  // or a[i] == LimitsMin<int16_t>() && b[i] == -1
   1276  __m256i raw_result;
   1277  __asm__("xvdiv.h %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1278  return Vec256<int16_t>{raw_result};
   1279 }
   1280 
   1281 HWY_API Vec256<uint16_t> operator/(const Vec256<uint16_t> a,
   1282                                   const Vec256<uint16_t> b) {
   1283  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1284  __m256i raw_result;
   1285  __asm__("xvdiv.hu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1286  return Vec256<uint16_t>{raw_result};
   1287 }
   1288 
   1289 HWY_API Vec256<int32_t> operator/(const Vec256<int32_t> a,
   1290                                  const Vec256<int32_t> b) {
   1291  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1292  // or a[i] == LimitsMin<int32_t>() && b[i] == -1
   1293  __m256i raw_result;
   1294  __asm__("xvdiv.w %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1295  return Vec256<int32_t>{raw_result};
   1296 }
   1297 
   1298 HWY_API Vec256<uint32_t> operator/(const Vec256<uint32_t> a,
   1299                                   const Vec256<uint32_t> b) {
   1300  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1301  __m256i raw_result;
   1302  __asm__("xvdiv.wu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1303  return Vec256<uint32_t>{raw_result};
   1304 }
   1305 
   1306 HWY_API Vec256<int64_t> operator/(const Vec256<int64_t> a,
   1307                                  const Vec256<int64_t> b) {
   1308  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1309  // or a[i] == LimitsMin<int64_t>() && b[i] == -1
   1310  __m256i raw_result;
   1311  __asm__("xvdiv.d %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1312  return Vec256<int64_t>{raw_result};
   1313 }
   1314 
   1315 HWY_API Vec256<uint64_t> operator/(const Vec256<uint64_t> a,
   1316                                   const Vec256<uint64_t> b) {
   1317  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1318  __m256i raw_result;
   1319  __asm__("xvdiv.du %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1320  return Vec256<uint64_t>{raw_result};
   1321 }
   1322 
   1323 // ------------------------------ Integer modulo
   1324 
   1325 HWY_API Vec256<int8_t> operator%(const Vec256<int8_t> a,
   1326                                 const Vec256<int8_t> b) {
   1327  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1328  // or a[i] == LimitsMin<int8_t>() && b[i] == -1
   1329  __m256i raw_result;
   1330  __asm__("xvmod.b %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1331  return Vec256<int8_t>{raw_result};
   1332 }
   1333 
   1334 HWY_API Vec256<uint8_t> operator%(const Vec256<uint8_t> a,
   1335                                  const Vec256<uint8_t> b) {
   1336  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1337  __m256i raw_result;
   1338  __asm__("xvmod.bu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1339  return Vec256<uint8_t>{raw_result};
   1340 }
   1341 
   1342 HWY_API Vec256<int16_t> operator%(const Vec256<int16_t> a,
   1343                                  const Vec256<int16_t> b) {
   1344  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1345  // or a[i] == LimitsMin<int16_t>() && b[i] == -1
   1346  __m256i raw_result;
   1347  __asm__("xvmod.h %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1348  return Vec256<int16_t>{raw_result};
   1349 }
   1350 
   1351 HWY_API Vec256<uint16_t> operator%(const Vec256<uint16_t> a,
   1352                                   const Vec256<uint16_t> b) {
   1353  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1354  __m256i raw_result;
   1355  __asm__("xvmod.hu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1356  return Vec256<uint16_t>{raw_result};
   1357 }
   1358 
   1359 HWY_API Vec256<int32_t> operator%(const Vec256<int32_t> a,
   1360                                  const Vec256<int32_t> b) {
   1361  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1362  // or a[i] == LimitsMin<int32_t>() && b[i] == -1
   1363  __m256i raw_result;
   1364  __asm__("xvmod.w %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1365  return Vec256<int32_t>{raw_result};
   1366 }
   1367 
   1368 HWY_API Vec256<uint32_t> operator%(const Vec256<uint32_t> a,
   1369                                   const Vec256<uint32_t> b) {
   1370  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1371  __m256i raw_result;
   1372  __asm__("xvmod.wu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1373  return Vec256<uint32_t>{raw_result};
   1374 }
   1375 
   1376 HWY_API Vec256<int64_t> operator%(const Vec256<int64_t> a,
   1377                                  const Vec256<int64_t> b) {
   1378  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1379  // or a[i] == LimitsMin<int64_t>() && b[i] == -1
   1380  __m256i raw_result;
   1381  __asm__("xvmod.d %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1382  return Vec256<int64_t>{raw_result};
   1383 }
   1384 
   1385 HWY_API Vec256<uint64_t> operator%(const Vec256<uint64_t> a,
   1386                                   const Vec256<uint64_t> b) {
   1387  // Use inline assembly to avoid undefined behavior if any lanes of b are zero
   1388  __m256i raw_result;
   1389  __asm__("xvmod.du %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :);
   1390  return Vec256<uint64_t>{raw_result};
   1391 }
   1392 
   1393 // ------------------------------ ShiftLeft (Compile-time constant shifts)
   1394 
   1395 template <int kBits, typename T, HWY_IF_UI8(T)>
   1396 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
   1397  return Vec256<T>{__lasx_xvslli_b(v.raw, kBits)};
   1398 }
   1399 
   1400 template <int kBits, typename T, HWY_IF_UI16(T)>
   1401 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
   1402  return Vec256<T>{__lasx_xvslli_h(v.raw, kBits)};
   1403 }
   1404 
   1405 template <int kBits, typename T, HWY_IF_UI32(T)>
   1406 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
   1407  return Vec256<T>{__lasx_xvslli_w(v.raw, kBits)};
   1408 }
   1409 
   1410 template <int kBits, typename T, HWY_IF_UI64(T)>
   1411 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
   1412  return Vec256<T>{__lasx_xvslli_d(v.raw, kBits)};
   1413 }
   1414 
   1415 // ------------------------------ ShiftRight (Compile-time constant shifts)
   1416 
   1417 template <int kBits>
   1418 HWY_API Vec256<uint8_t> ShiftRight(Vec256<uint8_t> v) {
   1419  return Vec256<uint8_t>{__lasx_xvsrli_b(v.raw, kBits)};
   1420 }
   1421 
   1422 template <int kBits>
   1423 HWY_API Vec256<uint16_t> ShiftRight(Vec256<uint16_t> v) {
   1424  return Vec256<uint16_t>{__lasx_xvsrli_h(v.raw, kBits)};
   1425 }
   1426 
   1427 template <int kBits>
   1428 HWY_API Vec256<uint32_t> ShiftRight(Vec256<uint32_t> v) {
   1429  return Vec256<uint32_t>{__lasx_xvsrli_w(v.raw, kBits)};
   1430 }
   1431 
   1432 template <int kBits>
   1433 HWY_API Vec256<uint64_t> ShiftRight(Vec256<uint64_t> v) {
   1434  return Vec256<uint64_t>{__lasx_xvsrli_d(v.raw, kBits)};
   1435 }
   1436 
   1437 template <int kBits>
   1438 HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) {
   1439  return Vec256<int8_t>{__lasx_xvsrai_b(v.raw, kBits)};
   1440 }
   1441 
   1442 template <int kBits>
   1443 HWY_API Vec256<int16_t> ShiftRight(Vec256<int16_t> v) {
   1444  return Vec256<int16_t>{__lasx_xvsrai_h(v.raw, kBits)};
   1445 }
   1446 
   1447 template <int kBits>
   1448 HWY_API Vec256<int32_t> ShiftRight(Vec256<int32_t> v) {
   1449  return Vec256<int32_t>{__lasx_xvsrai_w(v.raw, kBits)};
   1450 }
   1451 
   1452 template <int kBits>
   1453 HWY_API Vec256<int64_t> ShiftRight(Vec256<int64_t> v) {
   1454  return Vec256<int64_t>{__lasx_xvsrai_d(v.raw, kBits)};
   1455 }
   1456 
   1457 // ------------------------------ RoundingShiftRight
   1458 
   1459 template <int kBits>
   1460 HWY_API Vec256<int8_t> RoundingShiftRight(Vec256<int8_t> v) {
   1461  return Vec256<int8_t>{__lasx_xvsrari_b(v.raw, kBits)};
   1462 }
   1463 template <int kBits>
   1464 HWY_API Vec256<int16_t> RoundingShiftRight(Vec256<int16_t> v) {
   1465  return Vec256<int16_t>{__lasx_xvsrari_h(v.raw, kBits)};
   1466 }
   1467 template <int kBits>
   1468 HWY_API Vec256<int32_t> RoundingShiftRight(Vec256<int32_t> v) {
   1469  return Vec256<int32_t>{__lasx_xvsrari_w(v.raw, kBits)};
   1470 }
   1471 template <int kBits>
   1472 HWY_API Vec256<int64_t> RoundingShiftRight(Vec256<int64_t> v) {
   1473  return Vec256<int64_t>{__lasx_xvsrari_d(v.raw, kBits)};
   1474 }
   1475 
   1476 template <int kBits>
   1477 HWY_API Vec256<uint8_t> RoundingShiftRight(Vec256<uint8_t> v) {
   1478  return Vec256<uint8_t>{__lasx_xvsrlri_b(v.raw, kBits)};
   1479 }
   1480 template <int kBits>
   1481 HWY_API Vec256<uint16_t> RoundingShiftRight(Vec256<uint16_t> v) {
   1482  return Vec256<uint16_t>{__lasx_xvsrlri_h(v.raw, kBits)};
   1483 }
   1484 template <int kBits>
   1485 HWY_API Vec256<uint32_t> RoundingShiftRight(Vec256<uint32_t> v) {
   1486  return Vec256<uint32_t>{__lasx_xvsrlri_w(v.raw, kBits)};
   1487 }
   1488 template <int kBits>
   1489 HWY_API Vec256<uint64_t> RoundingShiftRight(Vec256<uint64_t> v) {
   1490  return Vec256<uint64_t>{__lasx_xvsrlri_d(v.raw, kBits)};
   1491 }
   1492 // ------------------------------ RoundingShr
   1493 
   1494 HWY_API Vec256<uint8_t> RoundingShr(Vec256<uint8_t> v, Vec256<uint8_t> bits) {
   1495  return Vec256<uint8_t>{__lasx_xvsrlr_b(v.raw, bits.raw)};
   1496 }
   1497 HWY_API Vec256<uint16_t> RoundingShr(Vec256<uint16_t> v,
   1498                                     Vec256<uint16_t> bits) {
   1499  return Vec256<uint16_t>{__lasx_xvsrlr_h(v.raw, bits.raw)};
   1500 }
   1501 HWY_API Vec256<uint32_t> RoundingShr(Vec256<uint32_t> v,
   1502                                     Vec256<uint32_t> bits) {
   1503  return Vec256<uint32_t>{__lasx_xvsrlr_w(v.raw, bits.raw)};
   1504 }
   1505 HWY_API Vec256<uint64_t> RoundingShr(Vec256<uint64_t> v,
   1506                                     Vec256<uint64_t> bits) {
   1507  return Vec256<uint64_t>{__lasx_xvsrlr_d(v.raw, bits.raw)};
   1508 }
   1509 
   1510 HWY_API Vec256<int8_t> RoundingShr(Vec256<int8_t> v, Vec256<int8_t> bits) {
   1511  return Vec256<int8_t>{__lasx_xvsrar_b(v.raw, bits.raw)};
   1512 }
   1513 HWY_API Vec256<int16_t> RoundingShr(Vec256<int16_t> v, Vec256<int16_t> bits) {
   1514  return Vec256<int16_t>{__lasx_xvsrar_h(v.raw, bits.raw)};
   1515 }
   1516 HWY_API Vec256<int32_t> RoundingShr(Vec256<int32_t> v, Vec256<int32_t> bits) {
   1517  return Vec256<int32_t>{__lasx_xvsrar_w(v.raw, bits.raw)};
   1518 }
   1519 HWY_API Vec256<int64_t> RoundingShr(Vec256<int64_t> v, Vec256<int64_t> bits) {
   1520  return Vec256<int64_t>{__lasx_xvsrar_d(v.raw, bits.raw)};
   1521 }
   1522 
   1523 // ------------------------------ RoundingShiftRightSame (RoundingShr)
   1524 
   1525 template <typename T>
   1526 HWY_API Vec256<T> RoundingShiftRightSame(const Vec256<T> v, int bits) {
   1527  return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
   1528 }
   1529 
   1530 // ------------------------------ RotateRight (Compile-time constant shifts)
   1531 
   1532 template <int kBits, typename T, HWY_IF_UI8(T)>
   1533 HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
   1534  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
   1535  if (kBits == 0) return v;
   1536  return Vec256<T>{__lasx_xvrotri_b(v.raw, kBits)};
   1537 }
   1538 
   1539 template <int kBits, typename T, HWY_IF_UI16(T)>
   1540 HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
   1541  static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
   1542  if (kBits == 0) return v;
   1543  return Vec256<T>{__lasx_xvrotri_h(v.raw, kBits)};
   1544 }
   1545 
   1546 template <int kBits, typename T, HWY_IF_UI32(T)>
   1547 HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
   1548  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
   1549  if (kBits == 0) return v;
   1550  return Vec256<T>{__lasx_xvrotri_w(v.raw, kBits)};
   1551 }
   1552 
   1553 template <int kBits, typename T, HWY_IF_UI64(T)>
   1554 HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
   1555  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
   1556  if (kBits == 0) return v;
   1557  return Vec256<T>{__lasx_xvrotri_d(v.raw, kBits)};
   1558 }
   1559 
   1560 // ------------------------------ Rol/Ror
   1561 template <class T, HWY_IF_UI8(T)>
   1562 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
   1563  return Vec256<T>{__lasx_xvrotr_b(a.raw, b.raw)};
   1564 }
   1565 
   1566 template <class T, HWY_IF_UI16(T)>
   1567 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
   1568  return Vec256<T>{__lasx_xvrotr_h(a.raw, b.raw)};
   1569 }
   1570 
   1571 template <class T, HWY_IF_UI32(T)>
   1572 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
   1573  return Vec256<T>{__lasx_xvrotr_w(a.raw, b.raw)};
   1574 }
   1575 
   1576 template <class T, HWY_IF_UI64(T)>
   1577 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
   1578  return Vec256<T>{__lasx_xvrotr_d(a.raw, b.raw)};
   1579 }
   1580 
   1581 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
   1582 
   1583 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
   1584  return Vec256<int8_t>{__lasx_xvsrai_b(v.raw, 7)};
   1585 }
   1586 
   1587 HWY_API Vec256<int16_t> BroadcastSignBit(const Vec256<int16_t> v) {
   1588  return Vec256<int16_t>{__lasx_xvsrai_h(v.raw, 15)};
   1589 }
   1590 
   1591 HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) {
   1592  return Vec256<int32_t>{__lasx_xvsrai_w(v.raw, 31)};
   1593 }
   1594 
   1595 HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
   1596  return Vec256<int64_t>{__lasx_xvsrai_d(v.raw, 63)};
   1597 }
   1598 
   1599 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
   1600 template <typename T>
   1601 HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
   1602  static_assert(IsSigned<T>(), "Only works for signed/float");
   1603  const DFromV<decltype(v)> d;
   1604  const RebindToSigned<decltype(d)> di;
   1605  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
   1606  return IfThenElse(mask, yes, no);
   1607 }
   1608 
   1609 // ------------------------------ IfNegativeThenNegOrUndefIfZero
   1610 
   1611 HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask,
   1612                                                      Vec256<int8_t> v) {
   1613  return Vec256<int8_t>{__lasx_xvsigncov_b(mask.raw, v.raw)};
   1614 }
   1615 
   1616 HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask,
   1617                                                       Vec256<int16_t> v) {
   1618  return Vec256<int16_t>{__lasx_xvsigncov_h(mask.raw, v.raw)};
   1619 }
   1620 
   1621 HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask,
   1622                                                       Vec256<int32_t> v) {
   1623  return Vec256<int32_t>{__lasx_xvsigncov_w(mask.raw, v.raw)};
   1624 }
   1625 
   1626 HWY_API Vec256<int64_t> IfNegativeThenNegOrUndefIfZero(Vec256<int64_t> mask,
   1627                                                       Vec256<int64_t> v) {
   1628  return Vec256<int64_t>{__lasx_xvsigncov_d(mask.raw, v.raw)};
   1629 }
   1630 
   1631 // ------------------------------ ShiftLeftSame
   1632 
   1633 template <typename T>
   1634 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
   1635  return Shl(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
   1636 }
   1637 
   1638 // ------------------------------ ShiftRightSame (BroadcastSignBit)
   1639 
   1640 HWY_API Vec256<uint8_t> ShiftRightSame(const Vec256<uint8_t> v,
   1641                                       const int bits) {
   1642  return Vec256<uint8_t>{__lasx_xvsrl_b(v.raw, __lasx_xvreplgr2vr_b(bits))};
   1643 }
   1644 
   1645 HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
   1646                                        const int bits) {
   1647  return Vec256<uint16_t>{__lasx_xvsrl_h(v.raw, __lasx_xvreplgr2vr_h(bits))};
   1648 }
   1649 
   1650 HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
   1651                                        const int bits) {
   1652  return Vec256<uint32_t>{__lasx_xvsrl_w(v.raw, __lasx_xvreplgr2vr_w(bits))};
   1653 }
   1654 
   1655 HWY_API Vec256<uint64_t> ShiftRightSame(const Vec256<uint64_t> v,
   1656                                        const int bits) {
   1657  return Vec256<uint64_t>{__lasx_xvsrl_d(v.raw, __lasx_xvreplgr2vr_d(bits))};
   1658 }
   1659 
   1660 HWY_API Vec256<int8_t> ShiftRightSame(const Vec256<int8_t> v, const int bits) {
   1661  return Vec256<int8_t>{__lasx_xvsra_b(v.raw, __lasx_xvreplgr2vr_b(bits))};
   1662 }
   1663 
   1664 HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
   1665                                       const int bits) {
   1666  return Vec256<int16_t>{__lasx_xvsra_h(v.raw, __lasx_xvreplgr2vr_h(bits))};
   1667 }
   1668 
   1669 HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
   1670                                       const int bits) {
   1671  return Vec256<int32_t>{__lasx_xvsra_w(v.raw, __lasx_xvreplgr2vr_w(bits))};
   1672 }
   1673 
   1674 HWY_API Vec256<int64_t> ShiftRightSame(const Vec256<int64_t> v,
   1675                                       const int bits) {
   1676  return Vec256<int64_t>{__lasx_xvsra_d(v.raw, __lasx_xvreplgr2vr_d(bits))};
   1677 }
   1678 
   1679 // ------------------------------ Neg (Xor, Sub)
   1680 
   1681 namespace detail {
   1682 
   1683 template <typename T>
   1684 HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) {
   1685  const DFromV<decltype(v)> d;
   1686  return Xor(v, SignBit(d));
   1687 }
   1688 
   1689 template <typename T>
   1690 HWY_INLINE Vec256<T> Neg(hwy::SpecialTag /*tag*/, const Vec256<T> v) {
   1691  const DFromV<decltype(v)> d;
   1692  return Xor(v, SignBit(d));
   1693 }
   1694 
   1695 // Not floating-point
   1696 template <typename T, HWY_IF_UI8(T)>
   1697 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
   1698  return Vec256<T>{__lasx_xvneg_b(v.raw)};
   1699 }
   1700 
   1701 template <typename T, HWY_IF_UI16(T)>
   1702 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
   1703  return Vec256<T>{__lasx_xvneg_h(v.raw)};
   1704 }
   1705 
   1706 template <typename T, HWY_IF_UI32(T)>
   1707 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
   1708  return Vec256<T>{__lasx_xvneg_w(v.raw)};
   1709 }
   1710 
   1711 template <typename T, HWY_IF_UI64(T)>
   1712 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) {
   1713  return Vec256<T>{__lasx_xvneg_d(v.raw)};
   1714 }
   1715 
   1716 }  // namespace detail
   1717 
   1718 template <typename T>
   1719 HWY_API Vec256<T> Neg(const Vec256<T> v) {
   1720  return detail::Neg(hwy::TypeTag<T>(), v);
   1721 }
   1722 
   1723 // ------------------------------ Floating-point mul / div
   1724 
   1725 HWY_API Vec256<float> operator*(Vec256<float> a, Vec256<float> b) {
   1726  return Vec256<float>{__lasx_xvfmul_s(a.raw, b.raw)};
   1727 }
   1728 HWY_API Vec256<double> operator*(Vec256<double> a, Vec256<double> b) {
   1729  return Vec256<double>{__lasx_xvfmul_d(a.raw, b.raw)};
   1730 }
   1731 
   1732 HWY_API Vec256<float> operator/(Vec256<float> a, Vec256<float> b) {
   1733  return Vec256<float>{__lasx_xvfdiv_s(a.raw, b.raw)};
   1734 }
   1735 HWY_API Vec256<double> operator/(Vec256<double> a, Vec256<double> b) {
   1736  return Vec256<double>{__lasx_xvfdiv_d(a.raw, b.raw)};
   1737 }
   1738 
   1739 // Approximate reciprocal
   1740 
   1741 HWY_API Vec256<float> ApproximateReciprocal(Vec256<float> v) {
   1742  return Vec256<float>{__lasx_xvfrecip_s(v.raw)};
   1743 }
   1744 
   1745 HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) {
   1746  return Vec256<double>{__lasx_xvfrecip_d(v.raw)};
   1747 }
   1748 
   1749 // Integer multiply-add variants
   1750 
   1751 // signed
   1752 HWY_API Vec256<int8_t> MulAdd(Vec256<int8_t> mul, Vec256<int8_t> x,
   1753                              Vec256<int8_t> add) {
   1754  return Vec256<int8_t>{__lasx_xvmadd_b(add.raw, mul.raw, x.raw)};
   1755 }
   1756 HWY_API Vec256<int16_t> MulAdd(Vec256<int16_t> mul, Vec256<int16_t> x,
   1757                               Vec256<int16_t> add) {
   1758  return Vec256<int16_t>{__lasx_xvmadd_h(add.raw, mul.raw, x.raw)};
   1759 }
   1760 HWY_API Vec256<int32_t> MulAdd(Vec256<int32_t> mul, Vec256<int32_t> x,
   1761                               Vec256<int32_t> add) {
   1762  return Vec256<int32_t>{__lasx_xvmadd_w(add.raw, mul.raw, x.raw)};
   1763 }
   1764 HWY_API Vec256<int64_t> MulAdd(Vec256<int64_t> mul, Vec256<int64_t> x,
   1765                               Vec256<int64_t> add) {
   1766  return Vec256<int64_t>{__lasx_xvmadd_d(add.raw, mul.raw, x.raw)};
   1767 }
   1768 
   1769 // unsigend
   1770 HWY_API Vec256<uint8_t> MulAdd(Vec256<uint8_t> mul, Vec256<uint8_t> x,
   1771                               Vec256<uint8_t> add) {
   1772  return Vec256<uint8_t>{__lasx_xvmadd_b(add.raw, mul.raw, x.raw)};
   1773 }
   1774 HWY_API Vec256<uint16_t> MulAdd(Vec256<uint16_t> mul, Vec256<uint16_t> x,
   1775                                Vec256<uint16_t> add) {
   1776  return Vec256<uint16_t>{__lasx_xvmadd_h(add.raw, mul.raw, x.raw)};
   1777 }
   1778 HWY_API Vec256<uint32_t> MulAdd(Vec256<uint32_t> mul, Vec256<uint32_t> x,
   1779                                Vec256<uint32_t> add) {
   1780  return Vec256<uint32_t>{__lasx_xvmadd_w(add.raw, mul.raw, x.raw)};
   1781 }
   1782 HWY_API Vec256<uint64_t> MulAdd(Vec256<uint64_t> mul, Vec256<uint64_t> x,
   1783                                Vec256<uint64_t> add) {
   1784  return Vec256<uint64_t>{__lasx_xvmadd_d(add.raw, mul.raw, x.raw)};
   1785 }
   1786 
   1787 // signed
   1788 HWY_API Vec256<int8_t> NegMulAdd(Vec256<int8_t> mul, Vec256<int8_t> x,
   1789                                 Vec256<int8_t> add) {
   1790  return Vec256<int8_t>{__lasx_xvmsub_b(add.raw, mul.raw, x.raw)};
   1791 }
   1792 HWY_API Vec256<int16_t> NegMulAdd(Vec256<int16_t> mul, Vec256<int16_t> x,
   1793                                  Vec256<int16_t> add) {
   1794  return Vec256<int16_t>{__lasx_xvmsub_h(add.raw, mul.raw, x.raw)};
   1795 }
   1796 HWY_API Vec256<int32_t> NegMulAdd(Vec256<int32_t> mul, Vec256<int32_t> x,
   1797                                  Vec256<int32_t> add) {
   1798  return Vec256<int32_t>{__lasx_xvmsub_w(add.raw, mul.raw, x.raw)};
   1799 }
   1800 HWY_API Vec256<int64_t> NegMulAdd(Vec256<int64_t> mul, Vec256<int64_t> x,
   1801                                  Vec256<int64_t> add) {
   1802  return Vec256<int64_t>{__lasx_xvmsub_d(add.raw, mul.raw, x.raw)};
   1803 }
   1804 
   1805 // unsigned
   1806 HWY_API Vec256<uint8_t> NegMulAdd(Vec256<uint8_t> mul, Vec256<uint8_t> x,
   1807                                  Vec256<uint8_t> add) {
   1808  return Vec256<uint8_t>{__lasx_xvmsub_b(add.raw, mul.raw, x.raw)};
   1809 }
   1810 HWY_API Vec256<uint16_t> NegMulAdd(Vec256<uint16_t> mul, Vec256<uint16_t> x,
   1811                                   Vec256<uint16_t> add) {
   1812  return Vec256<uint16_t>{__lasx_xvmsub_h(add.raw, mul.raw, x.raw)};
   1813 }
   1814 HWY_API Vec256<uint32_t> NegMulAdd(Vec256<uint32_t> mul, Vec256<uint32_t> x,
   1815                                   Vec256<uint32_t> add) {
   1816  return Vec256<uint32_t>{__lasx_xvmsub_w(add.raw, mul.raw, x.raw)};
   1817 }
   1818 HWY_API Vec256<uint64_t> NegMulAdd(Vec256<uint64_t> mul, Vec256<uint64_t> x,
   1819                                   Vec256<uint64_t> add) {
   1820  return Vec256<uint64_t>{__lasx_xvmsub_d(add.raw, mul.raw, x.raw)};
   1821 }
   1822 
   1823 // ------------------------------ Floating-point multiply-add variants
   1824 
   1825 HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x,
   1826                             Vec256<float> add) {
   1827  return Vec256<float>{__lasx_xvfmadd_s(mul.raw, x.raw, add.raw)};
   1828 }
   1829 HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x,
   1830                              Vec256<double> add) {
   1831  return Vec256<double>{__lasx_xvfmadd_d(mul.raw, x.raw, add.raw)};
   1832 }
   1833 
   1834 HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x,
   1835                                Vec256<float> add) {
   1836  return add - mul * x;
   1837 }
   1838 HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x,
   1839                                 Vec256<double> add) {
   1840  return add - mul * x;
   1841 }
   1842 
   1843 HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x,
   1844                             Vec256<float> sub) {
   1845  return Vec256<float>{__lasx_xvfmsub_s(mul.raw, x.raw, sub.raw)};
   1846 }
   1847 HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x,
   1848                              Vec256<double> sub) {
   1849  return Vec256<double>{__lasx_xvfmsub_d(mul.raw, x.raw, sub.raw)};
   1850 }
   1851 
   1852 HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x,
   1853                                Vec256<float> sub) {
   1854  return Vec256<float>{__lasx_xvfnmadd_s(mul.raw, x.raw, sub.raw)};
   1855 }
   1856 HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x,
   1857                                 Vec256<double> sub) {
   1858  return Vec256<double>{__lasx_xvfnmadd_d(mul.raw, x.raw, sub.raw)};
   1859 }
   1860 
   1861 // ------------------------------ MulAddSub(Float)
   1862 
   1863 template <typename T, HWY_IF_FLOAT3264(T)>
   1864 HWY_API Vec256<T> MulAddSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub_or_add) {
   1865  return OddEven(MulAdd(mul, x, sub_or_add), MulSub(mul, x, sub_or_add));
   1866 }
   1867 
   1868 // ------------------------------ Floating-point square root
   1869 
   1870 // Full precision square root
   1871 HWY_API Vec256<float> Sqrt(Vec256<float> v) {
   1872  return Vec256<float>{__lasx_xvfsqrt_s(v.raw)};
   1873 }
   1874 
   1875 HWY_API Vec256<double> Sqrt(Vec256<double> v) {
   1876  return Vec256<double>{__lasx_xvfsqrt_d(v.raw)};
   1877 }
   1878 
   1879 // Approximate reciprocal square root
   1880 HWY_API Vec256<float> ApproximateReciprocalSqrt(Vec256<float> v) {
   1881  return Vec256<float>{__lasx_xvfrsqrt_s(v.raw)};
   1882 }
   1883 
   1884 HWY_API Vec256<double> ApproximateReciprocalSqrt(Vec256<double> v) {
   1885  return Vec256<double>{__lasx_xvfrsqrt_d(v.raw)};
   1886 }
   1887 
   1888 // ------------------------------ Floating-point rounding
   1889 
   1890 // Toward nearest integer, tie to even
   1891 HWY_API Vec256<float> Round(Vec256<float> v) {
   1892  return Vec256<float>{__lasx_xvfrintrne_s(v.raw)};
   1893 }
   1894 
   1895 HWY_API Vec256<double> Round(Vec256<double> v) {
   1896  return Vec256<double>{__lasx_xvfrintrne_d(v.raw)};
   1897 }
   1898 
   1899 // Toward zero, aka truncate
   1900 HWY_API Vec256<float> Trunc(Vec256<float> v) {
   1901  return Vec256<float>{__lasx_xvfrintrz_s(v.raw)};
   1902 }
   1903 
   1904 HWY_API Vec256<double> Trunc(Vec256<double> v) {
   1905  return Vec256<double>{__lasx_xvfrintrz_d(v.raw)};
   1906 }
   1907 
   1908 // Toward +infinity, aka ceiling
   1909 HWY_API Vec256<float> Ceil(Vec256<float> v) {
   1910  return Vec256<float>{__lasx_xvfrintrp_s(v.raw)};
   1911 }
   1912 
   1913 HWY_API Vec256<double> Ceil(Vec256<double> v) {
   1914  return Vec256<double>{__lasx_xvfrintrp_d(v.raw)};
   1915 }
   1916 
   1917 // Toward -infinity, aka floor
   1918 HWY_API Vec256<float> Floor(Vec256<float> v) {
   1919  return Vec256<float>{__lasx_xvfrintrm_s(v.raw)};
   1920 }
   1921 
   1922 HWY_API Vec256<double> Floor(Vec256<double> v) {
   1923  return Vec256<double>{__lasx_xvfrintrm_d(v.raw)};
   1924 }
   1925 
   1926 // ------------------------------ Floating-point classification
   1927 
   1928 // FIXME: disable gcc-14 tree-based loop optimizations to prevent
   1929 // 'HighwayTestGroup/HighwayTest.TestAllIsNaN/LASX' failures
   1930 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
   1931 #pragma GCC push_options
   1932 #pragma GCC optimize("-fno-tree-loop-optimize")
   1933 #endif
   1934 
   1935 HWY_API Mask256<float> IsNaN(Vec256<float> v) {
   1936  const DFromV<decltype(v)> d;
   1937  const RebindToSigned<decltype(d)> di;
   1938  return RebindMask(d,
   1939                    MFromD<decltype(di)>{__lasx_xvfcmp_cune_s(v.raw, v.raw)});
   1940 }
   1941 
   1942 HWY_API Mask256<double> IsNaN(Vec256<double> v) {
   1943  const DFromV<decltype(v)> d;
   1944  const RebindToSigned<decltype(d)> di;
   1945  return RebindMask(d,
   1946                    MFromD<decltype(di)>{__lasx_xvfcmp_cune_d(v.raw, v.raw)});
   1947 }
   1948 
   1949 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
   1950 #pragma GCC pop_options
   1951 #endif
   1952 
   1953 HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) {
   1954  const DFromV<decltype(a)> d;
   1955  const RebindToSigned<decltype(d)> di;
   1956  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cun_s(a.raw, b.raw)});
   1957 }
   1958 
   1959 HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) {
   1960  const DFromV<decltype(a)> d;
   1961  const RebindToSigned<decltype(d)> di;
   1962  return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cun_d(a.raw, b.raw)});
   1963 }
   1964 
   1965 // ================================================== MEMORY
   1966 
   1967 // ------------------------------ Load
   1968 
   1969 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1970 HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
   1971  const RebindToSigned<D> di;
   1972  return BitCast(D(), VFromD<decltype(di)>{__lasx_xvld(aligned, 0)});
   1973 }
   1974 
   1975 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1976 HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
   1977  const RebindToSigned<D> di;
   1978  return BitCast(D(), VFromD<decltype(di)>{__lasx_xvld(p, 0)});
   1979 }
   1980 
   1981 // ------------------------------ MaskedLoad
   1982 
   1983 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1984 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   1985                             const TFromD<D>* HWY_RESTRICT p) {
   1986  return IfThenElseZero(m, LoadU(d, p));
   1987 }
   1988 
   1989 // ------------------------------ LoadDup128
   1990 
   1991 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1992 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
   1993  VFromD<Half<D>> vec_tmp;
   1994  vec_tmp = Load(Half<D>(), p);
   1995  return Combine(d, vec_tmp, vec_tmp);
   1996 }
   1997 
   1998 // ------------------------------ Store
   1999 
   2000 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2001 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
   2002  __lasx_xvst(v.raw, aligned, 0);
   2003 }
   2004 
   2005 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2006 HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
   2007  __lasx_xvst(v.raw, p, 0);
   2008 }
   2009 
   2010 // ------------------------------ BlendedStore
   2011 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2012 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   2013                          TFromD<D>* HWY_RESTRICT p) {
   2014  const RebindToUnsigned<decltype(d)> du;
   2015  const auto blended =
   2016      IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
   2017  StoreU(BitCast(d, blended), d, p);
   2018 }
   2019 
   2020 // ================================================== SWIZZLE
   2021 // ------------------------------ LowerHalf
   2022 
   2023 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   2024 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
   2025 #if HWY_HAS_BUILTIN(__builtin_shufflevector)
   2026  typedef uint32_t U32RawVectType __attribute__((__vector_size__(32)));
   2027  return VFromD<D>{reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(
   2028      __builtin_shufflevector(reinterpret_cast<U32RawVectType>(v.raw),
   2029                              reinterpret_cast<U32RawVectType>(v.raw), 0, 1, 2,
   2030                              3))};
   2031 #else
   2032  const RebindToUnsigned<D> du;
   2033  const Twice<decltype(du)> dut;
   2034  alignas(32) __m128i vec_tmp[2];
   2035  __m256i vec_result = BitCast(dut, v).raw;
   2036  CopyBytes<32>(&vec_result, vec_tmp);
   2037  return BitCast(D(), VFromD<decltype(du)>{vec_tmp[0]});
   2038 #endif
   2039 }
   2040 
   2041 template <typename T>
   2042 HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
   2043  const Full128<T> dh;
   2044  return LowerHalf(dh, v);
   2045 }
   2046 
   2047 // ------------------------------ UpperHalf
   2048 
   2049 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   2050 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
   2051 #if HWY_HAS_BUILTIN(__builtin_shufflevector)
   2052  (void)d;
   2053  typedef uint32_t U32RawVectType __attribute__((__vector_size__(32)));
   2054  return VFromD<D>{reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(
   2055      __builtin_shufflevector(reinterpret_cast<U32RawVectType>(v.raw),
   2056                              reinterpret_cast<U32RawVectType>(v.raw), 4, 5, 6,
   2057                              7))};
   2058 #else
   2059  const RebindToUnsigned<decltype(d)> du;
   2060  const Twice<decltype(du)> dut;
   2061  alignas(32) __m128i vec_tmp[2];
   2062  __m256i vec_result = BitCast(dut, v).raw;
   2063  CopyBytes<32>(&vec_result, vec_tmp);
   2064  return BitCast(d, VFromD<decltype(du)>{vec_tmp[1]});
   2065 #endif
   2066 }
   2067 
   2068 // ------------------------------ ExtractLane (Store)
   2069 template <typename T>
   2070 HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
   2071  const DFromV<decltype(v)> d;
   2072  HWY_DASSERT(i < Lanes(d));
   2073 
   2074 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   2075  constexpr size_t kLanesPerBlock = 16 / sizeof(T);
   2076  if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) {
   2077    return ExtractLane(LowerHalf(Half<decltype(d)>(), v), i);
   2078  }
   2079 #endif
   2080 
   2081  alignas(32) T lanes[32 / sizeof(T)];
   2082  Store(v, d, lanes);
   2083  return lanes[i];
   2084 }
   2085 
   2086 // ------------------------------ InsertLane (Store)
   2087 template <typename T>
   2088 HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
   2089  return detail::InsertLaneUsingBroadcastAndBlend(v, i, t);
   2090 }
   2091 
   2092 // ------------------------------ GetLane (LowerHalf)
   2093 template <typename T>
   2094 HWY_API T GetLane(const Vec256<T> v) {
   2095  return GetLane(LowerHalf(v));
   2096 }
   2097 
   2098 // ------------------------------ ExtractBlock (LowerHalf, UpperHalf)
   2099 
   2100 template <int kBlockIdx, class T>
   2101 HWY_API Vec128<T> ExtractBlock(Vec256<T> v) {
   2102  static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
   2103  const Half<DFromV<decltype(v)>> dh;
   2104  return (kBlockIdx == 0) ? LowerHalf(dh, v) : UpperHalf(dh, v);
   2105 }
   2106 
   2107 // ------------------------------ ZeroExtendVector
   2108 
   2109 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2110 HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) {
   2111 #if HWY_HAS_BUILTIN(__builtin_shufflevector)
   2112  typedef uint32_t U32RawVectType __attribute__((__vector_size__(16)));
   2113  U32RawVectType zero = {0, 0, 0, 0};
   2114  return VFromD<D>{reinterpret_cast<typename detail::Raw256<TFromD<D>>::type>(
   2115      __builtin_shufflevector(reinterpret_cast<U32RawVectType>(lo.raw), zero, 0,
   2116                              1, 2, 3, 4, 5, 6, 7))};
   2117 #else
   2118  return Combine(D(), Zero(Half<D>()), lo);
   2119 #endif
   2120 }
   2121 
   2122 // ------------------------------ ZeroExtendResizeBitCast
   2123 
   2124 namespace detail {
   2125 
   2126 template <class DTo, class DFrom>
   2127 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
   2128    hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */,
   2129    DTo d_to, DFrom d_from, VFromD<DFrom> v) {
   2130  const Twice<decltype(d_from)> dt_from;
   2131  const Twice<decltype(dt_from)> dq_from;
   2132  return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v)));
   2133 }
   2134 
   2135 }  // namespace detail
   2136 
   2137 // ------------------------------ Combine
   2138 
   2139 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2140 HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
   2141 #if HWY_HAS_BUILTIN(__builtin_shufflevector)
   2142  (void)d;
   2143  typedef uint32_t U32RawVectType __attribute__((__vector_size__(16)));
   2144  return VFromD<D>{reinterpret_cast<typename detail::Raw256<TFromD<D>>::type>(
   2145      __builtin_shufflevector(reinterpret_cast<U32RawVectType>(lo.raw),
   2146                              reinterpret_cast<U32RawVectType>(hi.raw), 0, 1, 2,
   2147                              3, 4, 5, 6, 7))};
   2148 #else
   2149  const RebindToUnsigned<decltype(d)> du;
   2150  const Half<decltype(du)> du128;
   2151  alignas(32) __m128i vec_tmp[2];
   2152  __m256i vec_result;
   2153  vec_tmp[0] = BitCast(du128, lo).raw;
   2154  vec_tmp[1] = BitCast(du128, hi).raw;
   2155  CopyBytes<32>(vec_tmp, &vec_result);
   2156  return BitCast(d, VFromD<decltype(du)>{vec_result});
   2157 #endif
   2158 }
   2159 
   2160 // ------------------------------ ShiftLeftBytes
   2161 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
   2162 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
   2163  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   2164  if (kBytes == 0) return v;
   2165  const RebindToUnsigned<decltype(d)> du;
   2166  return BitCast(
   2167      d, VFromD<decltype(du)>{__lasx_xvbsll_v(BitCast(du, v).raw, kBytes)});
   2168 }
   2169 
   2170 // ------------------------------ ShiftRightBytes
   2171 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
   2172 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
   2173  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   2174  if (kBytes == 0) return v;
   2175  const RebindToUnsigned<decltype(d)> du;
   2176  return BitCast(
   2177      d, VFromD<decltype(du)>{__lasx_xvbsrl_v(BitCast(du, v).raw, kBytes)});
   2178 }
   2179 
   2180 // ------------------------------ CombineShiftRightBytes
   2181 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)>
   2182 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   2183  return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi));
   2184 }
   2185 
   2186 // ------------------------------ Broadcast
   2187 
   2188 template <int kLane, class T, HWY_IF_T_SIZE(T, 1)>
   2189 HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
   2190  static_assert(0 <= kLane && kLane < 16, "Invalid lane");
   2191  return Vec256<T>{__lasx_xvreplve_b(v.raw, kLane)};
   2192 }
   2193 
   2194 template <int kLane, typename T, HWY_IF_T_SIZE(T, 2)>
   2195 HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
   2196  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   2197  const DFromV<decltype(v)> d;
   2198  const RebindToUnsigned<decltype(d)> du;
   2199  return BitCast(
   2200      d, VFromD<decltype(du)>{__lasx_xvreplve_h(BitCast(du, v).raw, kLane)});
   2201 }
   2202 
   2203 template <int kLane, typename T, HWY_IF_UI32(T)>
   2204 HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
   2205  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   2206  return Vec256<T>{__lasx_xvreplve_w(v.raw, kLane)};
   2207 }
   2208 
   2209 template <int kLane, typename T, HWY_IF_UI64(T)>
   2210 HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
   2211  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   2212  return Vec256<T>{__lasx_xvreplve_d(v.raw, kLane)};
   2213 }
   2214 
   2215 template <int kLane>
   2216 HWY_API Vec256<float> Broadcast(Vec256<float> v) {
   2217  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   2218  const DFromV<decltype(v)> d;
   2219  const RebindToUnsigned<decltype(d)> du;
   2220  return BitCast(
   2221      d, VFromD<decltype(du)>{__lasx_xvreplve_w(BitCast(du, v).raw, kLane)});
   2222 }
   2223 
   2224 template <int kLane>
   2225 HWY_API Vec256<double> Broadcast(const Vec256<double> v) {
   2226  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   2227  const DFromV<decltype(v)> d;
   2228  const RebindToUnsigned<decltype(d)> du;
   2229  return BitCast(
   2230      d, VFromD<decltype(du)>{__lasx_xvreplve_d(BitCast(du, v).raw, kLane)});
   2231 }
   2232 
   2233 // ------------------------------ BroadcastBlock
   2234 
   2235 template <int kBlockIdx, class T>
   2236 HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
   2237  static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
   2238  const DFromV<decltype(v)> d;
   2239  return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v)
   2240                          : ConcatUpperUpper(d, v, v);
   2241 }
   2242 
   2243 // ------------------------------ BroadcastLane
   2244 
   2245 namespace detail {
   2246 
   2247 template <class T, HWY_IF_T_SIZE(T, 1)>
   2248 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
   2249                                   Vec256<T> v) {
   2250  return Vec256<T>{__lasx_xvreplve0_b(v.raw)};
   2251 }
   2252 
   2253 template <class T, HWY_IF_T_SIZE(T, 2)>
   2254 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
   2255                                   Vec256<T> v) {
   2256  const DFromV<decltype(v)> d;
   2257  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   2258  return BitCast(d,
   2259                 VFromD<decltype(du)>{__lasx_xvreplve0_h(BitCast(du, v).raw)});
   2260 }
   2261 
   2262 template <class T, HWY_IF_UI32(T)>
   2263 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
   2264                                   Vec256<T> v) {
   2265  return Vec256<T>{__lasx_xvreplve0_w(v.raw)};
   2266 }
   2267 
   2268 template <class T, HWY_IF_UI64(T)>
   2269 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
   2270                                   Vec256<T> v) {
   2271  return Vec256<T>{__lasx_xvreplve0_d(v.raw)};
   2272 }
   2273 
   2274 HWY_INLINE Vec256<float> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
   2275                                       Vec256<float> v) {
   2276  const DFromV<decltype(v)> d;
   2277  const RebindToUnsigned<decltype(d)> du;
   2278  return BitCast(d,
   2279                 VFromD<decltype(du)>{__lasx_xvreplve0_w(BitCast(du, v).raw)});
   2280 }
   2281 
   2282 HWY_INLINE Vec256<double> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */,
   2283                                        Vec256<double> v) {
   2284  const DFromV<decltype(v)> d;
   2285  const RebindToUnsigned<decltype(d)> du;
   2286  return BitCast(d,
   2287                 VFromD<decltype(du)>{__lasx_xvreplve0_d(BitCast(du, v).raw)});
   2288 }
   2289 
   2290 template <size_t kLaneIdx, class T, hwy::EnableIf<kLaneIdx != 0>* = nullptr>
   2291 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<kLaneIdx> /* lane_idx_tag */,
   2292                                   Vec256<T> v) {
   2293  constexpr size_t kLanesPerBlock = 16 / sizeof(T);
   2294  constexpr int kBlockIdx = static_cast<int>(kLaneIdx / kLanesPerBlock);
   2295  constexpr int kLaneInBlkIdx =
   2296      static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1);
   2297  return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v));
   2298 }
   2299 }  // namespace detail
   2300 
   2301 template <int kLaneIdx, class T>
   2302 HWY_API Vec256<T> BroadcastLane(Vec256<T> v) {
   2303  static_assert(kLaneIdx >= 0, "Invalid lane");
   2304  return detail::BroadcastLane(hwy::SizeTag<static_cast<size_t>(kLaneIdx)>(),
   2305                               v);
   2306 }
   2307 
   2308 // ------------------------------ Hard-coded shuffles
   2309 
   2310 // Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
   2311 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the
   2312 // right (the previous least-significant lane is now most-significant =>
   2313 // 47650321). These could also be implemented via CombineShiftRightBytes but
   2314 // the shuffle_abcd notation is more convenient.
   2315 
   2316 // Swap 32-bit halves in 64-bit halves.
   2317 template <typename T, HWY_IF_UI32(T)>
   2318 HWY_API Vec256<T> Shuffle2301(const Vec256<T> v) {
   2319  return Vec256<T>{__lasx_xvshuf4i_w(v.raw, 0xb1)};
   2320 }
   2321 HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) {
   2322  const DFromV<decltype(v)> d;
   2323  const RebindToUnsigned<decltype(d)> du;
   2324  return BitCast(
   2325      d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0xb1)});
   2326 }
   2327 
   2328 // Used by generic_ops-inl.h
   2329 namespace detail {
   2330 
   2331 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2332 HWY_API Vec256<T> ShuffleTwo2301(const Vec256<T> a, const Vec256<T> b) {
   2333  const DFromV<decltype(a)> d;
   2334  const RebindToUnsigned<decltype(d)> du;
   2335  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w(
   2336                        BitCast(du, b).raw, BitCast(du, a).raw, 0xb1)});
   2337 }
   2338 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2339 HWY_API Vec256<T> ShuffleTwo1230(const Vec256<T> a, const Vec256<T> b) {
   2340  const DFromV<decltype(a)> d;
   2341  const RebindToUnsigned<decltype(d)> du;
   2342  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w(
   2343                        BitCast(du, b).raw, BitCast(du, a).raw, 0x6c)});
   2344 }
   2345 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2346 HWY_API Vec256<T> ShuffleTwo3012(const Vec256<T> a, const Vec256<T> b) {
   2347  const DFromV<decltype(a)> d;
   2348  const RebindToUnsigned<decltype(d)> du;
   2349  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w(
   2350                        BitCast(du, b).raw, BitCast(du, a).raw, 0xc6)});
   2351 }
   2352 
   2353 }  // namespace detail
   2354 
   2355 // Swap 64-bit halves
   2356 HWY_API Vec256<uint32_t> Shuffle1032(const Vec256<uint32_t> v) {
   2357  return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
   2358 }
   2359 HWY_API Vec256<int32_t> Shuffle1032(const Vec256<int32_t> v) {
   2360  return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
   2361 }
   2362 HWY_API Vec256<float> Shuffle1032(const Vec256<float> v) {
   2363  const DFromV<decltype(v)> d;
   2364  const RebindToUnsigned<decltype(d)> du;
   2365  return BitCast(
   2366      d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x4e)});
   2367 }
   2368 HWY_API Vec256<uint64_t> Shuffle01(const Vec256<uint64_t> v) {
   2369  return Vec256<uint64_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
   2370 }
   2371 HWY_API Vec256<int64_t> Shuffle01(const Vec256<int64_t> v) {
   2372  return Vec256<int64_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)};
   2373 }
   2374 HWY_API Vec256<double> Shuffle01(const Vec256<double> v) {
   2375  const DFromV<decltype(v)> d;
   2376  const RebindToUnsigned<decltype(d)> du;
   2377  return BitCast(
   2378      d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x4e)});
   2379 }
   2380 
   2381 // Rotate right 32 bits
   2382 HWY_API Vec256<uint32_t> Shuffle0321(const Vec256<uint32_t> v) {
   2383  return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x39)};
   2384 }
   2385 HWY_API Vec256<int32_t> Shuffle0321(const Vec256<int32_t> v) {
   2386  return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x39)};
   2387 }
   2388 HWY_API Vec256<float> Shuffle0321(const Vec256<float> v) {
   2389  const DFromV<decltype(v)> d;
   2390  const RebindToUnsigned<decltype(d)> du;
   2391  return BitCast(
   2392      d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x39)});
   2393 }
   2394 // Rotate left 32 bits
   2395 HWY_API Vec256<uint32_t> Shuffle2103(const Vec256<uint32_t> v) {
   2396  return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x93)};
   2397 }
   2398 HWY_API Vec256<int32_t> Shuffle2103(const Vec256<int32_t> v) {
   2399  return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x93)};
   2400 }
   2401 HWY_API Vec256<float> Shuffle2103(const Vec256<float> v) {
   2402  const DFromV<decltype(v)> d;
   2403  const RebindToUnsigned<decltype(d)> du;
   2404  return BitCast(
   2405      d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x93)});
   2406 }
   2407 
   2408 // Reverse
   2409 HWY_API Vec256<uint32_t> Shuffle0123(const Vec256<uint32_t> v) {
   2410  return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x1B)};
   2411 }
   2412 HWY_API Vec256<int32_t> Shuffle0123(const Vec256<int32_t> v) {
   2413  return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x1B)};
   2414 }
   2415 HWY_API Vec256<float> Shuffle0123(const Vec256<float> v) {
   2416  const DFromV<decltype(v)> d;
   2417  const RebindToUnsigned<decltype(d)> du;
   2418  return BitCast(
   2419      d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x1b)});
   2420 }
   2421 
   2422 // ------------------------------ TableLookupLanes
   2423 
   2424 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
   2425 template <typename T>
   2426 struct Indices256 {
   2427  __m256i raw;
   2428 };
   2429 
   2430 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
   2431 HWY_API Indices256<TFromD<D>> IndicesFromVec(D /* tag */, Vec256<TI> vec) {
   2432  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
   2433 #if HWY_IS_DEBUG_BUILD
   2434  const Full256<TI> di;
   2435  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
   2436              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(2 * Lanes(di))))));
   2437 #endif
   2438  return Indices256<TFromD<D>>{vec.raw};
   2439 }
   2440 
   2441 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
   2442 HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) {
   2443  const Rebind<TI, decltype(d)> di;
   2444  return IndicesFromVec(d, LoadU(di, idx));
   2445 }
   2446 
   2447 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2448 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
   2449  const DFromV<decltype(v)> d;
   2450  const auto a = ConcatLowerLower(d, v, v);
   2451  const auto b = ConcatUpperUpper(d, v, v);
   2452  return Vec256<T>{__lasx_xvshuf_b(b.raw, a.raw, idx.raw)};
   2453 }
   2454 
   2455 template <typename T, HWY_IF_T_SIZE(T, 2)>
   2456 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
   2457  const DFromV<decltype(v)> d;
   2458  const RebindToUnsigned<decltype(d)> du;
   2459  const auto a = ConcatLowerLower(d, v, v);
   2460  const auto b = ConcatUpperUpper(d, v, v);
   2461  return BitCast(d, VFromD<decltype(du)>{__lasx_xvshuf_h(
   2462                        idx.raw, BitCast(du, b).raw, BitCast(du, a).raw)});
   2463 }
   2464 
   2465 template <typename T, HWY_IF_T_SIZE(T, 4)>
   2466 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
   2467  const DFromV<decltype(v)> d;
   2468  const RebindToSigned<decltype(d)> di;
   2469  return BitCast(d,
   2470                 Vec256<int32_t>{__lasx_xvperm_w(BitCast(di, v).raw, idx.raw)});
   2471 }
   2472 
   2473 template <typename T, HWY_IF_T_SIZE(T, 8)>
   2474 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
   2475  using TI = MakeSigned<T>;
   2476  const DFromV<decltype(v)> d;
   2477  const RebindToSigned<decltype(d)> di64;
   2478  const Repartition<int32_t, decltype(d)> di32;
   2479  // Replicate 64-bit index into upper 32 bits
   2480  const Vec256<TI> dup{__lasx_xvpackev_w(idx.raw, idx.raw)};
   2481  // For each idx64 i, idx32 are 2*i and 2*i+1.
   2482  const Vec256<TI> idx32 = dup + dup + Set(di64, int64_t(1) << 32);
   2483  return BitCast(
   2484      d, TableLookupLanes(BitCast(di32, v), Indices256<int32_t>{idx32.raw}));
   2485 }
   2486 
   2487 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2488 HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
   2489                                       Indices256<T> idx) {
   2490  const auto idx2 = Indices256<T>{__lasx_xvandi_b(idx.raw, 31)};
   2491  const Vec256<T> idx_vec{idx.raw};
   2492  const auto sel_hi_mask = ShiftLeft<2>(idx_vec);
   2493  const auto mask0or1 = __lasx_xvslti_b(sel_hi_mask.raw, 0);
   2494  const auto lo_lookup_result = TableLookupLanes(a, idx);
   2495  const auto hi_lookup_result = TableLookupLanes(b, idx2);
   2496  return IfThenElse(Mask256<T>{mask0or1}, hi_lookup_result, lo_lookup_result);
   2497 }
   2498 
   2499 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
   2500 HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b,
   2501                                       Indices256<T> idx) {
   2502  const DFromV<decltype(a)> d;
   2503  const RebindToSigned<decltype(d)> di;
   2504  const Vec256<TFromD<decltype(di)>> idx_vec{idx.raw};
   2505  constexpr int shift_count = 8 * sizeof(T) - 6 + CeilLog2(sizeof(T));
   2506  const auto sel_hi_mask = BitCast(di, ShiftLeft<shift_count>(idx_vec));
   2507  const auto lo_lookup_result = BitCast(di, TableLookupLanes(a, idx));
   2508  const auto hi_lookup_result = BitCast(di, TableLookupLanes(b, idx));
   2509  return BitCast(
   2510      d, IfNegativeThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result));
   2511 }
   2512 
   2513 // ------------------------------ SwapAdjacentBlocks
   2514 
   2515 template <typename T>
   2516 HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
   2517  const DFromV<decltype(v)> d;
   2518  const RebindToUnsigned<decltype(d)> du;
   2519  return BitCast(d, Vec256<uint8_t>{__lasx_xvpermi_q(
   2520                        BitCast(du, v).raw, BitCast(du, v).raw, 0x01)});
   2521 }
   2522 
   2523 // ------------------------------ InterleaveEvenBlocks (ConcatLowerLower)
   2524 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
   2525 HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
   2526  return ConcatLowerLower(d, b, a);
   2527 }
   2528 
   2529 // ------------------------------ InterleaveOddBlocks (ConcatUpperUpper)
   2530 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
   2531 HWY_API V InterleaveOddBlocks(D d, V a, V b) {
   2532  return ConcatUpperUpper(d, b, a);
   2533 }
   2534 
   2535 // ------------------------------ Reverse (RotateRight)
   2536 
   2537 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
   2538 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   2539  alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
   2540  return TableLookupLanes(v, SetTableIndices(d, kReverse));
   2541 }
   2542 
   2543 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   2544 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   2545  const RebindToUnsigned<decltype(d)> du;
   2546  return BitCast(
   2547      d, VFromD<decltype(du)>{__lasx_xvpermi_d(BitCast(du, v).raw, 0x1b)});
   2548 }
   2549 
   2550 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2551 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   2552  alignas(32) static constexpr int16_t kReverse[16] = {
   2553      15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
   2554  return TableLookupLanes(v, SetTableIndices(d, kReverse));
   2555 }
   2556 
   2557 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   2558 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
   2559  alignas(32) static constexpr TFromD<D> kReverse[32] = {
   2560      31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
   2561      15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0};
   2562  return TableLookupLanes(v, SetTableIndices(d, kReverse));
   2563 }
   2564 
   2565 // ------------------------------ Reverse4 (SwapAdjacentBlocks)
   2566 
   2567 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2568 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
   2569  const RebindToUnsigned<decltype(d)> du;
   2570  return BitCast(
   2571      d, VFromD<decltype(du)>{__lasx_xvshuf4i_h(BitCast(du, v).raw, 0x1b)});
   2572 }
   2573 
   2574 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   2575 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) {
   2576  const RebindToUnsigned<D> du;
   2577  return BitCast(
   2578      D(), VFromD<decltype(du)>{__lasx_xvpermi_d(BitCast(du, v).raw, 0x1b)});
   2579 }
   2580 
   2581 // ------------------------------ Reverse8
   2582 
   2583 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2584 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   2585  const RebindToSigned<decltype(d)> di;
   2586  const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
   2587      di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
   2588  return BitCast(d, TableLookupBytes(v, shuffle));
   2589 }
   2590 
   2591 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
   2592 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   2593  return Reverse(d, v);
   2594 }
   2595 
   2596 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   2597 HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D> /* v */) {
   2598  HWY_ASSERT(0);
   2599 }
   2600 
   2601 // ------------------------------ InterleaveLower
   2602 
   2603 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
   2604 // the least-significant lane) and "b". To concatenate two half-width integers
   2605 // into one, use ZipLower/Upper instead (also works with scalar).
   2606 
   2607 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2608 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
   2609  return Vec256<T>{__lasx_xvilvl_b(b.raw, a.raw)};
   2610 }
   2611 template <typename T, HWY_IF_T_SIZE(T, 2)>
   2612 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
   2613  const DFromV<decltype(a)> d;
   2614  const RebindToUnsigned<decltype(d)> du;
   2615  using VU = VFromD<decltype(du)>;  // for float16_t
   2616  return BitCast(d,
   2617                 VU{__lasx_xvilvl_h(BitCast(du, b).raw, BitCast(du, a).raw)});
   2618 }
   2619 template <typename T, HWY_IF_UI32(T)>
   2620 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
   2621  return Vec256<T>{__lasx_xvilvl_w(b.raw, a.raw)};
   2622 }
   2623 template <typename T, HWY_IF_UI64(T)>
   2624 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
   2625  return Vec256<T>{__lasx_xvilvl_d(b.raw, a.raw)};
   2626 }
   2627 
   2628 HWY_API Vec256<float> InterleaveLower(Vec256<float> a, Vec256<float> b) {
   2629  const Full256<uint32_t> du;
   2630  const Full256<float> df;
   2631  return BitCast(df, Vec256<uint32_t>{__lasx_xvilvl_w(BitCast(du, b).raw,
   2632                                                      BitCast(du, a).raw)});
   2633 }
   2634 HWY_API Vec256<double> InterleaveLower(Vec256<double> a, Vec256<double> b) {
   2635  const Full256<uint64_t> du;
   2636  const Full256<double> df;
   2637  return BitCast(df, Vec256<uint64_t>{__lasx_xvilvl_d(BitCast(du, b).raw,
   2638                                                      BitCast(du, a).raw)});
   2639 }
   2640 
   2641 // ------------------------------ InterleaveUpper
   2642 
   2643 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   2644 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   2645  return VFromD<D>{__lasx_xvilvh_b(b.raw, a.raw)};
   2646 }
   2647 
   2648 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2649 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   2650  const RebindToUnsigned<decltype(d)> du;
   2651  using VU = VFromD<decltype(du)>;  // for float16_t
   2652  return BitCast(d,
   2653                 VU{__lasx_xvilvh_h(BitCast(du, b).raw, BitCast(du, a).raw)});
   2654 }
   2655 
   2656 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
   2657 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   2658  return VFromD<D>{__lasx_xvilvh_w(b.raw, a.raw)};
   2659 }
   2660 
   2661 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
   2662 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   2663  return VFromD<D>{__lasx_xvilvh_d(b.raw, a.raw)};
   2664 }
   2665 
   2666 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
   2667 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   2668  const RebindToUnsigned<D> du;
   2669  return BitCast(D(), VFromD<decltype(du)>{__lasx_xvilvh_w(
   2670                          BitCast(du, b).raw, BitCast(du, a).raw)});
   2671 }
   2672 
   2673 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   2674 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   2675  const RebindToUnsigned<D> du;
   2676  return BitCast(D(), VFromD<decltype(du)>{__lasx_xvilvh_d(
   2677                          BitCast(du, b).raw, BitCast(du, a).raw)});
   2678 }
   2679 
   2680 // ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
   2681 
   2682 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
   2683 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2684 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   2685  const RebindToUnsigned<decltype(d)> du;
   2686  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
   2687                        BitCast(du, hi).raw, BitCast(du, lo).raw, 0x20)});
   2688 }
   2689 
   2690 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
   2691 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2692 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   2693  const RebindToUnsigned<decltype(d)> du;
   2694  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
   2695                        BitCast(du, hi).raw, BitCast(du, lo).raw, 0x21)});
   2696 }
   2697 
   2698 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
   2699 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2700 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   2701  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   2702  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
   2703                        BitCast(du, hi).raw, BitCast(du, lo).raw, 0x30)});
   2704 }
   2705 
   2706 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
   2707 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2708 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   2709  const RebindToUnsigned<decltype(d)> du;  // for float16_t
   2710  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
   2711                        BitCast(du, hi).raw, BitCast(du, lo).raw, 0x31)});
   2712 }
   2713 
   2714 // ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower)
   2715 template <int kBlockIdx, class T>
   2716 HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) {
   2717  static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
   2718 
   2719  const DFromV<decltype(v)> d;
   2720  const auto vec_to_insert = ResizeBitCast(d, blk_to_insert);
   2721  return (kBlockIdx == 0) ? ConcatUpperLower(d, v, vec_to_insert)
   2722                          : ConcatLowerLower(d, vec_to_insert, v);
   2723 }
   2724 
   2725 // ------------------------------ ConcatOdd
   2726 
   2727 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   2728 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   2729  __m256i od = __lasx_xvpickod_b(hi.raw, lo.raw);
   2730  return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)};
   2731 }
   2732 
   2733 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2734 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   2735  const RebindToUnsigned<decltype(d)> du;
   2736  __m256i od = __lasx_xvpickod_h(BitCast(du, hi).raw, BitCast(du, lo).raw);
   2737  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)});
   2738 }
   2739 
   2740 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
   2741 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   2742  __m256i od = __lasx_xvpickod_w(hi.raw, lo.raw);
   2743  return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)};
   2744 }
   2745 
   2746 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
   2747 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   2748  const RebindToUnsigned<decltype(d)> du;
   2749  __m256i od = __lasx_xvpickod_w(BitCast(du, hi).raw, BitCast(du, lo).raw);
   2750  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)});
   2751 }
   2752 
   2753 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
   2754 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   2755  __m256i od = __lasx_xvpickod_d(hi.raw, lo.raw);
   2756  return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)};
   2757 }
   2758 
   2759 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   2760 HWY_API Vec256<double> ConcatOdd(D d, Vec256<double> hi, Vec256<double> lo) {
   2761  const RebindToUnsigned<decltype(d)> du;
   2762  __m256i od = __lasx_xvpickod_d(BitCast(du, hi).raw, BitCast(du, lo).raw);
   2763  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)});
   2764 }
   2765 
   2766 // ------------------------------ ConcatEven
   2767 
   2768 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   2769 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   2770  __m256i ev = __lasx_xvpickev_b(hi.raw, lo.raw);
   2771  return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)};
   2772 }
   2773 
   2774 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2775 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   2776  const RebindToUnsigned<decltype(d)> du;
   2777  __m256i ev = __lasx_xvpickev_h(BitCast(du, hi).raw, BitCast(du, lo).raw);
   2778  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)});
   2779 }
   2780 
   2781 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)>
   2782 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   2783  __m256i ev = __lasx_xvpickev_w(hi.raw, lo.raw);
   2784  return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)};
   2785 }
   2786 
   2787 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
   2788 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   2789  const RebindToUnsigned<decltype(d)> du;
   2790  __m256i ev = __lasx_xvpickev_w(BitCast(du, hi).raw, BitCast(du, lo).raw);
   2791  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)});
   2792 }
   2793 
   2794 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)>
   2795 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   2796  __m256i ev = __lasx_xvpickev_d(hi.raw, lo.raw);
   2797  return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)};
   2798 }
   2799 
   2800 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   2801 HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) {
   2802  const RebindToUnsigned<decltype(d)> du;
   2803  __m256i ev = __lasx_xvpickev_d(BitCast(du, hi).raw, BitCast(du, lo).raw);
   2804  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)});
   2805 }
   2806 
   2807 // ------------------------------ InterleaveWholeLower
   2808 
   2809 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2810 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
   2811  return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
   2812 }
   2813 
   2814 // ------------------------------ InterleaveWholeUpper
   2815 
   2816 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2817 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
   2818  return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b));
   2819 }
   2820 
   2821 // ------------------------------ DupEven (InterleaveLower)
   2822 
   2823 template <typename T, HWY_IF_UI8(T)>
   2824 HWY_API Vec256<T> DupEven(Vec256<T> v) {
   2825  return Vec256<T>{__lasx_xvpackev_b(v.raw, v.raw)};
   2826 }
   2827 
   2828 template <typename T, HWY_IF_UI16(T)>
   2829 HWY_API Vec256<T> DupEven(Vec256<T> v) {
   2830  return Vec256<T>{__lasx_xvpackev_h(v.raw, v.raw)};
   2831 }
   2832 
   2833 template <typename T, HWY_IF_UI32(T)>
   2834 HWY_API Vec256<T> DupEven(Vec256<T> v) {
   2835  return Vec256<T>{__lasx_xvpackev_w(v.raw, v.raw)};
   2836 }
   2837 
   2838 HWY_API Vec256<float> DupEven(Vec256<float> v) {
   2839  const Full256<uint32_t> du;
   2840  const DFromV<decltype(v)> d;
   2841  return BitCast(d, Vec256<uint32_t>{__lasx_xvpackev_w(BitCast(du, v).raw,
   2842                                                       BitCast(du, v).raw)});
   2843 }
   2844 
   2845 template <typename T, HWY_IF_T_SIZE(T, 8)>
   2846 HWY_API Vec256<T> DupEven(const Vec256<T> v) {
   2847  const DFromV<decltype(v)> d;
   2848  return InterleaveLower(d, v, v);
   2849 }
   2850 
   2851 // ------------------------------ DupOdd (InterleaveUpper)
   2852 
   2853 template <typename T, HWY_IF_UI8(T)>
   2854 HWY_API Vec256<T> DupOdd(Vec256<T> v) {
   2855  return Vec256<T>{__lasx_xvpackod_b(v.raw, v.raw)};
   2856 }
   2857 
   2858 template <typename T, HWY_IF_UI16(T)>
   2859 HWY_API Vec256<T> DupOdd(Vec256<T> v) {
   2860  return Vec256<T>{__lasx_xvpackod_h(v.raw, v.raw)};
   2861 }
   2862 
   2863 template <typename T, HWY_IF_UI32(T)>
   2864 HWY_API Vec256<T> DupOdd(Vec256<T> v) {
   2865  return Vec256<T>{__lasx_xvpackod_w(v.raw, v.raw)};
   2866 }
   2867 
   2868 HWY_API Vec256<float> DupOdd(Vec256<float> v) {
   2869  const Full256<uint32_t> du;
   2870  const DFromV<decltype(v)> d;
   2871  return BitCast(d, Vec256<uint32_t>{__lasx_xvpackod_w(BitCast(du, v).raw,
   2872                                                       BitCast(du, v).raw)});
   2873 }
   2874 
   2875 template <typename T, HWY_IF_T_SIZE(T, 8)>
   2876 HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
   2877  const DFromV<decltype(v)> d;
   2878  return InterleaveUpper(d, v, v);
   2879 }
   2880 
   2881 // ------------------------------ OddEven
   2882 
   2883 template <typename T, HWY_IF_T_SIZE(T, 1)>
   2884 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
   2885  __m256i c = __lasx_xvpackod_b(a.raw, a.raw);
   2886  return Vec256<T>{__lasx_xvpackev_b(c, b.raw)};
   2887 }
   2888 
   2889 template <typename T, HWY_IF_UI16(T)>
   2890 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
   2891  __m256i c = __lasx_xvpackod_h(a.raw, a.raw);
   2892  return Vec256<T>{__lasx_xvpackev_h(c, b.raw)};
   2893 }
   2894 
   2895 template <typename T, HWY_IF_UI32(T)>
   2896 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
   2897  __m256i c = __lasx_xvpackod_w(a.raw, a.raw);
   2898  return Vec256<T>{__lasx_xvpackev_w(c, b.raw)};
   2899 }
   2900 
   2901 template <typename T, HWY_IF_UI64(T)>
   2902 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) {
   2903  return Vec256<T>{__lasx_xvextrins_d(b.raw, a.raw, 0x11)};
   2904 }
   2905 
   2906 HWY_API Vec256<float> OddEven(Vec256<float> a, Vec256<float> b) {
   2907  const DFromV<decltype(a)> d;
   2908  const RebindToUnsigned<decltype(d)> du;
   2909  __m256i c = __lasx_xvpackod_w(BitCast(du, a).raw, BitCast(du, a).raw);
   2910  return BitCast(
   2911      d, VFromD<decltype(du)>{__lasx_xvpackev_w(c, BitCast(du, b).raw)});
   2912 }
   2913 
   2914 HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) {
   2915  const DFromV<decltype(a)> d;
   2916  const RebindToUnsigned<decltype(d)> du;
   2917  return BitCast(d, VFromD<decltype(du)>{__lasx_xvextrins_d(
   2918                        BitCast(du, b).raw, BitCast(du, a).raw, 0x11)});
   2919 }
   2920 
   2921 // -------------------------- InterleaveEven
   2922 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   2923 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2924  return VFromD<D>{__lasx_xvpackev_b(b.raw, a.raw)};
   2925 }
   2926 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2927 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2928  return VFromD<D>{__lasx_xvpackev_h(b.raw, a.raw)};
   2929 }
   2930 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
   2931 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   2932  const RebindToUnsigned<decltype(d)> du;
   2933  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpackev_w(
   2934                        BitCast(du, b).raw, BitCast(du, a).raw)});
   2935 }
   2936 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   2937 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2938  return InterleaveLower(a, b);
   2939 }
   2940 
   2941 // -------------------------- InterleaveOdd
   2942 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   2943 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2944  return VFromD<D>{__lasx_xvpackod_b(b.raw, a.raw)};
   2945 }
   2946 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   2947 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   2948  return VFromD<D>{__lasx_xvpackod_h(b.raw, a.raw)};
   2949 }
   2950 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
   2951 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   2952  const RebindToUnsigned<decltype(d)> du;
   2953  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpackod_w(
   2954                        BitCast(du, b).raw, BitCast(du, a).raw)});
   2955 }
   2956 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   2957 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   2958  return InterleaveUpper(d, a, b);
   2959 }
   2960 
   2961 // ------------------------------ OddEvenBlocks
   2962 
   2963 template <typename T>
   2964 Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
   2965  const DFromV<decltype(odd)> d;
   2966  const RebindToUnsigned<decltype(d)> du;
   2967  return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q(
   2968                        BitCast(du, odd).raw, BitCast(du, even).raw, 0x30)});
   2969 }
   2970 
   2971 // ------------------------------ ReverseBlocks (SwapAdjacentBlocks)
   2972 
   2973 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2974 HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) {
   2975  return SwapAdjacentBlocks(v);
   2976 }
   2977 
   2978 // ------------------------------ TableLookupBytes (ZeroExtendVector)
   2979 
   2980 // Both full
   2981 template <typename T, typename TI>
   2982 HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) {
   2983  const DFromV<decltype(from)> d;
   2984  return BitCast(d, Vec256<uint8_t>{__lasx_xvshuf_b(
   2985                        BitCast(Full256<uint8_t>(), bytes).raw,
   2986                        BitCast(Full256<uint8_t>(), bytes).raw,
   2987                        BitCast(Full256<uint8_t>(), from).raw)});
   2988 }
   2989 
   2990 // Partial index vector
   2991 template <typename T, typename TI, size_t NI>
   2992 HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, Vec128<TI, NI> from) {
   2993  const Full256<TI> di;
   2994  const Half<decltype(di)> dih;
   2995  // First expand to full 128, then 256.
   2996  const auto from_256 = ZeroExtendVector(di, Vec128<TI>{from.raw});
   2997  const auto tbl_full = TableLookupBytes(bytes, from_256);
   2998  // Shrink to 128, then partial.
   2999  return Vec128<TI, NI>{LowerHalf(dih, tbl_full).raw};
   3000 }
   3001 
   3002 // Partial table vector
   3003 template <typename T, size_t N, typename TI>
   3004 HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, Vec256<TI> from) {
   3005  const Full256<T> d;
   3006  // First expand to full 128, then 256.
   3007  const auto bytes_256 = ZeroExtendVector(d, Vec128<T>{bytes.raw});
   3008  return TableLookupBytes(bytes_256, from);
   3009 }
   3010 
   3011 // ------------------------------ Per4LaneBlockShuffle
   3012 
   3013 namespace detail {
   3014 
   3015 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3016 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
   3017                                                const uint32_t x2,
   3018                                                const uint32_t x1,
   3019                                                const uint32_t x0) {
   3020  alignas(32) uint32_t rawU32[8] = {x0, x1, x2, x3, x0, x1, x2, x3};
   3021  return BitCast(d, Vec256<uint32_t>{__lasx_xvld(rawU32, 0)});
   3022 }
   3023 
   3024 template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
   3025 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   3026                                  hwy::SizeTag<4> /*lane_size_tag*/,
   3027                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   3028  const DFromV<decltype(v)> d;
   3029  V idx =
   3030      Per4LaneBlkShufDupSet4xU32(d, (kIdx3210 >> 6) & 3, (kIdx3210 >> 4) & 3,
   3031                                 (kIdx3210 >> 2) & 3, kIdx3210 & 3);
   3032  return V{__lasx_xvshuf_w(idx.raw, v.raw, v.raw)};
   3033 }
   3034 
   3035 template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)>
   3036 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   3037                                  hwy::SizeTag<4> /*lane_size_tag*/,
   3038                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   3039  const DFromV<decltype(v)> d;
   3040  const RebindToUnsigned<decltype(d)> du;
   3041  const auto idx =
   3042      Per4LaneBlkShufDupSet4xU32(du, (kIdx3210 >> 6) & 3, (kIdx3210 >> 4) & 3,
   3043                                 (kIdx3210 >> 2) & 3, kIdx3210 & 3);
   3044  return BitCast(d, VFromD<decltype(du)>{__lasx_xvshuf_w(
   3045                        idx.raw, BitCast(du, v).raw, BitCast(du, v).raw)});
   3046 }
   3047 
   3048 template <class V>
   3049 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/,
   3050                                  hwy::SizeTag<8> /*lane_size_tag*/,
   3051                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   3052  const DFromV<decltype(v)> d;
   3053  return ConcatLowerLower(d, v, v);
   3054 }
   3055 
   3056 template <class V>
   3057 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/,
   3058                                  hwy::SizeTag<8> /*lane_size_tag*/,
   3059                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   3060  const DFromV<decltype(v)> d;
   3061  return ConcatUpperUpper(d, v, v);
   3062 }
   3063 
   3064 template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)>
   3065 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   3066                                  hwy::SizeTag<8> /*lane_size_tag*/,
   3067                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   3068  const DFromV<decltype(v)> d;
   3069  const RebindToUnsigned<decltype(d)> du;
   3070  using VU = VFromD<decltype(du)>;
   3071 
   3072  const VU vu = BitCast(du, v);
   3073  return BitCast(
   3074      d, VU{__lasx_xvpermi_d(vu.raw, static_cast<int>(kIdx3210 & 0xFF))});
   3075 }
   3076 
   3077 }  // namespace detail
   3078 
   3079 // ------------------------------ SlideUpLanes
   3080 
   3081 namespace detail {
   3082 
   3083 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3084 HWY_INLINE VFromD<D> TableLookupSlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3085  const RebindToUnsigned<D> du;
   3086  using TU = TFromD<decltype(du)>;
   3087  const auto idx = Iota(du, static_cast<TU>(size_t{0} - amt));
   3088  const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1)));
   3089  return BitCast(
   3090      d, IfThenElseZero(
   3091             idx == masked_idx,
   3092             TableLookupLanes(BitCast(du, v), IndicesFromVec(du, masked_idx))));
   3093 }
   3094 
   3095 }  // namespace detail
   3096 
   3097 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
   3098 HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
   3099  static_assert(0 <= kBlocks && kBlocks <= 1,
   3100                "kBlocks must be between 0 and 1");
   3101  return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v;
   3102 }
   3103 
   3104 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3105 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3106 #if !HWY_IS_DEBUG_BUILD
   3107  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
   3108  if (__builtin_constant_p(amt)) {
   3109    const auto v_lo = ConcatLowerLower(d, v, Zero(d));
   3110    switch (amt * sizeof(TFromD<D>)) {
   3111      case 0:
   3112        return v;
   3113      case 1:
   3114        return CombineShiftRightBytes<15>(d, v, v_lo);
   3115      case 2:
   3116        return CombineShiftRightBytes<14>(d, v, v_lo);
   3117      case 3:
   3118        return CombineShiftRightBytes<13>(d, v, v_lo);
   3119      case 4:
   3120        return CombineShiftRightBytes<12>(d, v, v_lo);
   3121      case 5:
   3122        return CombineShiftRightBytes<11>(d, v, v_lo);
   3123      case 6:
   3124        return CombineShiftRightBytes<10>(d, v, v_lo);
   3125      case 7:
   3126        return CombineShiftRightBytes<9>(d, v, v_lo);
   3127      case 8:
   3128        return CombineShiftRightBytes<8>(d, v, v_lo);
   3129      case 9:
   3130        return CombineShiftRightBytes<7>(d, v, v_lo);
   3131      case 10:
   3132        return CombineShiftRightBytes<6>(d, v, v_lo);
   3133      case 11:
   3134        return CombineShiftRightBytes<5>(d, v, v_lo);
   3135      case 12:
   3136        return CombineShiftRightBytes<4>(d, v, v_lo);
   3137      case 13:
   3138        return CombineShiftRightBytes<3>(d, v, v_lo);
   3139      case 14:
   3140        return CombineShiftRightBytes<2>(d, v, v_lo);
   3141      case 15:
   3142        return CombineShiftRightBytes<1>(d, v, v_lo);
   3143    }
   3144  }
   3145 
   3146  if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
   3147    const Half<decltype(d)> dh;
   3148    return Combine(d, SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock),
   3149                   Zero(dh));
   3150  }
   3151 #endif
   3152 
   3153  return detail::TableLookupSlideUpLanes(d, v, amt);
   3154 }
   3155 
   3156 // ------------------------------ Slide1Up
   3157 
   3158 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   3159 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
   3160  const auto v_lo = ConcatLowerLower(d, v, Zero(d));
   3161  return CombineShiftRightBytes<15>(d, v, v_lo);
   3162 }
   3163 
   3164 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   3165 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
   3166  const auto v_lo = ConcatLowerLower(d, v, Zero(d));
   3167  return CombineShiftRightBytes<14>(d, v, v_lo);
   3168 }
   3169 
   3170 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
   3171 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
   3172  const auto v_lo = ConcatLowerLower(d, v, Zero(d));
   3173  return CombineShiftRightBytes<12>(d, v, v_lo);
   3174 }
   3175 
   3176 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   3177 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
   3178  const auto v_lo = ConcatLowerLower(d, v, Zero(d));
   3179  return CombineShiftRightBytes<8>(d, v, v_lo);
   3180 }
   3181 
   3182 // ------------------------------ SlideDownLanes
   3183 
   3184 namespace detail {
   3185 
   3186 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3187 HWY_INLINE VFromD<D> TableLookupSlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3188  const RebindToUnsigned<decltype(d)> du;
   3189  using TU = TFromD<decltype(du)>;
   3190  const auto idx = Iota(du, static_cast<TU>(amt));
   3191  const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1)));
   3192  return IfThenElseZero(RebindMask(d, idx == masked_idx),
   3193                        TableLookupLanes(v, IndicesFromVec(d, masked_idx)));
   3194 }
   3195 
   3196 }  // namespace detail
   3197 
   3198 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
   3199 HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
   3200  static_assert(0 <= kBlocks && kBlocks <= 1,
   3201                "kBlocks must be between 0 and 1");
   3202  const Half<decltype(d)> dh;
   3203  return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v;
   3204 }
   3205 
   3206 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3207 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3208 #if !HWY_IS_DEBUG_BUILD
   3209  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
   3210  const Half<decltype(d)> dh;
   3211  if (__builtin_constant_p(amt)) {
   3212    const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
   3213    switch (amt * sizeof(TFromD<D>)) {
   3214      case 0:
   3215        return v;
   3216      case 1:
   3217        return CombineShiftRightBytes<1>(d, v_hi, v);
   3218      case 2:
   3219        return CombineShiftRightBytes<2>(d, v_hi, v);
   3220      case 3:
   3221        return CombineShiftRightBytes<3>(d, v_hi, v);
   3222      case 4:
   3223        return CombineShiftRightBytes<4>(d, v_hi, v);
   3224      case 5:
   3225        return CombineShiftRightBytes<5>(d, v_hi, v);
   3226      case 6:
   3227        return CombineShiftRightBytes<6>(d, v_hi, v);
   3228      case 7:
   3229        return CombineShiftRightBytes<7>(d, v_hi, v);
   3230      case 8:
   3231        return CombineShiftRightBytes<8>(d, v_hi, v);
   3232      case 9:
   3233        return CombineShiftRightBytes<9>(d, v_hi, v);
   3234      case 10:
   3235        return CombineShiftRightBytes<10>(d, v_hi, v);
   3236      case 11:
   3237        return CombineShiftRightBytes<11>(d, v_hi, v);
   3238      case 12:
   3239        return CombineShiftRightBytes<12>(d, v_hi, v);
   3240      case 13:
   3241        return CombineShiftRightBytes<13>(d, v_hi, v);
   3242      case 14:
   3243        return CombineShiftRightBytes<14>(d, v_hi, v);
   3244      case 15:
   3245        return CombineShiftRightBytes<15>(d, v_hi, v);
   3246    }
   3247  }
   3248 
   3249  if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
   3250    return ZeroExtendVector(
   3251        d, SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock));
   3252  }
   3253 #endif
   3254 
   3255  return detail::TableLookupSlideDownLanes(d, v, amt);
   3256 }
   3257 
   3258 // ------------------------------ Slide1Down
   3259 
   3260 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)>
   3261 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   3262  const Half<decltype(d)> dh;
   3263  const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
   3264  return CombineShiftRightBytes<1>(d, v_hi, v);
   3265 }
   3266 
   3267 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   3268 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   3269  const Half<decltype(d)> dh;
   3270  const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
   3271  return CombineShiftRightBytes<2>(d, v_hi, v);
   3272 }
   3273 
   3274 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)>
   3275 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   3276  const Half<decltype(d)> dh;
   3277  const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
   3278  return CombineShiftRightBytes<4>(d, v_hi, v);
   3279 }
   3280 
   3281 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)>
   3282 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   3283  const Half<decltype(d)> dh;
   3284  const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v));
   3285  return CombineShiftRightBytes<8>(d, v_hi, v);
   3286 }
   3287 
   3288 // ------------------------------ Shl (Mul, ZipLower)
   3289 namespace detail {
   3290 
   3291 HWY_INLINE Vec256<uint8_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint8_t> v,
   3292                               Vec256<uint8_t> bits) {
   3293  return Vec256<uint8_t>{__lasx_xvsll_b(v.raw, bits.raw)};
   3294 }
   3295 
   3296 HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
   3297                                Vec256<uint16_t> bits) {
   3298  return Vec256<uint16_t>{__lasx_xvsll_h(v.raw, bits.raw)};
   3299 }
   3300 
   3301 HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v,
   3302                                Vec256<uint32_t> bits) {
   3303  return Vec256<uint32_t>{__lasx_xvsll_w(v.raw, bits.raw)};
   3304 }
   3305 
   3306 HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v,
   3307                                Vec256<uint64_t> bits) {
   3308  return Vec256<uint64_t>{__lasx_xvsll_d(v.raw, bits.raw)};
   3309 }
   3310 
   3311 template <typename T>
   3312 HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) {
   3313  // Signed left shifts are the same as unsigned.
   3314  const Full256<T> di;
   3315  const Full256<MakeUnsigned<T>> du;
   3316  return BitCast(di,
   3317                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
   3318 }
   3319 
   3320 }  // namespace detail
   3321 
   3322 template <typename T>
   3323 HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
   3324  return detail::Shl(hwy::TypeTag<T>(), v, bits);
   3325 }
   3326 
   3327 // ------------------------------ Shr (MulHigh, IfThenElse, Not)
   3328 
   3329 HWY_API Vec256<uint8_t> operator>>(Vec256<uint8_t> v, Vec256<uint8_t> bits) {
   3330  return Vec256<uint8_t>{__lasx_xvsrl_b(v.raw, bits.raw)};
   3331 }
   3332 
   3333 HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
   3334  return Vec256<uint16_t>{__lasx_xvsrl_h(v.raw, bits.raw)};
   3335 }
   3336 
   3337 HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) {
   3338  return Vec256<uint32_t>{__lasx_xvsrl_w(v.raw, bits.raw)};
   3339 }
   3340 
   3341 HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) {
   3342  return Vec256<uint64_t>{__lasx_xvsrl_d(v.raw, bits.raw)};
   3343 }
   3344 
   3345 HWY_API Vec256<int8_t> operator>>(Vec256<int8_t> v, Vec256<int8_t> bits) {
   3346  return Vec256<int8_t>{__lasx_xvsra_b(v.raw, bits.raw)};
   3347 }
   3348 
   3349 HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) {
   3350  return Vec256<int16_t>{__lasx_xvsra_h(v.raw, bits.raw)};
   3351 }
   3352 
   3353 HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) {
   3354  return Vec256<int32_t>{__lasx_xvsra_w(v.raw, bits.raw)};
   3355 }
   3356 
   3357 HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
   3358  return Vec256<int64_t>{__lasx_xvsra_d(v.raw, bits.raw)};
   3359 }
   3360 
   3361 // ------------------------------ WidenMulPairwiseAdd
   3362 
   3363 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3364 HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a,
   3365                                      Vec256<int16_t> b) {
   3366  __m256i ev = __lasx_xvmulwev_w_h(b.raw, a.raw);
   3367  return VFromD<D>{__lasx_xvmaddwod_w_h(ev, b.raw, a.raw)};
   3368 }
   3369 
   3370 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
   3371 HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<uint16_t> a,
   3372                                      Vec256<uint16_t> b) {
   3373  __m256i ev = __lasx_xvmulwev_w_hu(b.raw, a.raw);
   3374  return VFromD<D>{__lasx_xvmaddwod_w_hu(ev, b.raw, a.raw)};
   3375 }
   3376 
   3377 // ------------------------------ ReorderWidenMulAccumulate
   3378 
   3379 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3380 HWY_API VFromD<D> ReorderWidenMulAccumulate(D /*tag*/, Vec256<int16_t> a,
   3381                                            Vec256<int16_t> b,
   3382                                            const VFromD<D> sum0,
   3383                                            VFromD<D>& /*sum1*/) {
   3384  return VFromD<D>{__lasx_xvmaddwev_w_h(
   3385      __lasx_xvmaddwod_w_h(sum0.raw, a.raw, b.raw), a.raw, b.raw)};
   3386 }
   3387 
   3388 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
   3389 HWY_API VFromD<D> ReorderWidenMulAccumulate(D /*tag*/, Vec256<uint16_t> a,
   3390                                            Vec256<uint16_t> b,
   3391                                            const VFromD<D> sum0,
   3392                                            VFromD<D>& /*sum1*/) {
   3393  return VFromD<D>{__lasx_xvmaddwev_w_hu(
   3394      __lasx_xvmaddwod_w_hu(sum0.raw, a.raw, b.raw), a.raw, b.raw)};
   3395 }
   3396 
   3397 // ------------------------------ RearrangeToOddPlusEven
   3398 HWY_API Vec256<int32_t> RearrangeToOddPlusEven(const Vec256<int32_t> sum0,
   3399                                               Vec256<int32_t> /*sum1*/) {
   3400  return sum0;  // invariant already holds
   3401 }
   3402 
   3403 HWY_API Vec256<uint32_t> RearrangeToOddPlusEven(const Vec256<uint32_t> sum0,
   3404                                                Vec256<uint32_t> /*sum1*/) {
   3405  return sum0;  // invariant already holds
   3406 }
   3407 
   3408 // ================================================== CONVERT
   3409 
   3410 // ------------------------------ Promotions (part w/ narrow lanes -> full)
   3411 
   3412 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
   3413 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<hwy::float16_t> v) {
   3414  const Repartition<hwy::float16_t, D> df16;
   3415  const auto from_128 = ZeroExtendVector(df16, v);
   3416  const VFromD<decltype(df16)> f16_concat{__lasx_xvpermi_d(from_128.raw, 0xd8)};
   3417  return VFromD<D>{__lasx_xvfcvtl_s_h(f16_concat.raw)};
   3418 }
   3419 
   3420 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   3421 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<float> v) {
   3422  const Repartition<float, D> df;
   3423  const RebindToSigned<decltype(df)> di;
   3424  const auto from_128 = ZeroExtendVector(df, v);
   3425  const auto f32_concat = BitCast(
   3426      df, Vec256<uint32_t>{__lasx_xvpermi_d(BitCast(di, from_128).raw, 0xd8)});
   3427  return VFromD<D>{__lasx_xvfcvtl_d_s(f32_concat.raw)};
   3428 }
   3429 
   3430 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
   3431 HWY_API VFromD<D> PromoteTo(D /*di64*/, Vec128<float> v) {
   3432  const Repartition<float, D> df;
   3433  const RebindToSigned<decltype(df)> di;
   3434  const auto from_128 = ZeroExtendVector(df, v);
   3435  const auto f32_concat = BitCast(
   3436      df, Vec256<uint32_t>{__lasx_xvpermi_d(BitCast(di, from_128).raw, 0xd8)});
   3437  return VFromD<D>{__lasx_xvftintrzl_l_s(f32_concat.raw)};
   3438 }
   3439 
   3440 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   3441 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) {
   3442  alignas(32) __m128i vec_tmp[2];
   3443  __m256i vec_temp;
   3444  vec_tmp[0] = v.raw;
   3445  CopyBytes<32>(vec_tmp, &vec_temp);
   3446  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3447  vec_temp = __lasx_xvsllwil_d_w(vec_temp, 0);
   3448  return VFromD<D>{__lasx_xvffint_d_l(vec_temp)};
   3449 }
   3450 
   3451 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   3452 HWY_API Vec256<double> PromoteTo(D /* tag */, Vec128<uint32_t> v) {
   3453  alignas(32) __m128i vec_tmp[2];
   3454  __m256i vec_temp;
   3455  vec_tmp[0] = v.raw;
   3456  CopyBytes<32>(vec_tmp, &vec_temp);
   3457  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3458  vec_temp = __lasx_xvsllwil_du_wu(vec_temp, 0);
   3459  return VFromD<D>{__lasx_xvffint_d_lu(vec_temp)};
   3460 }
   3461 
   3462 // Unsigned: zero-extend.
   3463 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
   3464 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t> v) {
   3465  alignas(32) __m128i vec_tmp[2];
   3466  __m256i vec_temp;
   3467  vec_tmp[0] = v.raw;
   3468  CopyBytes<32>(vec_tmp, &vec_temp);
   3469  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3470  return VFromD<D>{__lasx_xvsllwil_hu_bu(vec_temp, 0)};
   3471 }
   3472 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
   3473 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t, 8> v) {
   3474  alignas(32) __m128i vec_tmp[2];
   3475  __m256i vec_temp;
   3476  vec_tmp[0] = v.raw;
   3477  CopyBytes<32>(vec_tmp, &vec_temp);
   3478  vec_temp = __lasx_xvsllwil_hu_bu(vec_temp, 0);
   3479  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3480  return VFromD<D>{__lasx_xvsllwil_wu_hu(vec_temp, 0)};
   3481 }
   3482 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
   3483 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint16_t> v) {
   3484  alignas(32) __m128i vec_tmp[2];
   3485  __m256i vec_temp;
   3486  vec_tmp[0] = v.raw;
   3487  CopyBytes<32>(vec_tmp, &vec_temp);
   3488  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3489  return VFromD<D>{__lasx_xvsllwil_wu_hu(vec_temp, 0)};
   3490 }
   3491 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
   3492 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint32_t> v) {
   3493  alignas(32) __m128i vec_tmp[2];
   3494  __m256i vec_temp;
   3495  vec_tmp[0] = v.raw;
   3496  CopyBytes<32>(vec_tmp, &vec_temp);
   3497  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3498  return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)};
   3499 }
   3500 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
   3501 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec64<uint16_t> v) {
   3502  alignas(32) __m128i vec_tmp[2];
   3503  __m256i vec_temp;
   3504  vec_tmp[0] = v.raw;
   3505  CopyBytes<32>(vec_tmp, &vec_temp);
   3506  vec_temp = __lasx_xvsllwil_wu_hu(vec_temp, 0);
   3507  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3508  return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)};
   3509 }
   3510 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)>
   3511 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<uint8_t> v) {
   3512  alignas(32) __m128i vec_tmp[2];
   3513  __m256i vec_temp;
   3514  vec_tmp[0] = v.raw;
   3515  CopyBytes<32>(vec_tmp, &vec_temp);
   3516  vec_temp = __lasx_xvsllwil_hu_bu(vec_temp, 0);
   3517  vec_temp = __lasx_xvsllwil_wu_hu(vec_temp, 0);
   3518  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3519  return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)};
   3520 }
   3521 
   3522 // Signed: replicate sign bit.
   3523 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
   3524 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t> v) {
   3525  alignas(32) __m128i vec_tmp[2];
   3526  __m256i vec_temp;
   3527  vec_tmp[0] = v.raw;
   3528  CopyBytes<32>(vec_tmp, &vec_temp);
   3529  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3530  return VFromD<D>{__lasx_xvsllwil_h_b(vec_temp, 0)};
   3531 }
   3532 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3533 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t, 8> v) {
   3534  alignas(32) __m128i vec_tmp[2];
   3535  __m256i vec_temp;
   3536  vec_tmp[0] = v.raw;
   3537  CopyBytes<32>(vec_tmp, &vec_temp);
   3538  vec_temp = __lasx_xvsllwil_h_b(vec_temp, 0);
   3539  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3540  return VFromD<D>{__lasx_xvsllwil_w_h(vec_temp, 0)};
   3541 }
   3542 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3543 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int16_t> v) {
   3544  alignas(32) __m128i vec_tmp[2];
   3545  __m256i vec_temp;
   3546  vec_tmp[0] = v.raw;
   3547  CopyBytes<32>(vec_tmp, &vec_temp);
   3548  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3549  return VFromD<D>{__lasx_xvsllwil_w_h(vec_temp, 0)};
   3550 }
   3551 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
   3552 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) {
   3553  alignas(32) __m128i vec_tmp[2];
   3554  __m256i vec_temp;
   3555  vec_tmp[0] = v.raw;
   3556  CopyBytes<32>(vec_tmp, &vec_temp);
   3557  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3558  return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)};
   3559 }
   3560 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
   3561 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec64<int16_t> v) {
   3562  alignas(32) __m128i vec_tmp[2];
   3563  __m256i vec_temp;
   3564  vec_tmp[0] = v.raw;
   3565  CopyBytes<32>(vec_tmp, &vec_temp);
   3566  vec_temp = __lasx_xvsllwil_w_h(vec_temp, 0);
   3567  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3568  return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)};
   3569 }
   3570 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
   3571 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) {
   3572  alignas(32) __m128i vec_tmp[2];
   3573  __m256i vec_temp;
   3574  vec_tmp[0] = v.raw;
   3575  CopyBytes<32>(vec_tmp, &vec_temp);
   3576  vec_temp = __lasx_xvsllwil_h_b(vec_temp, 0);
   3577  vec_temp = __lasx_xvsllwil_w_h(vec_temp, 0);
   3578  vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8);
   3579  return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)};
   3580 }
   3581 
   3582 // ------------------------------ PromoteEvenTo/PromoteOddTo
   3583 namespace detail {
   3584 
   3585 // I32->I64 PromoteEvenTo/PromoteOddTo
   3586 
   3587 template <class D, HWY_IF_LANES_D(D, 4)>
   3588 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
   3589                                   hwy::SizeTag<8> /*to_lane_size_tag*/,
   3590                                   hwy::SignedTag /*from_type_tag*/, D d_to,
   3591                                   Vec256<int32_t> v) {
   3592  return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v));
   3593 }
   3594 
   3595 template <class D, HWY_IF_LANES_D(D, 4)>
   3596 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
   3597                                  hwy::SizeTag<8> /*to_lane_size_tag*/,
   3598                                  hwy::SignedTag /*from_type_tag*/, D d_to,
   3599                                  Vec256<int32_t> v) {
   3600  return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v)));
   3601 }
   3602 
   3603 }  // namespace detail
   3604 
   3605 // ------------------------------ Demotions (full -> part w/ narrow lanes)
   3606 
   3607 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
   3608 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int16_t> a,
   3609                                   Vec256<int16_t> b) {
   3610  return VFromD<D>{__lasx_xvssrani_b_h(b.raw, a.raw, 0)};
   3611 }
   3612 
   3613 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
   3614 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int16_t> a,
   3615                                   Vec256<int16_t> b) {
   3616  return VFromD<D>{__lasx_xvssrani_bu_h(b.raw, a.raw, 0)};
   3617 }
   3618 
   3619 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)>
   3620 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint16_t> a,
   3621                                   Vec256<uint16_t> b) {
   3622  return VFromD<D>{__lasx_xvssrlni_b_h(b.raw, a.raw, 0)};
   3623 }
   3624 
   3625 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)>
   3626 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint16_t> a,
   3627                                   Vec256<uint16_t> b) {
   3628  return VFromD<D>{__lasx_xvssrlni_bu_h(b.raw, a.raw, 0)};
   3629 }
   3630 
   3631 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
   3632 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int32_t> a,
   3633                                   Vec256<int32_t> b) {
   3634  return VFromD<D>{__lasx_xvssrani_h_w(b.raw, a.raw, 0)};
   3635 }
   3636 
   3637 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
   3638 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int32_t> a,
   3639                                   Vec256<int32_t> b) {
   3640  return VFromD<D>{__lasx_xvssrani_hu_w(b.raw, a.raw, 0)};
   3641 }
   3642 
   3643 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)>
   3644 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint32_t> a,
   3645                                   Vec256<uint32_t> b) {
   3646  return VFromD<D>{__lasx_xvssrlni_h_w(b.raw, a.raw, 0)};
   3647 }
   3648 
   3649 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)>
   3650 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint32_t> a,
   3651                                   Vec256<uint32_t> b) {
   3652  return VFromD<D>{__lasx_xvssrlni_hu_w(b.raw, a.raw, 0)};
   3653 }
   3654 
   3655 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3656 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int64_t> a,
   3657                                   Vec256<int64_t> b) {
   3658  return VFromD<D>{__lasx_xvssrani_w_d(b.raw, a.raw, 0)};
   3659 }
   3660 
   3661 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
   3662 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int64_t> a,
   3663                                   Vec256<int64_t> b) {
   3664  return VFromD<D>{__lasx_xvssrani_wu_d(b.raw, a.raw, 0)};
   3665 }
   3666 
   3667 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3668 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint64_t> a,
   3669                                   Vec256<uint64_t> b) {
   3670  return VFromD<D>{__lasx_xvssrlni_w_d(b.raw, a.raw, 0)};
   3671 }
   3672 
   3673 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)>
   3674 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint64_t> a,
   3675                                   Vec256<uint64_t> b) {
   3676  return VFromD<D>{__lasx_xvssrlni_wu_d(b.raw, a.raw, 0)};
   3677 }
   3678 
   3679 template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
   3680          HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   3681          HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
   3682          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   3683 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
   3684  return VFromD<D>{__lasx_xvpermi_d(ReorderDemote2To(d, a, b).raw, 0xd8)};
   3685 }
   3686 
   3687 template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D),
   3688          HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   3689          HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
   3690          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>))>
   3691 HWY_API VFromD<D> DemoteTo(D d, V v) {
   3692  return LowerHalf(OrderedDemote2To(Twice<decltype(d)>(), v, v));
   3693 }
   3694 
   3695 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   3696 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<float> v) {
   3697  const Full256<int16_t> di;
   3698  const Vec256<hwy::float16_t> f16_blocks{__lasx_xvfcvt_h_s(v.raw, v.raw)};
   3699  const auto f16_concat =
   3700      BitCast(Twice<D>(), VFromD<decltype(di)>{__lasx_xvpermi_d(
   3701                              BitCast(di, f16_blocks).raw, 0xd8)});
   3702  return LowerHalf(f16_concat);
   3703 }
   3704 
   3705 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   3706 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) {
   3707  const Full256<int32_t> di;
   3708  const Vec256<float> f32_blocks{__lasx_xvfcvt_s_d(v.raw, v.raw)};
   3709  const auto f32_concat =
   3710      BitCast(Twice<D>(), VFromD<decltype(di)>{__lasx_xvpermi_d(
   3711                              BitCast(di, f32_blocks).raw, 0xd8)});
   3712  return LowerHalf(f32_concat);
   3713 }
   3714 
   3715 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
   3716 HWY_API VFromD<D> DemoteTo(D dn, Vec256<double> v) {
   3717  const __m256i i32_blocks = __lasx_xvftintrz_w_d(v.raw, v.raw);
   3718  return LowerHalf(dn, VFromD<Twice<D>>{__lasx_xvpermi_d(i32_blocks, 0xd8)});
   3719 }
   3720 
   3721 // For already range-limited input [0, 255].
   3722 HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
   3723  const Full256<uint32_t> d32;
   3724  const Full64<uint8_t> d8;
   3725  alignas(32) static constexpr uint32_t k8From32[8] = {
   3726      0x0C080400u, 0x13121110u, 0, 0, 0x13121110u, 0x0C080400u, 0, 0};
   3727  // Place first four bytes in lo[0], remaining 4 in hi[1].
   3728  const auto quad = VFromD<decltype(d32)>{
   3729      __lasx_xvshuf_b(Zero(d32).raw, v.raw, Load(d32, k8From32).raw)};
   3730  // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
   3731  const auto lo = LowerHalf(quad);
   3732  const auto hi = UpperHalf(Half<decltype(d32)>(), quad);
   3733  return BitCast(d8, LowerHalf(lo | hi));
   3734 }
   3735 
   3736 // ------------------------------ Truncations
   3737 
   3738 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
   3739 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
   3740  const Full256<uint8_t> d8;
   3741  alignas(32) static constexpr uint8_t kMap[32] = {0, 8, 16, 24};
   3742  const auto i8 = TableLookupLanes(BitCast(d8, v), SetTableIndices(d8, kMap));
   3743  return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{i8.raw})));
   3744 }
   3745 
   3746 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
   3747 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
   3748  const __m256i i32_blocks = __lasx_xvpickev_w(v.raw, v.raw);
   3749  const __m256i i32_concat = __lasx_xvpermi_d(i32_blocks, 0xd8);
   3750  const __m256i i16 = __lasx_xvpickev_h(i32_concat, i32_concat);
   3751  return LowerHalf(LowerHalf(Vec256<uint16_t>{i16}));
   3752 }
   3753 
   3754 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
   3755 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
   3756  const Full256<uint32_t> d32;
   3757  alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
   3758  const auto v32 =
   3759      TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
   3760  return LowerHalf(Vec256<uint32_t>{v32.raw});
   3761 }
   3762 
   3763 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
   3764 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
   3765  const Full256<uint8_t> d8;
   3766  alignas(32) static constexpr uint8_t kEven[32] = {0,  4,  8,  12,
   3767                                                    16, 20, 24, 28};
   3768  const auto i8 = TableLookupLanes(BitCast(d8, v), SetTableIndices(d8, kEven));
   3769  return LowerHalf(LowerHalf(Vec256<uint8_t>{i8.raw}));
   3770 }
   3771 
   3772 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
   3773 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
   3774  const __m256i i16_blocks = __lasx_xvpickev_h(v.raw, v.raw);
   3775  const __m256i i16_concat = __lasx_xvpermi_d(i16_blocks, 0xd8);
   3776  return LowerHalf(Vec256<uint16_t>{i16_concat});
   3777 }
   3778 
   3779 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
   3780 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint16_t> v) {
   3781  const __m256i i8_blocks = __lasx_xvpickev_b(v.raw, v.raw);
   3782  const __m256i i8_concat = __lasx_xvpermi_d(i8_blocks, 0xd8);
   3783  return LowerHalf(Vec256<uint8_t>{i8_concat});
   3784 }
   3785 
   3786 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
   3787 
   3788 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
   3789 HWY_API VFromD<D> ConvertTo(D /* tag */, Vec256<int32_t> v) {
   3790  return VFromD<D>{__lasx_xvffint_s_w(v.raw)};
   3791 }
   3792 
   3793 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
   3794 HWY_API VFromD<D> ConvertTo(D /*df*/, Vec256<uint32_t> v) {
   3795  return VFromD<D>{__lasx_xvffint_s_wu(v.raw)};
   3796 }
   3797 
   3798 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   3799 HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<int64_t> v) {
   3800  return VFromD<D>{__lasx_xvffint_d_l(v.raw)};
   3801 }
   3802 
   3803 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
   3804 HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) {
   3805  return VFromD<D>{__lasx_xvffint_d_lu(v.raw)};
   3806 }
   3807 
   3808 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)>
   3809 HWY_API VFromD<D> ConvertTo(D /*d*/, Vec256<float> v) {
   3810  return VFromD<D>{__lasx_xvftintrz_w_s(v.raw)};
   3811 }
   3812 
   3813 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)>
   3814 HWY_API VFromD<D> ConvertTo(D /*di*/, Vec256<double> v) {
   3815  return VFromD<D>{__lasx_xvftintrz_l_d(v.raw)};
   3816 }
   3817 
   3818 template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)>
   3819 HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
   3820  return VFromD<DU>{__lasx_xvftintrz_wu_s(v.raw)};
   3821 }
   3822 
   3823 template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)>
   3824 HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
   3825  return VFromD<DU>{__lasx_xvftintrz_lu_d(v.raw)};
   3826 }
   3827 
   3828 template <typename T, HWY_IF_FLOAT3264(T)>
   3829 HWY_API Vec256<MakeSigned<T>> NearestInt(const Vec256<T> v) {
   3830  return ConvertTo(Full256<MakeSigned<T>>(), Round(v));
   3831 }
   3832 
   3833 // ------------------------------ LoadMaskBits (TestBit)
   3834 
   3835 namespace detail {
   3836 
   3837 template <typename T, HWY_IF_T_SIZE(T, 1)>
   3838 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
   3839  const Full256<T> d;
   3840  const RebindToUnsigned<decltype(d)> du;
   3841  const Repartition<uint32_t, decltype(d)> du32;
   3842  const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
   3843 
   3844  // Replicate bytes 8x such that each byte contains the bit that governs it.
   3845  const Repartition<uint64_t, decltype(d)> du64;
   3846  alignas(32) static constexpr uint64_t kRep8[4] = {
   3847      0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
   3848      0x0303030303030303ull};
   3849  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
   3850 
   3851  const VFromD<decltype(du)> bit = Dup128VecFromValues(
   3852      du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
   3853  return RebindMask(d, TestBit(rep8, bit));
   3854 }
   3855 
   3856 template <typename T, HWY_IF_T_SIZE(T, 2)>
   3857 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
   3858  const Full256<T> d;
   3859  const RebindToUnsigned<decltype(d)> du;
   3860  alignas(32) static constexpr uint16_t kBit[16] = {
   3861      1,     2,     4,     8,     16,     32,     64,     128,
   3862      0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
   3863  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
   3864  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
   3865 }
   3866 
   3867 template <typename T, HWY_IF_T_SIZE(T, 4)>
   3868 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
   3869  const Full256<T> d;
   3870  const RebindToUnsigned<decltype(d)> du;
   3871  alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
   3872  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
   3873  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
   3874 }
   3875 
   3876 template <typename T, HWY_IF_T_SIZE(T, 8)>
   3877 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) {
   3878  const Full256<T> d;
   3879  const RebindToUnsigned<decltype(d)> du;
   3880  alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8};
   3881  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
   3882 }
   3883 
   3884 }  // namespace detail
   3885 
   3886 // `p` points to at least 8 readable bytes, not all of which need be valid.
   3887 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3888 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   3889  constexpr size_t kN = MaxLanes(d);
   3890  constexpr size_t kNumBytes = (kN + 7) / 8;
   3891 
   3892  uint64_t mask_bits = 0;
   3893  CopyBytes<kNumBytes>(bits, &mask_bits);
   3894 
   3895  if (kN < 8) {
   3896    mask_bits &= (1ull << kN) - 1;
   3897  }
   3898 
   3899  return detail::LoadMaskBits256<TFromD<D>>(mask_bits);
   3900 }
   3901 
   3902 // ------------------------------ BitsFromMask
   3903 
   3904 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
   3905 HWY_API uint64_t BitsFromMask(D /*tag*/, MFromD<D> mask) {
   3906  const auto sign_bits = __lasx_xvmskltz_b(mask.raw);
   3907  return static_cast<uint32_t>(__lasx_xvpickve2gr_w(sign_bits, 0) |
   3908                               (__lasx_xvpickve2gr_w(sign_bits, 4) << 16));
   3909 }
   3910 
   3911 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
   3912 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   3913  const RebindToSigned<decltype(d)> di;
   3914  const auto vec_mask = VecFromMask(mask);
   3915  const auto sign_bits =
   3916      __lasx_xvpickod_b(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw);
   3917  const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8);
   3918  const auto sign_last = __lasx_xvmskltz_b(sign_shuf);
   3919  return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0));
   3920 }
   3921 
   3922 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
   3923 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   3924  const RebindToSigned<decltype(d)> di;
   3925  const auto vec_mask = VecFromMask(mask);
   3926  const auto sign_bits =
   3927      __lasx_xvpickod_h(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw);
   3928  const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8);
   3929  const auto sign_last = __lasx_xvmskltz_h(sign_shuf);
   3930  return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0));
   3931 }
   3932 
   3933 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
   3934 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   3935  const RebindToSigned<decltype(d)> di;
   3936  const auto vec_mask = VecFromMask(mask);
   3937  const auto sign_bits =
   3938      __lasx_xvpickod_w(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw);
   3939  const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8);
   3940  const auto sign_last = __lasx_xvmskltz_w(sign_shuf);
   3941  return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0));
   3942 }
   3943 
   3944 // ------------------------------ StoreMaskBits
   3945 // `p` points to at least 8 writable bytes.
   3946 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3947 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
   3948  constexpr size_t N = MaxLanes(d);
   3949  constexpr size_t kNumBytes = (N + 7) / 8;
   3950 
   3951  const uint64_t mask_bits = BitsFromMask(d, mask);
   3952  CopyBytes<kNumBytes>(&mask_bits, bits);
   3953  return kNumBytes;
   3954 }
   3955 
   3956 // ------------------------------ Mask testing
   3957 
   3958 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3959 HWY_API bool AllFalse(D d, MFromD<D> mask) {
   3960  return BitsFromMask(d, mask) == 0;
   3961 }
   3962 
   3963 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3964 HWY_API bool AllTrue(D d, MFromD<D> mask) {
   3965  constexpr size_t kN = MaxLanes(d);
   3966  constexpr uint64_t kAllBits = (1ull << kN) - 1;
   3967  return BitsFromMask(d, mask) == kAllBits;
   3968 }
   3969 
   3970 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3971 HWY_API size_t CountTrue(D d, MFromD<D> mask) {
   3972  return PopCount(BitsFromMask(d, mask));
   3973 }
   3974 
   3975 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3976 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
   3977  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   3978  return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
   3979 }
   3980 
   3981 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3982 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
   3983  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   3984  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
   3985 }
   3986 
   3987 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3988 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
   3989  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   3990  return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits);
   3991 }
   3992 
   3993 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   3994 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
   3995  const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask));
   3996  return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits))
   3997                   : -1;
   3998 }
   3999 
   4000 // ------------------------------ Compress, CompressBits
   4001 
   4002 namespace detail {
   4003 
   4004 template <typename T, HWY_IF_T_SIZE(T, 4)>
   4005 HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) {
   4006  const Full256<uint32_t> d32;
   4007  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
   4008  // of SetTableIndices would require 8 KiB, a large part of L1D. We instead
   4009  // compress each index into 4 bits, for a total of 1 KiB.
   4010  alignas(16) static constexpr uint32_t packed_array[256] = {
   4011      // PrintCompress32x8Tables
   4012      0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
   4013      0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
   4014      0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
   4015      0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
   4016      0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
   4017      0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
   4018      0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
   4019      0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
   4020      0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
   4021      0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
   4022      0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
   4023      0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
   4024      0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
   4025      0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
   4026      0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
   4027      0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
   4028      0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
   4029      0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
   4030      0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
   4031      0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
   4032      0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
   4033      0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
   4034      0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
   4035      0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
   4036      0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
   4037      0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
   4038      0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
   4039      0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
   4040      0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
   4041      0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
   4042      0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
   4043      0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
   4044      0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
   4045      0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
   4046      0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
   4047      0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
   4048      0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
   4049      0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
   4050      0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
   4051      0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
   4052      0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
   4053      0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
   4054      0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
   4055 
   4056  // No need to mask because __lasx_xvperm_w ignores bits 3..31.
   4057  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
   4058  const auto packed = Set(d32, packed_array[mask_bits]);
   4059  alignas(32) static constexpr uint32_t shifts[8] = {0,  4,  8,  12,
   4060                                                     16, 20, 24, 28};
   4061  return packed >> Load(d32, shifts);
   4062 }
   4063 
   4064 template <typename T, HWY_IF_T_SIZE(T, 8)>
   4065 HWY_INLINE Vec256<uint64_t> IndicesFromBits256(uint64_t mask_bits) {
   4066  const Full256<uint64_t> d64;
   4067 
   4068  // For 64-bit, there are only 4 lanes, so we can afford to load the
   4069  // entire index vector directly.
   4070  alignas(32) static constexpr uint64_t u64_indices[64] = {
   4071      // PrintCompress64x4PairTables
   4072      0,  1,  2, 3, 8, 1,  2,  3, 9, 0,  2,  3, 8, 9, 2,  3,
   4073      10, 0,  1, 3, 8, 10, 1,  3, 9, 10, 0,  3, 8, 9, 10, 3,
   4074      11, 0,  1, 2, 8, 11, 1,  2, 9, 11, 0,  2, 8, 9, 11, 2,
   4075      10, 11, 0, 1, 8, 10, 11, 1, 9, 10, 11, 0, 8, 9, 10, 11};
   4076  return Load(d64, u64_indices + 4 * mask_bits);
   4077 }
   4078 
   4079 template <typename T, HWY_IF_T_SIZE(T, 4)>
   4080 HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) {
   4081  const Full256<uint32_t> d32;
   4082  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
   4083  // of SetTableIndices would require 8 KiB, a large part of L1D. We instead
   4084  // compress each index into 4 bits, for a total of 1 KiB.
   4085  alignas(16) static constexpr uint32_t packed_array[256] = {
   4086      // PrintCompressNot32x8Tables
   4087      0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
   4088      0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
   4089      0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
   4090      0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
   4091      0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
   4092      0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
   4093      0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
   4094      0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
   4095      0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
   4096      0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
   4097      0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
   4098      0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
   4099      0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
   4100      0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
   4101      0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
   4102      0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
   4103      0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
   4104      0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
   4105      0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
   4106      0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
   4107      0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
   4108      0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
   4109      0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
   4110      0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
   4111      0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
   4112      0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
   4113      0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
   4114      0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
   4115      0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
   4116      0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
   4117      0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
   4118      0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
   4119      0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
   4120      0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
   4121      0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
   4122      0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
   4123      0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
   4124      0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
   4125      0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
   4126      0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
   4127      0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
   4128      0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
   4129      0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
   4130 
   4131  // No need to mask because <__lasx_xvperm_w> ignores bits 3..31.
   4132  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
   4133  const Vec256<uint32_t> packed = Set(d32, packed_array[mask_bits]);
   4134  alignas(32) static constexpr uint32_t shifts[8] = {0,  4,  8,  12,
   4135                                                     16, 20, 24, 28};
   4136  return packed >> Load(d32, shifts);
   4137 }
   4138 
   4139 template <typename T, HWY_IF_T_SIZE(T, 8)>
   4140 HWY_INLINE Vec256<uint64_t> IndicesFromNotBits256(uint64_t mask_bits) {
   4141  const Full256<uint64_t> d64;
   4142 
   4143  // For 64-bit, there are only 4 lanes, so we can afford to load
   4144  // the entire index vector directly.
   4145  alignas(32) static constexpr uint64_t u64_indices[64] = {
   4146      // PrintCompressNot64x4PairTables
   4147      8, 9, 10, 11, 9, 10, 11, 0, 8, 10, 11, 1, 10, 11, 0, 1,
   4148      8, 9, 11, 2,  9, 11, 0,  2, 8, 11, 1,  2, 11, 0,  1, 2,
   4149      8, 9, 10, 3,  9, 10, 0,  3, 8, 10, 1,  3, 10, 0,  1, 3,
   4150      8, 9, 2,  3,  9, 0,  2,  3, 8, 1,  2,  3, 0,  1,  2, 3};
   4151  return Load(d64, u64_indices + 4 * mask_bits);
   4152 }
   4153 
   4154 template <typename T, HWY_IF_NOT_T_SIZE(T, 2)>
   4155 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
   4156  const DFromV<decltype(v)> d;
   4157  const RebindToSigned<decltype(d)> di;
   4158 
   4159  HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
   4160  const Indices256<TFromD<decltype(di)>> indices{
   4161      IndicesFromBits256<T>(mask_bits).raw};
   4162  return BitCast(d, TableLookupLanes(BitCast(di, v), indices));
   4163 }
   4164 
   4165 // LUTs are infeasible for 2^16 possible masks, so splice together two
   4166 // half-vector Compress.
   4167 template <typename T, HWY_IF_T_SIZE(T, 2)>
   4168 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
   4169  const DFromV<decltype(v)> d;
   4170  const RebindToUnsigned<decltype(d)> du;
   4171  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
   4172  const Half<decltype(du)> duh;
   4173  const auto half0 = LowerHalf(duh, vu16);
   4174  const auto half1 = UpperHalf(duh, vu16);
   4175 
   4176  const uint64_t mask_bits0 = mask_bits & 0xFF;
   4177  const uint64_t mask_bits1 = mask_bits >> 8;
   4178  const auto compressed0 = detail::CompressBits(half0, mask_bits0);
   4179  const auto compressed1 = detail::CompressBits(half1, mask_bits1);
   4180 
   4181  alignas(32) uint16_t all_true[16] = {};
   4182  // Store mask=true lanes, left to right.
   4183  const size_t num_true0 = PopCount(mask_bits0);
   4184  Store(compressed0, duh, all_true);
   4185  StoreU(compressed1, duh, all_true + num_true0);
   4186 
   4187  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) {
   4188    // Store mask=false lanes, right to left. The second vector fills the upper
   4189    // half with right-aligned false lanes. The first vector is shifted
   4190    // rightwards to overwrite the true lanes of the second.
   4191    alignas(32) uint16_t all_false[16] = {};
   4192    const size_t num_true1 = PopCount(mask_bits1);
   4193    Store(compressed1, duh, all_false + 8);
   4194    StoreU(compressed0, duh, all_false + num_true1);
   4195 
   4196    const auto mask = FirstN(du, num_true0 + num_true1);
   4197    return BitCast(d,
   4198                   IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
   4199  } else {
   4200    // Only care about the mask=true lanes.
   4201    return BitCast(d, Load(du, all_true));
   4202  }
   4203 }
   4204 
   4205 template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
   4206 HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
   4207  const DFromV<decltype(v)> d;
   4208  const RebindToSigned<decltype(d)> di;
   4209 
   4210  HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
   4211  const Indices256<TFromD<decltype(di)>> indices{
   4212      IndicesFromNotBits256<T>(mask_bits).raw};
   4213  return BitCast(d, TableLookupLanes(BitCast(di, v), indices));
   4214 }
   4215 
   4216 // LUTs are infeasible for 2^16 possible masks, so splice together two
   4217 // half-vector Compress.
   4218 template <typename T, HWY_IF_T_SIZE(T, 2)>
   4219 HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
   4220  // Compress ensures only the lower 16 bits are set, so flip those.
   4221  return Compress(v, mask_bits ^ 0xFFFF);
   4222 }
   4223 
   4224 }  // namespace detail
   4225 
   4226 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
   4227 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
   4228  const DFromV<decltype(v)> d;
   4229  return detail::Compress(v, BitsFromMask(d, m));
   4230 }
   4231 
   4232 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
   4233 HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
   4234  const DFromV<decltype(v)> d;
   4235  return detail::CompressNot(v, BitsFromMask(d, m));
   4236 }
   4237 
   4238 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
   4239                                           Mask256<uint64_t> mask) {
   4240  return CompressNot(v, mask);
   4241 }
   4242 
   4243 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
   4244 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
   4245  constexpr size_t N = 32 / sizeof(T);
   4246  constexpr size_t kNumBytes = (N + 7) / 8;
   4247 
   4248  uint64_t mask_bits = 0;
   4249  CopyBytes<kNumBytes>(bits, &mask_bits);
   4250 
   4251  if (N < 8) {
   4252    mask_bits &= (1ull << N) - 1;
   4253  }
   4254 
   4255  return detail::Compress(v, mask_bits);
   4256 }
   4257 
   4258 // ------------------------------ CompressStore, CompressBitsStore
   4259 
   4260 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
   4261 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
   4262                             TFromD<D>* HWY_RESTRICT unaligned) {
   4263  const uint64_t mask_bits = BitsFromMask(d, m);
   4264  const size_t count = PopCount(mask_bits);
   4265  StoreU(detail::Compress(v, mask_bits), d, unaligned);
   4266  detail::MaybeUnpoison(unaligned, count);
   4267  return count;
   4268 }
   4269 
   4270 template <class D, HWY_IF_V_SIZE_D(D, 32),
   4271          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   4272 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
   4273                                    TFromD<D>* HWY_RESTRICT unaligned) {
   4274  const uint64_t mask_bits = BitsFromMask(d, m);
   4275  const size_t count = PopCount(mask_bits);
   4276  using TU = MakeUnsigned<TFromD<D>>;
   4277 
   4278  const RebindToUnsigned<decltype(d)> du;
   4279  HWY_DASSERT(mask_bits < (1ull << Lanes(d)));
   4280  const Vec256<TU> idx_mask = detail::IndicesFromBits256<TFromD<D>>(mask_bits);
   4281  // Shift nibble MSB into MSB
   4282  const auto shiftVal = sizeof(TU) == 4 ? 28 : 60;
   4283  const Mask256<TU> mask32or64 = MaskFromVec(ShiftLeft<shiftVal>(idx_mask));
   4284  const Mask256<TU> masku{sizeof(TU) == 4 ? __lasx_xvslti_w(mask32or64.raw, 0)
   4285                                          : __lasx_xvslti_d(mask32or64.raw, 0)};
   4286  const MFromD<D> mask = RebindMask(d, masku);
   4287  const VFromD<D> compressed = BitCast(
   4288      d, TableLookupLanes(BitCast(du, v), Indices256<TU>{idx_mask.raw}));
   4289 
   4290  BlendedStore(compressed, mask, d, unaligned);
   4291  detail::MaybeUnpoison(unaligned, count);
   4292  return count;
   4293 }
   4294 
   4295 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)>
   4296 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
   4297                                    TFromD<D>* HWY_RESTRICT unaligned) {
   4298  const uint64_t mask_bits = BitsFromMask(d, m);
   4299  const size_t count = PopCount(mask_bits);
   4300  const VFromD<D> compressed = detail::Compress(v, mask_bits);
   4301  BlendedStore(compressed, FirstN(d, count), d, unaligned);
   4302  return count;
   4303 }
   4304 
   4305 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)>
   4306 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
   4307                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
   4308  constexpr size_t N = MaxLanes(d);
   4309  constexpr size_t kNumBytes = (N + 7) / 8;
   4310 
   4311  uint64_t mask_bits = 0;
   4312  CopyBytes<kNumBytes>(bits, &mask_bits);
   4313 
   4314  if (N < 8) {
   4315    mask_bits &= (1ull << N) - 1;
   4316  }
   4317  const size_t count = PopCount(mask_bits);
   4318 
   4319  StoreU(detail::Compress(v, mask_bits), d, unaligned);
   4320  detail::MaybeUnpoison(unaligned, count);
   4321  return count;
   4322 }
   4323 
   4324 // ------------------------------ Dup128MaskFromMaskBits
   4325 
   4326 // Generic for all vector lengths >= 32 bytes
   4327 template <class D, HWY_IF_V_SIZE_GT_D(D, 16)>
   4328 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   4329  const Half<decltype(d)> dh;
   4330  const auto mh = Dup128MaskFromMaskBits(dh, mask_bits);
   4331  return CombineMasks(d, mh, mh);
   4332 }
   4333 
   4334 // ------------------------------ Expand
   4335 
   4336 template <typename T, HWY_IF_T_SIZE(T, 1)>
   4337 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
   4338  const DFromV<decltype(v)> d;
   4339  // LUTs are infeasible for so many mask combinations, so Combine two
   4340  // half-vector Expand.
   4341  const Half<decltype(d)> dh;
   4342  const uint64_t mask_bits = BitsFromMask(d, mask);
   4343  constexpr size_t N = 32 / sizeof(T);
   4344  const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1));
   4345  const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
   4346  const Vec128<T> expandL = Expand(LowerHalf(v), maskL);
   4347 
   4348  alignas(32) T lanes[N];
   4349  Store(v, d, lanes);
   4350  const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask)));
   4351  const Vec128<T> expandH = Expand(LoadU(dh, lanes + countL), maskH);
   4352  return Combine(d, expandH, expandL);
   4353 }
   4354 
   4355 template <typename T, HWY_IF_T_SIZE(T, 2)>
   4356 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
   4357  const Full256<T> d;
   4358  // LUTs are infeasible for 2^16 possible masks, so splice together two
   4359  // half-vector Expand.
   4360  const Half<decltype(d)> dh;
   4361  const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask)));
   4362  const Vec128<T> expandL = Expand(LowerHalf(v), maskL);
   4363 
   4364  alignas(32) T lanes[32 / sizeof(T)];
   4365  Store(v, d, lanes);
   4366  const Vec128<T> vH = LoadU(dh, lanes + CountTrue(dh, maskL));
   4367  const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask)));
   4368  const Vec128<T> expandH = Expand(vH, maskH);
   4369  return Combine(d, expandH, expandL);
   4370 }
   4371 
   4372 template <typename T, HWY_IF_T_SIZE(T, 4)>
   4373 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
   4374  const Full256<T> d;
   4375  const RebindToUnsigned<decltype(d)> du;
   4376  const uint64_t mask_bits = BitsFromMask(d, mask);
   4377  alignas(16) constexpr uint32_t packed_array[256] = {
   4378      // PrintExpand32x8Nibble.
   4379      0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0,
   4380      0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10,
   4381      0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0,
   4382      0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210,
   4383      0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0,
   4384      0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10,
   4385      0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0,
   4386      0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210,
   4387      0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0,
   4388      0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10,
   4389      0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0,
   4390      0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210,
   4391      0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0,
   4392      0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10,
   4393      0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0,
   4394      0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210,
   4395      0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0,
   4396      0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10,
   4397      0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0,
   4398      0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210,
   4399      0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0,
   4400      0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10,
   4401      0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0,
   4402      0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210,
   4403      0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0,
   4404      0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10,
   4405      0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0,
   4406      0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210,
   4407      0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0,
   4408      0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10,
   4409      0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0,
   4410      0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210,
   4411      0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0,
   4412      0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10,
   4413      0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0,
   4414      0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210,
   4415      0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0,
   4416      0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10,
   4417      0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0,
   4418      0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210,
   4419      0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0,
   4420      0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10,
   4421      0x543210ff, 0x654321f0, 0x6543210f, 0x76543210,
   4422  };
   4423 
   4424  // For lane i, shift the i-th 4-bit index down to bits [0, 3).
   4425  const Vec256<uint32_t> packed = Set(du, packed_array[mask_bits]);
   4426  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
   4427  // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec.
   4428  const Indices256<uint32_t> indices{(packed >> Load(du, shifts)).raw};
   4429  const Vec256<uint32_t> expand = TableLookupLanes(BitCast(du, v), indices);
   4430  // TableLookupLanes cannot also zero masked-off lanes, so do that now.
   4431  return IfThenElseZero(mask, BitCast(d, expand));
   4432 }
   4433 
   4434 template <typename T, HWY_IF_T_SIZE(T, 8)>
   4435 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) {
   4436  const Full256<T> d;
   4437  const RebindToUnsigned<decltype(d)> du;
   4438  const uint64_t mask_bits = BitsFromMask(d, mask);
   4439 
   4440  alignas(16) constexpr uint64_t packed_array[16] = {
   4441      // PrintExpand64x4Nibble.
   4442      0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
   4443      0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
   4444      0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
   4445 
   4446  // For lane i, shift the i-th 4-bit index down to bits [0, 2).
   4447  const Vec256<uint64_t> packed = Set(du, packed_array[mask_bits]);
   4448  alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
   4449  // 64-bit TableLookupLanes on LASX requires IndicesFromVec, which checks
   4450  // bounds, so clear the upper bits.
   4451  const Vec256<uint64_t> masked = And(packed >> Load(du, shifts), Set(du, 3));
   4452  const Indices256<uint64_t> indices = IndicesFromVec(du, masked);
   4453  const Vec256<uint64_t> expand = TableLookupLanes(BitCast(du, v), indices);
   4454  // TableLookupLanes cannot also zero masked-off lanes, so do that now.
   4455  return IfThenElseZero(mask, BitCast(d, expand));
   4456 }
   4457 
   4458 // ------------------------------ LoadExpand
   4459 
   4460 template <class D, HWY_IF_V_SIZE_D(D, 32),
   4461          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   4462 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   4463                             const TFromD<D>* HWY_RESTRICT unaligned) {
   4464  return Expand(LoadU(d, unaligned), mask);
   4465 }
   4466 
   4467 template <class D, HWY_IF_V_SIZE_D(D, 32),
   4468          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   4469 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   4470                             const TFromD<D>* HWY_RESTRICT unaligned) {
   4471  return Expand(LoadU(d, unaligned), mask);
   4472 }
   4473 
   4474 // ------------------------------ LoadInterleaved3/4
   4475 
   4476 // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
   4477 
   4478 namespace detail {
   4479 // Input:
   4480 // 1 0 (<- first block of unaligned)
   4481 // 3 2
   4482 // 5 4
   4483 // Output:
   4484 // 3 0
   4485 // 4 1
   4486 // 5 2
   4487 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   4488 HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   4489                                   VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) {
   4490  constexpr size_t N = MaxLanes(d);
   4491  const VFromD<D> v10 = LoadU(d, unaligned + 0 * N);  // 1 0
   4492  const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
   4493  const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
   4494 
   4495  A = ConcatUpperLower(d, v32, v10);
   4496  B = ConcatLowerUpper(d, v54, v10);
   4497  C = ConcatUpperLower(d, v54, v32);
   4498 }
   4499 
   4500 // Input (128-bit blocks):
   4501 // 1 0 (first block of unaligned)
   4502 // 3 2
   4503 // 5 4
   4504 // 7 6
   4505 // Output:
   4506 // 4 0 (LSB of vA)
   4507 // 5 1
   4508 // 6 2
   4509 // 7 3
   4510 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   4511 HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   4512                                   VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC,
   4513                                   VFromD<D>& vD) {
   4514  constexpr size_t N = MaxLanes(d);
   4515  const VFromD<D> v10 = LoadU(d, unaligned + 0 * N);
   4516  const VFromD<D> v32 = LoadU(d, unaligned + 1 * N);
   4517  const VFromD<D> v54 = LoadU(d, unaligned + 2 * N);
   4518  const VFromD<D> v76 = LoadU(d, unaligned + 3 * N);
   4519 
   4520  vA = ConcatLowerLower(d, v54, v10);
   4521  vB = ConcatUpperUpper(d, v54, v10);
   4522  vC = ConcatLowerLower(d, v76, v32);
   4523  vD = ConcatUpperUpper(d, v76, v32);
   4524 }
   4525 }  // namespace detail
   4526 
   4527 // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
   4528 
   4529 // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
   4530 
   4531 namespace detail {
   4532 // Input (128-bit blocks):
   4533 // 2 0 (LSB of i)
   4534 // 3 1
   4535 // Output:
   4536 // 1 0
   4537 // 3 2
   4538 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   4539 HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d,
   4540                                    TFromD<D>* HWY_RESTRICT unaligned) {
   4541  constexpr size_t N = MaxLanes(d);
   4542  const auto out0 = ConcatLowerLower(d, j, i);
   4543  const auto out1 = ConcatUpperUpper(d, j, i);
   4544  StoreU(out0, d, unaligned + 0 * N);
   4545  StoreU(out1, d, unaligned + 1 * N);
   4546 }
   4547 
   4548 // Input (128-bit blocks):
   4549 // 3 0 (LSB of i)
   4550 // 4 1
   4551 // 5 2
   4552 // Output:
   4553 // 1 0
   4554 // 3 2
   4555 // 5 4
   4556 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   4557 HWY_API void StoreTransposedBlocks3(VFromD<D> i, VFromD<D> j, VFromD<D> k, D d,
   4558                                    TFromD<D>* HWY_RESTRICT unaligned) {
   4559  constexpr size_t N = MaxLanes(d);
   4560  const auto out0 = ConcatLowerLower(d, j, i);
   4561  const auto out1 = ConcatUpperLower(d, i, k);
   4562  const auto out2 = ConcatUpperUpper(d, k, j);
   4563  StoreU(out0, d, unaligned + 0 * N);
   4564  StoreU(out1, d, unaligned + 1 * N);
   4565  StoreU(out2, d, unaligned + 2 * N);
   4566 }
   4567 
   4568 // Input (128-bit blocks):
   4569 // 4 0 (LSB of i)
   4570 // 5 1
   4571 // 6 2
   4572 // 7 3
   4573 // Output:
   4574 // 1 0
   4575 // 3 2
   4576 // 5 4
   4577 // 7 6
   4578 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   4579 HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k,
   4580                                    VFromD<D> l, D d,
   4581                                    TFromD<D>* HWY_RESTRICT unaligned) {
   4582  constexpr size_t N = MaxLanes(d);
   4583  // Write lower halves, then upper.
   4584  const auto out0 = ConcatLowerLower(d, j, i);
   4585  const auto out1 = ConcatLowerLower(d, l, k);
   4586  StoreU(out0, d, unaligned + 0 * N);
   4587  StoreU(out1, d, unaligned + 1 * N);
   4588  const auto out2 = ConcatUpperUpper(d, j, i);
   4589  const auto out3 = ConcatUpperUpper(d, l, k);
   4590  StoreU(out2, d, unaligned + 2 * N);
   4591  StoreU(out3, d, unaligned + 3 * N);
   4592 }
   4593 }  // namespace detail
   4594 
   4595 // ------------------------------ Additional mask logical operations
   4596 
   4597 namespace detail {
   4598 
   4599 template <class T>
   4600 static HWY_INLINE HWY_MAYBE_UNUSED Vec256<T> LasxI256Neg(Vec256<T> v) {
   4601  const Full256<T> d;
   4602  const Repartition<uint64_t, decltype(d)> du64;
   4603 
   4604  const auto vu64 = BitCast(du64, v);
   4605  const auto vu64_zero = Zero(du64);
   4606  const auto i128_ne_zero = VecFromMask(du64, Ne128(du64, vu64, vu64_zero));
   4607  const VFromD<decltype(du64)> i128_neg_result{
   4608      __lasx_xvsub_q(vu64_zero.raw, vu64.raw)};
   4609  const VFromD<decltype(du64)> i256_neg_result_as_u64{
   4610      __lasx_xvadd_q(i128_neg_result.raw,
   4611                     ConcatLowerLower(du64, i128_ne_zero, vu64_zero).raw)};
   4612 
   4613  return BitCast(d, i256_neg_result_as_u64);
   4614 }
   4615 
   4616 }  // namespace detail
   4617 
   4618 template <class T>
   4619 HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
   4620  const Full256<T> d;
   4621  return Or(mask, MaskFromVec(detail::LasxI256Neg(VecFromMask(d, mask))));
   4622 }
   4623 
   4624 template <class T>
   4625 HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
   4626  return Not(SetAtOrAfterFirst(mask));
   4627 }
   4628 
   4629 template <class T>
   4630 HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) {
   4631  const Full256<T> d;
   4632  const RebindToSigned<decltype(d)> di;
   4633 
   4634  const auto vmask = BitCast(di, VecFromMask(d, mask));
   4635  const auto neg_vmask = detail::LasxI256Neg(vmask);
   4636 
   4637  return MaskFromVec(BitCast(d, Neg(And(vmask, neg_vmask))));
   4638 }
   4639 
   4640 template <class T>
   4641 HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
   4642  const Full256<T> d;
   4643  constexpr size_t kLanesPerBlock = MaxLanes(d) / 2;
   4644 
   4645  const auto vmask = VecFromMask(d, mask);
   4646  const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d));
   4647  return SetBeforeFirst(
   4648      MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>(
   4649          d, vmask, vmask_lo)));
   4650 }
   4651 
   4652 // ------------------------------ LeadingZeroCount
   4653 
   4654 template <class V, HWY_IF_UI8(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
   4655 HWY_API V LeadingZeroCount(V v) {
   4656  return V{__lasx_xvclz_b(v.raw)};
   4657 }
   4658 template <class V, HWY_IF_UI16(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
   4659 HWY_API V LeadingZeroCount(V v) {
   4660  return V{__lasx_xvclz_h(v.raw)};
   4661 }
   4662 template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
   4663 HWY_API V LeadingZeroCount(V v) {
   4664  return V{__lasx_xvclz_w(v.raw)};
   4665 }
   4666 template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)>
   4667 HWY_API V LeadingZeroCount(V v) {
   4668  return V{__lasx_xvclz_d(v.raw)};
   4669 }
   4670 
   4671 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_V_SIZE_V(V, 32)>
   4672 HWY_API V HighestSetBitIndex(V v) {
   4673  const DFromV<decltype(v)> d;
   4674  using T = TFromD<decltype(d)>;
   4675  return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
   4676 }
   4677 
   4678 // NOLINTNEXTLINE(google-readability-namespace-comments)
   4679 }  // namespace HWY_NAMESPACE
   4680 }  // namespace hwy
   4681 HWY_AFTER_NAMESPACE();