tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

wasm_256-inl.h (80865B)


      1 // Copyright 2021 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // 256-bit WASM vectors and operations. Experimental.
     17 // External include guard in highway.h - see comment there.
     18 
     19 // For half-width vectors. Already includes base.h and shared-inl.h.
     20 #include "hwy/ops/wasm_128-inl.h"
     21 
     22 HWY_BEFORE_NAMESPACE();
     23 namespace hwy {
     24 namespace HWY_NAMESPACE {
     25 
     26 template <typename T>
     27 class Vec256 {
     28 public:
     29  using PrivateT = T;                                  // only for DFromV
     30  static constexpr size_t kPrivateN = 32 / sizeof(T);  // only for DFromV
     31 
     32  // Compound assignment. Only usable if there is a corresponding non-member
     33  // binary operator overload. For example, only f32 and f64 support division.
     34  HWY_INLINE Vec256& operator*=(const Vec256 other) {
     35    return *this = (*this * other);
     36  }
     37  HWY_INLINE Vec256& operator/=(const Vec256 other) {
     38    return *this = (*this / other);
     39  }
     40  HWY_INLINE Vec256& operator+=(const Vec256 other) {
     41    return *this = (*this + other);
     42  }
     43  HWY_INLINE Vec256& operator-=(const Vec256 other) {
     44    return *this = (*this - other);
     45  }
     46  HWY_INLINE Vec256& operator%=(const Vec256 other) {
     47    return *this = (*this % other);
     48  }
     49  HWY_INLINE Vec256& operator&=(const Vec256 other) {
     50    return *this = (*this & other);
     51  }
     52  HWY_INLINE Vec256& operator|=(const Vec256 other) {
     53    return *this = (*this | other);
     54  }
     55  HWY_INLINE Vec256& operator^=(const Vec256 other) {
     56    return *this = (*this ^ other);
     57  }
     58 
     59  Vec128<T> v0;
     60  Vec128<T> v1;
     61 };
     62 
     63 template <typename T>
     64 struct Mask256 {
     65  using PrivateT = T;                                  // only for DFromM
     66  static constexpr size_t kPrivateN = 32 / sizeof(T);  // only for DFromM
     67 
     68  Mask128<T> m0;
     69  Mask128<T> m1;
     70 };
     71 
     72 // ------------------------------ Zero
     73 
     74 // Avoid VFromD here because it is defined in terms of Zero.
     75 template <class D, HWY_IF_V_SIZE_D(D, 32)>
     76 HWY_API Vec256<TFromD<D>> Zero(D d) {
     77  const Half<decltype(d)> dh;
     78  Vec256<TFromD<D>> ret;
     79  ret.v0 = ret.v1 = Zero(dh);
     80  return ret;
     81 }
     82 
     83 // ------------------------------ BitCast
     84 template <class D, typename TFrom>
     85 HWY_API VFromD<D> BitCast(D d, Vec256<TFrom> v) {
     86  const Half<decltype(d)> dh;
     87  VFromD<D> ret;
     88  ret.v0 = BitCast(dh, v.v0);
     89  ret.v1 = BitCast(dh, v.v1);
     90  return ret;
     91 }
     92 
     93 // ------------------------------ ResizeBitCast
     94 
     95 // 32-byte vector to 32-byte vector: Same as BitCast
     96 template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32),
     97          HWY_IF_V_SIZE_D(D, 32)>
     98 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
     99  return BitCast(d, v);
    100 }
    101 
    102 // <= 16-byte vector to 32-byte vector
    103 template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16),
    104          HWY_IF_V_SIZE_D(D, 32)>
    105 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    106  const Half<decltype(d)> dh;
    107  VFromD<D> ret;
    108  ret.v0 = ResizeBitCast(dh, v);
    109  ret.v1 = Zero(dh);
    110  return ret;
    111 }
    112 
    113 // 32-byte vector to <= 16-byte vector
    114 template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32),
    115          HWY_IF_V_SIZE_LE_D(D, 16)>
    116 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
    117  return ResizeBitCast(d, v.v0);
    118 }
    119 
    120 // ------------------------------ Set
    121 template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2>
    122 HWY_API VFromD<D> Set(D d, const T2 t) {
    123  const Half<decltype(d)> dh;
    124  VFromD<D> ret;
    125  ret.v0 = ret.v1 = Set(dh, static_cast<TFromD<D>>(t));
    126  return ret;
    127 }
    128 
    129 // Undefined, Iota defined in wasm_128.
    130 
    131 // ------------------------------ Dup128VecFromValues
    132 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)>
    133 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
    134                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    135                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
    136                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
    137                                      TFromD<D> t11, TFromD<D> t12,
    138                                      TFromD<D> t13, TFromD<D> t14,
    139                                      TFromD<D> t15) {
    140  const Half<decltype(d)> dh;
    141  VFromD<D> ret;
    142  ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t8,
    143                                        t9, t10, t11, t12, t13, t14, t15);
    144  return ret;
    145 }
    146 
    147 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)>
    148 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
    149                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
    150                                      TFromD<D> t5, TFromD<D> t6,
    151                                      TFromD<D> t7) {
    152  const Half<decltype(d)> dh;
    153  VFromD<D> ret;
    154  ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7);
    155  return ret;
    156 }
    157 
    158 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)>
    159 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
    160                                      TFromD<D> t2, TFromD<D> t3) {
    161  const Half<decltype(d)> dh;
    162  VFromD<D> ret;
    163  ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3);
    164  return ret;
    165 }
    166 
    167 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)>
    168 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
    169  const Half<decltype(d)> dh;
    170  VFromD<D> ret;
    171  ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1);
    172  return ret;
    173 }
    174 
    175 // ================================================== ARITHMETIC
    176 
    177 template <typename T>
    178 HWY_API Vec256<T> operator+(Vec256<T> a, const Vec256<T> b) {
    179  a.v0 += b.v0;
    180  a.v1 += b.v1;
    181  return a;
    182 }
    183 
    184 template <typename T>
    185 HWY_API Vec256<T> operator-(Vec256<T> a, const Vec256<T> b) {
    186  a.v0 -= b.v0;
    187  a.v1 -= b.v1;
    188  return a;
    189 }
    190 
    191 // ------------------------------ SumsOf8
    192 HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
    193  Vec256<uint64_t> ret;
    194  ret.v0 = SumsOf8(v.v0);
    195  ret.v1 = SumsOf8(v.v1);
    196  return ret;
    197 }
    198 
    199 HWY_API Vec256<int64_t> SumsOf8(const Vec256<int8_t> v) {
    200  Vec256<int64_t> ret;
    201  ret.v0 = SumsOf8(v.v0);
    202  ret.v1 = SumsOf8(v.v1);
    203  return ret;
    204 }
    205 
    206 template <typename T>
    207 HWY_API Vec256<T> SaturatedAdd(Vec256<T> a, const Vec256<T> b) {
    208  a.v0 = SaturatedAdd(a.v0, b.v0);
    209  a.v1 = SaturatedAdd(a.v1, b.v1);
    210  return a;
    211 }
    212 
    213 template <typename T>
    214 HWY_API Vec256<T> SaturatedSub(Vec256<T> a, const Vec256<T> b) {
    215  a.v0 = SaturatedSub(a.v0, b.v0);
    216  a.v1 = SaturatedSub(a.v1, b.v1);
    217  return a;
    218 }
    219 
    220 template <typename T, HWY_IF_UNSIGNED(T),
    221          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
    222 HWY_API Vec256<T> AverageRound(Vec256<T> a, const Vec256<T> b) {
    223  a.v0 = AverageRound(a.v0, b.v0);
    224  a.v1 = AverageRound(a.v1, b.v1);
    225  return a;
    226 }
    227 
    228 template <typename T>
    229 HWY_API Vec256<T> Abs(Vec256<T> v) {
    230  v.v0 = Abs(v.v0);
    231  v.v1 = Abs(v.v1);
    232  return v;
    233 }
    234 
    235 // ------------------------------ Shift lanes by constant #bits
    236 
    237 template <int kBits, typename T>
    238 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) {
    239  v.v0 = ShiftLeft<kBits>(v.v0);
    240  v.v1 = ShiftLeft<kBits>(v.v1);
    241  return v;
    242 }
    243 
    244 template <int kBits, typename T>
    245 HWY_API Vec256<T> ShiftRight(Vec256<T> v) {
    246  v.v0 = ShiftRight<kBits>(v.v0);
    247  v.v1 = ShiftRight<kBits>(v.v1);
    248  return v;
    249 }
    250 
    251 // ------------------------------ RotateRight (ShiftRight, Or)
    252 template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    253 HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
    254  const DFromV<decltype(v)> d;
    255  const RebindToUnsigned<decltype(d)> du;
    256 
    257  constexpr size_t kSizeInBits = sizeof(T) * 8;
    258  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
    259  if (kBits == 0) return v;
    260 
    261  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
    262            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
    263 }
    264 
    265 // ------------------------------ Shift lanes by same variable #bits
    266 
    267 template <typename T>
    268 HWY_API Vec256<T> ShiftLeftSame(Vec256<T> v, const int bits) {
    269  v.v0 = ShiftLeftSame(v.v0, bits);
    270  v.v1 = ShiftLeftSame(v.v1, bits);
    271  return v;
    272 }
    273 
    274 template <typename T>
    275 HWY_API Vec256<T> ShiftRightSame(Vec256<T> v, const int bits) {
    276  v.v0 = ShiftRightSame(v.v0, bits);
    277  v.v1 = ShiftRightSame(v.v1, bits);
    278  return v;
    279 }
    280 
    281 // ------------------------------ Min, Max
    282 template <typename T>
    283 HWY_API Vec256<T> Min(Vec256<T> a, const Vec256<T> b) {
    284  a.v0 = Min(a.v0, b.v0);
    285  a.v1 = Min(a.v1, b.v1);
    286  return a;
    287 }
    288 
    289 template <typename T>
    290 HWY_API Vec256<T> Max(Vec256<T> a, const Vec256<T> b) {
    291  a.v0 = Max(a.v0, b.v0);
    292  a.v1 = Max(a.v1, b.v1);
    293  return a;
    294 }
    295 // ------------------------------ Integer multiplication
    296 
    297 template <typename T>
    298 HWY_API Vec256<T> operator*(Vec256<T> a, const Vec256<T> b) {
    299  a.v0 *= b.v0;
    300  a.v1 *= b.v1;
    301  return a;
    302 }
    303 
    304 template <typename T>
    305 HWY_API Vec256<T> MulHigh(Vec256<T> a, const Vec256<T> b) {
    306  a.v0 = MulHigh(a.v0, b.v0);
    307  a.v1 = MulHigh(a.v1, b.v1);
    308  return a;
    309 }
    310 
    311 template <typename T>
    312 HWY_API Vec256<T> MulFixedPoint15(Vec256<T> a, const Vec256<T> b) {
    313  a.v0 = MulFixedPoint15(a.v0, b.v0);
    314  a.v1 = MulFixedPoint15(a.v1, b.v1);
    315  return a;
    316 }
    317 
    318 // Cannot use MakeWide because that returns uint128_t for uint64_t, but we want
    319 // uint64_t.
    320 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    321          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    322 HWY_API Vec256<MakeWide<T>> MulEven(Vec256<T> a, const Vec256<T> b) {
    323  Vec256<MakeWide<T>> ret;
    324  ret.v0 = MulEven(a.v0, b.v0);
    325  ret.v1 = MulEven(a.v1, b.v1);
    326  return ret;
    327 }
    328 template <class T, HWY_IF_UI64(T)>
    329 HWY_API Vec256<T> MulEven(Vec256<T> a, const Vec256<T> b) {
    330  Vec256<T> ret;
    331  ret.v0 = MulEven(a.v0, b.v0);
    332  ret.v1 = MulEven(a.v1, b.v1);
    333  return ret;
    334 }
    335 
    336 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
    337          HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    338 HWY_API Vec256<MakeWide<T>> MulOdd(Vec256<T> a, const Vec256<T> b) {
    339  Vec256<MakeWide<T>> ret;
    340  ret.v0 = MulOdd(a.v0, b.v0);
    341  ret.v1 = MulOdd(a.v1, b.v1);
    342  return ret;
    343 }
    344 template <class T, HWY_IF_UI64(T)>
    345 HWY_API Vec256<T> MulOdd(Vec256<T> a, const Vec256<T> b) {
    346  Vec256<T> ret;
    347  ret.v0 = MulOdd(a.v0, b.v0);
    348  ret.v1 = MulOdd(a.v1, b.v1);
    349  return ret;
    350 }
    351 
    352 // ------------------------------ Negate
    353 template <typename T>
    354 HWY_API Vec256<T> Neg(Vec256<T> v) {
    355  v.v0 = Neg(v.v0);
    356  v.v1 = Neg(v.v1);
    357  return v;
    358 }
    359 
    360 // ------------------------------ AbsDiff
    361 // generic_ops takes care of integer T.
    362 template <typename T, HWY_IF_FLOAT(T)>
    363 HWY_API Vec256<T> AbsDiff(const Vec256<T> a, const Vec256<T> b) {
    364  return Abs(a - b);
    365 }
    366 
    367 // ------------------------------ Floating-point division
    368 // generic_ops takes care of integer T.
    369 template <typename T, HWY_IF_FLOAT(T)>
    370 HWY_API Vec256<T> operator/(Vec256<T> a, const Vec256<T> b) {
    371  a.v0 /= b.v0;
    372  a.v1 /= b.v1;
    373  return a;
    374 }
    375 
    376 // ------------------------------ Floating-point multiply-add variants
    377 
    378 template <class T, HWY_IF_FLOAT3264(T)>
    379 HWY_API Vec256<T> MulAdd(Vec256<T> mul, Vec256<T> x, Vec256<T> add) {
    380  mul.v0 = MulAdd(mul.v0, x.v0, add.v0);
    381  mul.v1 = MulAdd(mul.v1, x.v1, add.v1);
    382  return mul;
    383 }
    384 
    385 template <class T, HWY_IF_FLOAT3264(T)>
    386 HWY_API Vec256<T> NegMulAdd(Vec256<T> mul, Vec256<T> x, Vec256<T> add) {
    387  mul.v0 = NegMulAdd(mul.v0, x.v0, add.v0);
    388  mul.v1 = NegMulAdd(mul.v1, x.v1, add.v1);
    389  return mul;
    390 }
    391 
    392 template <class T, HWY_IF_FLOAT3264(T)>
    393 HWY_API Vec256<T> MulSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub) {
    394  mul.v0 = MulSub(mul.v0, x.v0, sub.v0);
    395  mul.v1 = MulSub(mul.v1, x.v1, sub.v1);
    396  return mul;
    397 }
    398 
    399 template <class T, HWY_IF_FLOAT3264(T)>
    400 HWY_API Vec256<T> NegMulSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub) {
    401  mul.v0 = NegMulSub(mul.v0, x.v0, sub.v0);
    402  mul.v1 = NegMulSub(mul.v1, x.v1, sub.v1);
    403  return mul;
    404 }
    405 
    406 // ------------------------------ Floating-point square root
    407 
    408 template <typename T>
    409 HWY_API Vec256<T> Sqrt(Vec256<T> v) {
    410  v.v0 = Sqrt(v.v0);
    411  v.v1 = Sqrt(v.v1);
    412  return v;
    413 }
    414 
    415 // ------------------------------ Floating-point rounding
    416 
    417 // Toward nearest integer, ties to even
    418 template <class T, HWY_IF_FLOAT3264(T)>
    419 HWY_API Vec256<T> Round(Vec256<T> v) {
    420  v.v0 = Round(v.v0);
    421  v.v1 = Round(v.v1);
    422  return v;
    423 }
    424 
    425 // Toward zero, aka truncate
    426 template <class T, HWY_IF_FLOAT3264(T)>
    427 HWY_API Vec256<T> Trunc(Vec256<T> v) {
    428  v.v0 = Trunc(v.v0);
    429  v.v1 = Trunc(v.v1);
    430  return v;
    431 }
    432 
    433 // Toward +infinity, aka ceiling
    434 template <class T, HWY_IF_FLOAT3264(T)>
    435 HWY_API Vec256<T> Ceil(Vec256<T> v) {
    436  v.v0 = Ceil(v.v0);
    437  v.v1 = Ceil(v.v1);
    438  return v;
    439 }
    440 
    441 // Toward -infinity, aka floor
    442 template <class T, HWY_IF_FLOAT3264(T)>
    443 HWY_API Vec256<T> Floor(Vec256<T> v) {
    444  v.v0 = Floor(v.v0);
    445  v.v1 = Floor(v.v1);
    446  return v;
    447 }
    448 
    449 // ------------------------------ Floating-point classification
    450 
    451 template <typename T>
    452 HWY_API Mask256<T> IsNaN(const Vec256<T> v) {
    453  return v != v;
    454 }
    455 
    456 template <typename T, HWY_IF_FLOAT(T)>
    457 HWY_API Mask256<T> IsInf(const Vec256<T> v) {
    458  const DFromV<decltype(v)> d;
    459  const RebindToUnsigned<decltype(d)> du;
    460  const VFromD<decltype(du)> vu = BitCast(du, v);
    461  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
    462  return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>())));
    463 }
    464 
    465 // Returns whether normal/subnormal/zero.
    466 template <typename T, HWY_IF_FLOAT(T)>
    467 HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
    468  const DFromV<decltype(v)> d;
    469  const RebindToUnsigned<decltype(d)> du;
    470  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
    471  const VFromD<decltype(du)> vu = BitCast(du, v);
    472  // 'Shift left' to clear the sign bit, then right so we can compare with the
    473  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
    474  // negative and non-negative floats would be greater).
    475  const VFromD<decltype(di)> exp =
    476      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
    477  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
    478 }
    479 
    480 // ================================================== COMPARE
    481 
    482 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
    483 
    484 template <class DTo, typename TFrom, typename TTo = TFromD<DTo>>
    485 HWY_API MFromD<DTo> RebindMask(DTo /*tag*/, Mask256<TFrom> m) {
    486  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
    487  return MFromD<DTo>{Mask128<TTo>{m.m0.raw}, Mask128<TTo>{m.m1.raw}};
    488 }
    489 
    490 template <typename T>
    491 HWY_API Mask256<T> TestBit(Vec256<T> v, Vec256<T> bit) {
    492  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
    493  return (v & bit) == bit;
    494 }
    495 
    496 template <typename T>
    497 HWY_API Mask256<T> operator==(Vec256<T> a, const Vec256<T> b) {
    498  Mask256<T> m;
    499  m.m0 = operator==(a.v0, b.v0);
    500  m.m1 = operator==(a.v1, b.v1);
    501  return m;
    502 }
    503 
    504 template <typename T>
    505 HWY_API Mask256<T> operator!=(Vec256<T> a, const Vec256<T> b) {
    506  Mask256<T> m;
    507  m.m0 = operator!=(a.v0, b.v0);
    508  m.m1 = operator!=(a.v1, b.v1);
    509  return m;
    510 }
    511 
    512 template <typename T>
    513 HWY_API Mask256<T> operator<(Vec256<T> a, const Vec256<T> b) {
    514  Mask256<T> m;
    515  m.m0 = operator<(a.v0, b.v0);
    516  m.m1 = operator<(a.v1, b.v1);
    517  return m;
    518 }
    519 
    520 template <typename T>
    521 HWY_API Mask256<T> operator>(Vec256<T> a, const Vec256<T> b) {
    522  Mask256<T> m;
    523  m.m0 = operator>(a.v0, b.v0);
    524  m.m1 = operator>(a.v1, b.v1);
    525  return m;
    526 }
    527 
    528 template <typename T>
    529 HWY_API Mask256<T> operator<=(Vec256<T> a, const Vec256<T> b) {
    530  Mask256<T> m;
    531  m.m0 = operator<=(a.v0, b.v0);
    532  m.m1 = operator<=(a.v1, b.v1);
    533  return m;
    534 }
    535 
    536 template <typename T>
    537 HWY_API Mask256<T> operator>=(Vec256<T> a, const Vec256<T> b) {
    538  Mask256<T> m;
    539  m.m0 = operator>=(a.v0, b.v0);
    540  m.m1 = operator>=(a.v1, b.v1);
    541  return m;
    542 }
    543 
    544 // ------------------------------ FirstN (Iota, Lt)
    545 
    546 template <class D, HWY_IF_V_SIZE_D(D, 32)>
    547 HWY_API MFromD<D> FirstN(const D d, size_t num) {
    548  const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
    549  using TI = TFromD<decltype(di)>;
    550  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num)));
    551 }
    552 
    553 // ================================================== LOGICAL
    554 
    555 template <typename T>
    556 HWY_API Vec256<T> Not(Vec256<T> v) {
    557  v.v0 = Not(v.v0);
    558  v.v1 = Not(v.v1);
    559  return v;
    560 }
    561 
    562 template <typename T>
    563 HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
    564  a.v0 = And(a.v0, b.v0);
    565  a.v1 = And(a.v1, b.v1);
    566  return a;
    567 }
    568 
    569 template <typename T>
    570 HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
    571  not_mask.v0 = AndNot(not_mask.v0, mask.v0);
    572  not_mask.v1 = AndNot(not_mask.v1, mask.v1);
    573  return not_mask;
    574 }
    575 
    576 template <typename T>
    577 HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
    578  a.v0 = Or(a.v0, b.v0);
    579  a.v1 = Or(a.v1, b.v1);
    580  return a;
    581 }
    582 
    583 template <typename T>
    584 HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
    585  a.v0 = Xor(a.v0, b.v0);
    586  a.v1 = Xor(a.v1, b.v1);
    587  return a;
    588 }
    589 
    590 template <typename T>
    591 HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
    592  return Xor(x1, Xor(x2, x3));
    593 }
    594 
    595 template <typename T>
    596 HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
    597  return Or(o1, Or(o2, o3));
    598 }
    599 
    600 template <typename T>
    601 HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
    602  return Or(o, And(a1, a2));
    603 }
    604 
    605 template <typename T>
    606 HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
    607  return IfThenElse(MaskFromVec(mask), yes, no);
    608 }
    609 
    610 // ------------------------------ Operator overloads (internal-only if float)
    611 
    612 template <typename T>
    613 HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
    614  return And(a, b);
    615 }
    616 
    617 template <typename T>
    618 HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
    619  return Or(a, b);
    620 }
    621 
    622 template <typename T>
    623 HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
    624  return Xor(a, b);
    625 }
    626 
    627 // ------------------------------ CopySign
    628 template <typename T>
    629 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
    630  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
    631  const DFromV<decltype(magn)> d;
    632  return BitwiseIfThenElse(SignBit(d), sign, magn);
    633 }
    634 
    635 // ------------------------------ CopySignToAbs
    636 template <typename T>
    637 HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
    638  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
    639  const DFromV<decltype(sign)> d;
    640  return OrAnd(abs, SignBit(d), sign);
    641 }
    642 
    643 // ------------------------------ Mask
    644 
    645 // Mask and Vec are the same (true = FF..FF).
    646 template <typename T>
    647 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
    648  Mask256<T> m;
    649  m.m0 = MaskFromVec(v.v0);
    650  m.m1 = MaskFromVec(v.v1);
    651  return m;
    652 }
    653 
    654 template <class D, typename T = TFromD<D>>
    655 HWY_API Vec256<T> VecFromMask(D d, Mask256<T> m) {
    656  const Half<decltype(d)> dh;
    657  Vec256<T> v;
    658  v.v0 = VecFromMask(dh, m.m0);
    659  v.v1 = VecFromMask(dh, m.m1);
    660  return v;
    661 }
    662 
    663 template <class D, HWY_IF_V_SIZE_D(D, 32)>
    664 HWY_API uint64_t BitsFromMask(D d, MFromD<D> m) {
    665  const Half<decltype(d)> dh;
    666  const uint64_t lo = BitsFromMask(dh, m.m0);
    667  const uint64_t hi = BitsFromMask(dh, m.m1);
    668  return (hi << Lanes(dh)) | lo;
    669 }
    670 
    671 // mask ? yes : no
    672 template <typename T>
    673 HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
    674  yes.v0 = IfThenElse(mask.m0, yes.v0, no.v0);
    675  yes.v1 = IfThenElse(mask.m1, yes.v1, no.v1);
    676  return yes;
    677 }
    678 
    679 // mask ? yes : 0
    680 template <typename T>
    681 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
    682  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
    683 }
    684 
    685 // mask ? 0 : no
    686 template <typename T>
    687 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
    688  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
    689 }
    690 
    691 template <typename T>
    692 HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
    693  v.v0 = IfNegativeThenElse(v.v0, yes.v0, no.v0);
    694  v.v1 = IfNegativeThenElse(v.v1, yes.v1, no.v1);
    695  return v;
    696 }
    697 
    698 // ------------------------------ Mask logical
    699 
    700 template <typename T>
    701 HWY_API Mask256<T> Not(const Mask256<T> m) {
    702  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
    703 }
    704 
    705 template <typename T>
    706 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
    707  const Full256<T> d;
    708  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
    709 }
    710 
    711 template <typename T>
    712 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
    713  const Full256<T> d;
    714  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
    715 }
    716 
    717 template <typename T>
    718 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
    719  const Full256<T> d;
    720  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
    721 }
    722 
    723 template <typename T>
    724 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
    725  const Full256<T> d;
    726  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
    727 }
    728 
    729 template <typename T>
    730 HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
    731  const Full256<T> d;
    732  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
    733 }
    734 
    735 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
    736 template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    737 HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
    738  v.v0 = operator<<(v.v0, bits.v0);
    739  v.v1 = operator<<(v.v1, bits.v1);
    740  return v;
    741 }
    742 
    743 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)
    744 template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
    745 HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) {
    746  v.v0 = operator>>(v.v0, bits.v0);
    747  v.v1 = operator>>(v.v1, bits.v1);
    748  return v;
    749 }
    750 
    751 // ------------------------------ BroadcastSignBit (compare, VecFromMask)
    752 
    753 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)>
    754 HWY_API Vec256<T> BroadcastSignBit(const Vec256<T> v) {
    755  return ShiftRight<sizeof(T) * 8 - 1>(v);
    756 }
    757 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
    758  const DFromV<decltype(v)> d;
    759  return VecFromMask(d, v < Zero(d));
    760 }
    761 
    762 // ================================================== MEMORY
    763 
    764 // ------------------------------ Load
    765 
    766 template <class D, HWY_IF_V_SIZE_D(D, 32)>
    767 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
    768  const Half<decltype(d)> dh;
    769  VFromD<D> ret;
    770  ret.v0 = Load(dh, aligned);
    771  ret.v1 = Load(dh, aligned + Lanes(dh));
    772  return ret;
    773 }
    774 
    775 template <class D, typename T = TFromD<D>>
    776 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D d, const T* HWY_RESTRICT aligned) {
    777  return IfThenElseZero(m, Load(d, aligned));
    778 }
    779 
    780 template <class D, typename T = TFromD<D>>
    781 HWY_API Vec256<T> MaskedLoadOr(Vec256<T> v, Mask256<T> m, D d,
    782                               const T* HWY_RESTRICT aligned) {
    783  return IfThenElse(m, Load(d, aligned), v);
    784 }
    785 
    786 // LoadU == Load.
    787 template <class D, HWY_IF_V_SIZE_D(D, 32)>
    788 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
    789  return Load(d, p);
    790 }
    791 
    792 template <class D, HWY_IF_V_SIZE_D(D, 32)>
    793 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
    794  const Half<decltype(d)> dh;
    795  VFromD<D> ret;
    796  ret.v0 = ret.v1 = Load(dh, p);
    797  return ret;
    798 }
    799 
    800 // ------------------------------ Store
    801 
    802 template <class D, typename T = TFromD<D>>
    803 HWY_API void Store(Vec256<T> v, D d, T* HWY_RESTRICT aligned) {
    804  const Half<decltype(d)> dh;
    805  Store(v.v0, dh, aligned);
    806  Store(v.v1, dh, aligned + Lanes(dh));
    807 }
    808 
    809 // StoreU == Store.
    810 template <class D, typename T = TFromD<D>>
    811 HWY_API void StoreU(Vec256<T> v, D d, T* HWY_RESTRICT p) {
    812  Store(v, d, p);
    813 }
    814 
    815 template <class D, typename T = TFromD<D>>
    816 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D d, T* HWY_RESTRICT p) {
    817  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
    818 }
    819 
    820 // ------------------------------ Stream
    821 template <class D, typename T = TFromD<D>>
    822 HWY_API void Stream(Vec256<T> v, D d, T* HWY_RESTRICT aligned) {
    823  // Same as aligned stores.
    824  Store(v, d, aligned);
    825 }
    826 
    827 // ------------------------------ Scatter, Gather defined in wasm_128
    828 
    829 // ================================================== SWIZZLE
    830 
    831 // ------------------------------ ExtractLane
    832 template <typename T>
    833 HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
    834  alignas(32) T lanes[32 / sizeof(T)];
    835  Store(v, DFromV<decltype(v)>(), lanes);
    836  return lanes[i];
    837 }
    838 
    839 // ------------------------------ InsertLane
    840 template <typename T>
    841 HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
    842  DFromV<decltype(v)> d;
    843  alignas(32) T lanes[32 / sizeof(T)];
    844  Store(v, d, lanes);
    845  lanes[i] = t;
    846  return Load(d, lanes);
    847 }
    848 
    849 // ------------------------------ ExtractBlock
    850 template <int kBlockIdx, class T>
    851 HWY_API Vec128<T> ExtractBlock(Vec256<T> v) {
    852  static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
    853  return (kBlockIdx == 0) ? v.v0 : v.v1;
    854 }
    855 
    856 // ------------------------------ InsertBlock
    857 template <int kBlockIdx, class T>
    858 HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) {
    859  static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
    860  Vec256<T> result;
    861  if (kBlockIdx == 0) {
    862    result.v0 = blk_to_insert;
    863    result.v1 = v.v1;
    864  } else {
    865    result.v0 = v.v0;
    866    result.v1 = blk_to_insert;
    867  }
    868  return result;
    869 }
    870 
    871 // ------------------------------ BroadcastBlock
    872 template <int kBlockIdx, class T>
    873 HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) {
    874  static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index");
    875  Vec256<T> result;
    876  result.v0 = result.v1 = (kBlockIdx == 0 ? v.v0 : v.v1);
    877  return result;
    878 }
    879 
    880 // ------------------------------ LowerHalf
    881 
    882 template <class D, typename T = TFromD<D>>
    883 HWY_API Vec128<T> LowerHalf(D /* tag */, Vec256<T> v) {
    884  return v.v0;
    885 }
    886 
    887 template <typename T>
    888 HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
    889  return v.v0;
    890 }
    891 
    892 // ------------------------------ GetLane (LowerHalf)
    893 template <typename T>
    894 HWY_API T GetLane(const Vec256<T> v) {
    895  return GetLane(LowerHalf(v));
    896 }
    897 
    898 // ------------------------------ ShiftLeftBytes
    899 
    900 template <int kBytes, class D, typename T = TFromD<D>>
    901 HWY_API Vec256<T> ShiftLeftBytes(D d, Vec256<T> v) {
    902  const Half<decltype(d)> dh;
    903  v.v0 = ShiftLeftBytes<kBytes>(dh, v.v0);
    904  v.v1 = ShiftLeftBytes<kBytes>(dh, v.v1);
    905  return v;
    906 }
    907 
    908 template <int kBytes, typename T>
    909 HWY_API Vec256<T> ShiftLeftBytes(Vec256<T> v) {
    910  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
    911 }
    912 
    913 // ------------------------------ ShiftLeftLanes
    914 
    915 template <int kLanes, class D, typename T = TFromD<D>>
    916 HWY_API Vec256<T> ShiftLeftLanes(D d, const Vec256<T> v) {
    917  const Repartition<uint8_t, decltype(d)> d8;
    918  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
    919 }
    920 
    921 template <int kLanes, typename T>
    922 HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
    923  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
    924 }
    925 
    926 // ------------------------------ ShiftRightBytes
    927 template <int kBytes, class D, typename T = TFromD<D>>
    928 HWY_API Vec256<T> ShiftRightBytes(D d, Vec256<T> v) {
    929  const Half<decltype(d)> dh;
    930  v.v0 = ShiftRightBytes<kBytes>(dh, v.v0);
    931  v.v1 = ShiftRightBytes<kBytes>(dh, v.v1);
    932  return v;
    933 }
    934 
    935 // ------------------------------ ShiftRightLanes
    936 template <int kLanes, class D, typename T = TFromD<D>>
    937 HWY_API Vec256<T> ShiftRightLanes(D d, const Vec256<T> v) {
    938  const Repartition<uint8_t, decltype(d)> d8;
    939  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
    940 }
    941 
    942 // ------------------------------ UpperHalf (ShiftRightBytes)
    943 template <class D, typename T = TFromD<D>>
    944 HWY_API Vec128<T> UpperHalf(D /* tag */, const Vec256<T> v) {
    945  return v.v1;
    946 }
    947 
    948 // ------------------------------ CombineShiftRightBytes
    949 
    950 template <int kBytes, class D, typename T = TFromD<D>>
    951 HWY_API Vec256<T> CombineShiftRightBytes(D d, Vec256<T> hi, Vec256<T> lo) {
    952  const Half<decltype(d)> dh;
    953  hi.v0 = CombineShiftRightBytes<kBytes>(dh, hi.v0, lo.v0);
    954  hi.v1 = CombineShiftRightBytes<kBytes>(dh, hi.v1, lo.v1);
    955  return hi;
    956 }
    957 
    958 // ------------------------------ Broadcast/splat any lane
    959 
    960 template <int kLane, typename T>
    961 HWY_API Vec256<T> Broadcast(const Vec256<T> v) {
    962  Vec256<T> ret;
    963  ret.v0 = Broadcast<kLane>(v.v0);
    964  ret.v1 = Broadcast<kLane>(v.v1);
    965  return ret;
    966 }
    967 
    968 template <int kLane, typename T>
    969 HWY_API Vec256<T> BroadcastLane(const Vec256<T> v) {
    970  constexpr int kLanesPerBlock = static_cast<int>(16 / sizeof(T));
    971  static_assert(0 <= kLane && kLane < kLanesPerBlock * 2, "Invalid lane");
    972  constexpr int kLaneInBlkIdx = kLane & (kLanesPerBlock - 1);
    973  Vec256<T> ret;
    974  ret.v0 = ret.v1 =
    975      Broadcast<kLaneInBlkIdx>(kLane >= kLanesPerBlock ? v.v1 : v.v0);
    976  return ret;
    977 }
    978 
    979 // ------------------------------ TableLookupBytes
    980 
    981 // Both full
    982 template <typename T, typename TI>
    983 HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes, Vec256<TI> from) {
    984  from.v0 = TableLookupBytes(bytes.v0, from.v0);
    985  from.v1 = TableLookupBytes(bytes.v1, from.v1);
    986  return from;
    987 }
    988 
    989 // Partial index vector
    990 template <typename T, typename TI, size_t NI>
    991 HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes,
    992                                        const Vec128<TI, NI> from) {
    993  // First expand to full 128, then 256.
    994  const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
    995  const auto tbl_full = TableLookupBytes(bytes, from_256);
    996  // Shrink to 128, then partial.
    997  return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
    998 }
    999 
   1000 // Partial table vector
   1001 template <typename T, size_t N, typename TI>
   1002 HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, const Vec256<TI> from) {
   1003  // First expand to full 128, then 256.
   1004  const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
   1005  return TableLookupBytes(bytes_256, from);
   1006 }
   1007 
   1008 // Partial both are handled by wasm_128.
   1009 
   1010 template <class V, class VI>
   1011 HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
   1012  // wasm out-of-bounds policy already zeros, so TableLookupBytes is fine.
   1013  return TableLookupBytes(bytes, from);
   1014 }
   1015 
   1016 // ------------------------------ Hard-coded shuffles
   1017 
   1018 template <typename T>
   1019 HWY_API Vec256<T> Shuffle01(Vec256<T> v) {
   1020  v.v0 = Shuffle01(v.v0);
   1021  v.v1 = Shuffle01(v.v1);
   1022  return v;
   1023 }
   1024 
   1025 template <typename T>
   1026 HWY_API Vec256<T> Shuffle2301(Vec256<T> v) {
   1027  v.v0 = Shuffle2301(v.v0);
   1028  v.v1 = Shuffle2301(v.v1);
   1029  return v;
   1030 }
   1031 
   1032 template <typename T>
   1033 HWY_API Vec256<T> Shuffle1032(Vec256<T> v) {
   1034  v.v0 = Shuffle1032(v.v0);
   1035  v.v1 = Shuffle1032(v.v1);
   1036  return v;
   1037 }
   1038 
   1039 template <typename T>
   1040 HWY_API Vec256<T> Shuffle0321(Vec256<T> v) {
   1041  v.v0 = Shuffle0321(v.v0);
   1042  v.v1 = Shuffle0321(v.v1);
   1043  return v;
   1044 }
   1045 
   1046 template <typename T>
   1047 HWY_API Vec256<T> Shuffle2103(Vec256<T> v) {
   1048  v.v0 = Shuffle2103(v.v0);
   1049  v.v1 = Shuffle2103(v.v1);
   1050  return v;
   1051 }
   1052 
   1053 template <typename T>
   1054 HWY_API Vec256<T> Shuffle0123(Vec256<T> v) {
   1055  v.v0 = Shuffle0123(v.v0);
   1056  v.v1 = Shuffle0123(v.v1);
   1057  return v;
   1058 }
   1059 
   1060 // Used by generic_ops-inl.h
   1061 namespace detail {
   1062 
   1063 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1064 HWY_API Vec256<T> ShuffleTwo2301(Vec256<T> a, const Vec256<T> b) {
   1065  a.v0 = ShuffleTwo2301(a.v0, b.v0);
   1066  a.v1 = ShuffleTwo2301(a.v1, b.v1);
   1067  return a;
   1068 }
   1069 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1070 HWY_API Vec256<T> ShuffleTwo1230(Vec256<T> a, const Vec256<T> b) {
   1071  a.v0 = ShuffleTwo1230(a.v0, b.v0);
   1072  a.v1 = ShuffleTwo1230(a.v1, b.v1);
   1073  return a;
   1074 }
   1075 template <typename T, HWY_IF_T_SIZE(T, 4)>
   1076 HWY_API Vec256<T> ShuffleTwo3012(Vec256<T> a, const Vec256<T> b) {
   1077  a.v0 = ShuffleTwo3012(a.v0, b.v0);
   1078  a.v1 = ShuffleTwo3012(a.v1, b.v1);
   1079  return a;
   1080 }
   1081 
   1082 }  // namespace detail
   1083 
   1084 // ------------------------------ TableLookupLanes
   1085 
   1086 // Returned by SetTableIndices for use by TableLookupLanes.
   1087 template <typename T>
   1088 struct Indices256 {
   1089  __v128_u i0;
   1090  __v128_u i1;
   1091 };
   1092 
   1093 template <class D, typename T = TFromD<D>, typename TI>
   1094 HWY_API Indices256<T> IndicesFromVec(D /* tag */, Vec256<TI> vec) {
   1095  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   1096  Indices256<T> ret;
   1097  ret.i0 = vec.v0.raw;
   1098  ret.i1 = vec.v1.raw;
   1099  return ret;
   1100 }
   1101 
   1102 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI>
   1103 HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) {
   1104  const Rebind<TI, decltype(d)> di;
   1105  return IndicesFromVec(d, LoadU(di, idx));
   1106 }
   1107 
   1108 template <typename T>
   1109 HWY_API Vec256<T> TableLookupLanes(const Vec256<T> v, Indices256<T> idx) {
   1110  const DFromV<decltype(v)> d;
   1111  const Half<decltype(d)> dh;
   1112  const auto idx_i0 = IndicesFromVec(dh, Vec128<T>{idx.i0});
   1113  const auto idx_i1 = IndicesFromVec(dh, Vec128<T>{idx.i1});
   1114 
   1115  Vec256<T> result;
   1116  result.v0 = TwoTablesLookupLanes(v.v0, v.v1, idx_i0);
   1117  result.v1 = TwoTablesLookupLanes(v.v0, v.v1, idx_i1);
   1118  return result;
   1119 }
   1120 
   1121 template <typename T>
   1122 HWY_API Vec256<T> TableLookupLanesOr0(Vec256<T> v, Indices256<T> idx) {
   1123  // The out of bounds behavior will already zero lanes.
   1124  return TableLookupLanesOr0(v, idx);
   1125 }
   1126 
   1127 template <typename T>
   1128 HWY_API Vec256<T> TwoTablesLookupLanes(const Vec256<T> a, const Vec256<T> b,
   1129                                       Indices256<T> idx) {
   1130  const DFromV<decltype(a)> d;
   1131  const Half<decltype(d)> dh;
   1132  const RebindToUnsigned<decltype(d)> du;
   1133  using TU = MakeUnsigned<T>;
   1134  constexpr size_t kLanesPerVect = 32 / sizeof(TU);
   1135 
   1136  Vec256<TU> vi;
   1137  vi.v0 = Vec128<TU>{idx.i0};
   1138  vi.v1 = Vec128<TU>{idx.i1};
   1139  const auto vmod = vi & Set(du, TU{kLanesPerVect - 1});
   1140  const auto is_lo = RebindMask(d, vi == vmod);
   1141 
   1142  const auto idx_i0 = IndicesFromVec(dh, vmod.v0);
   1143  const auto idx_i1 = IndicesFromVec(dh, vmod.v1);
   1144 
   1145  Vec256<T> result_lo;
   1146  Vec256<T> result_hi;
   1147  result_lo.v0 = TwoTablesLookupLanes(a.v0, a.v1, idx_i0);
   1148  result_lo.v1 = TwoTablesLookupLanes(a.v0, a.v1, idx_i1);
   1149  result_hi.v0 = TwoTablesLookupLanes(b.v0, b.v1, idx_i0);
   1150  result_hi.v1 = TwoTablesLookupLanes(b.v0, b.v1, idx_i1);
   1151  return IfThenElse(is_lo, result_lo, result_hi);
   1152 }
   1153 
   1154 // ------------------------------ Reverse
   1155 template <class D, typename T = TFromD<D>>
   1156 HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) {
   1157  const Half<decltype(d)> dh;
   1158  Vec256<T> ret;
   1159  ret.v1 = Reverse(dh, v.v0);  // note reversed v1 member order
   1160  ret.v0 = Reverse(dh, v.v1);
   1161  return ret;
   1162 }
   1163 
   1164 // ------------------------------ Reverse2
   1165 template <class D, typename T = TFromD<D>>
   1166 HWY_API Vec256<T> Reverse2(D d, Vec256<T> v) {
   1167  const Half<decltype(d)> dh;
   1168  v.v0 = Reverse2(dh, v.v0);
   1169  v.v1 = Reverse2(dh, v.v1);
   1170  return v;
   1171 }
   1172 
   1173 // ------------------------------ Reverse4
   1174 
   1175 // Each block has only 2 lanes, so swap blocks and their lanes.
   1176 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
   1177 HWY_API Vec256<T> Reverse4(D d, const Vec256<T> v) {
   1178  const Half<decltype(d)> dh;
   1179  Vec256<T> ret;
   1180  ret.v0 = Reverse2(dh, v.v1);  // swapped
   1181  ret.v1 = Reverse2(dh, v.v0);
   1182  return ret;
   1183 }
   1184 
   1185 template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 8)>
   1186 HWY_API Vec256<T> Reverse4(D d, Vec256<T> v) {
   1187  const Half<decltype(d)> dh;
   1188  v.v0 = Reverse4(dh, v.v0);
   1189  v.v1 = Reverse4(dh, v.v1);
   1190  return v;
   1191 }
   1192 
   1193 // ------------------------------ Reverse8
   1194 
   1195 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
   1196 HWY_API Vec256<T> Reverse8(D /* tag */, Vec256<T> /* v */) {
   1197  HWY_ASSERT(0);  // don't have 8 u64 lanes
   1198 }
   1199 
   1200 // Each block has only 4 lanes, so swap blocks and their lanes.
   1201 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
   1202 HWY_API Vec256<T> Reverse8(D d, const Vec256<T> v) {
   1203  const Half<decltype(d)> dh;
   1204  Vec256<T> ret;
   1205  ret.v0 = Reverse4(dh, v.v1);  // swapped
   1206  ret.v1 = Reverse4(dh, v.v0);
   1207  return ret;
   1208 }
   1209 
   1210 template <class D, typename T = TFromD<D>,
   1211          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
   1212 HWY_API Vec256<T> Reverse8(D d, Vec256<T> v) {
   1213  const Half<decltype(d)> dh;
   1214  v.v0 = Reverse8(dh, v.v0);
   1215  v.v1 = Reverse8(dh, v.v1);
   1216  return v;
   1217 }
   1218 
   1219 // ------------------------------ InterleaveLower
   1220 
   1221 template <typename T>
   1222 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) {
   1223  a.v0 = InterleaveLower(a.v0, b.v0);
   1224  a.v1 = InterleaveLower(a.v1, b.v1);
   1225  return a;
   1226 }
   1227 
   1228 // wasm_128 already defines a template with D, V, V args.
   1229 
   1230 // ------------------------------ InterleaveUpper (UpperHalf)
   1231 
   1232 template <class D, typename T = TFromD<D>>
   1233 HWY_API Vec256<T> InterleaveUpper(D d, Vec256<T> a, Vec256<T> b) {
   1234  const Half<decltype(d)> dh;
   1235  a.v0 = InterleaveUpper(dh, a.v0, b.v0);
   1236  a.v1 = InterleaveUpper(dh, a.v1, b.v1);
   1237  return a;
   1238 }
   1239 
   1240 // ------------------------------ InterleaveWholeLower
   1241 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1242 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
   1243  const Half<decltype(d)> dh;
   1244  VFromD<D> ret;
   1245  ret.v0 = InterleaveLower(a.v0, b.v0);
   1246  ret.v1 = InterleaveUpper(dh, a.v0, b.v0);
   1247  return ret;
   1248 }
   1249 
   1250 // ------------------------------ InterleaveWholeUpper
   1251 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1252 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
   1253  const Half<decltype(d)> dh;
   1254  VFromD<D> ret;
   1255  ret.v0 = InterleaveLower(a.v1, b.v1);
   1256  ret.v1 = InterleaveUpper(dh, a.v1, b.v1);
   1257  return ret;
   1258 }
   1259 
   1260 // ------------------------------ ZipLower/ZipUpper defined in wasm_128
   1261 
   1262 // ================================================== COMBINE
   1263 
   1264 // ------------------------------ Combine (InterleaveLower)
   1265 template <class D, typename T = TFromD<D>>
   1266 HWY_API Vec256<T> Combine(D /* d */, Vec128<T> hi, Vec128<T> lo) {
   1267  Vec256<T> ret;
   1268  ret.v1 = hi;
   1269  ret.v0 = lo;
   1270  return ret;
   1271 }
   1272 
   1273 // ------------------------------ ZeroExtendVector (Combine)
   1274 template <class D, typename T = TFromD<D>>
   1275 HWY_API Vec256<T> ZeroExtendVector(D d, Vec128<T> lo) {
   1276  const Half<decltype(d)> dh;
   1277  return Combine(d, Zero(dh), lo);
   1278 }
   1279 
   1280 // ------------------------------ ZeroExtendResizeBitCast
   1281 
   1282 namespace detail {
   1283 
   1284 template <size_t kFromVectSize, class DTo, class DFrom,
   1285          HWY_IF_LANES_LE(kFromVectSize, 8)>
   1286 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
   1287    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
   1288    hwy::SizeTag<32> /* to_size_tag */, DTo d_to, DFrom d_from,
   1289    VFromD<DFrom> v) {
   1290  const Half<decltype(d_to)> dh_to;
   1291  return ZeroExtendVector(d_to, ZeroExtendResizeBitCast(dh_to, d_from, v));
   1292 }
   1293 
   1294 }  // namespace detail
   1295 
   1296 // ------------------------------ ConcatLowerLower
   1297 template <class D, typename T = TFromD<D>>
   1298 HWY_API Vec256<T> ConcatLowerLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) {
   1299  Vec256<T> ret;
   1300  ret.v1 = hi.v0;
   1301  ret.v0 = lo.v0;
   1302  return ret;
   1303 }
   1304 
   1305 // ------------------------------ ConcatUpperUpper
   1306 template <class D, typename T = TFromD<D>>
   1307 HWY_API Vec256<T> ConcatUpperUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) {
   1308  Vec256<T> ret;
   1309  ret.v1 = hi.v1;
   1310  ret.v0 = lo.v1;
   1311  return ret;
   1312 }
   1313 
   1314 // ------------------------------ ConcatLowerUpper
   1315 template <class D, typename T = TFromD<D>>
   1316 HWY_API Vec256<T> ConcatLowerUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) {
   1317  Vec256<T> ret;
   1318  ret.v1 = hi.v0;
   1319  ret.v0 = lo.v1;
   1320  return ret;
   1321 }
   1322 
   1323 // ------------------------------ ConcatUpperLower
   1324 template <class D, typename T = TFromD<D>>
   1325 HWY_API Vec256<T> ConcatUpperLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) {
   1326  Vec256<T> ret;
   1327  ret.v1 = hi.v1;
   1328  ret.v0 = lo.v0;
   1329  return ret;
   1330 }
   1331 
   1332 // ------------------------------ ConcatOdd
   1333 template <class D, typename T = TFromD<D>>
   1334 HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) {
   1335  const Half<decltype(d)> dh;
   1336  Vec256<T> ret;
   1337  ret.v0 = ConcatOdd(dh, lo.v1, lo.v0);
   1338  ret.v1 = ConcatOdd(dh, hi.v1, hi.v0);
   1339  return ret;
   1340 }
   1341 
   1342 // ------------------------------ ConcatEven
   1343 template <class D, typename T = TFromD<D>>
   1344 HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) {
   1345  const Half<decltype(d)> dh;
   1346  Vec256<T> ret;
   1347  ret.v0 = ConcatEven(dh, lo.v1, lo.v0);
   1348  ret.v1 = ConcatEven(dh, hi.v1, hi.v0);
   1349  return ret;
   1350 }
   1351 
   1352 // ------------------------------ DupEven
   1353 template <typename T>
   1354 HWY_API Vec256<T> DupEven(Vec256<T> v) {
   1355  v.v0 = DupEven(v.v0);
   1356  v.v1 = DupEven(v.v1);
   1357  return v;
   1358 }
   1359 
   1360 // ------------------------------ DupOdd
   1361 template <typename T>
   1362 HWY_API Vec256<T> DupOdd(Vec256<T> v) {
   1363  v.v0 = DupOdd(v.v0);
   1364  v.v1 = DupOdd(v.v1);
   1365  return v;
   1366 }
   1367 
   1368 // ------------------------------ OddEven
   1369 template <typename T>
   1370 HWY_API Vec256<T> OddEven(Vec256<T> a, const Vec256<T> b) {
   1371  a.v0 = OddEven(a.v0, b.v0);
   1372  a.v1 = OddEven(a.v1, b.v1);
   1373  return a;
   1374 }
   1375 
   1376 // ------------------------------ InterleaveEven
   1377 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1378 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
   1379  const Half<decltype(d)> dh;
   1380  a.v0 = InterleaveEven(dh, a.v0, b.v0);
   1381  a.v1 = InterleaveEven(dh, a.v1, b.v1);
   1382  return a;
   1383 }
   1384 
   1385 // ------------------------------ InterleaveOdd
   1386 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1387 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   1388  const Half<decltype(d)> dh;
   1389  a.v0 = InterleaveOdd(dh, a.v0, b.v0);
   1390  a.v1 = InterleaveOdd(dh, a.v1, b.v1);
   1391  return a;
   1392 }
   1393 
   1394 // ------------------------------ OddEvenBlocks
   1395 template <typename T>
   1396 HWY_API Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
   1397  odd.v0 = even.v0;
   1398  return odd;
   1399 }
   1400 
   1401 // ------------------------------ SwapAdjacentBlocks
   1402 template <typename T>
   1403 HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
   1404  Vec256<T> ret;
   1405  ret.v0 = v.v1;  // swapped order
   1406  ret.v1 = v.v0;
   1407  return ret;
   1408 }
   1409 
   1410 // ------------------------------ InterleaveEvenBlocks
   1411 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
   1412 HWY_API V InterleaveEvenBlocks(D, V a, V b) {
   1413  V ret;
   1414  ret.v0 = a.v0;
   1415  ret.v1 = b.v0;
   1416  return ret;
   1417 }
   1418 // ------------------------------ InterleaveOddBlocks
   1419 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)>
   1420 HWY_API V InterleaveOddBlocks(D, V a, V b) {
   1421  V ret;
   1422  ret.v0 = a.v1;
   1423  ret.v1 = b.v1;
   1424  return ret;
   1425 }
   1426 
   1427 // ------------------------------ ReverseBlocks
   1428 template <class D, typename T = TFromD<D>>
   1429 HWY_API Vec256<T> ReverseBlocks(D /* tag */, const Vec256<T> v) {
   1430  return SwapAdjacentBlocks(v);  // 2 blocks, so Swap = Reverse
   1431 }
   1432 
   1433 // ------------------------------ Per4LaneBlockShuffle
   1434 namespace detail {
   1435 
   1436 template <size_t kIdx3210, class V>
   1437 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   1438                                  hwy::SizeTag<1> /*lane_size_tag*/,
   1439                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   1440  const DFromV<decltype(v)> d;
   1441  const Half<decltype(d)> dh;
   1442  using VH = VFromD<decltype(dh)>;
   1443 
   1444  constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   1445  constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   1446  constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   1447  constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   1448 
   1449  V ret;
   1450  ret.v0 = VH{wasm_i8x16_shuffle(
   1451      v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4,
   1452      kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
   1453      kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
   1454  ret.v1 = VH{wasm_i8x16_shuffle(
   1455      v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4,
   1456      kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8,
   1457      kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)};
   1458  return ret;
   1459 }
   1460 
   1461 template <size_t kIdx3210, class V>
   1462 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   1463                                  hwy::SizeTag<2> /*lane_size_tag*/,
   1464                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   1465  const DFromV<decltype(v)> d;
   1466  const Half<decltype(d)> dh;
   1467  using VH = VFromD<decltype(dh)>;
   1468 
   1469  constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   1470  constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   1471  constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   1472  constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   1473 
   1474  V ret;
   1475  ret.v0 = VH{wasm_i16x8_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3,
   1476                                 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
   1477  ret.v1 = VH{wasm_i16x8_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3,
   1478                                 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)};
   1479  return ret;
   1480 }
   1481 
   1482 template <size_t kIdx3210, class V>
   1483 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   1484                                  hwy::SizeTag<4> /*lane_size_tag*/,
   1485                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   1486  const DFromV<decltype(v)> d;
   1487  const Half<decltype(d)> dh;
   1488  using VH = VFromD<decltype(dh)>;
   1489 
   1490  constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   1491  constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   1492  constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   1493  constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   1494 
   1495  V ret;
   1496  ret.v0 =
   1497      VH{wasm_i32x4_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
   1498  ret.v1 =
   1499      VH{wasm_i32x4_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3)};
   1500  return ret;
   1501 }
   1502 
   1503 template <size_t kIdx3210, class V>
   1504 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   1505                                  hwy::SizeTag<8> /*lane_size_tag*/,
   1506                                  hwy::SizeTag<32> /*vect_size_tag*/, V v) {
   1507  const DFromV<decltype(v)> d;
   1508  const Half<decltype(d)> dh;
   1509  using VH = VFromD<decltype(dh)>;
   1510 
   1511  constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3);
   1512  constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3);
   1513  constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3);
   1514  constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3);
   1515 
   1516  V ret;
   1517  ret.v0 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx0, kIdx1)};
   1518  ret.v1 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx2, kIdx3)};
   1519  return ret;
   1520 }
   1521 
   1522 }  // namespace detail
   1523 
   1524 // ------------------------------ SlideUpBlocks
   1525 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
   1526 HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
   1527  static_assert(0 <= kBlocks && kBlocks <= 1,
   1528                "kBlocks must be between 0 and 1");
   1529  return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v;
   1530 }
   1531 
   1532 // ------------------------------ SlideDownBlocks
   1533 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)>
   1534 HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
   1535  static_assert(0 <= kBlocks && kBlocks <= 1,
   1536                "kBlocks must be between 0 and 1");
   1537  const Half<decltype(d)> dh;
   1538  return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v;
   1539 }
   1540 
   1541 // ------------------------------ SlideUpLanes
   1542 
   1543 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1544 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   1545  const Half<decltype(d)> dh;
   1546  const RebindToUnsigned<decltype(d)> du;
   1547  const RebindToUnsigned<decltype(dh)> dh_u;
   1548  const auto vu = BitCast(du, v);
   1549  VFromD<D> ret;
   1550 
   1551 #if !HWY_IS_DEBUG_BUILD
   1552  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
   1553  if (__builtin_constant_p(amt) && amt < kLanesPerBlock) {
   1554    switch (amt * sizeof(TFromD<D>)) {
   1555      case 0:
   1556        return v;
   1557      case 1:
   1558        ret.v0 = BitCast(dh, ShiftLeftBytes<1>(dh_u, vu.v0));
   1559        ret.v1 = BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0));
   1560        return ret;
   1561      case 2:
   1562        ret.v0 = BitCast(dh, ShiftLeftBytes<2>(dh_u, vu.v0));
   1563        ret.v1 = BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0));
   1564        return ret;
   1565      case 3:
   1566        ret.v0 = BitCast(dh, ShiftLeftBytes<3>(dh_u, vu.v0));
   1567        ret.v1 = BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0));
   1568        return ret;
   1569      case 4:
   1570        ret.v0 = BitCast(dh, ShiftLeftBytes<4>(dh_u, vu.v0));
   1571        ret.v1 = BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0));
   1572        return ret;
   1573      case 5:
   1574        ret.v0 = BitCast(dh, ShiftLeftBytes<5>(dh_u, vu.v0));
   1575        ret.v1 = BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0));
   1576        return ret;
   1577      case 6:
   1578        ret.v0 = BitCast(dh, ShiftLeftBytes<6>(dh_u, vu.v0));
   1579        ret.v1 = BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0));
   1580        return ret;
   1581      case 7:
   1582        ret.v0 = BitCast(dh, ShiftLeftBytes<7>(dh_u, vu.v0));
   1583        ret.v1 = BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0));
   1584        return ret;
   1585      case 8:
   1586        ret.v0 = BitCast(dh, ShiftLeftBytes<8>(dh_u, vu.v0));
   1587        ret.v1 = BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0));
   1588        return ret;
   1589      case 9:
   1590        ret.v0 = BitCast(dh, ShiftLeftBytes<9>(dh_u, vu.v0));
   1591        ret.v1 = BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0));
   1592        return ret;
   1593      case 10:
   1594        ret.v0 = BitCast(dh, ShiftLeftBytes<10>(dh_u, vu.v0));
   1595        ret.v1 = BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0));
   1596        return ret;
   1597      case 11:
   1598        ret.v0 = BitCast(dh, ShiftLeftBytes<11>(dh_u, vu.v0));
   1599        ret.v1 = BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0));
   1600        return ret;
   1601      case 12:
   1602        ret.v0 = BitCast(dh, ShiftLeftBytes<12>(dh_u, vu.v0));
   1603        ret.v1 = BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0));
   1604        return ret;
   1605      case 13:
   1606        ret.v0 = BitCast(dh, ShiftLeftBytes<13>(dh_u, vu.v0));
   1607        ret.v1 = BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0));
   1608        return ret;
   1609      case 14:
   1610        ret.v0 = BitCast(dh, ShiftLeftBytes<14>(dh_u, vu.v0));
   1611        ret.v1 = BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0));
   1612        return ret;
   1613      case 15:
   1614        ret.v0 = BitCast(dh, ShiftLeftBytes<15>(dh_u, vu.v0));
   1615        ret.v1 = BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0));
   1616        return ret;
   1617    }
   1618  }
   1619 
   1620  if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
   1621    ret.v0 = Zero(dh);
   1622    ret.v1 = SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock);
   1623    return ret;
   1624  }
   1625 #endif
   1626 
   1627  const Repartition<uint8_t, decltype(d)> du8;
   1628  const RebindToSigned<decltype(du8)> di8;
   1629  const Half<decltype(di8)> dh_i8;
   1630 
   1631  const auto lo_byte_idx = BitCast(
   1632      di8,
   1633      Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromD<D>))));
   1634 
   1635  const auto hi_byte_idx =
   1636      UpperHalf(dh_i8, lo_byte_idx) - Set(dh_i8, int8_t{16});
   1637  const auto hi_sel_mask =
   1638      UpperHalf(dh_i8, lo_byte_idx) > Set(dh_i8, int8_t{15});
   1639 
   1640  ret = BitCast(d,
   1641                TableLookupBytesOr0(ConcatLowerLower(du, vu, vu), lo_byte_idx));
   1642  ret.v1 =
   1643      BitCast(dh, IfThenElse(hi_sel_mask,
   1644                             TableLookupBytes(UpperHalf(dh_u, vu), hi_byte_idx),
   1645                             BitCast(dh_i8, ret.v1)));
   1646  return ret;
   1647 }
   1648 
   1649 // ------------------------------ Slide1Up
   1650 template <typename D, HWY_IF_V_SIZE_D(D, 32)>
   1651 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
   1652  VFromD<D> ret;
   1653  const Half<decltype(d)> dh;
   1654  constexpr int kShrByteAmt = static_cast<int>(16 - sizeof(TFromD<D>));
   1655  ret.v0 = ShiftLeftLanes<1>(dh, v.v0);
   1656  ret.v1 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0);
   1657  return ret;
   1658 }
   1659 
   1660 // ------------------------------ SlideDownLanes
   1661 
   1662 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   1663 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   1664  const Half<decltype(d)> dh;
   1665  const RebindToUnsigned<decltype(d)> du;
   1666  const RebindToUnsigned<decltype(dh)> dh_u;
   1667  VFromD<D> ret;
   1668 
   1669  const auto vu = BitCast(du, v);
   1670 
   1671 #if !HWY_IS_DEBUG_BUILD
   1672  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
   1673  if (__builtin_constant_p(amt) && amt < kLanesPerBlock) {
   1674    switch (amt * sizeof(TFromD<D>)) {
   1675      case 0:
   1676        return v;
   1677      case 1:
   1678        ret.v0 = BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0));
   1679        ret.v1 = BitCast(dh, ShiftRightBytes<1>(dh_u, vu.v1));
   1680        return ret;
   1681      case 2:
   1682        ret.v0 = BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0));
   1683        ret.v1 = BitCast(dh, ShiftRightBytes<2>(dh_u, vu.v1));
   1684        return ret;
   1685      case 3:
   1686        ret.v0 = BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0));
   1687        ret.v1 = BitCast(dh, ShiftRightBytes<3>(dh_u, vu.v1));
   1688        return ret;
   1689      case 4:
   1690        ret.v0 = BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0));
   1691        ret.v1 = BitCast(dh, ShiftRightBytes<4>(dh_u, vu.v1));
   1692        return ret;
   1693      case 5:
   1694        ret.v0 = BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0));
   1695        ret.v1 = BitCast(dh, ShiftRightBytes<5>(dh_u, vu.v1));
   1696        return ret;
   1697      case 6:
   1698        ret.v0 = BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0));
   1699        ret.v1 = BitCast(dh, ShiftRightBytes<6>(dh_u, vu.v1));
   1700        return ret;
   1701      case 7:
   1702        ret.v0 = BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0));
   1703        ret.v1 = BitCast(dh, ShiftRightBytes<7>(dh_u, vu.v1));
   1704        return ret;
   1705      case 8:
   1706        ret.v0 = BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0));
   1707        ret.v1 = BitCast(dh, ShiftRightBytes<8>(dh_u, vu.v1));
   1708        return ret;
   1709      case 9:
   1710        ret.v0 = BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0));
   1711        ret.v1 = BitCast(dh, ShiftRightBytes<9>(dh_u, vu.v1));
   1712        return ret;
   1713      case 10:
   1714        ret.v0 = BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0));
   1715        ret.v1 = BitCast(dh, ShiftRightBytes<10>(dh_u, vu.v1));
   1716        return ret;
   1717      case 11:
   1718        ret.v0 = BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0));
   1719        ret.v1 = BitCast(dh, ShiftRightBytes<11>(dh_u, vu.v1));
   1720        return ret;
   1721      case 12:
   1722        ret.v0 = BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0));
   1723        ret.v1 = BitCast(dh, ShiftRightBytes<12>(dh_u, vu.v1));
   1724        return ret;
   1725      case 13:
   1726        ret.v0 = BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0));
   1727        ret.v1 = BitCast(dh, ShiftRightBytes<13>(dh_u, vu.v1));
   1728        return ret;
   1729      case 14:
   1730        ret.v0 = BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0));
   1731        ret.v1 = BitCast(dh, ShiftRightBytes<14>(dh_u, vu.v1));
   1732        return ret;
   1733      case 15:
   1734        ret.v0 = BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0));
   1735        ret.v1 = BitCast(dh, ShiftRightBytes<15>(dh_u, vu.v1));
   1736        return ret;
   1737    }
   1738  }
   1739 
   1740  if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) {
   1741    ret.v0 = SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock);
   1742    ret.v1 = Zero(dh);
   1743    return ret;
   1744  }
   1745 #endif
   1746 
   1747  const Repartition<uint8_t, decltype(d)> du8;
   1748  const Half<decltype(du8)> dh_u8;
   1749 
   1750  const auto lo_byte_idx =
   1751      Iota(du8, static_cast<uint8_t>(amt * sizeof(TFromD<D>)));
   1752  const auto u8_16 = Set(du8, uint8_t{16});
   1753  const auto hi_byte_idx = lo_byte_idx - u8_16;
   1754 
   1755  const auto lo_sel_mask =
   1756      LowerHalf(dh_u8, lo_byte_idx) < LowerHalf(dh_u8, u8_16);
   1757  ret = BitCast(d, IfThenElseZero(hi_byte_idx < u8_16,
   1758                                  TableLookupBytes(ConcatUpperUpper(du, vu, vu),
   1759                                                   hi_byte_idx)));
   1760  ret.v0 =
   1761      BitCast(dh, IfThenElse(lo_sel_mask,
   1762                             TableLookupBytes(LowerHalf(dh_u, vu),
   1763                                              LowerHalf(dh_u8, lo_byte_idx)),
   1764                             BitCast(dh_u8, LowerHalf(dh, ret))));
   1765  return ret;
   1766 }
   1767 
   1768 // ------------------------------ Slide1Down
   1769 template <typename D, HWY_IF_V_SIZE_D(D, 32)>
   1770 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   1771  VFromD<D> ret;
   1772  const Half<decltype(d)> dh;
   1773  constexpr int kShrByteAmt = static_cast<int>(sizeof(TFromD<D>));
   1774  ret.v0 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0);
   1775  ret.v1 = ShiftRightBytes<kShrByteAmt>(dh, v.v1);
   1776  return ret;
   1777 }
   1778 
   1779 // ================================================== CONVERT
   1780 
   1781 // ------------------------------ PromoteTo
   1782 
   1783 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TN,
   1784          HWY_IF_T_SIZE_D(D, sizeof(TN) * 2)>
   1785 HWY_API VFromD<D> PromoteTo(D d, Vec128<TN> v) {
   1786  const Half<decltype(d)> dh;
   1787  VFromD<D> ret;
   1788  // PromoteLowerTo is defined later in generic_ops-inl.h.
   1789  ret.v0 = PromoteTo(dh, LowerHalf(v));
   1790  ret.v1 = PromoteUpperTo(dh, v);
   1791  return ret;
   1792 }
   1793 
   1794 // 4x promotion: 8-bit to 32-bit or 16-bit to 64-bit
   1795 template <class DW, HWY_IF_V_SIZE_D(DW, 32),
   1796          HWY_IF_T_SIZE_ONE_OF_D(DW, (1 << 4) | (1 << 8)),
   1797          HWY_IF_NOT_FLOAT_D(DW), typename TN,
   1798          HWY_IF_T_SIZE_D(DW, sizeof(TN) * 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TN)>
   1799 HWY_API Vec256<TFromD<DW>> PromoteTo(DW d, Vec64<TN> v) {
   1800  const Half<decltype(d)> dh;
   1801  // 16-bit lanes for UI8->UI32, 32-bit lanes for UI16->UI64
   1802  const Rebind<MakeWide<TN>, decltype(d)> d2;
   1803  const auto v_2x = PromoteTo(d2, v);
   1804  Vec256<TFromD<DW>> ret;
   1805  // PromoteLowerTo is defined later in generic_ops-inl.h.
   1806  ret.v0 = PromoteTo(dh, LowerHalf(v_2x));
   1807  ret.v1 = PromoteUpperTo(dh, v_2x);
   1808  return ret;
   1809 }
   1810 
   1811 // 8x promotion: 8-bit to 64-bit
   1812 template <class DW, HWY_IF_V_SIZE_D(DW, 32), HWY_IF_T_SIZE_D(DW, 8),
   1813          HWY_IF_NOT_FLOAT_D(DW), typename TN, HWY_IF_T_SIZE(TN, 1)>
   1814 HWY_API Vec256<TFromD<DW>> PromoteTo(DW d, Vec32<TN> v) {
   1815  const Half<decltype(d)> dh;
   1816  const Repartition<MakeWide<MakeWide<TN>>, decltype(dh)> d4;  // 32-bit lanes
   1817  const auto v32 = PromoteTo(d4, v);
   1818  Vec256<TFromD<DW>> ret;
   1819  // PromoteLowerTo is defined later in generic_ops-inl.h.
   1820  ret.v0 = PromoteTo(dh, LowerHalf(v32));
   1821  ret.v1 = PromoteUpperTo(dh, v32);
   1822  return ret;
   1823 }
   1824 
   1825 // ------------------------------ PromoteUpperTo
   1826 
   1827 // Not native, but still define this here because wasm_128 toggles
   1828 // HWY_NATIVE_PROMOTE_UPPER_TO.
   1829 template <class D, class T>
   1830 HWY_API VFromD<D> PromoteUpperTo(D d, Vec256<T> v) {
   1831  // Lanes(d) may differ from Lanes(DFromV<decltype(v)>()). Use the lane type
   1832  // from v because it cannot be deduced from D (could be either bf16 or f16).
   1833  const Rebind<T, decltype(d)> dh;
   1834  return PromoteTo(d, UpperHalf(dh, v));
   1835 }
   1836 
   1837 // ------------------------------ DemoteTo
   1838 
   1839 template <class D, HWY_IF_U16_D(D)>
   1840 HWY_API Vec128<uint16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) {
   1841  return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.v0.raw, v.v1.raw)};
   1842 }
   1843 
   1844 template <class D, HWY_IF_I16_D(D)>
   1845 HWY_API Vec128<int16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) {
   1846  return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw)};
   1847 }
   1848 
   1849 template <class D, HWY_IF_U8_D(D)>
   1850 HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) {
   1851  const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw);
   1852  return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
   1853 }
   1854 
   1855 template <class D, HWY_IF_U8_D(D)>
   1856 HWY_API Vec128<uint8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) {
   1857  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.v0.raw, v.v1.raw)};
   1858 }
   1859 
   1860 template <class D, HWY_IF_I8_D(D)>
   1861 HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) {
   1862  const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw);
   1863  return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
   1864 }
   1865 
   1866 template <class D, HWY_IF_I8_D(D)>
   1867 HWY_API Vec128<int8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) {
   1868  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.v0.raw, v.v1.raw)};
   1869 }
   1870 
   1871 template <class D, HWY_IF_I32_D(D)>
   1872 HWY_API Vec128<int32_t> DemoteTo(D di, Vec256<double> v) {
   1873  const Vec64<int32_t> lo{wasm_i32x4_trunc_sat_f64x2_zero(v.v0.raw)};
   1874  const Vec64<int32_t> hi{wasm_i32x4_trunc_sat_f64x2_zero(v.v1.raw)};
   1875  return Combine(di, hi, lo);
   1876 }
   1877 
   1878 template <class D, HWY_IF_U32_D(D)>
   1879 HWY_API Vec128<uint32_t> DemoteTo(D di, Vec256<double> v) {
   1880  const Vec64<uint32_t> lo{wasm_u32x4_trunc_sat_f64x2_zero(v.v0.raw)};
   1881  const Vec64<uint32_t> hi{wasm_u32x4_trunc_sat_f64x2_zero(v.v1.raw)};
   1882  return Combine(di, hi, lo);
   1883 }
   1884 
   1885 template <class D, HWY_IF_F32_D(D)>
   1886 HWY_API Vec128<float> DemoteTo(D df, Vec256<int64_t> v) {
   1887  const Vec64<float> lo = DemoteTo(Full64<float>(), v.v0);
   1888  const Vec64<float> hi = DemoteTo(Full64<float>(), v.v1);
   1889  return Combine(df, hi, lo);
   1890 }
   1891 
   1892 template <class D, HWY_IF_F32_D(D)>
   1893 HWY_API Vec128<float> DemoteTo(D df, Vec256<uint64_t> v) {
   1894  const Vec64<float> lo = DemoteTo(Full64<float>(), v.v0);
   1895  const Vec64<float> hi = DemoteTo(Full64<float>(), v.v1);
   1896  return Combine(df, hi, lo);
   1897 }
   1898 
   1899 template <class D, HWY_IF_F16_D(D)>
   1900 HWY_API Vec128<float16_t> DemoteTo(D d16, Vec256<float> v) {
   1901  const Half<decltype(d16)> d16h;
   1902  const Vec64<float16_t> lo = DemoteTo(d16h, v.v0);
   1903  const Vec64<float16_t> hi = DemoteTo(d16h, v.v1);
   1904  return Combine(d16, hi, lo);
   1905 }
   1906 
   1907 template <class D, HWY_IF_F32_D(D)>
   1908 HWY_API Vec128<float> DemoteTo(D df32, Vec256<double> v) {
   1909  const Half<decltype(df32)> df32h;
   1910  const Vec64<float> lo = DemoteTo(df32h, v.v0);
   1911  const Vec64<float> hi = DemoteTo(df32h, v.v1);
   1912  return Combine(df32, hi, lo);
   1913 }
   1914 
   1915 // For already range-limited input [0, 255].
   1916 HWY_API Vec64<uint8_t> U8FromU32(Vec256<uint32_t> v) {
   1917  const Full64<uint8_t> du8;
   1918  const Full256<int32_t> di32;  // no unsigned DemoteTo
   1919  return DemoteTo(du8, BitCast(di32, v));
   1920 }
   1921 
   1922 // ------------------------------ Truncations
   1923 
   1924 template <class D, HWY_IF_U8_D(D)>
   1925 HWY_API Vec32<uint8_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
   1926  return Vec32<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24, 0,
   1927                                           8, 16, 24, 0, 8, 16, 24, 0, 8, 16,
   1928                                           24)};
   1929 }
   1930 
   1931 template <class D, HWY_IF_U16_D(D)>
   1932 HWY_API Vec64<uint16_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
   1933  return Vec64<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9, 16,
   1934                                            17, 24, 25, 0, 1, 8, 9, 16, 17, 24,
   1935                                            25)};
   1936 }
   1937 
   1938 template <class D, HWY_IF_U32_D(D)>
   1939 HWY_API Vec128<uint32_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
   1940  return Vec128<uint32_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3, 8,
   1941                                             9, 10, 11, 16, 17, 18, 19, 24, 25,
   1942                                             26, 27)};
   1943 }
   1944 
   1945 template <class D, HWY_IF_U8_D(D)>
   1946 HWY_API Vec64<uint8_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
   1947  return Vec64<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12, 16,
   1948                                           20, 24, 28, 0, 4, 8, 12, 16, 20, 24,
   1949                                           28)};
   1950 }
   1951 
   1952 template <class D, HWY_IF_U16_D(D)>
   1953 HWY_API Vec128<uint16_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) {
   1954  return Vec128<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5, 8,
   1955                                             9, 12, 13, 16, 17, 20, 21, 24, 25,
   1956                                             28, 29)};
   1957 }
   1958 
   1959 template <class D, HWY_IF_U8_D(D)>
   1960 HWY_API Vec128<uint8_t> TruncateTo(D /* tag */, Vec256<uint16_t> v) {
   1961  return Vec128<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6, 8,
   1962                                            10, 12, 14, 16, 18, 20, 22, 24, 26,
   1963                                            28, 30)};
   1964 }
   1965 
   1966 // ------------------------------ ReorderDemote2To
   1967 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32),
   1968          HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), HWY_IF_SIGNED_V(V),
   1969          HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
   1970          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
   1971 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   1972  const Half<decltype(dn)> dnh;
   1973  VFromD<DN> demoted;
   1974  demoted.v0 = DemoteTo(dnh, a);
   1975  demoted.v1 = DemoteTo(dnh, b);
   1976  return demoted;
   1977 }
   1978 
   1979 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32), HWY_IF_UNSIGNED_D(DN),
   1980          HWY_IF_UNSIGNED_V(V),
   1981          HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
   1982          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
   1983 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   1984  const Half<decltype(dn)> dnh;
   1985  VFromD<DN> demoted;
   1986  demoted.v0 = DemoteTo(dnh, a);
   1987  demoted.v1 = DemoteTo(dnh, b);
   1988  return demoted;
   1989 }
   1990 
   1991 // ------------------------------ Convert i32 <=> f32 (Round)
   1992 
   1993 template <class DTo, typename TFrom, typename TTo = TFromD<DTo>>
   1994 HWY_API Vec256<TTo> ConvertTo(DTo d, const Vec256<TFrom> v) {
   1995  const Half<decltype(d)> dh;
   1996  Vec256<TTo> ret;
   1997  ret.v0 = ConvertTo(dh, v.v0);
   1998  ret.v1 = ConvertTo(dh, v.v1);
   1999  return ret;
   2000 }
   2001 
   2002 template <typename T, HWY_IF_FLOAT3264(T)>
   2003 HWY_API Vec256<MakeSigned<T>> NearestInt(const Vec256<T> v) {
   2004  return ConvertTo(Full256<MakeSigned<T>>(), Round(v));
   2005 }
   2006 
   2007 // ================================================== MISC
   2008 
   2009 // ------------------------------ LoadMaskBits (TestBit)
   2010 
   2011 // `p` points to at least 8 readable bytes, not all of which need be valid.
   2012 template <class D, HWY_IF_V_SIZE_D(D, 32),
   2013          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   2014 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   2015  const Half<decltype(d)> dh;
   2016  MFromD<D> ret;
   2017  ret.m0 = LoadMaskBits(dh, bits);
   2018  // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8.
   2019  // Both halves fit in one byte's worth of mask bits.
   2020  constexpr size_t kBitsPerHalf = 16 / sizeof(TFromD<D>);
   2021  const uint8_t bits_upper[8] = {static_cast<uint8_t>(bits[0] >> kBitsPerHalf)};
   2022  ret.m1 = LoadMaskBits(dh, bits_upper);
   2023  return ret;
   2024 }
   2025 
   2026 template <class D, HWY_IF_V_SIZE_D(D, 32),
   2027          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   2028 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   2029  const Half<decltype(d)> dh;
   2030  MFromD<D> ret;
   2031  ret.m0 = LoadMaskBits(dh, bits);
   2032  constexpr size_t kLanesPerHalf = 16 / sizeof(TFromD<D>);
   2033  constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
   2034  static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
   2035  ret.m1 = LoadMaskBits(dh, bits + kBytesPerHalf);
   2036  return ret;
   2037 }
   2038 
   2039 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2040 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   2041  const Half<decltype(d)> dh;
   2042  MFromD<D> ret;
   2043  ret.m0 = ret.m1 = Dup128MaskFromMaskBits(dh, mask_bits);
   2044  return ret;
   2045 }
   2046 
   2047 // ------------------------------ Mask
   2048 
   2049 // `p` points to at least 8 writable bytes.
   2050 template <class D, typename T = TFromD<D>,
   2051          HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
   2052 HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) {
   2053  const Half<decltype(d)> dh;
   2054  StoreMaskBits(dh, mask.m0, bits);
   2055  const uint8_t lo = bits[0];
   2056  StoreMaskBits(dh, mask.m1, bits);
   2057  // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8.
   2058  // Both halves fit in one byte's worth of mask bits.
   2059  constexpr size_t kBitsPerHalf = 16 / sizeof(T);
   2060  bits[0] = static_cast<uint8_t>(lo | (bits[0] << kBitsPerHalf));
   2061  return (kBitsPerHalf * 2 + 7) / 8;
   2062 }
   2063 
   2064 template <class D, typename T = TFromD<D>,
   2065          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
   2066 HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) {
   2067  const Half<decltype(d)> dh;
   2068  constexpr size_t kLanesPerHalf = 16 / sizeof(T);
   2069  constexpr size_t kBytesPerHalf = kLanesPerHalf / 8;
   2070  static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes");
   2071  StoreMaskBits(dh, mask.m0, bits);
   2072  StoreMaskBits(dh, mask.m1, bits + kBytesPerHalf);
   2073  return kBytesPerHalf * 2;
   2074 }
   2075 
   2076 template <class D, typename T = TFromD<D>>
   2077 HWY_API size_t CountTrue(D d, const Mask256<T> m) {
   2078  const Half<decltype(d)> dh;
   2079  return CountTrue(dh, m.m0) + CountTrue(dh, m.m1);
   2080 }
   2081 
   2082 template <class D, typename T = TFromD<D>>
   2083 HWY_API bool AllFalse(D d, const Mask256<T> m) {
   2084  const Half<decltype(d)> dh;
   2085  return AllFalse(dh, m.m0) && AllFalse(dh, m.m1);
   2086 }
   2087 
   2088 template <class D, typename T = TFromD<D>>
   2089 HWY_API bool AllTrue(D d, const Mask256<T> m) {
   2090  const Half<decltype(d)> dh;
   2091  return AllTrue(dh, m.m0) && AllTrue(dh, m.m1);
   2092 }
   2093 
   2094 template <class D, typename T = TFromD<D>>
   2095 HWY_API size_t FindKnownFirstTrue(D d, const Mask256<T> mask) {
   2096  const Half<decltype(d)> dh;
   2097  const intptr_t lo = FindFirstTrue(dh, mask.m0);  // not known
   2098  constexpr size_t kLanesPerHalf = 16 / sizeof(T);
   2099  return lo >= 0 ? static_cast<size_t>(lo)
   2100                 : kLanesPerHalf + FindKnownFirstTrue(dh, mask.m1);
   2101 }
   2102 
   2103 template <class D, typename T = TFromD<D>>
   2104 HWY_API intptr_t FindFirstTrue(D d, const Mask256<T> mask) {
   2105  const Half<decltype(d)> dh;
   2106  const intptr_t lo = FindFirstTrue(dh, mask.m0);
   2107  constexpr int kLanesPerHalf = 16 / sizeof(T);
   2108  if (lo >= 0) return lo;
   2109 
   2110  const intptr_t hi = FindFirstTrue(dh, mask.m1);
   2111  return hi + (hi >= 0 ? kLanesPerHalf : 0);
   2112 }
   2113 
   2114 template <class D, typename T = TFromD<D>>
   2115 HWY_API size_t FindKnownLastTrue(D d, const Mask256<T> mask) {
   2116  const Half<decltype(d)> dh;
   2117  const intptr_t hi = FindLastTrue(dh, mask.m1);  // not known
   2118  constexpr size_t kLanesPerHalf = 16 / sizeof(T);
   2119  return hi >= 0 ? kLanesPerHalf + static_cast<size_t>(hi)
   2120                 : FindKnownLastTrue(dh, mask.m0);
   2121 }
   2122 
   2123 template <class D, typename T = TFromD<D>>
   2124 HWY_API intptr_t FindLastTrue(D d, const Mask256<T> mask) {
   2125  const Half<decltype(d)> dh;
   2126  constexpr int kLanesPerHalf = 16 / sizeof(T);
   2127  const intptr_t hi = FindLastTrue(dh, mask.m1);
   2128  return hi >= 0 ? kLanesPerHalf + hi : FindLastTrue(dh, mask.m0);
   2129 }
   2130 
   2131 // ------------------------------ CompressStore
   2132 template <class D, typename T = TFromD<D>>
   2133 HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, D d,
   2134                             T* HWY_RESTRICT unaligned) {
   2135  const Half<decltype(d)> dh;
   2136  const size_t count = CompressStore(v.v0, mask.m0, dh, unaligned);
   2137  const size_t count2 = CompressStore(v.v1, mask.m1, dh, unaligned + count);
   2138  return count + count2;
   2139 }
   2140 
   2141 // ------------------------------ CompressBlendedStore
   2142 template <class D, typename T = TFromD<D>>
   2143 HWY_API size_t CompressBlendedStore(Vec256<T> v, const Mask256<T> m, D d,
   2144                                    T* HWY_RESTRICT unaligned) {
   2145  const Half<decltype(d)> dh;
   2146  const size_t count = CompressBlendedStore(v.v0, m.m0, dh, unaligned);
   2147  const size_t count2 = CompressBlendedStore(v.v1, m.m1, dh, unaligned + count);
   2148  return count + count2;
   2149 }
   2150 
   2151 // ------------------------------ CompressBitsStore
   2152 
   2153 template <class D, typename T = TFromD<D>>
   2154 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
   2155                                 D d, T* HWY_RESTRICT unaligned) {
   2156  const Mask256<T> m = LoadMaskBits(d, bits);
   2157  return CompressStore(v, m, d, unaligned);
   2158 }
   2159 
   2160 // ------------------------------ Compress
   2161 template <typename T>
   2162 HWY_API Vec256<T> Compress(const Vec256<T> v, const Mask256<T> mask) {
   2163  const DFromV<decltype(v)> d;
   2164  alignas(32) T lanes[32 / sizeof(T)] = {};
   2165  (void)CompressStore(v, mask, d, lanes);
   2166  return Load(d, lanes);
   2167 }
   2168 
   2169 // ------------------------------ CompressNot
   2170 template <typename T>
   2171 HWY_API Vec256<T> CompressNot(Vec256<T> v, const Mask256<T> mask) {
   2172  return Compress(v, Not(mask));
   2173 }
   2174 
   2175 // ------------------------------ CompressBlocksNot
   2176 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
   2177                                           Mask256<uint64_t> mask) {
   2178  const Full128<uint64_t> dh;
   2179  // Because the non-selected (mask=1) blocks are undefined, we can return the
   2180  // input unless mask = 01, in which case we must bring down the upper block.
   2181  return AllTrue(dh, AndNot(mask.m1, mask.m0)) ? SwapAdjacentBlocks(v) : v;
   2182 }
   2183 
   2184 // ------------------------------ CompressBits
   2185 template <typename T>
   2186 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
   2187  const Mask256<T> m = LoadMaskBits(DFromV<decltype(v)>(), bits);
   2188  return Compress(v, m);
   2189 }
   2190 
   2191 // ------------------------------ Expand
   2192 template <typename T>
   2193 HWY_API Vec256<T> Expand(const Vec256<T> v, const Mask256<T> mask) {
   2194  Vec256<T> ret;
   2195  const Full256<T> d;
   2196  const Half<decltype(d)> dh;
   2197  alignas(32) T lanes[32 / sizeof(T)] = {};
   2198  Store(v, d, lanes);
   2199  ret.v0 = Expand(v.v0, mask.m0);
   2200  ret.v1 = Expand(LoadU(dh, lanes + CountTrue(dh, mask.m0)), mask.m1);
   2201  return ret;
   2202 }
   2203 
   2204 // ------------------------------ LoadExpand
   2205 template <class D, HWY_IF_V_SIZE_D(D, 32)>
   2206 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   2207                             const TFromD<D>* HWY_RESTRICT unaligned) {
   2208  return Expand(LoadU(d, unaligned), mask);
   2209 }
   2210 
   2211 // ------------------------------ LoadInterleaved3/4
   2212 
   2213 // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
   2214 
   2215 namespace detail {
   2216 
   2217 // Input:
   2218 // 1 0 (<- first block of unaligned)
   2219 // 3 2
   2220 // 5 4
   2221 // Output:
   2222 // 3 0
   2223 // 4 1
   2224 // 5 2
   2225 template <class D, typename T = TFromD<D>>
   2226 HWY_API void LoadTransposedBlocks3(D d, const T* HWY_RESTRICT unaligned,
   2227                                   Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
   2228  const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d));
   2229  const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d));
   2230  const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d));
   2231 
   2232  A = ConcatUpperLower(d, v32, v10);
   2233  B = ConcatLowerUpper(d, v54, v10);
   2234  C = ConcatUpperLower(d, v54, v32);
   2235 }
   2236 
   2237 // Input (128-bit blocks):
   2238 // 1 0 (first block of unaligned)
   2239 // 3 2
   2240 // 5 4
   2241 // 7 6
   2242 // Output:
   2243 // 4 0 (LSB of A)
   2244 // 5 1
   2245 // 6 2
   2246 // 7 3
   2247 template <class D, typename T = TFromD<D>>
   2248 HWY_API void LoadTransposedBlocks4(D d, const T* HWY_RESTRICT unaligned,
   2249                                   Vec256<T>& vA, Vec256<T>& vB, Vec256<T>& vC,
   2250                                   Vec256<T>& vD) {
   2251  const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d));
   2252  const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d));
   2253  const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d));
   2254  const Vec256<T> v76 = LoadU(d, unaligned + 3 * MaxLanes(d));
   2255 
   2256  vA = ConcatLowerLower(d, v54, v10);
   2257  vB = ConcatUpperUpper(d, v54, v10);
   2258  vC = ConcatLowerLower(d, v76, v32);
   2259  vD = ConcatUpperUpper(d, v76, v32);
   2260 }
   2261 
   2262 }  // namespace detail
   2263 
   2264 // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
   2265 
   2266 // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
   2267 
   2268 namespace detail {
   2269 
   2270 // Input (128-bit blocks):
   2271 // 2 0 (LSB of i)
   2272 // 3 1
   2273 // Output:
   2274 // 1 0
   2275 // 3 2
   2276 template <class D, typename T = TFromD<D>>
   2277 HWY_API void StoreTransposedBlocks2(Vec256<T> i, Vec256<T> j, D d,
   2278                                    T* HWY_RESTRICT unaligned) {
   2279  const Vec256<T> out0 = ConcatLowerLower(d, j, i);
   2280  const Vec256<T> out1 = ConcatUpperUpper(d, j, i);
   2281  StoreU(out0, d, unaligned + 0 * MaxLanes(d));
   2282  StoreU(out1, d, unaligned + 1 * MaxLanes(d));
   2283 }
   2284 
   2285 // Input (128-bit blocks):
   2286 // 3 0 (LSB of i)
   2287 // 4 1
   2288 // 5 2
   2289 // Output:
   2290 // 1 0
   2291 // 3 2
   2292 // 5 4
   2293 template <class D, typename T = TFromD<D>>
   2294 HWY_API void StoreTransposedBlocks3(Vec256<T> i, Vec256<T> j, Vec256<T> k, D d,
   2295                                    T* HWY_RESTRICT unaligned) {
   2296  const Vec256<T> out0 = ConcatLowerLower(d, j, i);
   2297  const Vec256<T> out1 = ConcatUpperLower(d, i, k);
   2298  const Vec256<T> out2 = ConcatUpperUpper(d, k, j);
   2299  StoreU(out0, d, unaligned + 0 * MaxLanes(d));
   2300  StoreU(out1, d, unaligned + 1 * MaxLanes(d));
   2301  StoreU(out2, d, unaligned + 2 * MaxLanes(d));
   2302 }
   2303 
   2304 // Input (128-bit blocks):
   2305 // 4 0 (LSB of i)
   2306 // 5 1
   2307 // 6 2
   2308 // 7 3
   2309 // Output:
   2310 // 1 0
   2311 // 3 2
   2312 // 5 4
   2313 // 7 6
   2314 template <class D, typename T = TFromD<D>>
   2315 HWY_API void StoreTransposedBlocks4(Vec256<T> i, Vec256<T> j, Vec256<T> k,
   2316                                    Vec256<T> l, D d,
   2317                                    T* HWY_RESTRICT unaligned) {
   2318  // Write lower halves, then upper.
   2319  const Vec256<T> out0 = ConcatLowerLower(d, j, i);
   2320  const Vec256<T> out1 = ConcatLowerLower(d, l, k);
   2321  StoreU(out0, d, unaligned + 0 * MaxLanes(d));
   2322  StoreU(out1, d, unaligned + 1 * MaxLanes(d));
   2323  const Vec256<T> out2 = ConcatUpperUpper(d, j, i);
   2324  const Vec256<T> out3 = ConcatUpperUpper(d, l, k);
   2325  StoreU(out2, d, unaligned + 2 * MaxLanes(d));
   2326  StoreU(out3, d, unaligned + 3 * MaxLanes(d));
   2327 }
   2328 
   2329 }  // namespace detail
   2330 
   2331 // ------------------------------ Additional mask logical operations
   2332 
   2333 template <class T>
   2334 HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) {
   2335  const Full256<T> d;
   2336  const Half<decltype(d)> dh;
   2337  const Repartition<int64_t, decltype(dh)> dh_i64;
   2338 
   2339  Mask256<T> result;
   2340  result.m0 = SetAtOrAfterFirst(mask.m0);
   2341  result.m1 = SetAtOrAfterFirst(mask.m1);
   2342 
   2343  // Copy the sign bit of the lower 128-bit half to the upper 128-bit half
   2344  const auto vmask_lo = BitCast(dh_i64, VecFromMask(dh, result.m0));
   2345  result.m1 =
   2346      Or(result.m1, MaskFromVec(BitCast(dh, BroadcastSignBit(InterleaveUpper(
   2347                                                dh_i64, vmask_lo, vmask_lo)))));
   2348 
   2349  return result;
   2350 }
   2351 
   2352 template <class T>
   2353 HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) {
   2354  return Not(SetAtOrAfterFirst(mask));
   2355 }
   2356 
   2357 template <class T>
   2358 HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) {
   2359  const Full256<T> d;
   2360  const RebindToSigned<decltype(d)> di;
   2361  const Repartition<int64_t, decltype(d)> di64;
   2362  const Half<decltype(di64)> dh_i64;
   2363 
   2364  const auto zero = Zero(di64);
   2365  const auto vmask = BitCast(di64, VecFromMask(d, mask));
   2366 
   2367  const auto vmask_eq_0 = VecFromMask(di64, vmask == zero);
   2368  auto vmask2_lo = LowerHalf(dh_i64, vmask_eq_0);
   2369  auto vmask2_hi = UpperHalf(dh_i64, vmask_eq_0);
   2370 
   2371  vmask2_lo = And(vmask2_lo, InterleaveLower(vmask2_lo, vmask2_lo));
   2372  vmask2_hi = And(ConcatLowerUpper(dh_i64, vmask2_hi, vmask2_lo),
   2373                  InterleaveUpper(dh_i64, vmask2_lo, vmask2_lo));
   2374  vmask2_lo = InterleaveLower(Set(dh_i64, int64_t{-1}), vmask2_lo);
   2375 
   2376  const auto vmask2 = Combine(di64, vmask2_hi, vmask2_lo);
   2377  const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
   2378  return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
   2379 }
   2380 
   2381 template <class T>
   2382 HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) {
   2383  const Full256<T> d;
   2384  constexpr size_t kLanesPerBlock = MaxLanes(d) / 2;
   2385 
   2386  const auto vmask = VecFromMask(d, mask);
   2387  const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d));
   2388  return SetBeforeFirst(
   2389      MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>(
   2390          d, vmask, vmask_lo)));
   2391 }
   2392 
   2393 // ------------------------------ WidenMulPairwiseAdd
   2394 template <class D32, typename T16, typename T32 = TFromD<D32>>
   2395 HWY_API Vec256<T32> WidenMulPairwiseAdd(D32 d32, Vec256<T16> a, Vec256<T16> b) {
   2396  const Half<decltype(d32)> d32h;
   2397  Vec256<T32> result;
   2398  result.v0 = WidenMulPairwiseAdd(d32h, a.v0, b.v0);
   2399  result.v1 = WidenMulPairwiseAdd(d32h, a.v1, b.v1);
   2400  return result;
   2401 }
   2402 
   2403 // ------------------------------ ReorderWidenMulAccumulate
   2404 template <class D32, typename T16, typename T32 = TFromD<D32>>
   2405 HWY_API Vec256<T32> ReorderWidenMulAccumulate(D32 d32, Vec256<T16> a,
   2406                                              Vec256<T16> b, Vec256<T32> sum0,
   2407                                              Vec256<T32>& sum1) {
   2408  const Half<decltype(d32)> d32h;
   2409  sum0.v0 = ReorderWidenMulAccumulate(d32h, a.v0, b.v0, sum0.v0, sum1.v0);
   2410  sum0.v1 = ReorderWidenMulAccumulate(d32h, a.v1, b.v1, sum0.v1, sum1.v1);
   2411  return sum0;
   2412 }
   2413 
   2414 // ------------------------------ RearrangeToOddPlusEven
   2415 template <typename TW>
   2416 HWY_API Vec256<TW> RearrangeToOddPlusEven(Vec256<TW> sum0, Vec256<TW> sum1) {
   2417  sum0.v0 = RearrangeToOddPlusEven(sum0.v0, sum1.v0);
   2418  sum0.v1 = RearrangeToOddPlusEven(sum0.v1, sum1.v1);
   2419  return sum0;
   2420 }
   2421 
   2422 // ------------------------------ Reductions in generic_ops
   2423 
   2424 // ------------------------------ Lt128
   2425 
   2426 template <class D, typename T = TFromD<D>>
   2427 HWY_INLINE Mask256<T> Lt128(D d, Vec256<T> a, Vec256<T> b) {
   2428  const Half<decltype(d)> dh;
   2429  Mask256<T> ret;
   2430  ret.m0 = Lt128(dh, a.v0, b.v0);
   2431  ret.m1 = Lt128(dh, a.v1, b.v1);
   2432  return ret;
   2433 }
   2434 
   2435 template <class D, typename T = TFromD<D>>
   2436 HWY_INLINE Mask256<T> Lt128Upper(D d, Vec256<T> a, Vec256<T> b) {
   2437  const Half<decltype(d)> dh;
   2438  Mask256<T> ret;
   2439  ret.m0 = Lt128Upper(dh, a.v0, b.v0);
   2440  ret.m1 = Lt128Upper(dh, a.v1, b.v1);
   2441  return ret;
   2442 }
   2443 
   2444 template <class D, typename T = TFromD<D>>
   2445 HWY_INLINE Mask256<T> Eq128(D d, Vec256<T> a, Vec256<T> b) {
   2446  const Half<decltype(d)> dh;
   2447  Mask256<T> ret;
   2448  ret.m0 = Eq128(dh, a.v0, b.v0);
   2449  ret.m1 = Eq128(dh, a.v1, b.v1);
   2450  return ret;
   2451 }
   2452 
   2453 template <class D, typename T = TFromD<D>>
   2454 HWY_INLINE Mask256<T> Eq128Upper(D d, Vec256<T> a, Vec256<T> b) {
   2455  const Half<decltype(d)> dh;
   2456  Mask256<T> ret;
   2457  ret.m0 = Eq128Upper(dh, a.v0, b.v0);
   2458  ret.m1 = Eq128Upper(dh, a.v1, b.v1);
   2459  return ret;
   2460 }
   2461 
   2462 template <class D, typename T = TFromD<D>>
   2463 HWY_INLINE Mask256<T> Ne128(D d, Vec256<T> a, Vec256<T> b) {
   2464  const Half<decltype(d)> dh;
   2465  Mask256<T> ret;
   2466  ret.m0 = Ne128(dh, a.v0, b.v0);
   2467  ret.m1 = Ne128(dh, a.v1, b.v1);
   2468  return ret;
   2469 }
   2470 
   2471 template <class D, typename T = TFromD<D>>
   2472 HWY_INLINE Mask256<T> Ne128Upper(D d, Vec256<T> a, Vec256<T> b) {
   2473  const Half<decltype(d)> dh;
   2474  Mask256<T> ret;
   2475  ret.m0 = Ne128Upper(dh, a.v0, b.v0);
   2476  ret.m1 = Ne128Upper(dh, a.v1, b.v1);
   2477  return ret;
   2478 }
   2479 
   2480 template <class D, typename T = TFromD<D>>
   2481 HWY_INLINE Vec256<T> Min128(D d, Vec256<T> a, Vec256<T> b) {
   2482  const Half<decltype(d)> dh;
   2483  Vec256<T> ret;
   2484  ret.v0 = Min128(dh, a.v0, b.v0);
   2485  ret.v1 = Min128(dh, a.v1, b.v1);
   2486  return ret;
   2487 }
   2488 
   2489 template <class D, typename T = TFromD<D>>
   2490 HWY_INLINE Vec256<T> Max128(D d, Vec256<T> a, Vec256<T> b) {
   2491  const Half<decltype(d)> dh;
   2492  Vec256<T> ret;
   2493  ret.v0 = Max128(dh, a.v0, b.v0);
   2494  ret.v1 = Max128(dh, a.v1, b.v1);
   2495  return ret;
   2496 }
   2497 
   2498 template <class D, typename T = TFromD<D>>
   2499 HWY_INLINE Vec256<T> Min128Upper(D d, Vec256<T> a, Vec256<T> b) {
   2500  const Half<decltype(d)> dh;
   2501  Vec256<T> ret;
   2502  ret.v0 = Min128Upper(dh, a.v0, b.v0);
   2503  ret.v1 = Min128Upper(dh, a.v1, b.v1);
   2504  return ret;
   2505 }
   2506 
   2507 template <class D, typename T = TFromD<D>>
   2508 HWY_INLINE Vec256<T> Max128Upper(D d, Vec256<T> a, Vec256<T> b) {
   2509  const Half<decltype(d)> dh;
   2510  Vec256<T> ret;
   2511  ret.v0 = Max128Upper(dh, a.v0, b.v0);
   2512  ret.v1 = Max128Upper(dh, a.v1, b.v1);
   2513  return ret;
   2514 }
   2515 
   2516 // NOLINTNEXTLINE(google-readability-namespace-comments)
   2517 }  // namespace HWY_NAMESPACE
   2518 }  // namespace hwy
   2519 HWY_AFTER_NAMESPACE();