tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

generic_ops-inl.h (308815B)


      1 // Copyright 2021 Google LLC
      2 // Copyright 2023,2024 Arm Limited and/or
      3 // its affiliates <open-source-office@arm.com>
      4 // SPDX-License-Identifier: Apache-2.0
      5 // SPDX-License-Identifier: BSD-3-Clause
      6 //
      7 // Licensed under the Apache License, Version 2.0 (the "License");
      8 // you may not use this file except in compliance with the License.
      9 // You may obtain a copy of the License at
     10 //
     11 //      http://www.apache.org/licenses/LICENSE-2.0
     12 //
     13 // Unless required by applicable law or agreed to in writing, software
     14 // distributed under the License is distributed on an "AS IS" BASIS,
     15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     16 // See the License for the specific language governing permissions and
     17 // limitations under the License.
     18 
     19 // Target-independent types/functions defined after target-specific ops.
     20 
     21 // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
     22 // the generic implementation here if native ops are already defined.
     23 
     24 #include "hwy/base.h"
     25 
     26 // Define detail::Shuffle1230 etc, but only when viewing the current header;
     27 // normally this is included via highway.h, which includes ops/*.h.
     28 #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
     29 #include "hwy/detect_targets.h"
     30 #include "hwy/ops/emu128-inl.h"
     31 #endif  // HWY_IDE
     32 
     33 // Relies on the external include guard in highway.h.
     34 HWY_BEFORE_NAMESPACE();
     35 namespace hwy {
     36 namespace HWY_NAMESPACE {
     37 
     38 // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
     39 template <class V>
     40 using LaneType = decltype(GetLane(V()));
     41 
     42 // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
     43 // type of functions that do not take a vector argument, or as an argument type
     44 // if the function only has a template argument for D, or for explicit type
     45 // names instead of auto. This may be a built-in type.
     46 template <class D>
     47 using Vec = decltype(Zero(D()));
     48 
     49 // Mask type. Useful as the return type of functions that do not take a mask
     50 // argument, or as an argument type if the function only has a template argument
     51 // for D, or for explicit type names instead of auto.
     52 template <class D>
     53 using Mask = decltype(MaskFromVec(Zero(D())));
     54 
     55 // Returns the closest value to v within [lo, hi].
     56 template <class V>
     57 HWY_API V Clamp(const V v, const V lo, const V hi) {
     58  return Min(Max(lo, v), hi);
     59 }
     60 
     61 // CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
     62 // and RVV has its own implementation of -Lanes.
     63 #if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE
     64 
     65 template <size_t kLanes, class D>
     66 HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
     67  constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
     68  static_assert(kBytes < 16, "Shift count is per-block");
     69  return CombineShiftRightBytes<kBytes>(d, hi, lo);
     70 }
     71 
     72 #endif
     73 
     74 // Returns lanes with the most significant bit set and all other bits zero.
     75 template <class D>
     76 HWY_API Vec<D> SignBit(D d) {
     77  const RebindToUnsigned<decltype(d)> du;
     78  return BitCast(d, Set(du, SignMask<TFromD<D>>()));
     79 }
     80 
     81 // Returns quiet NaN.
     82 template <class D>
     83 HWY_API Vec<D> NaN(D d) {
     84  const RebindToSigned<D> di;
     85  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
     86  // mantissa MSB (to indicate quiet) would be sufficient.
     87  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
     88 }
     89 
     90 // Returns positive infinity.
     91 template <class D>
     92 HWY_API Vec<D> Inf(D d) {
     93  const RebindToUnsigned<D> du;
     94  using T = TFromD<D>;
     95  using TU = TFromD<decltype(du)>;
     96  const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
     97  return BitCast(d, Set(du, max_x2 >> 1));
     98 }
     99 
    100 // ------------------------------ MaskedSetOr/MaskedSet
    101 
    102 template <class V, typename T = TFromV<V>, typename D = DFromV<V>,
    103          typename M = MFromD<D>>
    104 HWY_API V MaskedSetOr(V no, M m, T a) {
    105  D d;
    106  return IfThenElse(m, Set(d, a), no);
    107 }
    108 
    109 template <class D, typename V = VFromD<D>, typename M = MFromD<D>,
    110          typename T = TFromD<D>>
    111 HWY_API V MaskedSet(D d, M m, T a) {
    112  return IfThenElseZero(m, Set(d, a));
    113 }
    114 
    115 // ------------------------------ ZeroExtendResizeBitCast
    116 
    117 // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
    118 // target is in emu128-inl.h, and the implementation of
    119 // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h
    120 #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
    121 namespace detail {
    122 
    123 #if HWY_HAVE_SCALABLE
    124 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
    125 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    126    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    127    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
    128    VFromD<DFrom> v) {
    129  const Repartition<uint8_t, DTo> d_to_u8;
    130  const auto resized = ResizeBitCast(d_to_u8, v);
    131  // Zero the upper bytes which were not present/valid in d_from.
    132  const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
    133  return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
    134 }
    135 #else   // target that uses fixed-size vectors
    136 // Truncating or same-size resizing cast: same as ResizeBitCast
    137 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
    138          HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
    139 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    140    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    141    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
    142    VFromD<DFrom> v) {
    143  return ResizeBitCast(d_to, v);
    144 }
    145 
    146 // Resizing cast to vector that has twice the number of lanes of the source
    147 // vector
    148 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
    149          HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
    150 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    151    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    152    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
    153    VFromD<DFrom> v) {
    154  const Twice<decltype(d_from)> dt_from;
    155  return BitCast(d_to, ZeroExtendVector(dt_from, v));
    156 }
    157 
    158 // Resizing cast to vector that has more than twice the number of lanes of the
    159 // source vector
    160 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
    161          HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
    162 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
    163    hwy::SizeTag<kFromVectSize> /* from_size_tag */,
    164    hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
    165    VFromD<DFrom> v) {
    166  using TFrom = TFromD<DFrom>;
    167  constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom);
    168  const Repartition<TFrom, decltype(d_to)> d_resize_to;
    169  return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
    170                                      ResizeBitCast(d_resize_to, v)));
    171 }
    172 #endif  // HWY_HAVE_SCALABLE
    173 
    174 }  // namespace detail
    175 #endif  // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
    176 
    177 template <class DTo, class DFrom>
    178 HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from,
    179                                            VFromD<DFrom> v) {
    180  return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(),
    181                                         hwy::SizeTag<d_to.MaxBytes()>(), d_to,
    182                                         d_from, v);
    183 }
    184 
    185 // ------------------------------ SafeFillN
    186 
    187 template <class D, typename T = TFromD<D>>
    188 HWY_API void SafeFillN(const size_t num, const T value, D d,
    189                       T* HWY_RESTRICT to) {
    190 #if HWY_MEM_OPS_MIGHT_FAULT
    191  (void)d;
    192  for (size_t i = 0; i < num; ++i) {
    193    to[i] = value;
    194  }
    195 #else
    196  BlendedStore(Set(d, value), FirstN(d, num), d, to);
    197 #endif
    198 }
    199 
    200 // ------------------------------ SafeCopyN
    201 
    202 template <class D, typename T = TFromD<D>>
    203 HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
    204                       T* HWY_RESTRICT to) {
    205 #if HWY_MEM_OPS_MIGHT_FAULT
    206  (void)d;
    207  for (size_t i = 0; i < num; ++i) {
    208    to[i] = from[i];
    209  }
    210 #else
    211  const Mask<D> mask = FirstN(d, num);
    212  BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
    213 #endif
    214 }
    215 
    216 // ------------------------------ IsNegative
    217 #if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
    218 #ifdef HWY_NATIVE_IS_NEGATIVE
    219 #undef HWY_NATIVE_IS_NEGATIVE
    220 #else
    221 #define HWY_NATIVE_IS_NEGATIVE
    222 #endif
    223 
    224 template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
    225 HWY_API Mask<DFromV<V>> IsNegative(V v) {
    226  const DFromV<decltype(v)> d;
    227  const RebindToSigned<decltype(d)> di;
    228  return RebindMask(d, MaskFromVec(BroadcastSignBit(BitCast(di, v))));
    229 }
    230 
    231 #endif  // HWY_NATIVE_IS_NEGATIVE
    232 
    233 // ------------------------------ MaskFalse
    234 #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
    235 #ifdef HWY_NATIVE_MASK_FALSE
    236 #undef HWY_NATIVE_MASK_FALSE
    237 #else
    238 #define HWY_NATIVE_MASK_FALSE
    239 #endif
    240 
    241 template <class D>
    242 HWY_API Mask<D> MaskFalse(D d) {
    243  return MaskFromVec(Zero(d));
    244 }
    245 
    246 #endif  // HWY_NATIVE_MASK_FALSE
    247 
    248 // ------------------------------ SetMask
    249 #if (defined(HWY_NATIVE_SET_MASK) == defined(HWY_TARGET_TOGGLE))
    250 #ifdef HWY_NATIVE_SET_MASK
    251 #undef HWY_NATIVE_SET_MASK
    252 #else
    253 #define HWY_NATIVE_SET_MASK
    254 #endif
    255 
    256 template <class D>
    257 HWY_API Mask<D> SetMask(D d, bool val) {
    258  const Repartition<int32_t, decltype(d)> di32;
    259  return MaskFromVec(ResizeBitCast(d, Set(di32, -static_cast<int32_t>(val))));
    260 }
    261 
    262 #endif  // HWY_NATIVE_SET_MASK
    263 
    264 // ------------------------------ IfNegativeThenElseZero
    265 #if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
    266 #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
    267 #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
    268 #else
    269 #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
    270 #endif
    271 
    272 template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
    273 HWY_API V IfNegativeThenElseZero(V v, V yes) {
    274  return IfThenElseZero(IsNegative(v), yes);
    275 }
    276 
    277 #endif  // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
    278 
    279 // ------------------------------ IfNegativeThenZeroElse
    280 #if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
    281 #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
    282 #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
    283 #else
    284 #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
    285 #endif
    286 
    287 template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
    288 HWY_API V IfNegativeThenZeroElse(V v, V no) {
    289  return IfThenZeroElse(IsNegative(v), no);
    290 }
    291 
    292 #endif  // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
    293 
    294 // ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse)
    295 
    296 // ZeroIfNegative is generic for all vector lengths
    297 template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
    298 HWY_API V ZeroIfNegative(V v) {
    299  return IfNegativeThenZeroElse(v, v);
    300 }
    301 
    302 // ------------------------------ BitwiseIfThenElse
    303 #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
    304 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
    305 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
    306 #else
    307 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
    308 #endif
    309 
    310 template <class V>
    311 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
    312  return Or(And(mask, yes), AndNot(mask, no));
    313 }
    314 
    315 #endif  // HWY_NATIVE_BITWISE_IF_THEN_ELSE
    316 
    317 // ------------------------------ PromoteMaskTo
    318 
    319 #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
    320 #ifdef HWY_NATIVE_PROMOTE_MASK_TO
    321 #undef HWY_NATIVE_PROMOTE_MASK_TO
    322 #else
    323 #define HWY_NATIVE_PROMOTE_MASK_TO
    324 #endif
    325 
    326 template <class DTo, class DFrom>
    327 HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
    328  static_assert(
    329      sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
    330      "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
    331  static_assert(
    332      IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
    333      "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
    334 
    335  const RebindToSigned<decltype(d_to)> di_to;
    336  const RebindToSigned<decltype(d_from)> di_from;
    337 
    338  return MaskFromVec(BitCast(
    339      d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
    340 }
    341 
    342 #endif  // HWY_NATIVE_PROMOTE_MASK_TO
    343 
    344 // ------------------------------ DemoteMaskTo
    345 
    346 #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
    347 #ifdef HWY_NATIVE_DEMOTE_MASK_TO
    348 #undef HWY_NATIVE_DEMOTE_MASK_TO
    349 #else
    350 #define HWY_NATIVE_DEMOTE_MASK_TO
    351 #endif
    352 
    353 template <class DTo, class DFrom>
    354 HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
    355  static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
    356                "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
    357  static_assert(
    358      IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
    359      "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
    360 
    361  const RebindToSigned<decltype(d_to)> di_to;
    362  const RebindToSigned<decltype(d_from)> di_from;
    363 
    364  return MaskFromVec(
    365      BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
    366 }
    367 
    368 #endif  // HWY_NATIVE_DEMOTE_MASK_TO
    369 
    370 // ------------------------------ InsertIntoUpper
    371 #if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE))
    372 #ifdef HWY_NATIVE_LOAD_HIGHER
    373 #undef HWY_NATIVE_LOAD_HIGHER
    374 #else
    375 #define HWY_NATIVE_LOAD_HIGHER
    376 #endif
    377 template <class D, typename T, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1),
    378          HWY_IF_POW2_GT_D(D, -3)>
    379 HWY_API V InsertIntoUpper(D d, T* p, V a) {
    380  Half<D> dh;
    381  const VFromD<decltype(dh)> b = LoadU(dh, p);
    382  return Combine(d, b, LowerHalf(a));
    383 }
    384 #endif  // HWY_NATIVE_LOAD_HIGHER
    385 
    386 // ------------------------------ CombineMasks
    387 
    388 #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
    389 #ifdef HWY_NATIVE_COMBINE_MASKS
    390 #undef HWY_NATIVE_COMBINE_MASKS
    391 #else
    392 #define HWY_NATIVE_COMBINE_MASKS
    393 #endif
    394 
    395 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    396 template <class D>
    397 HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
    398  const Half<decltype(d)> dh;
    399  return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
    400 }
    401 #endif
    402 
    403 #endif  // HWY_NATIVE_COMBINE_MASKS
    404 
    405 // ------------------------------ LowerHalfOfMask
    406 
    407 #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
    408 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
    409 #undef HWY_NATIVE_LOWER_HALF_OF_MASK
    410 #else
    411 #define HWY_NATIVE_LOWER_HALF_OF_MASK
    412 #endif
    413 
    414 template <class D>
    415 HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
    416  const Twice<decltype(d)> dt;
    417  return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
    418 }
    419 
    420 #endif  // HWY_NATIVE_LOWER_HALF_OF_MASK
    421 
    422 // ------------------------------ UpperHalfOfMask
    423 
    424 #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
    425 #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
    426 #undef HWY_NATIVE_UPPER_HALF_OF_MASK
    427 #else
    428 #define HWY_NATIVE_UPPER_HALF_OF_MASK
    429 #endif
    430 
    431 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    432 template <class D>
    433 HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
    434  const Twice<decltype(d)> dt;
    435  return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
    436 }
    437 #endif
    438 
    439 #endif  // HWY_NATIVE_UPPER_HALF_OF_MASK
    440 
    441 // ------------------------------ OrderedDemote2MasksTo
    442 
    443 #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
    444     defined(HWY_TARGET_TOGGLE))
    445 #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
    446 #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
    447 #else
    448 #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
    449 #endif
    450 
    451 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    452 template <class DTo, class DFrom>
    453 HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
    454                                        Mask<DFrom> b) {
    455  static_assert(
    456      sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
    457      "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
    458  static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
    459                "Mask<DTo> must be the same type as "
    460                "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
    461 
    462  const RebindToSigned<decltype(d_from)> di_from;
    463  const RebindToSigned<decltype(d_to)> di_to;
    464 
    465  const auto va = BitCast(di_from, VecFromMask(d_from, a));
    466  const auto vb = BitCast(di_from, VecFromMask(d_from, b));
    467  return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
    468 }
    469 #endif
    470 
    471 #endif  // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
    472 
    473 // ------------------------------ RotateLeft
    474 template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
    475 HWY_API V RotateLeft(V v) {
    476  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
    477  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
    478 
    479  constexpr int kRotateRightAmt =
    480      (kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits;
    481  return RotateRight<kRotateRightAmt>(v);
    482 }
    483 
    484 // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
    485 #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
    486 #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
    487 #undef HWY_NATIVE_INTERLEAVE_WHOLE
    488 #else
    489 #define HWY_NATIVE_INTERLEAVE_WHOLE
    490 #endif
    491 
    492 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    493 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    494 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
    495  // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
    496  // D().MaxBytes() <= 16 is true
    497  return InterleaveLower(d, a, b);
    498 }
    499 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
    500 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
    501  // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
    502  // D().MaxBytes() <= 16 is true
    503  return InterleaveUpper(d, a, b);
    504 }
    505 
    506 // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
    507 // is implemented in x86_256-inl.h.
    508 
    509 // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
    510 // implemented in x86_512-inl.h.
    511 
    512 // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
    513 // is implemented in wasm_256-inl.h.
    514 #endif  // HWY_TARGET != HWY_SCALAR
    515 
    516 #endif  // HWY_NATIVE_INTERLEAVE_WHOLE
    517 
    518 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    519 // The InterleaveWholeLower without the optional D parameter is generic for all
    520 // vector lengths.
    521 template <class V>
    522 HWY_API V InterleaveWholeLower(V a, V b) {
    523  return InterleaveWholeLower(DFromV<V>(), a, b);
    524 }
    525 #endif  // HWY_TARGET != HWY_SCALAR
    526 
    527 // ------------------------------ InterleaveEven
    528 
    529 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    530 // InterleaveEven without the optional D parameter is generic for all vector
    531 // lengths
    532 template <class V>
    533 HWY_API V InterleaveEven(V a, V b) {
    534  return InterleaveEven(DFromV<V>(), a, b);
    535 }
    536 #endif
    537 
    538 // ------------------------------ MinNumber/MaxNumber
    539 
    540 #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_NUMBER) == defined(HWY_TARGET_TOGGLE))
    541 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
    542 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
    543 #else
    544 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
    545 #endif
    546 
    547 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
    548 HWY_API V MinNumber(V a, V b) {
    549  return Min(a, b);
    550 }
    551 
    552 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
    553 HWY_API V MaxNumber(V a, V b) {
    554  return Max(a, b);
    555 }
    556 
    557 #endif
    558 
    559 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
    560 HWY_API V MinNumber(V a, V b) {
    561  return Min(a, b);
    562 }
    563 
    564 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
    565 HWY_API V MaxNumber(V a, V b) {
    566  return Max(a, b);
    567 }
    568 
    569 // ------------------------------ MinMagnitude/MaxMagnitude
    570 
    571 #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE) == defined(HWY_TARGET_TOGGLE))
    572 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
    573 #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
    574 #else
    575 #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
    576 #endif
    577 
    578 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
    579 HWY_API V MinMagnitude(V a, V b) {
    580  const V abs_a = Abs(a);
    581  const V abs_b = Abs(b);
    582  const V min = Min(IfThenElse(Eq(abs_a, abs_b), a, b), b);
    583  return IfThenElse(Lt(abs_a, abs_b), a, min);
    584 }
    585 
    586 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
    587 HWY_API V MaxMagnitude(V a, V b) {
    588  const V abs_a = Abs(a);
    589  const V abs_b = Abs(b);
    590  // This lvalue appears to be necessary to avoid a clang bug on SVE.
    591  const V max = Max(IfThenElse(Eq(abs_a, abs_b), b, a), a);
    592  return IfThenElse(Lt(abs_a, abs_b), b, max);
    593 }
    594 
    595 #endif  // HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE
    596 
    597 template <class V, HWY_IF_SIGNED_V(V)>
    598 HWY_API V MinMagnitude(V a, V b) {
    599  const DFromV<V> d;
    600  const RebindToUnsigned<decltype(d)> du;
    601  const auto abs_a = BitCast(du, Abs(a));
    602  const auto abs_b = BitCast(du, Abs(b));
    603  return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), a,
    604                    Min(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), a, b), b));
    605 }
    606 
    607 template <class V, HWY_IF_SIGNED_V(V)>
    608 HWY_API V MaxMagnitude(V a, V b) {
    609  const DFromV<V> d;
    610  const RebindToUnsigned<decltype(d)> du;
    611  const auto abs_a = BitCast(du, Abs(a));
    612  const auto abs_b = BitCast(du, Abs(b));
    613  return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), b,
    614                    Max(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), b, a), a));
    615 }
    616 
    617 template <class V, HWY_IF_UNSIGNED_V(V)>
    618 HWY_API V MinMagnitude(V a, V b) {
    619  return Min(a, b);
    620 }
    621 
    622 template <class V, HWY_IF_UNSIGNED_V(V)>
    623 HWY_API V MaxMagnitude(V a, V b) {
    624  return Max(a, b);
    625 }
    626 
    627 // ------------------------------ AddSub
    628 
    629 template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
    630 HWY_API V AddSub(V a, V b) {
    631  // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
    632  return Sub(a, b);
    633 }
    634 
    635 // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
    636 // SSSE3/SSE4/AVX2/AVX3
    637 
    638 // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
    639 // AVX2/AVX3
    640 
    641 // AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
    642 
    643 // AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
    644 template <class V, HWY_IF_ADDSUB_V(V)>
    645 HWY_API V AddSub(V a, V b) {
    646  using D = DFromV<decltype(a)>;
    647  using T = TFromD<D>;
    648  using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
    649 
    650  const D d;
    651  const Rebind<TNegate, D> d_negate;
    652 
    653  // Negate the even lanes of b
    654  const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
    655 
    656  return Add(a, negated_even_b);
    657 }
    658 
    659 // ------------------------------ MaskedAddOr etc.
    660 #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
    661 #ifdef HWY_NATIVE_MASKED_ARITH
    662 #undef HWY_NATIVE_MASKED_ARITH
    663 #else
    664 #define HWY_NATIVE_MASKED_ARITH
    665 #endif
    666 
    667 template <class V, class M>
    668 HWY_API V MaskedMinOr(V no, M m, V a, V b) {
    669  return IfThenElse(m, Min(a, b), no);
    670 }
    671 
    672 template <class V, class M>
    673 HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
    674  return IfThenElse(m, Max(a, b), no);
    675 }
    676 
    677 template <class V, class M>
    678 HWY_API V MaskedAddOr(V no, M m, V a, V b) {
    679  return IfThenElse(m, Add(a, b), no);
    680 }
    681 
    682 template <class V, class M>
    683 HWY_API V MaskedSubOr(V no, M m, V a, V b) {
    684  return IfThenElse(m, Sub(a, b), no);
    685 }
    686 
    687 template <class V, class M>
    688 HWY_API V MaskedMulOr(V no, M m, V a, V b) {
    689  return IfThenElse(m, Mul(a, b), no);
    690 }
    691 
    692 template <class V, class M>
    693 HWY_API V MaskedDivOr(V no, M m, V a, V b) {
    694  const DFromV<V> d;
    695  // Avoid division by zero for masked-out lanes.
    696  const V nonzero = Set(d, TFromD<decltype(d)>{1});
    697  return IfThenElse(m, Div(a, IfThenElse(m, b, nonzero)), no);
    698 }
    699 
    700 template <class V, class M>
    701 HWY_API V MaskedModOr(V no, M m, V a, V b) {
    702  const DFromV<V> d;
    703  // Avoid division by zero for masked-out lanes.
    704  const V nonzero = Set(d, TFromD<decltype(d)>{1});
    705  return IfThenElse(m, Mod(a, IfThenElse(m, b, nonzero)), no);
    706 }
    707 
    708 template <class V, class M>
    709 HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
    710  return IfThenElse(m, SaturatedAdd(a, b), no);
    711 }
    712 
    713 template <class V, class M>
    714 HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
    715  return IfThenElse(m, SaturatedSub(a, b), no);
    716 }
    717 #endif  // HWY_NATIVE_MASKED_ARITH
    718 
    719 #if (defined(HWY_NATIVE_ZERO_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
    720 #ifdef HWY_NATIVE_ZERO_MASKED_ARITH
    721 #undef HWY_NATIVE_ZERO_MASKED_ARITH
    722 #else
    723 #define HWY_NATIVE_ZERO_MASKED_ARITH
    724 #endif
    725 
    726 template <class V, class M>
    727 HWY_API V MaskedMax(M m, V a, V b) {
    728  return IfThenElseZero(m, (Max(a, b)));
    729 }
    730 
    731 template <class V, class M>
    732 HWY_API V MaskedAdd(M m, V a, V b) {
    733  return IfThenElseZero(m, Add(a, b));
    734 }
    735 
    736 template <class V, class M>
    737 HWY_API V MaskedSub(M m, V a, V b) {
    738  return IfThenElseZero(m, Sub(a, b));
    739 }
    740 
    741 template <class V, class M>
    742 HWY_API V MaskedMul(M m, V a, V b) {
    743  return IfThenElseZero(m, Mul(a, b));
    744 }
    745 
    746 template <class V, class M>
    747 HWY_API V MaskedDiv(M m, V a, V b) {
    748  return IfThenElseZero(m, Div(a, b));
    749 }
    750 
    751 template <class V, class M>
    752 HWY_API V MaskedSaturatedAdd(M m, V a, V b) {
    753  return IfThenElseZero(m, SaturatedAdd(a, b));
    754 }
    755 
    756 template <class V, class M>
    757 HWY_API V MaskedSaturatedSub(M m, V a, V b) {
    758  return IfThenElseZero(m, SaturatedSub(a, b));
    759 }
    760 
    761 template <class V, class M, typename D = DFromV<V>, HWY_IF_I16_D(D)>
    762 HWY_API V MaskedMulFixedPoint15(M m, V a, V b) {
    763  return IfThenElseZero(m, MulFixedPoint15(a, b));
    764 }
    765 
    766 template <class V, class M>
    767 HWY_API V MaskedMulAdd(M m, V mul, V x, V add) {
    768  return IfThenElseZero(m, MulAdd(mul, x, add));
    769 }
    770 
    771 template <class V, class M>
    772 HWY_API V MaskedNegMulAdd(M m, V mul, V x, V add) {
    773  return IfThenElseZero(m, NegMulAdd(mul, x, add));
    774 }
    775 
    776 template <class D, class M, HWY_IF_UI32_D(D),
    777          class V16 = VFromD<RepartitionToNarrow<D>>>
    778 HWY_API VFromD<D> MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) {
    779  return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b));
    780 }
    781 
    782 template <class DF, class M, HWY_IF_F32_D(DF), class VBF>
    783 HWY_API VFromD<DF> MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) {
    784  return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b));
    785 }
    786 #endif  // HWY_NATIVE_ZERO_MASKED_ARITH
    787 
    788 // ------------------------------ MaskedShift
    789 template <int kShift, class V, class M>
    790 HWY_API V MaskedShiftLeft(M m, V a) {
    791  return IfThenElseZero(m, ShiftLeft<kShift>(a));
    792 }
    793 
    794 template <int kShift, class V, class M>
    795 HWY_API V MaskedShiftRight(M m, V a) {
    796  return IfThenElseZero(m, ShiftRight<kShift>(a));
    797 }
    798 
    799 template <int kShift, class V, class M>
    800 HWY_API V MaskedShiftRightOr(V no, M m, V a) {
    801  return IfThenElse(m, ShiftRight<kShift>(a), no);
    802 }
    803 
    804 template <class V, class M>
    805 HWY_API V MaskedShrOr(V no, M m, V a, V shifts) {
    806  return IfThenElse(m, Shr(a, shifts), no);
    807 }
    808 
    809 // ------------------------------ MaskedEq etc.
    810 #if (defined(HWY_NATIVE_MASKED_COMP) == defined(HWY_TARGET_TOGGLE))
    811 #ifdef HWY_NATIVE_MASKED_COMP
    812 #undef HWY_NATIVE_MASKED_COMP
    813 #else
    814 #define HWY_NATIVE_MASKED_COMP
    815 #endif
    816 
    817 template <class V, class M>
    818 HWY_API auto MaskedEq(M m, V a, V b) -> decltype(a == b) {
    819  return And(m, Eq(a, b));
    820 }
    821 
    822 template <class V, class M>
    823 HWY_API auto MaskedNe(M m, V a, V b) -> decltype(a == b) {
    824  return And(m, Ne(a, b));
    825 }
    826 
    827 template <class V, class M>
    828 HWY_API auto MaskedLt(M m, V a, V b) -> decltype(a == b) {
    829  return And(m, Lt(a, b));
    830 }
    831 
    832 template <class V, class M>
    833 HWY_API auto MaskedGt(M m, V a, V b) -> decltype(a == b) {
    834  return And(m, Gt(a, b));
    835 }
    836 
    837 template <class V, class M>
    838 HWY_API auto MaskedLe(M m, V a, V b) -> decltype(a == b) {
    839  return And(m, Le(a, b));
    840 }
    841 
    842 template <class V, class M>
    843 HWY_API auto MaskedGe(M m, V a, V b) -> decltype(a == b) {
    844  return And(m, Ge(a, b));
    845 }
    846 
    847 template <class V, class M, class D = DFromV<V>>
    848 HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
    849  return And(m, IsNaN(v));
    850 }
    851 #endif  // HWY_NATIVE_MASKED_COMP
    852 
    853 // ------------------------------ IfNegativeThenNegOrUndefIfZero
    854 
    855 #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
    856     defined(HWY_TARGET_TOGGLE))
    857 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
    858 #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
    859 #else
    860 #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
    861 #endif
    862 
    863 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
    864 HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
    865 #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
    866  // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
    867  const auto zero = Zero(DFromV<V>());
    868  return MaskedSubOr(v, Lt(mask, zero), zero, v);
    869 #else
    870  return IfNegativeThenElse(mask, Neg(v), v);
    871 #endif
    872 }
    873 
    874 #endif  // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
    875 
    876 template <class V, HWY_IF_FLOAT_V(V)>
    877 HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
    878  return CopySign(v, Xor(mask, v));
    879 }
    880 
    881 // ------------------------------ SaturatedNeg
    882 
    883 #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
    884 #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
    885 #undef HWY_NATIVE_SATURATED_NEG_8_16_32
    886 #else
    887 #define HWY_NATIVE_SATURATED_NEG_8_16_32
    888 #endif
    889 
    890 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
    891          HWY_IF_SIGNED_V(V)>
    892 HWY_API V SaturatedNeg(V v) {
    893  const DFromV<decltype(v)> d;
    894  return SaturatedSub(Zero(d), v);
    895 }
    896 
    897 template <class V, HWY_IF_I32(TFromV<V>)>
    898 HWY_API V SaturatedNeg(V v) {
    899  const DFromV<decltype(v)> d;
    900 
    901 #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_PPC || HWY_TARGET_IS_SVE || \
    902    HWY_TARGET_IS_NEON
    903  // RVV/PPC/SVE/NEON have native I32 SaturatedSub instructions
    904  return SaturatedSub(Zero(d), v);
    905 #else
    906  // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
    907  // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
    908  // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
    909  // ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
    910  return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
    911 #endif
    912 }
    913 #endif  // HWY_NATIVE_SATURATED_NEG_8_16_32
    914 
    915 #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
    916 #ifdef HWY_NATIVE_SATURATED_NEG_64
    917 #undef HWY_NATIVE_SATURATED_NEG_64
    918 #else
    919 #define HWY_NATIVE_SATURATED_NEG_64
    920 #endif
    921 
    922 template <class V, HWY_IF_I64(TFromV<V>)>
    923 HWY_API V SaturatedNeg(V v) {
    924 #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_SVE || HWY_TARGET_IS_NEON
    925  // RVV/SVE/NEON have native I64 SaturatedSub instructions
    926  const DFromV<decltype(v)> d;
    927  return SaturatedSub(Zero(d), v);
    928 #else
    929  const auto neg_v = Neg(v);
    930  return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
    931 #endif
    932 }
    933 #endif  // HWY_NATIVE_SATURATED_NEG_64
    934 
    935 // ------------------------------ SaturatedAbs
    936 
    937 #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
    938 #ifdef HWY_NATIVE_SATURATED_ABS
    939 #undef HWY_NATIVE_SATURATED_ABS
    940 #else
    941 #define HWY_NATIVE_SATURATED_ABS
    942 #endif
    943 
    944 template <class V, HWY_IF_SIGNED_V(V)>
    945 HWY_API V SaturatedAbs(V v) {
    946  return Max(v, SaturatedNeg(v));
    947 }
    948 
    949 #endif
    950 
    951 // ------------------------------ MaskedAbsOr
    952 template <class V, HWY_IF_SIGNED_V(V), class M>
    953 HWY_API V MaskedAbsOr(V no, M m, V v) {
    954  return IfThenElse(m, Abs(v), no);
    955 }
    956 
    957 // ------------------------------ MaskedAbs
    958 template <class V, HWY_IF_SIGNED_V(V), class M>
    959 HWY_API V MaskedAbs(M m, V v) {
    960  return IfThenElseZero(m, Abs(v));
    961 }
    962 
    963 // ------------------------------ Reductions
    964 
    965 // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
    966 // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
    967 // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
    968 // SumOfLanes overloads. For the latter group, we here define the remaining
    969 // overloads, plus ReduceSum which uses them plus GetLane.
    970 #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
    971 #ifdef HWY_NATIVE_REDUCE_SCALAR
    972 #undef HWY_NATIVE_REDUCE_SCALAR
    973 #else
    974 #define HWY_NATIVE_REDUCE_SCALAR
    975 #endif
    976 
    977 namespace detail {
    978 
    979 // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
    980 struct AddFunc {
    981  template <class V>
    982  V operator()(V a, V b) const {
    983    return Add(a, b);
    984  }
    985 };
    986 
    987 struct MinFunc {
    988  template <class V>
    989  V operator()(V a, V b) const {
    990    return Min(a, b);
    991  }
    992 };
    993 
    994 struct MaxFunc {
    995  template <class V>
    996  V operator()(V a, V b) const {
    997    return Max(a, b);
    998  }
    999 };
   1000 
   1001 // No-op for vectors of at most one block.
   1002 template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
   1003 HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
   1004  return v;
   1005 }
   1006 
   1007 // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
   1008 // WASM_EMU256. AVX3 has its own overload.
   1009 template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
   1010 HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) {
   1011  return f(v, SwapAdjacentBlocks(v));
   1012 }
   1013 
   1014 // These return the reduction result broadcasted across all lanes. They assume
   1015 // the caller has already reduced across blocks.
   1016 
   1017 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
   1018 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
   1019  return f(v10, Reverse2(d, v10));
   1020 }
   1021 
   1022 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
   1023 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
   1024  const VFromD<D> v0123 = Reverse4(d, v3210);
   1025  const VFromD<D> v03_12_12_03 = f(v3210, v0123);
   1026  const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
   1027  return f(v03_12_12_03, v12_03_03_12);
   1028 }
   1029 
   1030 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
   1031 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
   1032  // The upper half is reversed from the lower half; omit for brevity.
   1033  const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
   1034  const VFromD<D> v0347_1625_1625_0347 =
   1035      f(v34_25_16_07, Reverse4(d, v34_25_16_07));
   1036  return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
   1037 }
   1038 
   1039 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
   1040 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
   1041  const RepartitionToWide<decltype(d)> dw;
   1042  using VW = VFromD<decltype(dw)>;
   1043  const VW vw = BitCast(dw, v);
   1044  // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
   1045  const VW even = And(vw, Set(dw, 0xFF));
   1046  const VW odd = ShiftRight<8>(vw);
   1047  const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
   1048 #if HWY_IS_LITTLE_ENDIAN
   1049  return DupEven(BitCast(d, reduced));
   1050 #else
   1051  return DupOdd(BitCast(d, reduced));
   1052 #endif
   1053 }
   1054 
   1055 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
   1056 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
   1057  const RepartitionToWide<decltype(d)> dw;
   1058  using VW = VFromD<decltype(dw)>;
   1059  const VW vw = BitCast(dw, v);
   1060  // Sign-extend
   1061  // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
   1062  const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
   1063  const VW odd = ShiftRight<8>(vw);
   1064  const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
   1065 #if HWY_IS_LITTLE_ENDIAN
   1066  return DupEven(BitCast(d, reduced));
   1067 #else
   1068  return DupOdd(BitCast(d, reduced));
   1069 #endif
   1070 }
   1071 
   1072 }  // namespace detail
   1073 
   1074 template <class D, HWY_IF_SUM_OF_LANES_D(D)>
   1075 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
   1076  const detail::AddFunc f;
   1077  v = detail::ReduceAcrossBlocks(d, f, v);
   1078  return detail::ReduceWithinBlocks(d, f, v);
   1079 }
   1080 template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
   1081 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
   1082  const detail::MinFunc f;
   1083  v = detail::ReduceAcrossBlocks(d, f, v);
   1084  return detail::ReduceWithinBlocks(d, f, v);
   1085 }
   1086 template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
   1087 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
   1088  const detail::MaxFunc f;
   1089  v = detail::ReduceAcrossBlocks(d, f, v);
   1090  return detail::ReduceWithinBlocks(d, f, v);
   1091 }
   1092 
   1093 template <class D, HWY_IF_REDUCE_D(D)>
   1094 HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
   1095  return GetLane(SumOfLanes(d, v));
   1096 }
   1097 template <class D, HWY_IF_REDUCE_D(D)>
   1098 HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
   1099  return GetLane(MinOfLanes(d, v));
   1100 }
   1101 template <class D, HWY_IF_REDUCE_D(D)>
   1102 HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
   1103  return GetLane(MaxOfLanes(d, v));
   1104 }
   1105 
   1106 #endif  // HWY_NATIVE_REDUCE_SCALAR
   1107 
   1108 // Corner cases for both generic and native implementations:
   1109 // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
   1110 template <class D, HWY_IF_LANES_D(D, 1)>
   1111 HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
   1112  return GetLane(v);
   1113 }
   1114 template <class D, HWY_IF_LANES_D(D, 1)>
   1115 HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
   1116  return GetLane(v);
   1117 }
   1118 template <class D, HWY_IF_LANES_D(D, 1)>
   1119 HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
   1120  return GetLane(v);
   1121 }
   1122 
   1123 template <class D, HWY_IF_LANES_D(D, 1)>
   1124 HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
   1125  return v;
   1126 }
   1127 template <class D, HWY_IF_LANES_D(D, 1)>
   1128 HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
   1129  return v;
   1130 }
   1131 template <class D, HWY_IF_LANES_D(D, 1)>
   1132 HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
   1133  return v;
   1134 }
   1135 
   1136 // N=4 for 8-bit is still less than the minimum native size.
   1137 
   1138 // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
   1139 // ReduceSum operations
   1140 #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
   1141 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
   1142 #undef HWY_NATIVE_REDUCE_SUM_4_UI8
   1143 #else
   1144 #define HWY_NATIVE_REDUCE_SUM_4_UI8
   1145 #endif
   1146 
   1147 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
   1148 HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
   1149  const Twice<RepartitionToWide<decltype(d)>> dw;
   1150  return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
   1151 }
   1152 #endif  // HWY_NATIVE_REDUCE_SUM_4_UI8
   1153 
   1154 // RVV/SVE have target-specific implementations of the N=4 I8/U8
   1155 // ReduceMin/ReduceMax operations
   1156 #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
   1157 #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
   1158 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
   1159 #else
   1160 #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
   1161 #endif
   1162 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
   1163 HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
   1164  const Twice<RepartitionToWide<decltype(d)>> dw;
   1165  return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
   1166 }
   1167 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
   1168 HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
   1169  const Twice<RepartitionToWide<decltype(d)>> dw;
   1170  return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
   1171 }
   1172 #endif  // HWY_NATIVE_REDUCE_MINMAX_4_UI8
   1173 
   1174 #if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
   1175 #ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR
   1176 #undef HWY_NATIVE_MASKED_REDUCE_SCALAR
   1177 #else
   1178 #define HWY_NATIVE_MASKED_REDUCE_SCALAR
   1179 #endif
   1180 
   1181 template <class D, class M>
   1182 HWY_API TFromD<D> MaskedReduceSum(D d, M m, VFromD<D> v) {
   1183  return ReduceSum(d, IfThenElseZero(m, v));
   1184 }
   1185 template <class D, class M>
   1186 HWY_API TFromD<D> MaskedReduceMin(D d, M m, VFromD<D> v) {
   1187  return ReduceMin(
   1188      d, IfThenElse(m, v, Set(d, hwy::PositiveInfOrHighestValue<TFromD<D>>())));
   1189 }
   1190 template <class D, class M>
   1191 HWY_API TFromD<D> MaskedReduceMax(D d, M m, VFromD<D> v) {
   1192  return ReduceMax(
   1193      d, IfThenElse(m, v, Set(d, hwy::NegativeInfOrLowestValue<TFromD<D>>())));
   1194 }
   1195 
   1196 #endif  // HWY_NATIVE_MASKED_REDUCE_SCALAR
   1197 
   1198 // ------------------------------ IsEitherNaN
   1199 #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
   1200 #ifdef HWY_NATIVE_IS_EITHER_NAN
   1201 #undef HWY_NATIVE_IS_EITHER_NAN
   1202 #else
   1203 #define HWY_NATIVE_IS_EITHER_NAN
   1204 #endif
   1205 
   1206 template <class V, HWY_IF_FLOAT_V(V)>
   1207 HWY_API MFromD<DFromV<V>> IsEitherNaN(V a, V b) {
   1208  return Or(IsNaN(a), IsNaN(b));
   1209 }
   1210 
   1211 #endif  // HWY_NATIVE_IS_EITHER_NAN
   1212 
   1213 // ------------------------------ IsInf, IsFinite
   1214 
   1215 // AVX3 has target-specific implementations of these.
   1216 #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
   1217 #ifdef HWY_NATIVE_ISINF
   1218 #undef HWY_NATIVE_ISINF
   1219 #else
   1220 #define HWY_NATIVE_ISINF
   1221 #endif
   1222 
   1223 template <class V, class D = DFromV<V>>
   1224 HWY_API MFromD<D> IsInf(const V v) {
   1225  using T = TFromD<D>;
   1226  const D d;
   1227  const RebindToUnsigned<decltype(d)> du;
   1228  const VFromD<decltype(du)> vu = BitCast(du, v);
   1229  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
   1230  return RebindMask(
   1231      d,
   1232      Eq(Add(vu, vu),
   1233         Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
   1234 }
   1235 
   1236 // Returns whether normal/subnormal/zero.
   1237 template <class V, class D = DFromV<V>>
   1238 HWY_API MFromD<D> IsFinite(const V v) {
   1239  using T = TFromD<D>;
   1240  const D d;
   1241  const RebindToUnsigned<decltype(d)> du;
   1242  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
   1243  const VFromD<decltype(du)> vu = BitCast(du, v);
   1244 // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
   1245 // for AVX2 if we instead add vu + vu.
   1246 #if HWY_COMPILER_MSVC
   1247  const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
   1248 #else
   1249  const VFromD<decltype(du)> shl = Add(vu, vu);
   1250 #endif
   1251 
   1252  // Then shift right so we can compare with the max exponent (cannot compare
   1253  // with MaxExponentTimes2 directly because it is negative and non-negative
   1254  // floats would be greater).
   1255  const VFromD<decltype(di)> exp =
   1256      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
   1257  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
   1258 }
   1259 
   1260 #endif  // HWY_NATIVE_ISINF
   1261 
   1262 // ------------------------------ CeilInt/FloorInt
   1263 #if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE))
   1264 #ifdef HWY_NATIVE_CEIL_FLOOR_INT
   1265 #undef HWY_NATIVE_CEIL_FLOOR_INT
   1266 #else
   1267 #define HWY_NATIVE_CEIL_FLOOR_INT
   1268 #endif
   1269 
   1270 template <class V, HWY_IF_FLOAT_V(V)>
   1271 HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
   1272  const DFromV<decltype(v)> d;
   1273  const RebindToSigned<decltype(d)> di;
   1274  return ConvertTo(di, Ceil(v));
   1275 }
   1276 
   1277 template <class V, HWY_IF_FLOAT_V(V)>
   1278 HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
   1279  const DFromV<decltype(v)> d;
   1280  const RebindToSigned<decltype(d)> di;
   1281  return ConvertTo(di, Floor(v));
   1282 }
   1283 
   1284 #endif  // HWY_NATIVE_CEIL_FLOOR_INT
   1285 
   1286 // ------------------------------ MulByPow2/MulByFloorPow2
   1287 
   1288 #if (defined(HWY_NATIVE_MUL_BY_POW2) == defined(HWY_TARGET_TOGGLE))
   1289 #ifdef HWY_NATIVE_MUL_BY_POW2
   1290 #undef HWY_NATIVE_MUL_BY_POW2
   1291 #else
   1292 #define HWY_NATIVE_MUL_BY_POW2
   1293 #endif
   1294 
   1295 template <class V, HWY_IF_FLOAT_V(V)>
   1296 HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) {
   1297  const DFromV<decltype(v)> df;
   1298  const RebindToUnsigned<decltype(df)> du;
   1299  const RebindToSigned<decltype(df)> di;
   1300 
   1301  using TF = TFromD<decltype(df)>;
   1302  using TI = TFromD<decltype(di)>;
   1303  using TU = TFromD<decltype(du)>;
   1304 
   1305  using VF = VFromD<decltype(df)>;
   1306  using VI = VFromD<decltype(di)>;
   1307 
   1308  constexpr TI kMaxBiasedExp = MaxExponentField<TF>();
   1309  static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true");
   1310 
   1311  constexpr TI kExpBias = static_cast<TI>(kMaxBiasedExp >> 1);
   1312  static_assert(kExpBias > 0, "kExpBias > 0 must be true");
   1313  static_assert(kExpBias <= LimitsMax<TI>() / 3,
   1314                "kExpBias <= LimitsMax<TI>() / 3 must be true");
   1315 
   1316 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4
   1317  using TExpMinMax = If<(sizeof(TI) <= 4), TI, int32_t>;
   1318 #elif (HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2) || \
   1319    HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256
   1320  using TExpMinMax = int16_t;
   1321 #else
   1322  using TExpMinMax = TI;
   1323 #endif
   1324 
   1325 #if HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SCALAR
   1326  using TExpSatSub = TU;
   1327 #elif HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
   1328    HWY_TARGET == HWY_WASM_EMU256
   1329  using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, uint16_t>;
   1330 #elif HWY_TARGET_IS_PPC
   1331  using TExpSatSub = If<(sizeof(TF) >= 4), uint32_t, TU>;
   1332 #else
   1333  using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, TU>;
   1334 #endif
   1335 
   1336  static_assert(kExpBias <= static_cast<TI>(LimitsMax<TExpMinMax>() / 3),
   1337                "kExpBias <= LimitsMax<TExpMinMax>() / 3 must be true");
   1338 
   1339  const Repartition<TExpMinMax, decltype(df)> d_exp_min_max;
   1340  const Repartition<TExpSatSub, decltype(df)> d_sat_exp_sub;
   1341 
   1342  constexpr int kNumOfExpBits = ExponentBits<TF>();
   1343  constexpr int kNumOfMantBits = MantissaBits<TF>();
   1344 
   1345  // The sign bit of BitCastScalar<TU>(a[i]) >> kNumOfMantBits can be zeroed out
   1346  // using SaturatedSub if kZeroOutSignUsingSatSub is true.
   1347 
   1348  // If kZeroOutSignUsingSatSub is true, then val_for_exp_sub will be bitcasted
   1349  // to a vector that has a smaller lane size than TU for the SaturatedSub
   1350  // operation below.
   1351  constexpr bool kZeroOutSignUsingSatSub =
   1352      ((sizeof(TExpSatSub) * 8) == static_cast<size_t>(kNumOfExpBits));
   1353 
   1354  // If kZeroOutSignUsingSatSub is true, then the upper
   1355  // (sizeof(TU) - sizeof(TExpSatSub)) * 8 bits of kExpDecrBy1Bits will be all
   1356  // ones and the lower sizeof(TExpSatSub) * 8 bits of kExpDecrBy1Bits will be
   1357  // equal to 1.
   1358 
   1359  // Otherwise, if kZeroOutSignUsingSatSub is false, kExpDecrBy1Bits will be
   1360  // equal to 1.
   1361  constexpr TU kExpDecrBy1Bits = static_cast<TU>(
   1362      TU{1} - (static_cast<TU>(kZeroOutSignUsingSatSub) << kNumOfExpBits));
   1363 
   1364  VF val_for_exp_sub = v;
   1365  HWY_IF_CONSTEXPR(!kZeroOutSignUsingSatSub) {
   1366    // If kZeroOutSignUsingSatSub is not true, zero out the sign bit of
   1367    // val_for_exp_sub[i] using Abs
   1368    val_for_exp_sub = Abs(val_for_exp_sub);
   1369  }
   1370 
   1371  // min_exp1_plus_min_exp2[i] is the smallest exponent such that
   1372  // min_exp1_plus_min_exp2[i] >= 2 - kExpBias * 2 and
   1373  // std::ldexp(v[i], min_exp1_plus_min_exp2[i]) is a normal floating-point
   1374  // number if v[i] is a normal number
   1375  const VI min_exp1_plus_min_exp2 = BitCast(
   1376      di,
   1377      Max(BitCast(
   1378              d_exp_min_max,
   1379              Neg(BitCast(
   1380                  di,
   1381                  SaturatedSub(
   1382                      BitCast(d_sat_exp_sub, ShiftRight<kNumOfMantBits>(
   1383                                                 BitCast(du, val_for_exp_sub))),
   1384                      BitCast(d_sat_exp_sub, Set(du, kExpDecrBy1Bits)))))),
   1385          BitCast(d_exp_min_max,
   1386                  Set(di, static_cast<TI>(2 - kExpBias - kExpBias)))));
   1387 
   1388  const VI clamped_exp =
   1389      Max(Min(exp, Set(di, static_cast<TI>(kExpBias * 3))),
   1390          Add(min_exp1_plus_min_exp2, Set(di, static_cast<TI>(1 - kExpBias))));
   1391 
   1392  const VI exp1_plus_exp2 = BitCast(
   1393      di, Max(Min(BitCast(d_exp_min_max,
   1394                          Sub(clamped_exp, ShiftRight<2>(clamped_exp))),
   1395                  BitCast(d_exp_min_max,
   1396                          Set(di, static_cast<TI>(kExpBias + kExpBias)))),
   1397              BitCast(d_exp_min_max, min_exp1_plus_min_exp2)));
   1398 
   1399  const VI exp1 = ShiftRight<1>(exp1_plus_exp2);
   1400  const VI exp2 = Sub(exp1_plus_exp2, exp1);
   1401  const VI exp3 = Sub(clamped_exp, exp1_plus_exp2);
   1402 
   1403  const VI exp_bias = Set(di, kExpBias);
   1404 
   1405  const VF factor1 =
   1406      BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp1, exp_bias)));
   1407  const VF factor2 =
   1408      BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp2, exp_bias)));
   1409  const VF factor3 =
   1410      BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp3, exp_bias)));
   1411 
   1412  return Mul(Mul(Mul(v, factor1), factor2), factor3);
   1413 }
   1414 
   1415 template <class V, HWY_IF_FLOAT_V(V)>
   1416 HWY_API V MulByFloorPow2(V v, V exp) {
   1417  const DFromV<decltype(v)> df;
   1418 
   1419  // MulByFloorPow2 special cases:
   1420  // MulByFloorPow2(v, NaN) => NaN
   1421  // MulByFloorPow2(0, inf) => NaN
   1422  // MulByFloorPow2(inf, -inf) => NaN
   1423  // MulByFloorPow2(-inf, -inf) => NaN
   1424  const auto is_special_case_with_nan_result =
   1425      Or(IsNaN(exp),
   1426         And(Eq(Abs(v), IfNegativeThenElseZero(exp, Inf(df))), IsInf(exp)));
   1427 
   1428  return IfThenElse(is_special_case_with_nan_result, NaN(df),
   1429                    MulByPow2(v, FloorInt(exp)));
   1430 }
   1431 
   1432 #endif  // HWY_NATIVE_MUL_BY_POW2
   1433 
   1434 // ------------------------------ GetBiasedExponent
   1435 #if (defined(HWY_NATIVE_GET_BIASED_EXPONENT) == defined(HWY_TARGET_TOGGLE))
   1436 #ifdef HWY_NATIVE_GET_BIASED_EXPONENT
   1437 #undef HWY_NATIVE_GET_BIASED_EXPONENT
   1438 #else
   1439 #define HWY_NATIVE_GET_BIASED_EXPONENT
   1440 #endif
   1441 
   1442 template <class V, HWY_IF_FLOAT_V(V)>
   1443 HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) {
   1444  using T = TFromV<V>;
   1445 
   1446  const DFromV<V> d;
   1447  const RebindToUnsigned<decltype(d)> du;
   1448 
   1449  constexpr int kNumOfMantBits = MantissaBits<T>();
   1450  return ShiftRight<kNumOfMantBits>(BitCast(du, Abs(v)));
   1451 }
   1452 
   1453 #endif
   1454 
   1455 // ------------------------------ GetExponent
   1456 
   1457 #if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE))
   1458 #ifdef HWY_NATIVE_GET_EXPONENT
   1459 #undef HWY_NATIVE_GET_EXPONENT
   1460 #else
   1461 #define HWY_NATIVE_GET_EXPONENT
   1462 #endif
   1463 
   1464 template <class V, HWY_IF_FLOAT_V(V)>
   1465 HWY_API V GetExponent(V v) {
   1466  const DFromV<V> d;
   1467  using T = TFromV<V>;
   1468  const RebindToSigned<decltype(d)> di;
   1469 
   1470  const auto exponent_offset = Set(di, MaxExponentField<T>() >> 1);
   1471 
   1472  // extract exponent bits as integer
   1473  const auto encoded_exponent = GetBiasedExponent(v);
   1474  const auto exponent_int = Sub(BitCast(di, encoded_exponent), exponent_offset);
   1475 
   1476  // convert integer to original type
   1477  return ConvertTo(d, exponent_int);
   1478 }
   1479 
   1480 #endif  // HWY_NATIVE_GET_EXPONENT
   1481 // ------------------------------ LoadInterleaved2
   1482 
   1483 #if HWY_IDE || \
   1484    (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
   1485 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1486 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1487 #else
   1488 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
   1489 #endif
   1490 
   1491 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   1492 HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1493                              VFromD<D>& v0, VFromD<D>& v1) {
   1494  const VFromD<D> A = LoadU(d, unaligned);  // v1[1] v0[1] v1[0] v0[0]
   1495  const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
   1496  v0 = ConcatEven(d, B, A);
   1497  v1 = ConcatOdd(d, B, A);
   1498 }
   1499 
   1500 template <class D, HWY_IF_LANES_D(D, 1)>
   1501 HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1502                              VFromD<D>& v0, VFromD<D>& v1) {
   1503  v0 = LoadU(d, unaligned + 0);
   1504  v1 = LoadU(d, unaligned + 1);
   1505 }
   1506 
   1507 // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
   1508 
   1509 namespace detail {
   1510 
   1511 #if HWY_IDE
   1512 template <class V>
   1513 HWY_INLINE V ShuffleTwo1230(V a, V /* b */) {
   1514  return a;
   1515 }
   1516 template <class V>
   1517 HWY_INLINE V ShuffleTwo2301(V a, V /* b */) {
   1518  return a;
   1519 }
   1520 template <class V>
   1521 HWY_INLINE V ShuffleTwo3012(V a, V /* b */) {
   1522  return a;
   1523 }
   1524 #endif  // HWY_IDE
   1525 
   1526 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
   1527 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   1528 HWY_INLINE void LoadTransposedBlocks3(D d,
   1529                                      const TFromD<D>* HWY_RESTRICT unaligned,
   1530                                      VFromD<D>& A, VFromD<D>& B,
   1531                                      VFromD<D>& C) {
   1532  constexpr size_t kN = MaxLanes(d);
   1533  A = LoadU(d, unaligned + 0 * kN);
   1534  B = LoadU(d, unaligned + 1 * kN);
   1535  C = LoadU(d, unaligned + 2 * kN);
   1536 }
   1537 
   1538 }  // namespace detail
   1539 
   1540 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
   1541 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1542                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1543  const RebindToUnsigned<decltype(d)> du;
   1544  using V = VFromD<D>;
   1545  using VU = VFromD<decltype(du)>;
   1546  // Compact notation so these fit on one line: 12 := v1[2].
   1547  V A;  // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
   1548  V B;  // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
   1549  V C;  // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
   1550  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   1551  // Compress all lanes belonging to v0 into consecutive lanes.
   1552  constexpr uint8_t Z = 0x80;
   1553  const VU idx_v0A =
   1554      Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
   1555  const VU idx_v0B =
   1556      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
   1557  const VU idx_v0C =
   1558      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
   1559  const VU idx_v1A =
   1560      Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
   1561  const VU idx_v1B =
   1562      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
   1563  const VU idx_v1C =
   1564      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
   1565  const VU idx_v2A =
   1566      Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
   1567  const VU idx_v2B =
   1568      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
   1569  const VU idx_v2C =
   1570      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
   1571  const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
   1572  const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
   1573  const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
   1574  const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
   1575  const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
   1576  const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
   1577  const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
   1578  const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
   1579  const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
   1580  v0 = Xor3(v0L, v0M, v0U);
   1581  v1 = Xor3(v1L, v1M, v1U);
   1582  v2 = Xor3(v2L, v2M, v2U);
   1583 }
   1584 
   1585 // 8-bit lanes x8
   1586 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   1587 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1588                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1589  const RebindToUnsigned<decltype(d)> du;
   1590  using V = VFromD<D>;
   1591  using VU = VFromD<decltype(du)>;
   1592  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
   1593  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
   1594  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
   1595  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   1596  // Compress all lanes belonging to v0 into consecutive lanes.
   1597  constexpr uint8_t Z = 0x80;
   1598  const VU idx_v0A =
   1599      Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
   1600  const VU idx_v0B =
   1601      Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
   1602  const VU idx_v0C =
   1603      Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
   1604  const VU idx_v1A =
   1605      Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
   1606  const VU idx_v1B =
   1607      Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
   1608  const VU idx_v1C =
   1609      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
   1610  const VU idx_v2A =
   1611      Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
   1612  const VU idx_v2B =
   1613      Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
   1614  const VU idx_v2C =
   1615      Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
   1616  const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
   1617  const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
   1618  const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
   1619  const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
   1620  const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
   1621  const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
   1622  const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
   1623  const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
   1624  const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
   1625  v0 = Xor3(v0L, v0M, v0U);
   1626  v1 = Xor3(v1L, v1M, v1U);
   1627  v2 = Xor3(v2L, v2M, v2U);
   1628 }
   1629 
   1630 // 16-bit lanes x8
   1631 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   1632 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1633                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1634  const RebindToUnsigned<decltype(d)> du;
   1635  const Repartition<uint8_t, decltype(du)> du8;
   1636  using V = VFromD<D>;
   1637  using VU8 = VFromD<decltype(du8)>;
   1638  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
   1639  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
   1640  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
   1641  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   1642  // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
   1643  // but each element of the array contains a byte index for a byte of a lane.
   1644  constexpr uint8_t Z = 0x80;
   1645  const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
   1646                                          0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
   1647  const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
   1648                                          0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
   1649  const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
   1650                                          Z, 0x04, 0x05, 0x0A, 0x0B);
   1651  const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
   1652                                          0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
   1653  const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
   1654                                          0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
   1655  const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
   1656                                          0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
   1657  const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
   1658                                          Z, Z, Z, Z, Z, Z, Z, Z, Z);
   1659  const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
   1660                                          0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
   1661  const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
   1662                                          0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
   1663  const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
   1664  const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
   1665  const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
   1666  const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
   1667  const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
   1668  const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
   1669  const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
   1670  const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
   1671  const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
   1672  v0 = Xor3(v0L, v0M, v0U);
   1673  v1 = Xor3(v1L, v1M, v1U);
   1674  v2 = Xor3(v2L, v2M, v2U);
   1675 }
   1676 
   1677 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
   1678 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1679                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1680  using V = VFromD<D>;
   1681  V A;  // v0[1] v2[0] v1[0] v0[0]
   1682  V B;  // v1[2] v0[2] v2[1] v1[1]
   1683  V C;  // v2[3] v1[3] v0[3] v2[2]
   1684  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   1685 
   1686  const V vxx_02_03_xx = OddEven(C, B);
   1687  v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);
   1688 
   1689  // Shuffle2301 takes the upper/lower halves of the output from one input, so
   1690  // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
   1691  // OddEven because it may have higher throughput than Shuffle.
   1692  const V vxx_xx_10_11 = OddEven(A, B);
   1693  const V v12_13_xx_xx = OddEven(B, C);
   1694  v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);
   1695 
   1696  const V vxx_20_21_xx = OddEven(B, A);
   1697  v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C);
   1698 }
   1699 
   1700 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
   1701 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1702                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1703  VFromD<D> A;  // v1[0] v0[0]
   1704  VFromD<D> B;  // v0[1] v2[0]
   1705  VFromD<D> C;  // v2[1] v1[1]
   1706  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
   1707  v0 = OddEven(B, A);
   1708  v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A);
   1709  v2 = OddEven(C, B);
   1710 }
   1711 
   1712 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
   1713 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
   1714                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   1715  v0 = LoadU(d, unaligned + 0);
   1716  v1 = LoadU(d, unaligned + 1);
   1717  v2 = LoadU(d, unaligned + 2);
   1718 }
   1719 
   1720 // ------------------------------ LoadInterleaved4
   1721 
   1722 namespace detail {
   1723 
   1724 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
   1725 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   1726 HWY_INLINE void LoadTransposedBlocks4(D d,
   1727                                      const TFromD<D>* HWY_RESTRICT unaligned,
   1728                                      VFromD<D>& vA, VFromD<D>& vB,
   1729                                      VFromD<D>& vC, VFromD<D>& vD) {
   1730  constexpr size_t kN = MaxLanes(d);
   1731  vA = LoadU(d, unaligned + 0 * kN);
   1732  vB = LoadU(d, unaligned + 1 * kN);
   1733  vC = LoadU(d, unaligned + 2 * kN);
   1734  vD = LoadU(d, unaligned + 3 * kN);
   1735 }
   1736 
   1737 }  // namespace detail
   1738 
   1739 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
   1740 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1741                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   1742                              VFromD<D>& v3) {
   1743  const Repartition<uint64_t, decltype(d)> d64;
   1744  using V64 = VFromD<decltype(d64)>;
   1745  using V = VFromD<D>;
   1746  // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD.
   1747  // Here int[i] means the four interleaved values of the i-th 4-tuple and
   1748  // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
   1749  V vA;  // int[13..10] int[3..0]
   1750  V vB;  // int[17..14] int[7..4]
   1751  V vC;  // int[1b..18] int[b..8]
   1752  V vD;  // int[1f..1c] int[f..c]
   1753  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
   1754 
   1755  // For brevity, the comments only list the lower block (upper = lower + 0x10)
   1756  const V v5140 = InterleaveLower(d, vA, vB);  // int[5,1,4,0]
   1757  const V vd9c8 = InterleaveLower(d, vC, vD);  // int[d,9,c,8]
   1758  const V v7362 = InterleaveUpper(d, vA, vB);  // int[7,3,6,2]
   1759  const V vfbea = InterleaveUpper(d, vC, vD);  // int[f,b,e,a]
   1760 
   1761  const V v6420 = InterleaveLower(d, v5140, v7362);  // int[6,4,2,0]
   1762  const V veca8 = InterleaveLower(d, vd9c8, vfbea);  // int[e,c,a,8]
   1763  const V v7531 = InterleaveUpper(d, v5140, v7362);  // int[7,5,3,1]
   1764  const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea);  // int[f,d,b,9]
   1765 
   1766  const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531));  // v10[7..0]
   1767  const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9));  // v10[f..8]
   1768  const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531));  // v32[7..0]
   1769  const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9));  // v32[f..8]
   1770 
   1771  v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
   1772  v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
   1773  v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
   1774  v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
   1775 }
   1776 
   1777 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
   1778 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1779                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   1780                              VFromD<D>& v3) {
   1781  // In the last step, we interleave by half of the block size, which is usually
   1782  // 8 bytes but half that for 8-bit x8 vectors.
   1783  using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>;
   1784  const Repartition<TW, decltype(d)> dw;
   1785  using VW = VFromD<decltype(dw)>;
   1786 
   1787  // (Comments are for 256-bit vectors.)
   1788  // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
   1789  VFromD<D> vA;  // v3210[9]v3210[8] v3210[1]v3210[0]
   1790  VFromD<D> vB;  // v3210[b]v3210[a] v3210[3]v3210[2]
   1791  VFromD<D> vC;  // v3210[d]v3210[c] v3210[5]v3210[4]
   1792  VFromD<D> vD;  // v3210[f]v3210[e] v3210[7]v3210[6]
   1793  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
   1794 
   1795  const VFromD<D> va820 = InterleaveLower(d, vA, vB);  // v3210[a,8] v3210[2,0]
   1796  const VFromD<D> vec64 = InterleaveLower(d, vC, vD);  // v3210[e,c] v3210[6,4]
   1797  const VFromD<D> vb931 = InterleaveUpper(d, vA, vB);  // v3210[b,9] v3210[3,1]
   1798  const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD);  // v3210[f,d] v3210[7,5]
   1799 
   1800  const VW v10_b830 =  // v10[b..8] v10[3..0]
   1801      BitCast(dw, InterleaveLower(d, va820, vb931));
   1802  const VW v10_fc74 =  // v10[f..c] v10[7..4]
   1803      BitCast(dw, InterleaveLower(d, vec64, vfd75));
   1804  const VW v32_b830 =  // v32[b..8] v32[3..0]
   1805      BitCast(dw, InterleaveUpper(d, va820, vb931));
   1806  const VW v32_fc74 =  // v32[f..c] v32[7..4]
   1807      BitCast(dw, InterleaveUpper(d, vec64, vfd75));
   1808 
   1809  v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
   1810  v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
   1811  v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
   1812  v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
   1813 }
   1814 
   1815 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
   1816 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1817                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   1818                              VFromD<D>& v3) {
   1819  using V = VFromD<D>;
   1820  V vA;  // v3210[4] v3210[0]
   1821  V vB;  // v3210[5] v3210[1]
   1822  V vC;  // v3210[6] v3210[2]
   1823  V vD;  // v3210[7] v3210[3]
   1824  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
   1825  const V v10e = InterleaveLower(d, vA, vC);  // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
   1826  const V v10o = InterleaveLower(d, vB, vD);  // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
   1827  const V v32e = InterleaveUpper(d, vA, vC);  // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
   1828  const V v32o = InterleaveUpper(d, vB, vD);  // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
   1829 
   1830  v0 = InterleaveLower(d, v10e, v10o);
   1831  v1 = InterleaveUpper(d, v10e, v10o);
   1832  v2 = InterleaveLower(d, v32e, v32o);
   1833  v3 = InterleaveUpper(d, v32e, v32o);
   1834 }
   1835 
   1836 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
   1837 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   1838                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   1839                              VFromD<D>& v3) {
   1840  VFromD<D> vA, vB, vC, vD;
   1841  detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
   1842  v0 = InterleaveLower(d, vA, vC);
   1843  v1 = InterleaveUpper(d, vA, vC);
   1844  v2 = InterleaveLower(d, vB, vD);
   1845  v3 = InterleaveUpper(d, vB, vD);
   1846 }
   1847 
   1848 // Any T x1
   1849 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
   1850 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
   1851                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   1852                              VFromD<D>& v3) {
   1853  v0 = LoadU(d, unaligned + 0);
   1854  v1 = LoadU(d, unaligned + 1);
   1855  v2 = LoadU(d, unaligned + 2);
   1856  v3 = LoadU(d, unaligned + 3);
   1857 }
   1858 
   1859 // ------------------------------ StoreInterleaved2
   1860 
   1861 namespace detail {
   1862 
   1863 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
   1864 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   1865 HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d,
   1866                                       TFromD<D>* HWY_RESTRICT unaligned) {
   1867  constexpr size_t kN = MaxLanes(d);
   1868  StoreU(A, d, unaligned + 0 * kN);
   1869  StoreU(B, d, unaligned + 1 * kN);
   1870 }
   1871 
   1872 }  // namespace detail
   1873 
   1874 // >= 128 bit vector
   1875 template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
   1876 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
   1877                               TFromD<D>* HWY_RESTRICT unaligned) {
   1878  const auto v10L = InterleaveLower(d, v0, v1);  // .. v1[0] v0[0]
   1879  const auto v10U = InterleaveUpper(d, v0, v1);  // .. v1[kN/2] v0[kN/2]
   1880  detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
   1881 }
   1882 
   1883 // <= 64 bits
   1884 template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   1885 HWY_API void StoreInterleaved2(V part0, V part1, D d,
   1886                               TFromD<D>* HWY_RESTRICT unaligned) {
   1887  const Twice<decltype(d)> d2;
   1888  const auto v0 = ZeroExtendVector(d2, part0);
   1889  const auto v1 = ZeroExtendVector(d2, part1);
   1890  const auto v10 = InterleaveLower(d2, v0, v1);
   1891  StoreU(v10, d2, unaligned);
   1892 }
   1893 
   1894 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
   1895 // TableLookupBytes)
   1896 
   1897 namespace detail {
   1898 
   1899 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
   1900 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   1901 HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C,
   1902                                       D d, TFromD<D>* HWY_RESTRICT unaligned) {
   1903  constexpr size_t kN = MaxLanes(d);
   1904  StoreU(A, d, unaligned + 0 * kN);
   1905  StoreU(B, d, unaligned + 1 * kN);
   1906  StoreU(C, d, unaligned + 2 * kN);
   1907 }
   1908 
   1909 }  // namespace detail
   1910 
   1911 // >= 128-bit vector, 8-bit lanes
   1912 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
   1913 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   1914                               TFromD<D>* HWY_RESTRICT unaligned) {
   1915  const RebindToUnsigned<decltype(d)> du;
   1916  using TU = TFromD<decltype(du)>;
   1917  using VU = VFromD<decltype(du)>;
   1918  const VU k5 = Set(du, TU{5});
   1919  const VU k6 = Set(du, TU{6});
   1920 
   1921  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
   1922  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
   1923  // to their place, with 0x80 so lanes to be filled from other vectors are 0
   1924  // to enable blending by ORing together.
   1925  const VFromD<decltype(du)> shuf_A0 =
   1926      Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
   1927                          0x80, 0x80, 4, 0x80, 0x80, 5);
   1928  // Cannot reuse shuf_A0 because it contains 5.
   1929  const VFromD<decltype(du)> shuf_A1 =
   1930      Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
   1931                          3, 0x80, 0x80, 4, 0x80, 0x80);
   1932  // The interleaved vectors will be named A, B, C; temporaries with suffix
   1933  // 0..2 indicate which input vector's lanes they hold.
   1934  // cannot reuse shuf_A0 (has 5)
   1935  const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
   1936  const VU vA0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
   1937  const VU vA1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
   1938  const VU vA2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
   1939  const VFromD<D> A = BitCast(d, vA0 | vA1 | vA2);
   1940 
   1941  // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
   1942  const VU shuf_B0 = shuf_A2 + k6;  // .A..9..8..7..6..
   1943  const VU shuf_B1 = shuf_A0 + k5;  // A..9..8..7..6..5
   1944  const VU shuf_B2 = shuf_A1 + k5;  // ..9..8..7..6..5.
   1945  const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
   1946  const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
   1947  const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
   1948  const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
   1949 
   1950  // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
   1951  const VU shuf_C0 = shuf_B2 + k6;  // ..F..E..D..C..B.
   1952  const VU shuf_C1 = shuf_B0 + k5;  // .F..E..D..C..B..
   1953  const VU shuf_C2 = shuf_B1 + k5;  // F..E..D..C..B..A
   1954  const VU vC0 = TableLookupBytesOr0(v0, shuf_C0);
   1955  const VU vC1 = TableLookupBytesOr0(v1, shuf_C1);
   1956  const VU vC2 = TableLookupBytesOr0(v2, shuf_C2);
   1957  const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
   1958 
   1959  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
   1960 }
   1961 
   1962 // >= 128-bit vector, 16-bit lanes
   1963 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
   1964 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   1965                               TFromD<D>* HWY_RESTRICT unaligned) {
   1966  const Repartition<uint8_t, decltype(d)> du8;
   1967  using VU8 = VFromD<decltype(du8)>;
   1968  const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
   1969  const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
   1970 
   1971  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
   1972  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
   1973  // filled from other vectors are 0 for blending. Note that these are byte
   1974  // indices for 16-bit lanes.
   1975  const VFromD<decltype(du8)> shuf_A1 =
   1976      Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
   1977                          0x80, 0x80, 0x80, 0x80, 4, 5);
   1978  const VFromD<decltype(du8)> shuf_A2 =
   1979      Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
   1980                          0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
   1981 
   1982  // The interleaved vectors will be named A, B, C; temporaries with suffix
   1983  // 0..2 indicate which input vector's lanes they hold.
   1984  const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
   1985 
   1986  const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
   1987  const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
   1988  const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
   1989  const VFromD<D> A = BitCast(d, A0 | A1 | A2);
   1990 
   1991  // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
   1992  const VU8 shuf_B0 = shuf_A1 + k3;  // 5..4..3.
   1993  const VU8 shuf_B1 = shuf_A2 + k3;  // ..4..3..
   1994  const VU8 shuf_B2 = shuf_A0 + k2;  // .4..3..2
   1995  const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
   1996  const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
   1997  const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
   1998  const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
   1999 
   2000  // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
   2001  const VU8 shuf_C0 = shuf_B1 + k3;  // ..7..6..
   2002  const VU8 shuf_C1 = shuf_B2 + k3;  // .7..6..5
   2003  const VU8 shuf_C2 = shuf_B0 + k2;  // 7..6..5.
   2004  const VU8 vC0 = TableLookupBytesOr0(v0, shuf_C0);
   2005  const VU8 vC1 = TableLookupBytesOr0(v1, shuf_C1);
   2006  const VU8 vC2 = TableLookupBytesOr0(v2, shuf_C2);
   2007  const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
   2008 
   2009  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
   2010 }
   2011 
   2012 // >= 128-bit vector, 32-bit lanes
   2013 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
   2014 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   2015                               TFromD<D>* HWY_RESTRICT unaligned) {
   2016  const RepartitionToWide<decltype(d)> dw;
   2017 
   2018  const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
   2019  const VFromD<D> v01_v20 = OddEven(v0, v2);
   2020  // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
   2021  const VFromD<D> A = BitCast(
   2022      d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
   2023 
   2024  const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1);
   2025  const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0);
   2026  const VFromD<D> v21_v11 = OddEven(v2, v1_321);
   2027  const VFromD<D> v12_v02 = OddEven(v1_321, v0_32);
   2028  // B: v1[2],v0[2], v2[1],v1[1]
   2029  const VFromD<D> B = BitCast(
   2030      d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
   2031 
   2032  // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
   2033  const VFromD<D> v23_v13 = OddEven(v2, v1_321);
   2034  const VFromD<D> v03_v22 = OddEven(v0, v2);
   2035  // C: v2[3],v1[3],v0[3], v2[2]
   2036  const VFromD<D> C = BitCast(
   2037      d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
   2038 
   2039  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
   2040 }
   2041 
   2042 // >= 128-bit vector, 64-bit lanes
   2043 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
   2044 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   2045                               TFromD<D>* HWY_RESTRICT unaligned) {
   2046  const VFromD<D> A = InterleaveLower(d, v0, v1);
   2047  const VFromD<D> B = OddEven(v0, v2);
   2048  const VFromD<D> C = InterleaveUpper(d, v1, v2);
   2049  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
   2050 }
   2051 
   2052 // 64-bit vector, 8-bit lanes
   2053 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
   2054 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
   2055                               VFromD<D> part2, D d,
   2056                               TFromD<D>* HWY_RESTRICT unaligned) {
   2057  // Use full vectors for the shuffles and first result.
   2058  constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
   2059  const Full128<uint8_t> du;
   2060  using VU = VFromD<decltype(du)>;
   2061  const Full128<TFromD<D>> d_full;
   2062  const VU k5 = Set(du, uint8_t{5});
   2063  const VU k6 = Set(du, uint8_t{6});
   2064 
   2065  const VFromD<decltype(d_full)> v0{part0.raw};
   2066  const VFromD<decltype(d_full)> v1{part1.raw};
   2067  const VFromD<decltype(d_full)> v2{part2.raw};
   2068 
   2069  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
   2070  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
   2071  // filled from other vectors are 0 for blending.
   2072  alignas(16) static constexpr uint8_t tbl_v0[16] = {
   2073      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
   2074      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
   2075  alignas(16) static constexpr uint8_t tbl_v1[16] = {
   2076      0x80, 0, 0x80, 0x80, 1, 0x80,  //
   2077      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
   2078  // The interleaved vectors will be named A, B, C; temporaries with suffix
   2079  // 0..2 indicate which input vector's lanes they hold.
   2080  const VU shuf_A0 = Load(du, tbl_v0);
   2081  const VU shuf_A1 = Load(du, tbl_v1);  // cannot reuse shuf_A0 (5 in MSB)
   2082  const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
   2083  const VU A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
   2084  const VU A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
   2085  const VU A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
   2086  const auto A = BitCast(d_full, A0 | A1 | A2);
   2087  StoreU(A, d_full, unaligned + 0 * kFullN);
   2088 
   2089  // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
   2090  const VU shuf_B0 = shuf_A2 + k6;  // ..7..6..
   2091  const VU shuf_B1 = shuf_A0 + k5;  // .7..6..5
   2092  const VU shuf_B2 = shuf_A1 + k5;  // 7..6..5.
   2093  const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
   2094  const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
   2095  const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
   2096  const VFromD<D> B{BitCast(d_full, vB0 | vB1 | vB2).raw};
   2097  StoreU(B, d, unaligned + 1 * kFullN);
   2098 }
   2099 
   2100 // 64-bit vector, 16-bit lanes
   2101 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
   2102 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
   2103                               VFromD<D> part2, D dh,
   2104                               TFromD<D>* HWY_RESTRICT unaligned) {
   2105  const Twice<D> d_full;
   2106  const Full128<uint8_t> du8;
   2107  using VU8 = VFromD<decltype(du8)>;
   2108  const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
   2109  const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
   2110 
   2111  const VFromD<decltype(d_full)> v0{part0.raw};
   2112  const VFromD<decltype(d_full)> v1{part1.raw};
   2113  const VFromD<decltype(d_full)> v2{part2.raw};
   2114 
   2115  // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
   2116  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
   2117  // to their place, with 0x80 so lanes to be filled from other vectors are 0
   2118  // to enable blending by ORing together.
   2119  alignas(16) static constexpr uint8_t tbl_v1[16] = {
   2120      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
   2121      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
   2122  alignas(16) static constexpr uint8_t tbl_v2[16] = {
   2123      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
   2124      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
   2125 
   2126  // The interleaved vectors will be named A, B; temporaries with suffix
   2127  // 0..2 indicate which input vector's lanes they hold.
   2128  const VU8 shuf_A1 = Load(du8, tbl_v1);  // 2..1..0.
   2129                                          // .2..1..0
   2130  const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
   2131  const VU8 shuf_A2 = Load(du8, tbl_v2);  // ..1..0..
   2132 
   2133  const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
   2134  const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
   2135  const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
   2136  const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
   2137  StoreU(A, d_full, unaligned);
   2138 
   2139  // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
   2140  const VU8 shuf_B0 = shuf_A1 + k3;  // ..3.
   2141  const VU8 shuf_B1 = shuf_A2 + k3;  // .3..
   2142  const VU8 shuf_B2 = shuf_A0 + k2;  // 3..2
   2143  const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
   2144  const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
   2145  const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
   2146  const VFromD<decltype(d_full)> B = BitCast(d_full, vB0 | vB1 | vB2);
   2147  StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
   2148 }
   2149 
   2150 // 64-bit vector, 32-bit lanes
   2151 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
   2152 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   2153                               TFromD<D>* HWY_RESTRICT unaligned) {
   2154  // (same code as 128-bit vector, 64-bit lanes)
   2155  const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
   2156  const VFromD<D> v01_v20 = OddEven(v0, v2);
   2157  const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2);
   2158  constexpr size_t kN = MaxLanes(d);
   2159  StoreU(v10_v00, d, unaligned + 0 * kN);
   2160  StoreU(v01_v20, d, unaligned + 1 * kN);
   2161  StoreU(v21_v11, d, unaligned + 2 * kN);
   2162 }
   2163 
   2164 // 64-bit lanes are handled by the N=1 case below.
   2165 
   2166 // <= 32-bit vector, 8-bit lanes
   2167 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
   2168          HWY_IF_LANES_GT_D(D, 1)>
   2169 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
   2170                               VFromD<D> part2, D d,
   2171                               TFromD<D>* HWY_RESTRICT unaligned) {
   2172  // Use full vectors for the shuffles and result.
   2173  const Full128<uint8_t> du;
   2174  using VU = VFromD<decltype(du)>;
   2175  const Full128<TFromD<D>> d_full;
   2176 
   2177  const VFromD<decltype(d_full)> v0{part0.raw};
   2178  const VFromD<decltype(d_full)> v1{part1.raw};
   2179  const VFromD<decltype(d_full)> v2{part2.raw};
   2180 
   2181  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
   2182  // so lanes to be filled from other vectors are 0 to enable blending by ORing
   2183  // together.
   2184  alignas(16) static constexpr uint8_t tbl_v0[16] = {
   2185      0,    0x80, 0x80, 1,    0x80, 0x80, 2,    0x80,
   2186      0x80, 3,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
   2187  // The interleaved vector will be named A; temporaries with suffix
   2188  // 0..2 indicate which input vector's lanes they hold.
   2189  const VU shuf_A0 = Load(du, tbl_v0);
   2190  const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
   2191  const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
   2192  const VU A0 = TableLookupBytesOr0(v0, shuf_A0);  // ......3..2..1..0
   2193  const VU A1 = TableLookupBytesOr0(v1, shuf_A1);  // .....3..2..1..0.
   2194  const VU A2 = TableLookupBytesOr0(v2, shuf_A2);  // ....3..2..1..0..
   2195  const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
   2196  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
   2197  StoreU(A, d_full, buf);
   2198  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
   2199 }
   2200 
   2201 // 32-bit vector, 16-bit lanes
   2202 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
   2203 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
   2204                               VFromD<D> part2, D d,
   2205                               TFromD<D>* HWY_RESTRICT unaligned) {
   2206  // Use full vectors for the shuffles and result.
   2207  const Full128<uint8_t> du8;
   2208  using VU8 = VFromD<decltype(du8)>;
   2209  const Full128<TFromD<D>> d_full;
   2210 
   2211  const VFromD<decltype(d_full)> v0{part0.raw};
   2212  const VFromD<decltype(d_full)> v1{part1.raw};
   2213  const VFromD<decltype(d_full)> v2{part2.raw};
   2214 
   2215  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
   2216  // so lanes to be filled from other vectors are 0 to enable blending by ORing
   2217  // together.
   2218  alignas(16) static constexpr uint8_t tbl_v2[16] = {
   2219      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
   2220      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
   2221  // The interleaved vector will be named A; temporaries with suffix
   2222  // 0..2 indicate which input vector's lanes they hold.
   2223  const VU8 shuf_A2 = Load(du8, tbl_v2);  // ..1..0..
   2224  const VU8 shuf_A1 =
   2225      CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);  // ...1..0.
   2226  const VU8 shuf_A0 =
   2227      CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);  // ....1..0
   2228  const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);       // ..1..0
   2229  const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);       // .1..0.
   2230  const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);       // 1..0..
   2231  const auto A = BitCast(d_full, A0 | A1 | A2);
   2232  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
   2233  StoreU(A, d_full, buf);
   2234  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
   2235 }
   2236 
   2237 // Single-element vector, any lane size: just store directly
   2238 template <class D, HWY_IF_LANES_D(D, 1)>
   2239 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   2240                               TFromD<D>* HWY_RESTRICT unaligned) {
   2241  StoreU(v0, d, unaligned + 0);
   2242  StoreU(v1, d, unaligned + 1);
   2243  StoreU(v2, d, unaligned + 2);
   2244 }
   2245 
   2246 // ------------------------------ StoreInterleaved4
   2247 
   2248 namespace detail {
   2249 
   2250 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
   2251 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   2252 HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC,
   2253                                       VFromD<D> vD, D d,
   2254                                       TFromD<D>* HWY_RESTRICT unaligned) {
   2255  constexpr size_t kN = MaxLanes(d);
   2256  StoreU(vA, d, unaligned + 0 * kN);
   2257  StoreU(vB, d, unaligned + 1 * kN);
   2258  StoreU(vC, d, unaligned + 2 * kN);
   2259  StoreU(vD, d, unaligned + 3 * kN);
   2260 }
   2261 
   2262 }  // namespace detail
   2263 
   2264 // >= 128-bit vector, 8..32-bit lanes
   2265 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
   2266 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
   2267                               VFromD<D> v3, D d,
   2268                               TFromD<D>* HWY_RESTRICT unaligned) {
   2269  const RepartitionToWide<decltype(d)> dw;
   2270  const auto v10L = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
   2271  const auto v32L = ZipLower(dw, v2, v3);
   2272  const auto v10U = ZipUpper(dw, v0, v1);
   2273  const auto v32U = ZipUpper(dw, v2, v3);
   2274  // The interleaved vectors are vA, vB, vC, vD.
   2275  const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L));  // 3210
   2276  const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L));
   2277  const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U));
   2278  const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U));
   2279  detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
   2280 }
   2281 
   2282 // >= 128-bit vector, 64-bit lanes
   2283 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
   2284 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
   2285                               VFromD<D> v3, D d,
   2286                               TFromD<D>* HWY_RESTRICT unaligned) {
   2287  // The interleaved vectors are vA, vB, vC, vD.
   2288  const VFromD<D> vA = InterleaveLower(d, v0, v1);  // v1[0] v0[0]
   2289  const VFromD<D> vB = InterleaveLower(d, v2, v3);
   2290  const VFromD<D> vC = InterleaveUpper(d, v0, v1);
   2291  const VFromD<D> vD = InterleaveUpper(d, v2, v3);
   2292  detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
   2293 }
   2294 
   2295 // 64-bit vector, 8..32-bit lanes
   2296 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
   2297 HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
   2298                               VFromD<D> part2, VFromD<D> part3, D /* tag */,
   2299                               TFromD<D>* HWY_RESTRICT unaligned) {
   2300  // Use full vectors to reduce the number of stores.
   2301  const Full128<TFromD<D>> d_full;
   2302  const RepartitionToWide<decltype(d_full)> dw;
   2303  const VFromD<decltype(d_full)> v0{part0.raw};
   2304  const VFromD<decltype(d_full)> v1{part1.raw};
   2305  const VFromD<decltype(d_full)> v2{part2.raw};
   2306  const VFromD<decltype(d_full)> v3{part3.raw};
   2307  const auto v10 = ZipLower(dw, v0, v1);  // v1[0] v0[0]
   2308  const auto v32 = ZipLower(dw, v2, v3);
   2309  const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
   2310  const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
   2311  StoreU(A, d_full, unaligned);
   2312  StoreU(B, d_full, unaligned + MaxLanes(d_full));
   2313 }
   2314 
   2315 // 64-bit vector, 64-bit lane
   2316 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
   2317 HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
   2318                               VFromD<D> part2, VFromD<D> part3, D /* tag */,
   2319                               TFromD<D>* HWY_RESTRICT unaligned) {
   2320  // Use full vectors to reduce the number of stores.
   2321  const Full128<TFromD<D>> d_full;
   2322  const VFromD<decltype(d_full)> v0{part0.raw};
   2323  const VFromD<decltype(d_full)> v1{part1.raw};
   2324  const VFromD<decltype(d_full)> v2{part2.raw};
   2325  const VFromD<decltype(d_full)> v3{part3.raw};
   2326  const auto A = InterleaveLower(d_full, v0, v1);  // v1[0] v0[0]
   2327  const auto B = InterleaveLower(d_full, v2, v3);
   2328  StoreU(A, d_full, unaligned);
   2329  StoreU(B, d_full, unaligned + MaxLanes(d_full));
   2330 }
   2331 
   2332 // <= 32-bit vectors
   2333 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   2334 HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
   2335                               VFromD<D> part2, VFromD<D> part3, D d,
   2336                               TFromD<D>* HWY_RESTRICT unaligned) {
   2337  // Use full vectors to reduce the number of stores.
   2338  const Full128<TFromD<D>> d_full;
   2339  const RepartitionToWide<decltype(d_full)> dw;
   2340  const VFromD<decltype(d_full)> v0{part0.raw};
   2341  const VFromD<decltype(d_full)> v1{part1.raw};
   2342  const VFromD<decltype(d_full)> v2{part2.raw};
   2343  const VFromD<decltype(d_full)> v3{part3.raw};
   2344  const auto v10 = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
   2345  const auto v32 = ZipLower(dw, v2, v3);
   2346  const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
   2347  alignas(16) TFromD<D> buf[MaxLanes(d_full)];
   2348  StoreU(v3210, d_full, buf);
   2349  CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
   2350 }
   2351 
   2352 #endif  // HWY_NATIVE_LOAD_STORE_INTERLEAVED
   2353 
   2354 // ------------------------------ PairwiseAdd/PairwiseSub
   2355 #if (defined(HWY_NATIVE_PAIRWISE_ADD) == defined(HWY_TARGET_TOGGLE))
   2356 #ifdef HWY_NATIVE_PAIRWISE_ADD
   2357 #undef HWY_NATIVE_PAIRWISE_ADD
   2358 #else
   2359 #define HWY_NATIVE_PAIRWISE_ADD
   2360 #endif
   2361 
   2362 template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)>
   2363 HWY_API V PairwiseAdd(D d, V a, V b) {
   2364  return Add(InterleaveEven(d, a, b), InterleaveOdd(d, a, b));
   2365 }
   2366 
   2367 #endif
   2368 
   2369 #if (defined(HWY_NATIVE_PAIRWISE_SUB) == defined(HWY_TARGET_TOGGLE))
   2370 #ifdef HWY_NATIVE_PAIRWISE_SUB
   2371 #undef HWY_NATIVE_PAIRWISE_SUB
   2372 #else
   2373 #define HWY_NATIVE_PAIRWISE_SUB
   2374 #endif
   2375 
   2376 template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)>
   2377 HWY_API V PairwiseSub(D d, V a, V b) {
   2378  return Sub(InterleaveOdd(d, a, b), InterleaveEven(d, a, b));
   2379 }
   2380 
   2381 #endif
   2382 
   2383 // Load/StoreInterleaved for special floats. Requires HWY_GENERIC_IF_EMULATED_D
   2384 // is defined such that it is true only for types that actually require these
   2385 // generic implementations.
   2386 #if HWY_IDE || (defined(HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED) == \
   2387                    defined(HWY_TARGET_TOGGLE) &&                           \
   2388                defined(HWY_GENERIC_IF_EMULATED_D))
   2389 #ifdef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   2390 #undef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   2391 #else
   2392 #define HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   2393 #endif
   2394 #if HWY_IDE
   2395 #define HWY_GENERIC_IF_EMULATED_D(D) int
   2396 #endif
   2397 
   2398 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
   2399 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
   2400                              VFromD<D>& v0, VFromD<D>& v1) {
   2401  const RebindToUnsigned<decltype(d)> du;
   2402  VFromD<decltype(du)> vu0, vu1;
   2403  LoadInterleaved2(du, detail::U16LanePointer(unaligned), vu0, vu1);
   2404  v0 = BitCast(d, vu0);
   2405  v1 = BitCast(d, vu1);
   2406 }
   2407 
   2408 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
   2409 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
   2410                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   2411  const RebindToUnsigned<decltype(d)> du;
   2412  VFromD<decltype(du)> vu0, vu1, vu2;
   2413  LoadInterleaved3(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2);
   2414  v0 = BitCast(d, vu0);
   2415  v1 = BitCast(d, vu1);
   2416  v2 = BitCast(d, vu2);
   2417 }
   2418 
   2419 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
   2420 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
   2421                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   2422                              VFromD<D>& v3) {
   2423  const RebindToUnsigned<decltype(d)> du;
   2424  VFromD<decltype(du)> vu0, vu1, vu2, vu3;
   2425  LoadInterleaved4(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2, vu3);
   2426  v0 = BitCast(d, vu0);
   2427  v1 = BitCast(d, vu1);
   2428  v2 = BitCast(d, vu2);
   2429  v3 = BitCast(d, vu3);
   2430 }
   2431 
   2432 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
   2433 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
   2434                               T* HWY_RESTRICT unaligned) {
   2435  const RebindToUnsigned<decltype(d)> du;
   2436  StoreInterleaved2(BitCast(du, v0), BitCast(du, v1), du,
   2437                    detail::U16LanePointer(unaligned));
   2438 }
   2439 
   2440 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
   2441 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   2442                               T* HWY_RESTRICT unaligned) {
   2443  const RebindToUnsigned<decltype(d)> du;
   2444  StoreInterleaved3(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2), du,
   2445                    detail::U16LanePointer(unaligned));
   2446 }
   2447 
   2448 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>>
   2449 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
   2450                               VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
   2451  const RebindToUnsigned<decltype(d)> du;
   2452  StoreInterleaved4(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2),
   2453                    BitCast(du, v3), du, detail::U16LanePointer(unaligned));
   2454 }
   2455 
   2456 #endif  // HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED
   2457 
   2458 // ------------------------------ LoadN
   2459 
   2460 #if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
   2461 
   2462 #ifdef HWY_NATIVE_LOAD_N
   2463 #undef HWY_NATIVE_LOAD_N
   2464 #else
   2465 #define HWY_NATIVE_LOAD_N
   2466 #endif
   2467 
   2468 #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
   2469 namespace detail {
   2470 
   2471 template <class DTo, class DFrom>
   2472 HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
   2473                                          VFromD<DFrom> v) {
   2474 #if HWY_TARGET <= HWY_SSE2
   2475  // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw
   2476  // past the first (lowest-index) Lanes(d_from) lanes of v.raw if
   2477  // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true
   2478  (void)d_from;
   2479  return ResizeBitCast(d_to, v);
   2480 #else
   2481  // On other targets such as PPC/NEON, the contents of any lanes past the first
   2482  // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if
   2483  // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true.
   2484  return ZeroExtendResizeBitCast(d_to, d_from, v);
   2485 #endif
   2486 }
   2487 
   2488 }  // namespace detail
   2489 
   2490 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
   2491          HWY_IF_NOT_BF16_D(D)>
   2492 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2493                        size_t num_lanes) {
   2494  return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
   2495 }
   2496 
   2497 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
   2498          HWY_IF_NOT_BF16_D(D)>
   2499 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2500                          size_t num_lanes) {
   2501  return (num_lanes > 0) ? LoadU(d, p) : no;
   2502 }
   2503 
   2504 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
   2505          HWY_IF_NOT_BF16_D(D)>
   2506 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2507                        size_t num_lanes) {
   2508  const FixedTag<TFromD<D>, 1> d1;
   2509 
   2510  if (num_lanes >= 2) return LoadU(d, p);
   2511  if (num_lanes == 0) return Zero(d);
   2512  return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
   2513 }
   2514 
   2515 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
   2516          HWY_IF_NOT_BF16_D(D)>
   2517 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2518                          size_t num_lanes) {
   2519  const FixedTag<TFromD<D>, 1> d1;
   2520 
   2521  if (num_lanes >= 2) return LoadU(d, p);
   2522  if (num_lanes == 0) return no;
   2523  return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
   2524 }
   2525 
   2526 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
   2527          HWY_IF_NOT_BF16_D(D)>
   2528 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2529                        size_t num_lanes) {
   2530  const FixedTag<TFromD<D>, 2> d2;
   2531  const Half<decltype(d2)> d1;
   2532 
   2533  if (num_lanes >= 4) return LoadU(d, p);
   2534  if (num_lanes == 0) return Zero(d);
   2535  if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
   2536 
   2537  // Two or three lanes.
   2538  const VFromD<D> v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p));
   2539  return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
   2540 }
   2541 
   2542 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
   2543          HWY_IF_NOT_BF16_D(D)>
   2544 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2545                          size_t num_lanes) {
   2546  const FixedTag<TFromD<D>, 2> d2;
   2547 
   2548  if (num_lanes >= 4) return LoadU(d, p);
   2549  if (num_lanes == 0) return no;
   2550  if (num_lanes == 1) return InsertLane(no, 0, p[0]);
   2551 
   2552  // Two or three lanes.
   2553  const VFromD<D> v_lo =
   2554      ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p)));
   2555  return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
   2556 }
   2557 
   2558 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
   2559          HWY_IF_NOT_BF16_D(D)>
   2560 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2561                        size_t num_lanes) {
   2562  const FixedTag<TFromD<D>, 4> d4;
   2563  const Half<decltype(d4)> d2;
   2564  const Half<decltype(d2)> d1;
   2565 
   2566  if (num_lanes >= 8) return LoadU(d, p);
   2567  if (num_lanes == 0) return Zero(d);
   2568  if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
   2569 
   2570  const size_t leading_len = num_lanes & 4;
   2571  VFromD<decltype(d4)> v_trailing = Zero(d4);
   2572 
   2573  if ((num_lanes & 2) != 0) {
   2574    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
   2575    if ((num_lanes & 1) != 0) {
   2576      v_trailing = Combine(
   2577          d4,
   2578          detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
   2579          v_trailing_lo2);
   2580    } else {
   2581      v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
   2582    }
   2583  } else if ((num_lanes & 1) != 0) {
   2584    v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
   2585  }
   2586 
   2587  if (leading_len != 0) {
   2588    return Combine(d, v_trailing, LoadU(d4, p));
   2589  } else {
   2590    return detail::LoadNResizeBitCast(d, d4, v_trailing);
   2591  }
   2592 }
   2593 
   2594 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
   2595          HWY_IF_NOT_BF16_D(D)>
   2596 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2597                          size_t num_lanes) {
   2598  const FixedTag<TFromD<D>, 4> d4;
   2599  const Half<decltype(d4)> d2;
   2600  const Half<decltype(d2)> d1;
   2601 
   2602  if (num_lanes >= 8) return LoadU(d, p);
   2603  if (num_lanes == 0) return no;
   2604  if (num_lanes == 1) return InsertLane(no, 0, p[0]);
   2605 
   2606  const size_t leading_len = num_lanes & 4;
   2607  VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
   2608 
   2609  if ((num_lanes & 2) != 0) {
   2610    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
   2611    if ((num_lanes & 1) != 0) {
   2612      v_trailing = Combine(
   2613          d4,
   2614          InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
   2615                          ResizeBitCast(d2, no)),
   2616          v_trailing_lo2);
   2617    } else {
   2618      v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
   2619                                    ResizeBitCast(d4, v_trailing_lo2));
   2620    }
   2621  } else if ((num_lanes & 1) != 0) {
   2622    v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
   2623  }
   2624 
   2625  if (leading_len != 0) {
   2626    return Combine(d, v_trailing, LoadU(d4, p));
   2627  } else {
   2628    return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing));
   2629  }
   2630 }
   2631 
   2632 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
   2633          HWY_IF_NOT_BF16_D(D)>
   2634 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2635                        size_t num_lanes) {
   2636  const FixedTag<TFromD<D>, 8> d8;
   2637  const Half<decltype(d8)> d4;
   2638  const Half<decltype(d4)> d2;
   2639  const Half<decltype(d2)> d1;
   2640 
   2641  if (num_lanes >= 16) return LoadU(d, p);
   2642  if (num_lanes == 0) return Zero(d);
   2643  if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
   2644 
   2645  const size_t leading_len = num_lanes & 12;
   2646  VFromD<decltype(d4)> v_trailing = Zero(d4);
   2647 
   2648  if ((num_lanes & 2) != 0) {
   2649    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
   2650    if ((num_lanes & 1) != 0) {
   2651      v_trailing = Combine(
   2652          d4,
   2653          detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
   2654          v_trailing_lo2);
   2655    } else {
   2656      v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
   2657    }
   2658  } else if ((num_lanes & 1) != 0) {
   2659    v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
   2660  }
   2661 
   2662  if (leading_len != 0) {
   2663    if (leading_len >= 8) {
   2664      const VFromD<decltype(d8)> v_hi7 =
   2665          ((leading_len & 4) != 0)
   2666              ? Combine(d8, v_trailing, LoadU(d4, p + 8))
   2667              : detail::LoadNResizeBitCast(d8, d4, v_trailing);
   2668      return Combine(d, v_hi7, LoadU(d8, p));
   2669    } else {
   2670      return detail::LoadNResizeBitCast(d, d8,
   2671                                        Combine(d8, v_trailing, LoadU(d4, p)));
   2672    }
   2673  } else {
   2674    return detail::LoadNResizeBitCast(d, d4, v_trailing);
   2675  }
   2676 }
   2677 
   2678 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
   2679          HWY_IF_NOT_BF16_D(D)>
   2680 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2681                          size_t num_lanes) {
   2682  const FixedTag<TFromD<D>, 8> d8;
   2683  const Half<decltype(d8)> d4;
   2684  const Half<decltype(d4)> d2;
   2685  const Half<decltype(d2)> d1;
   2686 
   2687  if (num_lanes >= 16) return LoadU(d, p);
   2688  if (num_lanes == 0) return no;
   2689  if (num_lanes == 1) return InsertLane(no, 0, p[0]);
   2690 
   2691  const size_t leading_len = num_lanes & 12;
   2692  VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
   2693 
   2694  if ((num_lanes & 2) != 0) {
   2695    const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
   2696    if ((num_lanes & 1) != 0) {
   2697      v_trailing = Combine(
   2698          d4,
   2699          InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
   2700                          ResizeBitCast(d2, no)),
   2701          v_trailing_lo2);
   2702    } else {
   2703      v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
   2704                                    ResizeBitCast(d4, v_trailing_lo2));
   2705    }
   2706  } else if ((num_lanes & 1) != 0) {
   2707    v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
   2708  }
   2709 
   2710  if (leading_len != 0) {
   2711    if (leading_len >= 8) {
   2712      const VFromD<decltype(d8)> v_hi7 =
   2713          ((leading_len & 4) != 0)
   2714              ? Combine(d8, v_trailing, LoadU(d4, p + 8))
   2715              : ConcatUpperLower(d8, ResizeBitCast(d8, no),
   2716                                 ResizeBitCast(d8, v_trailing));
   2717      return Combine(d, v_hi7, LoadU(d8, p));
   2718    } else {
   2719      return ConcatUpperLower(
   2720          d, ResizeBitCast(d, no),
   2721          ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p))));
   2722    }
   2723  } else {
   2724    const Repartition<uint32_t, D> du32;
   2725    // lowest 4 bytes from v_trailing, next 4 from no.
   2726    const VFromD<decltype(du32)> lo8 =
   2727        InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no));
   2728    return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8));
   2729  }
   2730 }
   2731 
   2732 #if HWY_MAX_BYTES >= 32
   2733 
   2734 template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
   2735 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2736                        size_t num_lanes) {
   2737  if (num_lanes >= Lanes(d)) return LoadU(d, p);
   2738 
   2739  const Half<decltype(d)> dh;
   2740  const size_t half_N = Lanes(dh);
   2741  if (num_lanes <= half_N) {
   2742    return ZeroExtendVector(d, LoadN(dh, p, num_lanes));
   2743  } else {
   2744    const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
   2745    const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N);
   2746    return Combine(d, v_hi, v_lo);
   2747  }
   2748 }
   2749 
   2750 template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
   2751 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2752                          size_t num_lanes) {
   2753  if (num_lanes >= Lanes(d)) return LoadU(d, p);
   2754 
   2755  const Half<decltype(d)> dh;
   2756  const size_t half_N = Lanes(dh);
   2757  const VFromD<decltype(dh)> no_h = LowerHalf(no);
   2758  if (num_lanes <= half_N) {
   2759    return ConcatUpperLower(d, no,
   2760                            ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes)));
   2761  } else {
   2762    const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
   2763    const VFromD<decltype(dh)> v_hi =
   2764        LoadNOr(no_h, dh, p + half_N, num_lanes - half_N);
   2765    return Combine(d, v_hi, v_lo);
   2766  }
   2767 }
   2768 
   2769 #endif  // HWY_MAX_BYTES >= 32
   2770 
   2771 template <class D, HWY_IF_BF16_D(D)>
   2772 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2773                        size_t num_lanes) {
   2774  const RebindToUnsigned<D> du;
   2775  return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
   2776 }
   2777 
   2778 template <class D, HWY_IF_BF16_D(D)>
   2779 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2780                          size_t num_lanes) {
   2781  const RebindToUnsigned<D> du;
   2782  return BitCast(
   2783      d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
   2784 }
   2785 
   2786 #else  // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
   2787 
   2788 // For SVE and non-sanitizer AVX-512; RVV has its own specialization.
   2789 template <class D>
   2790 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2791                        size_t num_lanes) {
   2792 #if HWY_MEM_OPS_MIGHT_FAULT
   2793  if (num_lanes <= 0) return Zero(d);
   2794 #endif
   2795 
   2796  return MaskedLoad(FirstN(d, num_lanes), d, p);
   2797 }
   2798 
   2799 template <class D>
   2800 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
   2801                          size_t num_lanes) {
   2802 #if HWY_MEM_OPS_MIGHT_FAULT
   2803  if (num_lanes <= 0) return no;
   2804 #endif
   2805 
   2806  return MaskedLoadOr(no, FirstN(d, num_lanes), d, p);
   2807 }
   2808 
   2809 #endif  // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
   2810 #endif  // HWY_NATIVE_LOAD_N
   2811 
   2812 // ------------------------------ StoreN
   2813 #if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
   2814 #ifdef HWY_NATIVE_STORE_N
   2815 #undef HWY_NATIVE_STORE_N
   2816 #else
   2817 #define HWY_NATIVE_STORE_N
   2818 #endif
   2819 
   2820 #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
   2821 namespace detail {
   2822 
   2823 template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
   2824 HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
   2825  constexpr size_t kMinShrVectBytes = HWY_TARGET_IS_NEON ? 8 : 16;
   2826  const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
   2827  return ResizeBitCast(
   2828      dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
   2829 }
   2830 
   2831 template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
   2832 HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
   2833  return UpperHalf(dh, v);
   2834 }
   2835 
   2836 }  // namespace detail
   2837 
   2838 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
   2839          typename T = TFromD<D>>
   2840 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2841                    size_t max_lanes_to_store) {
   2842  if (max_lanes_to_store > 0) {
   2843    StoreU(v, d, p);
   2844  }
   2845 }
   2846 
   2847 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
   2848          typename T = TFromD<D>>
   2849 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2850                    size_t max_lanes_to_store) {
   2851  if (max_lanes_to_store > 1) {
   2852    StoreU(v, d, p);
   2853  } else if (max_lanes_to_store == 1) {
   2854    const FixedTag<TFromD<D>, 1> d1;
   2855    StoreU(LowerHalf(d1, v), d1, p);
   2856  }
   2857 }
   2858 
   2859 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
   2860          typename T = TFromD<D>>
   2861 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2862                    size_t max_lanes_to_store) {
   2863  const FixedTag<TFromD<D>, 2> d2;
   2864  const Half<decltype(d2)> d1;
   2865 
   2866  if (max_lanes_to_store > 1) {
   2867    if (max_lanes_to_store >= 4) {
   2868      StoreU(v, d, p);
   2869    } else {
   2870      StoreU(ResizeBitCast(d2, v), d2, p);
   2871      if (max_lanes_to_store == 3) {
   2872        StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2);
   2873      }
   2874    }
   2875  } else if (max_lanes_to_store == 1) {
   2876    StoreU(ResizeBitCast(d1, v), d1, p);
   2877  }
   2878 }
   2879 
   2880 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
   2881          typename T = TFromD<D>>
   2882 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2883                    size_t max_lanes_to_store) {
   2884  const FixedTag<TFromD<D>, 4> d4;
   2885  const Half<decltype(d4)> d2;
   2886  const Half<decltype(d2)> d1;
   2887 
   2888  if (max_lanes_to_store <= 1) {
   2889    if (max_lanes_to_store == 1) {
   2890      StoreU(ResizeBitCast(d1, v), d1, p);
   2891    }
   2892  } else if (max_lanes_to_store >= 8) {
   2893    StoreU(v, d, p);
   2894  } else if (max_lanes_to_store >= 4) {
   2895    StoreU(LowerHalf(d4, v), d4, p);
   2896    StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4,
   2897           max_lanes_to_store - 4);
   2898  } else {
   2899    StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store);
   2900  }
   2901 }
   2902 
   2903 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
   2904          typename T = TFromD<D>>
   2905 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2906                    size_t max_lanes_to_store) {
   2907  const FixedTag<TFromD<D>, 8> d8;
   2908  const Half<decltype(d8)> d4;
   2909  const Half<decltype(d4)> d2;
   2910  const Half<decltype(d2)> d1;
   2911 
   2912  if (max_lanes_to_store <= 1) {
   2913    if (max_lanes_to_store == 1) {
   2914      StoreU(ResizeBitCast(d1, v), d1, p);
   2915    }
   2916  } else if (max_lanes_to_store >= 16) {
   2917    StoreU(v, d, p);
   2918  } else if (max_lanes_to_store >= 8) {
   2919    StoreU(LowerHalf(d8, v), d8, p);
   2920    StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8,
   2921           max_lanes_to_store - 8);
   2922  } else {
   2923    StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store);
   2924  }
   2925 }
   2926 
   2927 #if HWY_MAX_BYTES >= 32
   2928 template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>>
   2929 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2930                    size_t max_lanes_to_store) {
   2931  const size_t N = Lanes(d);
   2932  if (max_lanes_to_store >= N) {
   2933    StoreU(v, d, p);
   2934    return;
   2935  }
   2936 
   2937  const Half<decltype(d)> dh;
   2938  const size_t half_N = Lanes(dh);
   2939  if (max_lanes_to_store <= half_N) {
   2940    StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store);
   2941  } else {
   2942    StoreU(LowerHalf(dh, v), dh, p);
   2943    StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N);
   2944  }
   2945 }
   2946 #endif  // HWY_MAX_BYTES >= 32
   2947 
   2948 #else  // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
   2949 template <class D, typename T = TFromD<D>>
   2950 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
   2951                    size_t max_lanes_to_store) {
   2952  const size_t N = Lanes(d);
   2953  const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
   2954 #if HWY_MEM_OPS_MIGHT_FAULT
   2955  if (clamped_max_lanes_to_store == 0) return;
   2956 #endif
   2957 
   2958  BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
   2959 
   2960  detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
   2961 }
   2962 #endif  // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
   2963 
   2964 #endif  // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
   2965 
   2966 // ------------------------------ TruncateStore
   2967 #if (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE))
   2968 #ifdef HWY_NATIVE_STORE_TRUNCATED
   2969 #undef HWY_NATIVE_STORE_TRUNCATED
   2970 #else
   2971 #define HWY_NATIVE_STORE_TRUNCATED
   2972 #endif
   2973 
   2974 template <class D, class T, HWY_IF_T_SIZE_GT_D(D, sizeof(T)),
   2975          HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
   2976 HWY_API void TruncateStore(VFromD<D> v, const D /*d*/, T* HWY_RESTRICT p) {
   2977  using DTo = Rebind<T, D>;
   2978  DTo dsmall;
   2979  StoreU(TruncateTo(dsmall, v), dsmall, p);
   2980 }
   2981 
   2982 #endif  // (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE))
   2983 
   2984 // ------------------------------ Scatter
   2985 
   2986 #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
   2987 #ifdef HWY_NATIVE_SCATTER
   2988 #undef HWY_NATIVE_SCATTER
   2989 #else
   2990 #define HWY_NATIVE_SCATTER
   2991 #endif
   2992 
   2993 template <class D, typename T = TFromD<D>>
   2994 HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base,
   2995                           VFromD<RebindToSigned<D>> offset) {
   2996  const RebindToSigned<decltype(d)> di;
   2997  using TI = TFromD<decltype(di)>;
   2998  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   2999 
   3000  HWY_ALIGN T lanes[MaxLanes(d)];
   3001  Store(v, d, lanes);
   3002 
   3003  HWY_ALIGN TI offset_lanes[MaxLanes(d)];
   3004  Store(offset, di, offset_lanes);
   3005 
   3006  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
   3007  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3008    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
   3009  }
   3010 }
   3011 
   3012 template <class D, typename T = TFromD<D>>
   3013 HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base,
   3014                          VFromD<RebindToSigned<D>> index) {
   3015  const RebindToSigned<decltype(d)> di;
   3016  using TI = TFromD<decltype(di)>;
   3017  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3018 
   3019  HWY_ALIGN T lanes[MaxLanes(d)];
   3020  Store(v, d, lanes);
   3021 
   3022  HWY_ALIGN TI index_lanes[MaxLanes(d)];
   3023  Store(index, di, index_lanes);
   3024 
   3025  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3026    base[index_lanes[i]] = lanes[i];
   3027  }
   3028 }
   3029 
   3030 template <class D, typename T = TFromD<D>>
   3031 HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
   3032                                T* HWY_RESTRICT base,
   3033                                VFromD<RebindToSigned<D>> index) {
   3034  const RebindToSigned<decltype(d)> di;
   3035  using TI = TFromD<decltype(di)>;
   3036  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3037 
   3038  HWY_ALIGN T lanes[MaxLanes(d)];
   3039  Store(v, d, lanes);
   3040 
   3041  HWY_ALIGN TI index_lanes[MaxLanes(d)];
   3042  Store(index, di, index_lanes);
   3043 
   3044  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
   3045  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
   3046 
   3047  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3048    if (mask_lanes[i]) base[index_lanes[i]] = lanes[i];
   3049  }
   3050 }
   3051 
   3052 template <class D, typename T = TFromD<D>>
   3053 HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
   3054                           VFromD<RebindToSigned<D>> index,
   3055                           const size_t max_lanes_to_store) {
   3056  const RebindToSigned<decltype(d)> di;
   3057  using TI = TFromD<decltype(di)>;
   3058  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3059 
   3060  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3061    if (i < max_lanes_to_store) base[ExtractLane(index, i)] = ExtractLane(v, i);
   3062  }
   3063 }
   3064 #else
   3065 template <class D, typename T = TFromD<D>>
   3066 HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
   3067                           VFromD<RebindToSigned<D>> index,
   3068                           const size_t max_lanes_to_store) {
   3069  MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
   3070 }
   3071 #endif  // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
   3072 
   3073 // ------------------------------ Gather
   3074 
   3075 #if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
   3076 #ifdef HWY_NATIVE_GATHER
   3077 #undef HWY_NATIVE_GATHER
   3078 #else
   3079 #define HWY_NATIVE_GATHER
   3080 #endif
   3081 
   3082 template <class D, typename T = TFromD<D>>
   3083 HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
   3084                               VFromD<RebindToSigned<D>> offset) {
   3085  const RebindToSigned<D> di;
   3086  using TI = TFromD<decltype(di)>;
   3087  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3088 
   3089  HWY_ALIGN TI offset_lanes[MaxLanes(d)];
   3090  Store(offset, di, offset_lanes);
   3091 
   3092  HWY_ALIGN T lanes[MaxLanes(d)];
   3093  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
   3094  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3095    HWY_DASSERT(offset_lanes[i] >= 0);
   3096    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
   3097  }
   3098  return Load(d, lanes);
   3099 }
   3100 
   3101 template <class D, typename T = TFromD<D>>
   3102 HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
   3103                              VFromD<RebindToSigned<D>> index) {
   3104  const RebindToSigned<D> di;
   3105  using TI = TFromD<decltype(di)>;
   3106  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3107 
   3108  HWY_ALIGN TI index_lanes[MaxLanes(d)];
   3109  Store(index, di, index_lanes);
   3110 
   3111  HWY_ALIGN T lanes[MaxLanes(d)];
   3112  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3113    HWY_DASSERT(index_lanes[i] >= 0);
   3114    lanes[i] = base[index_lanes[i]];
   3115  }
   3116  return Load(d, lanes);
   3117 }
   3118 
   3119 template <class D, typename T = TFromD<D>>
   3120 HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
   3121                                    const T* HWY_RESTRICT base,
   3122                                    VFromD<RebindToSigned<D>> index) {
   3123  const RebindToSigned<D> di;
   3124  using TI = TFromD<decltype(di)>;
   3125  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3126 
   3127  HWY_ALIGN TI index_lanes[MaxLanes(di)];
   3128  Store(index, di, index_lanes);
   3129 
   3130  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
   3131  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
   3132 
   3133  HWY_ALIGN T lanes[MaxLanes(d)];
   3134  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3135    HWY_DASSERT(index_lanes[i] >= 0);
   3136    lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
   3137  }
   3138  return Load(d, lanes);
   3139 }
   3140 
   3141 template <class D, typename T = TFromD<D>>
   3142 HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
   3143                                      const T* HWY_RESTRICT base,
   3144                                      VFromD<RebindToSigned<D>> index) {
   3145  const RebindToSigned<D> di;
   3146  using TI = TFromD<decltype(di)>;
   3147  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3148 
   3149  HWY_ALIGN TI index_lanes[MaxLanes(di)];
   3150  Store(index, di, index_lanes);
   3151 
   3152  HWY_ALIGN TI mask_lanes[MaxLanes(di)];
   3153  Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
   3154 
   3155  HWY_ALIGN T no_lanes[MaxLanes(d)];
   3156  Store(no, d, no_lanes);
   3157 
   3158  HWY_ALIGN T lanes[MaxLanes(d)];
   3159  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3160    HWY_DASSERT(index_lanes[i] >= 0);
   3161    lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
   3162  }
   3163  return Load(d, lanes);
   3164 }
   3165 
   3166 template <class D, typename T = TFromD<D>>
   3167 HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
   3168                               VFromD<RebindToSigned<D>> index,
   3169                               const size_t max_lanes_to_load) {
   3170  return GatherIndexNOr(Zero(d), d, base, index, max_lanes_to_load);
   3171 }
   3172 
   3173 template <class D, typename T = TFromD<D>>
   3174 HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
   3175                               VFromD<RebindToSigned<D>> index,
   3176                               const size_t max_lanes_to_load) {
   3177  const RebindToSigned<D> di;
   3178  using TI = TFromD<decltype(di)>;
   3179  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
   3180 
   3181  VFromD<D> v = no;
   3182  for (size_t i = 0; i < MaxLanes(d); ++i) {
   3183    if (i < max_lanes_to_load)
   3184      v = InsertLane(v, i, base[ExtractLane(index, i)]);
   3185  }
   3186  return v;
   3187 }
   3188 #else
   3189 template <class D, typename T = TFromD<D>>
   3190 HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
   3191                               VFromD<RebindToSigned<D>> index,
   3192                               const size_t max_lanes_to_load) {
   3193  return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
   3194 }
   3195 template <class D, typename T = TFromD<D>>
   3196 HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
   3197                               VFromD<RebindToSigned<D>> index,
   3198                               const size_t max_lanes_to_load) {
   3199  return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
   3200 }
   3201 #endif  // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
   3202 
   3203 // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
   3204 
   3205 #if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
   3206 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
   3207 #undef HWY_NATIVE_INTEGER_ABS_DIFF
   3208 #else
   3209 #define HWY_NATIVE_INTEGER_ABS_DIFF
   3210 #endif
   3211 
   3212 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   3213 HWY_API V AbsDiff(V a, V b) {
   3214  return Sub(Max(a, b), Min(a, b));
   3215 }
   3216 
   3217 #endif  // HWY_NATIVE_INTEGER_ABS_DIFF
   3218 
   3219 #if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
   3220 #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   3221 #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   3222 #else
   3223 #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   3224 #endif
   3225 
   3226 template <class V, HWY_IF_UI8_D(DFromV<V>),
   3227          HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
   3228 HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
   3229  const DFromV<decltype(a)> d;
   3230  const RebindToUnsigned<decltype(d)> du;
   3231  const RepartitionToWideX3<decltype(d)> dw;
   3232 
   3233  return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
   3234 }
   3235 
   3236 #endif  // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
   3237 
   3238 // ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64
   3239 
   3240 #if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
   3241 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
   3242 #undef HWY_NATIVE_I32_SATURATED_ADDSUB
   3243 #else
   3244 #define HWY_NATIVE_I32_SATURATED_ADDSUB
   3245 #endif
   3246 
   3247 template <class V, HWY_IF_I32_D(DFromV<V>)>
   3248 HWY_API V SaturatedAdd(V a, V b) {
   3249  const DFromV<decltype(a)> d;
   3250  const auto sum = Add(a, b);
   3251  const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
   3252  const auto overflow_result =
   3253      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
   3254  return IfNegativeThenElse(overflow_mask, overflow_result, sum);
   3255 }
   3256 
   3257 template <class V, HWY_IF_I32_D(DFromV<V>)>
   3258 HWY_API V SaturatedSub(V a, V b) {
   3259  const DFromV<decltype(a)> d;
   3260  const auto diff = Sub(a, b);
   3261  const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
   3262  const auto overflow_result =
   3263      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
   3264  return IfNegativeThenElse(overflow_mask, overflow_result, diff);
   3265 }
   3266 
   3267 #endif  // HWY_NATIVE_I32_SATURATED_ADDSUB
   3268 
   3269 #if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
   3270 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
   3271 #undef HWY_NATIVE_I64_SATURATED_ADDSUB
   3272 #else
   3273 #define HWY_NATIVE_I64_SATURATED_ADDSUB
   3274 #endif
   3275 
   3276 template <class V, HWY_IF_I64_D(DFromV<V>)>
   3277 HWY_API V SaturatedAdd(V a, V b) {
   3278  const DFromV<decltype(a)> d;
   3279  const auto sum = Add(a, b);
   3280  const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
   3281  const auto overflow_result =
   3282      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
   3283  return IfNegativeThenElse(overflow_mask, overflow_result, sum);
   3284 }
   3285 
   3286 template <class V, HWY_IF_I64_D(DFromV<V>)>
   3287 HWY_API V SaturatedSub(V a, V b) {
   3288  const DFromV<decltype(a)> d;
   3289  const auto diff = Sub(a, b);
   3290  const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
   3291  const auto overflow_result =
   3292      Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
   3293  return IfNegativeThenElse(overflow_mask, overflow_result, diff);
   3294 }
   3295 
   3296 #endif  // HWY_NATIVE_I64_SATURATED_ADDSUB
   3297 
   3298 #if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
   3299 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
   3300 #undef HWY_NATIVE_U32_SATURATED_ADDSUB
   3301 #else
   3302 #define HWY_NATIVE_U32_SATURATED_ADDSUB
   3303 #endif
   3304 
   3305 template <class V, HWY_IF_U32_D(DFromV<V>)>
   3306 HWY_API V SaturatedAdd(V a, V b) {
   3307  return Add(a, Min(b, Not(a)));
   3308 }
   3309 
   3310 template <class V, HWY_IF_U32_D(DFromV<V>)>
   3311 HWY_API V SaturatedSub(V a, V b) {
   3312  return Sub(a, Min(a, b));
   3313 }
   3314 
   3315 #endif  // HWY_NATIVE_U32_SATURATED_ADDSUB
   3316 
   3317 #if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
   3318 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
   3319 #undef HWY_NATIVE_U64_SATURATED_ADDSUB
   3320 #else
   3321 #define HWY_NATIVE_U64_SATURATED_ADDSUB
   3322 #endif
   3323 
   3324 template <class V, HWY_IF_U64_D(DFromV<V>)>
   3325 HWY_API V SaturatedAdd(V a, V b) {
   3326  return Add(a, Min(b, Not(a)));
   3327 }
   3328 
   3329 template <class V, HWY_IF_U64_D(DFromV<V>)>
   3330 HWY_API V SaturatedSub(V a, V b) {
   3331  return Sub(a, Min(a, b));
   3332 }
   3333 
   3334 #endif  // HWY_NATIVE_U64_SATURATED_ADDSUB
   3335 
   3336 // ------------------------------ Unsigned to signed demotions
   3337 
   3338 template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
   3339          HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
   3340          class V2 = VFromD<Rebind<TFromV<V>, DN>>,
   3341          hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
   3342          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
   3343 HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
   3344  const DFromV<decltype(v)> d;
   3345  const RebindToSigned<decltype(d)> di;
   3346  const RebindToUnsigned<decltype(dn)> dn_u;
   3347 
   3348  // First, do a signed to signed demotion. This will convert any values
   3349  // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
   3350  // negative value.
   3351  const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v));
   3352 
   3353  // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
   3354  // using an unsigned Min operation.
   3355  const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
   3356 
   3357  return BitCast(
   3358      dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
   3359 }
   3360 
   3361 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3362 template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
   3363          HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
   3364          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   3365          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   3366          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
   3367 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   3368  const DFromV<decltype(a)> d;
   3369  const RebindToSigned<decltype(d)> di;
   3370  const RebindToUnsigned<decltype(dn)> dn_u;
   3371 
   3372  // First, do a signed to signed demotion. This will convert any values
   3373  // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
   3374  // negative value.
   3375  const auto i2i_demote_result =
   3376      ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b));
   3377 
   3378  // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
   3379  // using an unsigned Min operation.
   3380  const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
   3381 
   3382  return BitCast(
   3383      dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
   3384 }
   3385 #endif
   3386 
   3387 // ------------------------------ PromoteLowerTo
   3388 
   3389 // There is no codegen advantage for a native version of this. It is provided
   3390 // only for convenience.
   3391 template <class D, class V>
   3392 HWY_API VFromD<D> PromoteLowerTo(D d, V v) {
   3393  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
   3394  // because it cannot be deduced from D (could be either bf16 or f16).
   3395  const Rebind<TFromV<V>, decltype(d)> dh;
   3396  return PromoteTo(d, LowerHalf(dh, v));
   3397 }
   3398 
   3399 // ------------------------------ PromoteUpperTo
   3400 
   3401 #if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
   3402 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
   3403 #undef HWY_NATIVE_PROMOTE_UPPER_TO
   3404 #else
   3405 #define HWY_NATIVE_PROMOTE_UPPER_TO
   3406 #endif
   3407 
   3408 // This requires UpperHalf.
   3409 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3410 
   3411 template <class D, class V>
   3412 HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
   3413  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
   3414  // because it cannot be deduced from D (could be either bf16 or f16).
   3415  const Rebind<TFromV<V>, decltype(d)> dh;
   3416  return PromoteTo(d, UpperHalf(dh, v));
   3417 }
   3418 
   3419 #endif  // HWY_TARGET != HWY_SCALAR
   3420 #endif  // HWY_NATIVE_PROMOTE_UPPER_TO
   3421 
   3422 // ------------------------------ float16_t <-> float
   3423 
   3424 #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
   3425 #ifdef HWY_NATIVE_F16C
   3426 #undef HWY_NATIVE_F16C
   3427 #else
   3428 #define HWY_NATIVE_F16C
   3429 #endif
   3430 
   3431 template <class D, HWY_IF_F32_D(D)>
   3432 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
   3433  const RebindToSigned<decltype(df32)> di32;
   3434  const RebindToUnsigned<decltype(df32)> du32;
   3435  const Rebind<uint16_t, decltype(df32)> du16;
   3436  using VU32 = VFromD<decltype(du32)>;
   3437 
   3438  const VU32 bits16 = PromoteTo(du32, BitCast(du16, v));
   3439  const VU32 sign = ShiftRight<15>(bits16);
   3440  const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F));
   3441  const VU32 mantissa = And(bits16, Set(du32, 0x3FF));
   3442  const VU32 subnormal =
   3443      BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)),
   3444                        Set(df32, 1.0f / 16384 / 1024)));
   3445 
   3446  const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15));
   3447  const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa);
   3448  const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32);
   3449  const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal);
   3450  return BitCast(df32, Or(ShiftLeft<31>(sign), bits32));
   3451 }
   3452 
   3453 template <class D, HWY_IF_F16_D(D)>
   3454 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
   3455  const RebindToSigned<decltype(df16)> di16;
   3456  const Rebind<int32_t, decltype(df16)> di32;
   3457  const RebindToFloat<decltype(di32)> df32;
   3458  const RebindToUnsigned<decltype(df32)> du32;
   3459 
   3460  // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
   3461  // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
   3462  // mantissa of a F16
   3463 
   3464  // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
   3465  // 2^(-14) is the smallest positive normal F16 value and as we want 13
   3466  // mantissa bits (including the implicit 1 bit) to the left of the
   3467  // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
   3468 
   3469  // The biased exponent of round_incr[i] needs to be at least 126 as
   3470  // (-14) + 13 + 127 is equal to 126
   3471 
   3472  // We also want to biased exponent of round_incr[i] to be less than or equal
   3473  // to 255 (which is equal to MaxExponentField<float>())
   3474 
   3475  // The biased F32 exponent of round_incr is equal to
   3476  // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
   3477 
   3478  // hi9_bits[i] is equal to the upper 9 bits of v[i]
   3479  const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));
   3480 
   3481  const auto k13 = Set(du32, uint32_t{13u});
   3482 
   3483  // Minimum biased F32 exponent of round_incr
   3484  const auto k126 = Set(du32, uint32_t{126u});
   3485 
   3486  // round_incr_hi9_bits[i] is equivalent to
   3487  // (hi9_bits[i] & 0x100) |
   3488  // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
   3489 
   3490 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
   3491  const auto k255 = Set(du32, uint32_t{255u});
   3492  const auto round_incr_hi9_bits = BitwiseIfThenElse(
   3493      k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
   3494 #else
   3495  // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
   3496  // be incremented by 13 and clamped to the [13, 255] range without overflowing
   3497  // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
   3498  // exponent bits in an F32
   3499 
   3500  // U8 Max can be used on targets other than SCALAR and EMU128 to clamp
   3501  // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
   3502  // bit
   3503 
   3504  const Repartition<uint8_t, decltype(du32)> du32_as_u8;
   3505  const auto round_incr_hi9_bits = BitCast(
   3506      du32,
   3507      Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
   3508          BitCast(du32_as_u8, k126)));
   3509 #endif
   3510 
   3511  // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
   3512  // (round_incr_hi9_bits & 0xFF) is equal to
   3513  // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)
   3514 
   3515  const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
   3516 
   3517  // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
   3518  // and to move the fractional bits of the resulting non-NaN mantissa down to
   3519  // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
   3520  // value
   3521  const auto rounded_val = Add(v, round_incr);
   3522 
   3523  // rounded_val_bits is the bits of rounded_val as a U32
   3524  const auto rounded_val_bits = BitCast(du32, rounded_val);
   3525 
   3526  // rounded_val[i] is known to have the same biased exponent as round_incr[i]
   3527  // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
   3528  // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
   3529  // is either a power of 2 that is greater than or equal to 2^-1 or infinity.
   3530 
   3531  // If rounded_val[i] is a finite F32 value, then
   3532  // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
   3533  // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
   3534  // in the range [0, 2].
   3535 
   3536  // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
   3537  // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
   3538  // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
   3539 
   3540  // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
   3541  // rounded_val[i] is a non-NaN value
   3542 
   3543  // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
   3544  // the biased exponent of round_incr[i] is at least 126 and as both v[i] and
   3545  // round_incr[i] have the same sign bit
   3546 
   3547  // The ULP of a F32 value with a biased exponent of 126 is equal to
   3548  // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
   3549  // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
   3550  // -24)
   3551 
   3552  // The biased exponent (before subtracting by 126) needs to be clamped to the
   3553  // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
   3554  // biased exponent of a F16.
   3555 
   3556  // The biased exponent of the resulting F16 value is equal to
   3557  // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
   3558  //         ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
   3559 
   3560 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
   3561  const auto k157Shl10 = Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10));
   3562  auto f16_exp_bits =
   3563      Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
   3564              And(rounded_val_bits,
   3565                  Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
   3566          k157Shl10);
   3567  const auto f16_result_is_inf_mask =
   3568      RebindMask(df32, Eq(f16_exp_bits, k157Shl10));
   3569 #else
   3570  const auto k157 = Set(du32, uint32_t{157});
   3571  auto f16_exp_bits = BitCast(
   3572      du32,
   3573      Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
   3574                       BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
   3575          BitCast(du32_as_u8, k157)));
   3576  const auto f16_result_is_inf_mask = RebindMask(df32, Eq(f16_exp_bits, k157));
   3577  f16_exp_bits = ShiftLeft<10>(f16_exp_bits);
   3578 #endif
   3579 
   3580  f16_exp_bits =
   3581      Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
   3582 
   3583  const auto f16_unmasked_mant_bits =
   3584      BitCast(di32, Or(IfThenZeroElse(f16_result_is_inf_mask, rounded_val),
   3585                       VecFromMask(df32, IsNaN(rounded_val))));
   3586 
   3587  const auto f16_exp_mant_bits =
   3588      OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
   3589            Set(di32, int32_t{0x03FF}));
   3590 
   3591  // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
   3592  // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
   3593  // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
   3594  // operation
   3595  const auto f16_bits_as_i32 =
   3596      OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
   3597            Set(di32, static_cast<int32_t>(0xFFFF8000u)));
   3598  return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
   3599 }
   3600 
   3601 #endif  // HWY_NATIVE_F16C
   3602 
   3603 // ------------------------------ F64->F16 DemoteTo
   3604 #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
   3605 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
   3606 #undef HWY_NATIVE_DEMOTE_F64_TO_F16
   3607 #else
   3608 #define HWY_NATIVE_DEMOTE_F64_TO_F16
   3609 #endif
   3610 
   3611 #if HWY_HAVE_FLOAT64
   3612 template <class D, HWY_IF_F16_D(D)>
   3613 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
   3614  const Rebind<double, D> df64;
   3615  const Rebind<uint64_t, D> du64;
   3616  const Rebind<float, D> df32;
   3617 
   3618  // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
   3619  // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
   3620  // the result is correctly rounded to a F16.
   3621 
   3622  const auto vf64_rounded = OrAnd(
   3623      And(v,
   3624          BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
   3625      BitCast(df64, Add(BitCast(du64, v),
   3626                        Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
   3627      BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
   3628 
   3629  return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
   3630 }
   3631 #endif  // HWY_HAVE_FLOAT64
   3632 
   3633 #endif  // HWY_NATIVE_DEMOTE_F64_TO_F16
   3634 
   3635 // ------------------------------ F16->F64 PromoteTo
   3636 #if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
   3637 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
   3638 #undef HWY_NATIVE_PROMOTE_F16_TO_F64
   3639 #else
   3640 #define HWY_NATIVE_PROMOTE_F16_TO_F64
   3641 #endif
   3642 
   3643 #if HWY_HAVE_FLOAT64
   3644 template <class D, HWY_IF_F64_D(D)>
   3645 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
   3646  return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
   3647 }
   3648 #endif  // HWY_HAVE_FLOAT64
   3649 
   3650 #endif  // HWY_NATIVE_PROMOTE_F16_TO_F64
   3651 
   3652 // ------------------------------ F32 to BF16 DemoteTo
   3653 #if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
   3654 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
   3655 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
   3656 #else
   3657 #define HWY_NATIVE_DEMOTE_F32_TO_BF16
   3658 #endif
   3659 
   3660 namespace detail {
   3661 
   3662 // Round a F32 value to the nearest BF16 value, with the result returned as the
   3663 // rounded F32 value bitcasted to an U32
   3664 
   3665 // RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
   3666 // NaN F32 values from being converted to an infinity
   3667 template <class V, HWY_IF_F32(TFromV<V>)>
   3668 HWY_INLINE VFromD<RebindToUnsigned<DFromV<V>>> RoundF32ForDemoteToBF16(V v) {
   3669  const DFromV<decltype(v)> d;
   3670  const RebindToUnsigned<decltype(d)> du32;
   3671 
   3672  const auto is_non_nan = Not(IsNaN(v));
   3673  const auto bits32 = BitCast(du32, v);
   3674 
   3675  const auto round_incr =
   3676      Add(And(ShiftRight<16>(bits32), Set(du32, uint32_t{1})),
   3677          Set(du32, uint32_t{0x7FFFu}));
   3678  return MaskedAddOr(Or(bits32, Set(du32, uint32_t{0x00400000u})),
   3679                     RebindMask(du32, is_non_nan), bits32, round_incr);
   3680 }
   3681 
   3682 }  // namespace detail
   3683 
   3684 template <class D, HWY_IF_BF16_D(D)>
   3685 HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
   3686  const RebindToUnsigned<decltype(dbf16)> du16;
   3687  const Twice<decltype(du16)> dt_u16;
   3688 
   3689  const auto rounded_bits = BitCast(dt_u16, detail::RoundF32ForDemoteToBF16(v));
   3690 #if HWY_IS_LITTLE_ENDIAN
   3691  return BitCast(
   3692      dbf16, LowerHalf(du16, ConcatOdd(dt_u16, rounded_bits, rounded_bits)));
   3693 #else
   3694  return BitCast(
   3695      dbf16, LowerHalf(du16, ConcatEven(dt_u16, rounded_bits, rounded_bits)));
   3696 #endif
   3697 }
   3698 
   3699 template <class D, HWY_IF_BF16_D(D)>
   3700 HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
   3701                                   VFromD<Repartition<float, D>> b) {
   3702  const RebindToUnsigned<decltype(dbf16)> du16;
   3703 
   3704  const auto rounded_a_bits32 =
   3705      BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
   3706  const auto rounded_b_bits32 =
   3707      BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
   3708 #if HWY_IS_LITTLE_ENDIAN
   3709  return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, rounded_b_bits32),
   3710                                  BitCast(du16, rounded_a_bits32)));
   3711 #else
   3712  return BitCast(dbf16, ConcatEven(du16, BitCast(du16, rounded_b_bits32),
   3713                                   BitCast(du16, rounded_a_bits32)));
   3714 #endif
   3715 }
   3716 
   3717 template <class D, HWY_IF_BF16_D(D)>
   3718 HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
   3719                                   VFromD<Repartition<float, D>> b) {
   3720  const RebindToUnsigned<decltype(dbf16)> du16;
   3721 
   3722 #if HWY_IS_LITTLE_ENDIAN
   3723  const auto a_in_odd = detail::RoundF32ForDemoteToBF16(a);
   3724  const auto b_in_even = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
   3725 #else
   3726  const auto a_in_odd = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(a));
   3727  const auto b_in_even = detail::RoundF32ForDemoteToBF16(b);
   3728 #endif
   3729 
   3730  return BitCast(dbf16,
   3731                 OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
   3732 }
   3733 
   3734 #endif  // HWY_NATIVE_DEMOTE_F32_TO_BF16
   3735 
   3736 // ------------------------------ PromoteInRangeTo
   3737 #if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
   3738     defined(HWY_TARGET_TOGGLE))
   3739 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   3740 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   3741 #else
   3742 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   3743 #endif
   3744 
   3745 #if HWY_HAVE_INTEGER64
   3746 template <class D64, HWY_IF_UI64_D(D64)>
   3747 HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
   3748  return PromoteTo(d64, v);
   3749 }
   3750 #endif
   3751 
   3752 #endif  // HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   3753 
   3754 // ------------------------------ ConvertInRangeTo
   3755 #if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
   3756 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   3757 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   3758 #else
   3759 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   3760 #endif
   3761 
   3762 template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
   3763          HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) |
   3764                                         (1 << 4) |
   3765                                         (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
   3766 HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
   3767  return ConvertTo(di, v);
   3768 }
   3769 
   3770 #endif  // HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
   3771 
   3772 // ------------------------------ DemoteInRangeTo
   3773 #if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
   3774     defined(HWY_TARGET_TOGGLE))
   3775 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   3776 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   3777 #else
   3778 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   3779 #endif
   3780 
   3781 #if HWY_HAVE_FLOAT64
   3782 template <class D32, HWY_IF_UI32_D(D32)>
   3783 HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
   3784  return DemoteTo(d32, v);
   3785 }
   3786 #endif
   3787 
   3788 #endif  // HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
   3789 
   3790 // ------------------------------ PromoteInRangeLowerTo/PromoteInRangeUpperTo
   3791 
   3792 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
   3793 HWY_API VFromD<D> PromoteInRangeLowerTo(D d, V v) {
   3794  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
   3795  // because it cannot be deduced from D (could be either bf16 or f16).
   3796  const Rebind<TFromV<V>, decltype(d)> dh;
   3797  return PromoteInRangeTo(d, LowerHalf(dh, v));
   3798 }
   3799 
   3800 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3801 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
   3802 HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) {
   3803 #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
   3804     (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
   3805  // On targets that provide target-specific implementations of F32->UI64
   3806  // PromoteInRangeTo, promote the upper half of v using PromoteInRangeTo
   3807 
   3808  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
   3809  // because it cannot be deduced from D (could be either bf16 or f16).
   3810  const Rebind<TFromV<V>, decltype(d)> dh;
   3811  return PromoteInRangeTo(d, UpperHalf(dh, v));
   3812 #else
   3813  // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
   3814  // around F32->UI64 PromoteTo, promote the upper half of v to TFromD<D> using
   3815  // PromoteUpperTo
   3816  return PromoteUpperTo(d, v);
   3817 #endif
   3818 }
   3819 #endif  // HWY_TARGET != HWY_SCALAR
   3820 
   3821 // ------------------------------ PromoteInRangeEvenTo/PromoteInRangeOddTo
   3822 
   3823 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
   3824 HWY_API VFromD<D> PromoteInRangeEvenTo(D d, V v) {
   3825 #if HWY_TARGET == HWY_SCALAR
   3826  return PromoteInRangeTo(d, v);
   3827 #elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
   3828       (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
   3829  // On targets that provide target-specific implementations of F32->UI64
   3830  // PromoteInRangeTo, promote the even lanes of v using PromoteInRangeTo
   3831 
   3832  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
   3833  // because it cannot be deduced from D (could be either bf16 or f16).
   3834  const DFromV<decltype(v)> d_from;
   3835  const Rebind<TFromV<V>, decltype(d)> dh;
   3836  return PromoteInRangeTo(d, LowerHalf(dh, ConcatEven(d_from, v, v)));
   3837 #else
   3838  // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
   3839  // around F32->UI64 PromoteTo, promote the even lanes of v to TFromD<D> using
   3840  // PromoteEvenTo
   3841  return PromoteEvenTo(d, v);
   3842 #endif  // HWY_TARGET == HWY_SCALAR
   3843 }
   3844 
   3845 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3846 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
   3847 HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) {
   3848 #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
   3849     (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
   3850  // On targets that provide target-specific implementations of F32->UI64
   3851  // PromoteInRangeTo, promote the odd lanes of v using PromoteInRangeTo
   3852 
   3853  // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
   3854  // because it cannot be deduced from D (could be either bf16 or f16).
   3855  const DFromV<decltype(v)> d_from;
   3856  const Rebind<TFromV<V>, decltype(d)> dh;
   3857  return PromoteInRangeTo(d, LowerHalf(dh, ConcatOdd(d_from, v, v)));
   3858 #else
   3859  // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
   3860  // around F32->UI64 PromoteTo, promote the odd lanes of v to TFromD<D> using
   3861  // PromoteOddTo
   3862  return PromoteOddTo(d, v);
   3863 #endif
   3864 }
   3865 #endif  // HWY_TARGET != HWY_SCALAR
   3866 
   3867 // ------------------------------ SumsOf2
   3868 
   3869 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3870 namespace detail {
   3871 
   3872 template <class TypeTag, size_t kLaneSize, class V>
   3873 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   3874    TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
   3875  const DFromV<decltype(v)> d;
   3876  const RepartitionToWide<decltype(d)> dw;
   3877  return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
   3878 }
   3879 
   3880 }  // namespace detail
   3881 
   3882 template <class V>
   3883 HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) {
   3884  return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
   3885                         hwy::SizeTag<sizeof(TFromV<V>)>(), v);
   3886 }
   3887 #endif  // HWY_TARGET != HWY_SCALAR
   3888 
   3889 // ------------------------------ SumsOf4
   3890 
   3891 namespace detail {
   3892 
   3893 template <class TypeTag, size_t kLaneSize, class V>
   3894 HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
   3895    TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
   3896  using hwy::HWY_NAMESPACE::SumsOf2;
   3897  return SumsOf2(SumsOf2(v));
   3898 }
   3899 
   3900 }  // namespace detail
   3901 
   3902 template <class V>
   3903 HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) {
   3904  return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(),
   3905                         hwy::SizeTag<sizeof(TFromV<V>)>(), v);
   3906 }
   3907 
   3908 // ------------------------------ OrderedTruncate2To
   3909 
   3910 #if HWY_IDE || \
   3911    (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))
   3912 
   3913 #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   3914 #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   3915 #else
   3916 #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   3917 #endif
   3918 
   3919 // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
   3920 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3921 template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
   3922          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   3923          HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   3924 HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
   3925  return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
   3926 }
   3927 #endif  // HWY_TARGET != HWY_SCALAR
   3928 #endif  // HWY_NATIVE_ORDERED_TRUNCATE_2_TO
   3929 
   3930 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
   3931 
   3932 #if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
   3933 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
   3934 #undef HWY_NATIVE_LEADING_ZERO_COUNT
   3935 #else
   3936 #define HWY_NATIVE_LEADING_ZERO_COUNT
   3937 #endif
   3938 
   3939 namespace detail {
   3940 
   3941 template <class D, HWY_IF_U32_D(D)>
   3942 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   3943  const RebindToFloat<decltype(d)> df;
   3944 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
   3945  const RebindToSigned<decltype(d)> di;
   3946  const Repartition<int16_t, decltype(d)> di16;
   3947 
   3948  // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed
   3949  // by a unsigned right shift of the uint32_t bit representation of the
   3950  // floating point values by 23, followed by an int16_t Min
   3951  // operation as we are only interested in the biased exponent that would
   3952  // result from a uint32_t to float conversion.
   3953 
   3954  // An int32_t to float vector conversion is also much more efficient on
   3955  // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion
   3956  // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2
   3957  // requires multiple instructions whereas an int32_t to float vector
   3958  // conversion can be carried out using a single instruction on
   3959  // SSE2/SSSE3/SSE4/AVX2.
   3960 
   3961  const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v)));
   3962  return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)),
   3963                        BitCast(di16, Set(d, 158))));
   3964 #else
   3965  const auto f32_bits = BitCast(d, ConvertTo(df, v));
   3966  return BitCast(d, ShiftRight<23>(f32_bits));
   3967 #endif
   3968 }
   3969 
   3970 template <class V, HWY_IF_U32_D(DFromV<V>)>
   3971 HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) {
   3972  // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but
   3973  // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647.
   3974  const DFromV<decltype(v)> d;
   3975  const RebindToFloat<decltype(d)> df;
   3976 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
   3977  const RebindToSigned<decltype(d)> d_src;
   3978 #else
   3979  const RebindToUnsigned<decltype(d)> d_src;
   3980 #endif
   3981  const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v)));
   3982  return ShiftRight<23>(f32_bits);
   3983 }
   3984 
   3985 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
   3986 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   3987  const Rebind<uint32_t, decltype(d)> du32;
   3988  const auto f32_biased_exp_as_u32 =
   3989      I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
   3990  return TruncateTo(d, f32_biased_exp_as_u32);
   3991 }
   3992 
   3993 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   3994 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
   3995 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   3996  const Half<decltype(d)> dh;
   3997  const Rebind<uint32_t, decltype(dh)> du32;
   3998 
   3999  const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
   4000  const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
   4001 
   4002  const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
   4003  const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
   4004 #if HWY_TARGET <= HWY_SSE2
   4005  const RebindToSigned<decltype(du32)> di32;
   4006  const RebindToSigned<decltype(d)> di;
   4007  return BitCast(d,
   4008                 OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32),
   4009                                  BitCast(di32, hi_f32_biased_exp_as_u32)));
   4010 #else
   4011  return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32,
   4012                            hi_f32_biased_exp_as_u32);
   4013 #endif
   4014 }
   4015 #endif  // HWY_TARGET != HWY_SCALAR
   4016 
   4017 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
   4018 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   4019  const Rebind<uint32_t, decltype(d)> du32;
   4020  const auto f32_biased_exp_as_u32 =
   4021      I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
   4022  return U8FromU32(f32_biased_exp_as_u32);
   4023 }
   4024 
   4025 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   4026 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
   4027          HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
   4028 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   4029  const Half<decltype(d)> dh;
   4030  const Rebind<uint32_t, decltype(dh)> du32;
   4031  const Repartition<uint16_t, decltype(du32)> du16;
   4032 
   4033  const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
   4034  const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
   4035 
   4036  const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
   4037  const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
   4038 
   4039 #if HWY_TARGET <= HWY_SSE2
   4040  const RebindToSigned<decltype(du32)> di32;
   4041  const RebindToSigned<decltype(du16)> di16;
   4042  const auto f32_biased_exp_as_i16 =
   4043      OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32),
   4044                       BitCast(di32, hi_f32_biased_exp_as_u32));
   4045  return DemoteTo(d, f32_biased_exp_as_i16);
   4046 #else
   4047  const auto f32_biased_exp_as_u16 = OrderedTruncate2To(
   4048      du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32);
   4049  return TruncateTo(d, f32_biased_exp_as_u16);
   4050 #endif
   4051 }
   4052 
   4053 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)>
   4054 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   4055  const Half<decltype(d)> dh;
   4056  const Half<decltype(dh)> dq;
   4057  const Rebind<uint32_t, decltype(dq)> du32;
   4058  const Repartition<uint16_t, decltype(du32)> du16;
   4059 
   4060  const auto lo_half = LowerHalf(dh, v);
   4061  const auto hi_half = UpperHalf(dh, v);
   4062 
   4063  const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half));
   4064  const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half));
   4065  const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half));
   4066  const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half));
   4067 
   4068  const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0);
   4069  const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1);
   4070  const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2);
   4071  const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3);
   4072 
   4073 #if HWY_TARGET <= HWY_SSE2
   4074  const RebindToSigned<decltype(du32)> di32;
   4075  const RebindToSigned<decltype(du16)> di16;
   4076 
   4077  const auto lo_f32_biased_exp_as_i16 =
   4078      OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0),
   4079                       BitCast(di32, f32_biased_exp_as_u32_q1));
   4080  const auto hi_f32_biased_exp_as_i16 =
   4081      OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2),
   4082                       BitCast(di32, f32_biased_exp_as_u32_q3));
   4083  return OrderedDemote2To(d, lo_f32_biased_exp_as_i16,
   4084                          hi_f32_biased_exp_as_i16);
   4085 #else
   4086  const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To(
   4087      du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1);
   4088  const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To(
   4089      du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3);
   4090  return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16,
   4091                            hi_f32_biased_exp_as_u16);
   4092 #endif
   4093 }
   4094 #endif  // HWY_TARGET != HWY_SCALAR
   4095 
   4096 #if HWY_TARGET == HWY_SCALAR
   4097 template <class D>
   4098 using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>;
   4099 #elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
   4100 template <class D>
   4101 using F32ExpLzcntMinMaxRepartition = Repartition<uint8_t, D>;
   4102 #else
   4103 template <class D>
   4104 using F32ExpLzcntMinMaxRepartition =
   4105    Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>;
   4106 #endif
   4107 
   4108 template <class V>
   4109 using F32ExpLzcntMinMaxCmpV = VFromD<F32ExpLzcntMinMaxRepartition<DFromV<V>>>;
   4110 
   4111 template <class V>
   4112 HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) {
   4113  const DFromV<decltype(v)> d;
   4114  const F32ExpLzcntMinMaxRepartition<decltype(d)> d2;
   4115  return BitCast(d2, v);
   4116 }
   4117 
   4118 template <class D, HWY_IF_U64_D(D)>
   4119 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
   4120 #if HWY_TARGET == HWY_SCALAR
   4121  const uint64_t u64_val = GetLane(v);
   4122  const float f32_val = static_cast<float>(u64_val);
   4123  const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
   4124  return Set(d, static_cast<uint64_t>(f32_bits >> 23));
   4125 #else
   4126  const Repartition<uint32_t, decltype(d)> du32;
   4127  const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v));
   4128  const auto f32_biased_exp_adj =
   4129      IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)),
   4130                     BitCast(du32, Set(d, 0x0000002000000000u)));
   4131  const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj);
   4132 
   4133  return ShiftRight<32>(BitCast(
   4134      d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp),
   4135             F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp)))));
   4136 #endif
   4137 }
   4138 
   4139 template <class V, HWY_IF_UNSIGNED_V(V)>
   4140 HWY_INLINE V UIntToF32BiasedExp(V v) {
   4141  const DFromV<decltype(v)> d;
   4142  return UIntToF32BiasedExp(d, v);
   4143 }
   4144 
   4145 template <class V, HWY_IF_UNSIGNED_V(V),
   4146          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
   4147 HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
   4148  return v;
   4149 }
   4150 
   4151 template <class V, HWY_IF_UNSIGNED_V(V),
   4152          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
   4153 HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
   4154  // If v[i] >= 16777216 is true, make sure that the bit at
   4155  // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact
   4156  // conversion to single-precision floating point is rounded down.
   4157 
   4158  // This zeroing-out can be accomplished through the AndNot operation below.
   4159  return AndNot(ShiftRight<24>(v), v);
   4160 }
   4161 
   4162 }  // namespace detail
   4163 
   4164 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4165 HWY_API V HighestSetBitIndex(V v) {
   4166  const DFromV<decltype(v)> d;
   4167  const RebindToUnsigned<decltype(d)> du;
   4168  using TU = TFromD<decltype(du)>;
   4169 
   4170  const auto f32_biased_exp = detail::UIntToF32BiasedExp(
   4171      detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
   4172  return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127})));
   4173 }
   4174 
   4175 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4176 HWY_API V LeadingZeroCount(V v) {
   4177  const DFromV<decltype(v)> d;
   4178  const RebindToUnsigned<decltype(d)> du;
   4179  using TU = TFromD<decltype(du)>;
   4180 
   4181  constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
   4182  const auto f32_biased_exp = detail::UIntToF32BiasedExp(
   4183      detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
   4184  const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp);
   4185 
   4186  return BitCast(d,
   4187                 Min(detail::F32ExpLzcntMinMaxBitCast(lz_count),
   4188                     detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
   4189 }
   4190 
   4191 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4192 HWY_API V TrailingZeroCount(V v) {
   4193  const DFromV<decltype(v)> d;
   4194  const RebindToUnsigned<decltype(d)> du;
   4195  const RebindToSigned<decltype(d)> di;
   4196  using TU = TFromD<decltype(du)>;
   4197 
   4198  const auto vi = BitCast(di, v);
   4199  const auto lowest_bit = BitCast(du, And(vi, Neg(vi)));
   4200 
   4201  constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
   4202  const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit);
   4203  const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127}));
   4204 
   4205  return BitCast(d,
   4206                 Min(detail::F32ExpLzcntMinMaxBitCast(tz_count),
   4207                     detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
   4208 }
   4209 #endif  // HWY_NATIVE_LEADING_ZERO_COUNT
   4210 
   4211 // ------------------------------ MaskedLeadingZeroCount
   4212 #if (defined(HWY_NATIVE_MASKED_LEADING_ZERO_COUNT) == \
   4213     defined(HWY_TARGET_TOGGLE))
   4214 #ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
   4215 #undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
   4216 #else
   4217 #define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
   4218 #endif
   4219 
   4220 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), class M>
   4221 HWY_API V MaskedLeadingZeroCount(M m, V v) {
   4222  return IfThenElseZero(m, LeadingZeroCount(v));
   4223 }
   4224 #endif  // HWY_NATIVE_MASKED_LEADING_ZERO_COUNT
   4225 
   4226 // ------------------------------ AESRound
   4227 
   4228 // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
   4229 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   4230 
   4231 // Define for white-box testing, even if native instructions are available.
   4232 namespace detail {
   4233 
   4234 // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
   4235 // Vector Permute Instructions" and the accompanying assembly language
   4236 // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
   4237 // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
   4238 //
   4239 // A brute-force 256 byte table lookup can also be made constant-time, and
   4240 // possibly competitive on NEON, but this is more performance-portable
   4241 // especially for x86 and large vectors.
   4242 
   4243 template <class V>  // u8
   4244 HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
   4245                                               V affine_tblU) {
   4246  const DFromV<V> du;
   4247  const auto mask = Set(du, uint8_t{0xF});
   4248 
   4249  // Change polynomial basis to GF(2^4)
   4250  {
   4251    const VFromD<decltype(du)> basisL =
   4252        Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
   4253                            0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
   4254    const VFromD<decltype(du)> basisU =
   4255        Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
   4256                            0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
   4257    const auto sL = And(state, mask);
   4258    const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
   4259    const auto gf4L = TableLookupBytes(basisL, sL);
   4260    const auto gf4U = TableLookupBytes(basisU, sU);
   4261    state = Xor(gf4L, gf4U);
   4262  }
   4263 
   4264  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
   4265  // cause TableLookupBytesOr0 to return 0.
   4266  const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
   4267      du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
   4268  const VFromD<decltype(du)> tbl = Dup128VecFromValues(
   4269      du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
   4270  const auto sL = And(state, mask);      // L=low nibble, U=upper
   4271  const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
   4272  const auto sX = Xor(sU, sL);
   4273  const auto invL = TableLookupBytes(zetaInv, sL);
   4274  const auto invU = TableLookupBytes(tbl, sU);
   4275  const auto invX = TableLookupBytes(tbl, sX);
   4276  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
   4277  const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
   4278 
   4279  const auto affL = TableLookupBytesOr0(affine_tblL, outL);
   4280  const auto affU = TableLookupBytesOr0(affine_tblU, outU);
   4281  return Xor(affL, affU);
   4282 }
   4283 
   4284 template <class V>  // u8
   4285 HWY_INLINE V SubBytes(V state) {
   4286  const DFromV<V> du;
   4287  // Linear skew (cannot bake 0x63 bias into the table because out* indices
   4288  // may have the infinity flag set).
   4289  const VFromD<decltype(du)> affineL =
   4290      Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
   4291                          0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
   4292  const VFromD<decltype(du)> affineU =
   4293      Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
   4294                          0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
   4295  return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
   4296             Set(du, uint8_t{0x63}));
   4297 }
   4298 
   4299 template <class V>  // u8
   4300 HWY_INLINE V InvSubBytes(V state) {
   4301  const DFromV<V> du;
   4302  const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
   4303      Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
   4304                          0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
   4305  const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
   4306      Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
   4307                          0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
   4308 
   4309  // Apply the inverse affine transformation
   4310  const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
   4311                          Or(ShiftLeft<3>(state), ShiftRight<5>(state)),
   4312                          Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
   4313                     Set(du, uint8_t{0x05}));
   4314 
   4315  // The GF(2^8) multiplicative inverse is computed as follows:
   4316  // - Changing the polynomial basis to GF(2^4)
   4317  // - Computing the GF(2^4) multiplicative inverse
   4318  // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
   4319  //   multiplicative inverse through table lookups using the
   4320  //   kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
   4321  return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
   4322                                           gF2P4InvToGF2P8InvU);
   4323 }
   4324 
   4325 }  // namespace detail
   4326 
   4327 #endif  // HWY_TARGET != HWY_SCALAR
   4328 
   4329 #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
   4330 #ifdef HWY_NATIVE_AES
   4331 #undef HWY_NATIVE_AES
   4332 #else
   4333 #define HWY_NATIVE_AES
   4334 #endif
   4335 
   4336 // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
   4337 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   4338 
   4339 namespace detail {
   4340 
   4341 template <class V>  // u8
   4342 HWY_INLINE V ShiftRows(const V state) {
   4343  const DFromV<V> du;
   4344  // transposed: state is column major
   4345  const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
   4346      du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
   4347  return TableLookupBytes(state, shift_row);
   4348 }
   4349 
   4350 template <class V>  // u8
   4351 HWY_INLINE V InvShiftRows(const V state) {
   4352  const DFromV<V> du;
   4353  // transposed: state is column major
   4354  const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
   4355      du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
   4356  return TableLookupBytes(state, shift_row);
   4357 }
   4358 
   4359 template <class V>  // u8
   4360 HWY_INLINE V GF2P8Mod11BMulBy2(V v) {
   4361  const DFromV<V> du;
   4362  const RebindToSigned<decltype(du)> di;  // can only do signed comparisons
   4363  const auto msb = Lt(BitCast(di, v), Zero(di));
   4364  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
   4365  return Xor(Add(v, v), overflow);  // = v*2 in GF(2^8).
   4366 }
   4367 
   4368 template <class V>  // u8
   4369 HWY_INLINE V MixColumns(const V state) {
   4370  const DFromV<V> du;
   4371  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
   4372  // 2 3 1 1  // Let s := state*1, d := state*2, t := state*3.
   4373  // 1 2 3 1  // d are on diagonal, no permutation needed.
   4374  // 1 1 2 3  // t1230 indicates column indices of threes for the 4 rows.
   4375  // 3 1 1 2  // We also need to compute s2301 and s3012 (=1230 o 2301).
   4376  const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
   4377      du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
   4378  const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
   4379      du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
   4380  const auto d = GF2P8Mod11BMulBy2(state);  // = state*2 in GF(2^8).
   4381  const auto s2301 = TableLookupBytes(state, v2301);
   4382  const auto d_s2301 = Xor(d, s2301);
   4383  const auto t_s2301 = Xor(state, d_s2301);  // t(s*3) = XOR-sum {s, d(s*2)}
   4384  const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
   4385  return Xor(d_s2301, t1230_s3012);  // XOR-sum of 4 terms
   4386 }
   4387 
   4388 template <class V>  // u8
   4389 HWY_INLINE V InvMixColumns(const V state) {
   4390  const DFromV<V> du;
   4391  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
   4392  // 14 11 13  9
   4393  //  9 14 11 13
   4394  // 13  9 14 11
   4395  // 11 13  9 14
   4396  const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
   4397      du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
   4398  const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
   4399      du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
   4400 
   4401  const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
   4402  const auto sx4 = GF2P8Mod11BMulBy2(sx2);   /* = state*4 in GF(2^8) */
   4403  const auto sx8 = GF2P8Mod11BMulBy2(sx4);   /* = state*8 in GF(2^8) */
   4404  const auto sx9 = Xor(sx8, state);          /* = state*9 in GF(2^8) */
   4405  const auto sx11 = Xor(sx9, sx2);           /* = state*11 in GF(2^8) */
   4406  const auto sx13 = Xor(sx9, sx4);           /* = state*13 in GF(2^8) */
   4407  const auto sx14 = Xor3(sx8, sx4, sx2);     /* = state*14 in GF(2^8) */
   4408 
   4409  const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
   4410  const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
   4411  const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
   4412  return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
   4413 }
   4414 
   4415 }  // namespace detail
   4416 
   4417 template <class V>  // u8
   4418 HWY_API V AESRound(V state, const V round_key) {
   4419  // Intel docs swap the first two steps, but it does not matter because
   4420  // ShiftRows is a permutation and SubBytes is independent of lane index.
   4421  state = detail::SubBytes(state);
   4422  state = detail::ShiftRows(state);
   4423  state = detail::MixColumns(state);
   4424  state = Xor(state, round_key);  // AddRoundKey
   4425  return state;
   4426 }
   4427 
   4428 template <class V>  // u8
   4429 HWY_API V AESLastRound(V state, const V round_key) {
   4430  // LIke AESRound, but without MixColumns.
   4431  state = detail::SubBytes(state);
   4432  state = detail::ShiftRows(state);
   4433  state = Xor(state, round_key);  // AddRoundKey
   4434  return state;
   4435 }
   4436 
   4437 template <class V>
   4438 HWY_API V AESInvMixColumns(V state) {
   4439  return detail::InvMixColumns(state);
   4440 }
   4441 
   4442 template <class V>  // u8
   4443 HWY_API V AESRoundInv(V state, const V round_key) {
   4444  state = detail::InvSubBytes(state);
   4445  state = detail::InvShiftRows(state);
   4446  state = detail::InvMixColumns(state);
   4447  state = Xor(state, round_key);  // AddRoundKey
   4448  return state;
   4449 }
   4450 
   4451 template <class V>  // u8
   4452 HWY_API V AESLastRoundInv(V state, const V round_key) {
   4453  // Like AESRoundInv, but without InvMixColumns.
   4454  state = detail::InvSubBytes(state);
   4455  state = detail::InvShiftRows(state);
   4456  state = Xor(state, round_key);  // AddRoundKey
   4457  return state;
   4458 }
   4459 
   4460 template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
   4461 HWY_API V AESKeyGenAssist(V v) {
   4462  const DFromV<decltype(v)> d;
   4463  const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
   4464                                            0, 0, kRcon, 0, 0, 0);
   4465  const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
   4466                                               13, 14, 15, 13, 14, 15, 12);
   4467  const auto sub_word_result = detail::SubBytes(v);
   4468  const auto rot_word_result =
   4469      TableLookupBytes(sub_word_result, rotWordShuffle);
   4470  return Xor(rot_word_result, rconXorMask);
   4471 }
   4472 
   4473 // Constant-time implementation inspired by
   4474 // https://www.bearssl.org/constanttime.html, but about half the cost because we
   4475 // use 64x64 multiplies and 128-bit XORs.
   4476 template <class V>
   4477 HWY_API V CLMulLower(V a, V b) {
   4478  const DFromV<V> d;
   4479  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
   4480  const auto k1 = Set(d, 0x1111111111111111ULL);
   4481  const auto k2 = Set(d, 0x2222222222222222ULL);
   4482  const auto k4 = Set(d, 0x4444444444444444ULL);
   4483  const auto k8 = Set(d, 0x8888888888888888ULL);
   4484  const auto a0 = And(a, k1);
   4485  const auto a1 = And(a, k2);
   4486  const auto a2 = And(a, k4);
   4487  const auto a3 = And(a, k8);
   4488  const auto b0 = And(b, k1);
   4489  const auto b1 = And(b, k2);
   4490  const auto b2 = And(b, k4);
   4491  const auto b3 = And(b, k8);
   4492 
   4493  auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
   4494  auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
   4495  auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
   4496  auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
   4497  m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
   4498  m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
   4499  m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
   4500  m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
   4501  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
   4502 }
   4503 
   4504 template <class V>
   4505 HWY_API V CLMulUpper(V a, V b) {
   4506  const DFromV<V> d;
   4507  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
   4508  const auto k1 = Set(d, 0x1111111111111111ULL);
   4509  const auto k2 = Set(d, 0x2222222222222222ULL);
   4510  const auto k4 = Set(d, 0x4444444444444444ULL);
   4511  const auto k8 = Set(d, 0x8888888888888888ULL);
   4512  const auto a0 = And(a, k1);
   4513  const auto a1 = And(a, k2);
   4514  const auto a2 = And(a, k4);
   4515  const auto a3 = And(a, k8);
   4516  const auto b0 = And(b, k1);
   4517  const auto b1 = And(b, k2);
   4518  const auto b2 = And(b, k4);
   4519  const auto b3 = And(b, k8);
   4520 
   4521  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
   4522  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
   4523  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
   4524  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
   4525  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
   4526  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
   4527  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
   4528  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
   4529  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
   4530 }
   4531 
   4532 #endif  // HWY_NATIVE_AES
   4533 #endif  // HWY_TARGET != HWY_SCALAR
   4534 
   4535 // ------------------------------ PopulationCount
   4536 
   4537 #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
   4538 #ifdef HWY_NATIVE_POPCNT
   4539 #undef HWY_NATIVE_POPCNT
   4540 #else
   4541 #define HWY_NATIVE_POPCNT
   4542 #endif
   4543 
   4544 template <class V, class D = DFromV<V>, HWY_IF_U8_D(D)>
   4545 HWY_API V PopulationCount(V v) {
   4546  const D d;
   4547 
   4548 #if HWY_TARGET == HWY_SSE2
   4549  // TableLookupBytes is slow on SSE2
   4550 
   4551  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
   4552  const V k33 = Set(d, uint8_t{0x33});
   4553  v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
   4554  v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
   4555  return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
   4556 #else  // HWY_TARGET != HWY_SSE2
   4557 
   4558 #if HWY_TARGET == HWY_RVV
   4559  // Need at least LMUL=1 on RVV to ensure that Lanes(d_tbl) is at least 16
   4560  const ScalableTag<uint8_t, HWY_MAX(HWY_POW2_D(D), 0)> d_tbl;
   4561 #else
   4562  const FixedTag<uint8_t, HWY_MAX(HWY_MAX_LANES_D(D), 16)> d_tbl;
   4563 #endif
   4564 
   4565  const auto lookup = Dup128VecFromValues(d_tbl, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2,
   4566                                          2, 3, 2, 3, 3, 4);
   4567  const auto lo = And(v, Set(d, uint8_t{0xF}));
   4568  const auto hi = ShiftRight<4>(v);
   4569 
   4570 #if HWY_TARGET == HWY_RVV
   4571  // On RVV, use TableLookupLanes to avoid unnecessary overhead
   4572  const auto hi_popcnt =
   4573      ResizeBitCast(d, TableLookupLanes(lookup, ResizeBitCast(d_tbl, hi)));
   4574  const auto lo_popcnt =
   4575      ResizeBitCast(d, TableLookupLanes(lookup, ResizeBitCast(d_tbl, lo)));
   4576 #else  // HWY_TARGET != HWY_RVV
   4577  const auto hi_popcnt = TableLookupBytes(lookup, hi);
   4578  const auto lo_popcnt = TableLookupBytes(lookup, lo);
   4579 #endif  // HWY_TARGET == HWY_RVV
   4580 
   4581  return Add(hi_popcnt, lo_popcnt);
   4582 #endif  // HWY_TARGET == HWY_SSE2
   4583 }
   4584 
   4585 template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
   4586 HWY_API V PopulationCount(V v) {
   4587  const D d;
   4588  const Repartition<uint8_t, decltype(d)> d8;
   4589  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
   4590  return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
   4591 }
   4592 
   4593 template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
   4594 HWY_API V PopulationCount(V v) {
   4595  const D d;
   4596  Repartition<uint16_t, decltype(d)> d16;
   4597  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
   4598  return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
   4599 }
   4600 
   4601 #if HWY_HAVE_INTEGER64
   4602 template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
   4603 HWY_API V PopulationCount(V v) {
   4604  const D d;
   4605  Repartition<uint32_t, decltype(d)> d32;
   4606  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
   4607  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
   4608 }
   4609 #endif
   4610 
   4611 #endif  // HWY_NATIVE_POPCNT
   4612 
   4613 // ------------------------------ 8-bit multiplication
   4614 
   4615 #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
   4616 #ifdef HWY_NATIVE_MUL_8
   4617 #undef HWY_NATIVE_MUL_8
   4618 #else
   4619 #define HWY_NATIVE_MUL_8
   4620 #endif
   4621 
   4622 // 8 bit and fits in wider reg: promote
   4623 template <class V, HWY_IF_T_SIZE_V(V, 1),
   4624          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
   4625 HWY_API V operator*(const V a, const V b) {
   4626  const DFromV<decltype(a)> d;
   4627  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
   4628  const RebindToUnsigned<decltype(d)> du;    // TruncateTo result
   4629  const RebindToUnsigned<decltype(dw)> dwu;  // TruncateTo input
   4630  const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
   4631  // TruncateTo is cheaper than ConcatEven.
   4632  return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
   4633 }
   4634 
   4635 // 8 bit full reg: promote halves
   4636 template <class V, HWY_IF_T_SIZE_V(V, 1),
   4637          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
   4638 HWY_API V operator*(const V a, const V b) {
   4639  const DFromV<decltype(a)> d;
   4640  const Half<decltype(d)> dh;
   4641  const Twice<RepartitionToWide<decltype(dh)>> dw;
   4642  const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
   4643  const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
   4644  const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
   4645  const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
   4646  const VFromD<decltype(dw)> m0 = a0 * b0;
   4647  const VFromD<decltype(dw)> m1 = a1 * b1;
   4648  return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
   4649 }
   4650 
   4651 #endif  // HWY_NATIVE_MUL_8
   4652 
   4653 // ------------------------------ 64-bit multiplication
   4654 
   4655 #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
   4656 #ifdef HWY_NATIVE_MUL_64
   4657 #undef HWY_NATIVE_MUL_64
   4658 #else
   4659 #define HWY_NATIVE_MUL_64
   4660 #endif
   4661 
   4662 // Single-lane i64 or u64
   4663 template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
   4664          HWY_IF_NOT_FLOAT_V(V)>
   4665 HWY_API V operator*(V x, V y) {
   4666  const DFromV<V> d;
   4667  using T = TFromD<decltype(d)>;
   4668  using TU = MakeUnsigned<T>;
   4669  const TU xu = static_cast<TU>(GetLane(x));
   4670  const TU yu = static_cast<TU>(GetLane(y));
   4671  return Set(d, static_cast<T>(xu * yu));
   4672 }
   4673 
   4674 template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
   4675          HWY_IF_V_SIZE_GT_D(D64, 8)>
   4676 HWY_API V operator*(V x, V y) {
   4677  RepartitionToNarrow<D64> d32;
   4678  auto x32 = BitCast(d32, x);
   4679  auto y32 = BitCast(d32, y);
   4680  auto lolo = BitCast(d32, MulEven(x32, y32));
   4681  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
   4682  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
   4683  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
   4684  return BitCast(D64{}, lolo + hi);
   4685 }
   4686 template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
   4687          HWY_IF_V_SIZE_GT_D(DI64, 8)>
   4688 HWY_API V operator*(V x, V y) {
   4689  RebindToUnsigned<DI64> du64;
   4690  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
   4691 }
   4692 
   4693 #endif  // HWY_NATIVE_MUL_64
   4694 
   4695 // ------------------------------ MulRound
   4696 template <class V, HWY_IF_FLOAT_V(V)>
   4697 HWY_API V MulRound(V a, V b) {
   4698  return Round(Mul(a, b));
   4699 }
   4700 
   4701 // ------------------------------ MulAdd / NegMulAdd
   4702 
   4703 #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
   4704 #ifdef HWY_NATIVE_INT_FMA
   4705 #undef HWY_NATIVE_INT_FMA
   4706 #else
   4707 #define HWY_NATIVE_INT_FMA
   4708 #endif
   4709 
   4710 #ifdef HWY_NATIVE_INT_FMSUB
   4711 #undef HWY_NATIVE_INT_FMSUB
   4712 #else
   4713 #define HWY_NATIVE_INT_FMSUB
   4714 #endif
   4715 
   4716 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4717 HWY_API V MulAdd(V mul, V x, V add) {
   4718  return Add(Mul(mul, x), add);
   4719 }
   4720 
   4721 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4722 HWY_API V NegMulAdd(V mul, V x, V add) {
   4723  return Sub(add, Mul(mul, x));
   4724 }
   4725 
   4726 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4727 HWY_API V MulSub(V mul, V x, V sub) {
   4728  return Sub(Mul(mul, x), sub);
   4729 }
   4730 #endif  // HWY_NATIVE_INT_FMA
   4731 // ------------------------------ MulComplex* / MaskedMulComplex*
   4732 
   4733 #if (defined(HWY_NATIVE_CPLX) == defined(HWY_TARGET_TOGGLE))
   4734 #ifdef HWY_NATIVE_CPLX
   4735 #undef HWY_NATIVE_CPLX
   4736 #else
   4737 #define HWY_NATIVE_CPLX
   4738 #endif
   4739 
   4740 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   4741 
   4742 template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)>
   4743 HWY_API V ComplexConj(V a) {
   4744  return OddEven(Neg(a), a);
   4745 }
   4746 
   4747 template <class V>
   4748 HWY_API V MulComplex(V a, V b) {
   4749  // a = u + iv, b = x + iy
   4750  const auto u = DupEven(a);
   4751  const auto v = DupOdd(a);
   4752  const auto x = DupEven(b);
   4753  const auto y = DupOdd(b);
   4754 
   4755  return OddEven(MulAdd(u, y, Mul(v, x)), Sub(Mul(u, x), Mul(v, y)));
   4756 }
   4757 
   4758 template <class V>
   4759 HWY_API V MulComplexConj(V a, V b) {
   4760  // a = u + iv, b = x + iy
   4761  const auto u = DupEven(a);
   4762  const auto v = DupOdd(a);
   4763  const auto x = DupEven(b);
   4764  const auto y = DupOdd(b);
   4765 
   4766  return OddEven(Sub(Mul(v, x), Mul(u, y)), MulAdd(u, x, Mul(v, y)));
   4767 }
   4768 
   4769 template <class V>
   4770 HWY_API V MulComplexAdd(V a, V b, V c) {
   4771  return Add(MulComplex(a, b), c);
   4772 }
   4773 
   4774 template <class V>
   4775 HWY_API V MulComplexConjAdd(V a, V b, V c) {
   4776  return Add(MulComplexConj(a, b), c);
   4777 }
   4778 
   4779 template <class V, class M>
   4780 HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) {
   4781  return IfThenElseZero(mask, MulComplexConjAdd(a, b, c));
   4782 }
   4783 
   4784 template <class V, class M>
   4785 HWY_API V MaskedMulComplexConj(M mask, V a, V b) {
   4786  return IfThenElseZero(mask, MulComplexConj(a, b));
   4787 }
   4788 
   4789 template <class V, class M>
   4790 HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) {
   4791  return IfThenElse(mask, MulComplex(a, b), no);
   4792 }
   4793 #endif  // HWY_TARGET != HWY_SCALAR
   4794 
   4795 #endif  // HWY_NATIVE_CPLX
   4796 
   4797 // ------------------------------ MaskedMulAddOr
   4798 #if (defined(HWY_NATIVE_MASKED_INT_FMA) == defined(HWY_TARGET_TOGGLE))
   4799 #ifdef HWY_NATIVE_MASKED_INT_FMA
   4800 #undef HWY_NATIVE_MASKED_INT_FMA
   4801 #else
   4802 #define HWY_NATIVE_MASKED_INT_FMA
   4803 #endif
   4804 
   4805 template <class V, class M>
   4806 HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) {
   4807  return IfThenElse(m, MulAdd(mul, x, add), no);
   4808 }
   4809 
   4810 #endif  // HWY_NATIVE_MASKED_INT_FMA
   4811 
   4812 // ------------------------------ Integer MulSub / NegMulSub
   4813 #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
   4814 #ifdef HWY_NATIVE_INT_FMSUB
   4815 #undef HWY_NATIVE_INT_FMSUB
   4816 #else
   4817 #define HWY_NATIVE_INT_FMSUB
   4818 #endif
   4819 
   4820 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4821 HWY_API V MulSub(V mul, V x, V sub) {
   4822  const DFromV<decltype(mul)> d;
   4823  const RebindToSigned<decltype(d)> di;
   4824  return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
   4825 }
   4826 
   4827 #endif  // HWY_NATIVE_INT_FMSUB
   4828 
   4829 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   4830 HWY_API V NegMulSub(V mul, V x, V sub) {
   4831  const DFromV<decltype(mul)> d;
   4832  const RebindToSigned<decltype(d)> di;
   4833 
   4834  return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
   4835 }
   4836 
   4837 // ------------------------------ MulAddSub
   4838 
   4839 // MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
   4840 // MulSub(mul, x, sub_or_add)
   4841 template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
   4842 HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
   4843  return MulSub(mul, x, sub_or_add);
   4844 }
   4845 
   4846 // MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
   4847 // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
   4848 // x86_512-inl.h
   4849 
   4850 // MulAddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
   4851 
   4852 // MulAddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
   4853 template <class V, HWY_IF_MULADDSUB_V(V)>
   4854 HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
   4855  using D = DFromV<V>;
   4856  using T = TFromD<D>;
   4857  using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
   4858 
   4859  const D d;
   4860  const Rebind<TNegate, D> d_negate;
   4861 
   4862  const auto add =
   4863      OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
   4864  return MulAdd(mul, x, add);
   4865 }
   4866 // ------------------------------ MulSubAdd
   4867 
   4868 template <class V>
   4869 HWY_API V MulSubAdd(V mul, V x, V sub_or_add) {
   4870  using D = DFromV<V>;
   4871  using T = TFromD<D>;
   4872  using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
   4873 
   4874  const D d;
   4875  const Rebind<TNegate, D> d_negate;
   4876 
   4877  return MulAddSub(mul, x, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
   4878 }
   4879 
   4880 // ------------------------------ MaskedConvertTo
   4881 template <class D, class V, class M>
   4882 HWY_API VFromD<D> MaskedConvertTo(M m, D d, V v) {
   4883  return IfThenElseZero(m, ConvertTo(d, v));
   4884 }
   4885 
   4886 // ------------------------------ Integer division
   4887 #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
   4888 #ifdef HWY_NATIVE_INT_DIV
   4889 #undef HWY_NATIVE_INT_DIV
   4890 #else
   4891 #define HWY_NATIVE_INT_DIV
   4892 #endif
   4893 
   4894 namespace detail {
   4895 
   4896 // DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo are okay to use in
   4897 // the implementation of detail::IntDiv in generic_ops-inl.h as the current
   4898 // implementations of DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo
   4899 // will convert values that are outside of the range of TFromD<DI> by either
   4900 // saturation, truncation, or converting values that are outside of the
   4901 // destination range to LimitsMin<TFromD<DI>>() (which is equal to
   4902 // static_cast<TFromD<DI>>(LimitsMax<TFromD<DI>>() + 1))
   4903 
   4904 template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
   4905 HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
   4906  return ConvertInRangeTo(di, vf);
   4907 }
   4908 
   4909 template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
   4910 HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
   4911  return ConvertTo(df, vi);
   4912 }
   4913 
   4914 #if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
   4915 template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
   4916 HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
   4917  return PromoteInRangeTo(df, vi);
   4918 }
   4919 
   4920 // If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
   4921 // IntDivConvIntToFloat(df, vi) returns an approximation of
   4922 // static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
   4923 template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
   4924 HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
   4925  const Twice<decltype(df32)> dt_f32;
   4926 
   4927  auto vf32 =
   4928      ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
   4929 
   4930 #if HWY_IS_LITTLE_ENDIAN
   4931  const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
   4932  auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
   4933 #else
   4934  const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
   4935  auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
   4936 #endif
   4937 
   4938  const RebindToSigned<decltype(df32)> di32;
   4939 
   4940  hi_f32 =
   4941      Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
   4942                      Set(df32, 1.0f)));
   4943  return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
   4944 }
   4945 
   4946 template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
   4947 HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
   4948  const Twice<decltype(df32)> dt_f32;
   4949 
   4950  auto vf32 =
   4951      ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
   4952 
   4953 #if HWY_IS_LITTLE_ENDIAN
   4954  const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
   4955  const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
   4956 #else
   4957  const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
   4958  const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
   4959 #endif
   4960 
   4961  return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
   4962 }
   4963 #endif  // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
   4964 
   4965 template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   4966          HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
   4967 HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
   4968  const DFromV<decltype(a)> d;
   4969  const RebindToFloat<decltype(d)> df;
   4970 
   4971  // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
   4972  // [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
   4973  // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
   4974 
   4975  // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
   4976  // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
   4977  // mantissa bits (including the implied one bit), where flt_q is equal to
   4978  // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
   4979  // even in the case where the magnitude of an inexact floating point division
   4980  // result is rounded up.
   4981 
   4982  // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
   4983  // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
   4984  // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
   4985  // the case where the magnitude of an inexact floating point division result
   4986  // is rounded up.
   4987 
   4988  // It is okay to do conversions from MakeFloat<TFromV<V>> to TFromV<V> using
   4989  // ConvertInRangeTo if sizeof(TFromV<V>) > kOrigLaneSize as the result of the
   4990  // floating point division is always greater than LimitsMin<TFromV<V>>() and
   4991  // less than LimitsMax<TFromV<V>>() if sizeof(TFromV<V>) > kOrigLaneSize and
   4992  // b[i] != 0.
   4993 
   4994 #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
   4995  // On Armv7, do division by multiplying by the ApproximateReciprocal
   4996  // to avoid unnecessary overhead as F32 Div refines the approximate
   4997  // reciprocal using 4 Newton-Raphson iterations
   4998 
   4999  const RebindToSigned<decltype(d)> di;
   5000  const RebindToUnsigned<decltype(d)> du;
   5001 
   5002  const auto flt_b = ConvertTo(df, b);
   5003  auto flt_recip_b = ApproximateReciprocal(flt_b);
   5004  if (kOrigLaneSize > 1) {
   5005    flt_recip_b =
   5006        Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
   5007  }
   5008 
   5009  auto q0 = ConvertInRangeTo(d, Mul(ConvertTo(df, a), flt_recip_b));
   5010  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
   5011 
   5012  auto r1 = r0;
   5013 
   5014  // Need to negate r1[i] if a[i] < 0 is true
   5015  if (IsSigned<TFromV<V>>()) {
   5016    r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
   5017  }
   5018 
   5019  // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
   5020 
   5021  auto abs_b = BitCast(du, b);
   5022  if (IsSigned<TFromV<V>>()) {
   5023    abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
   5024  }
   5025 
   5026  // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
   5027  // Otherwise, set q1[i] to 0.
   5028 
   5029  // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
   5030  // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
   5031  // will be true if r1[i] < 0 is true.
   5032  auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
   5033 
   5034  // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
   5035 
   5036  // Need to negate q1[i] if r0[i] and b[i] do not have the same sign
   5037  auto q1_negate_mask = r0;
   5038  if (IsSigned<TFromV<V>>()) {
   5039    q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
   5040  }
   5041  q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
   5042 
   5043  // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
   5044  //                       (((r0[i] ^ b[i]) < 0) ? 1 : -1)
   5045 
   5046  // Need to subtract q1[i] from q0[i] to get the final result
   5047  return Sub(q0, BitCast(d, q1));
   5048 #else
   5049  // On targets other than Armv7 NEON, use F16 or F32 division as most targets
   5050  // other than Armv7 NEON have native F32 divide instructions
   5051  return ConvertInRangeTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
   5052 #endif
   5053 }
   5054 
   5055 template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   5056          HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
   5057          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
   5058 HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
   5059  // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
   5060  // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
   5061  // than kOrigLaneSize*8 + 1 bits
   5062 
   5063  using T = TFromV<V>;
   5064 
   5065 #if HWY_HAVE_FLOAT64
   5066  using TF = MakeFloat<T>;
   5067 #else
   5068  using TF = float;
   5069 #endif
   5070 
   5071  const DFromV<decltype(a)> d;
   5072  const RebindToSigned<decltype(d)> di;
   5073  const RebindToUnsigned<decltype(d)> du;
   5074  const Rebind<TF, decltype(d)> df;
   5075 
   5076  if (!IsSigned<T>()) {
   5077    // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
   5078    // b[i] > LimitsMax<MakeSigned<T>>() is true
   5079 
   5080    const auto one = Set(di, MakeSigned<T>{1});
   5081    a = BitCast(
   5082        d, IfNegativeThenElse(BitCast(di, b),
   5083                              IfThenElseZero(RebindMask(di, Ge(a, b)), one),
   5084                              BitCast(di, a)));
   5085    b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
   5086  }
   5087 
   5088  // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true
   5089 
   5090  const auto flt_b = IntDivConvIntToFloat(df, b);
   5091 
   5092 #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
   5093  auto flt_recip_b = ApproximateReciprocal(flt_b);
   5094  flt_recip_b =
   5095      Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
   5096 #else
   5097  const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
   5098 #endif
   5099 
   5100  // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
   5101  // IntDivConvFloatToInt returns incorrect results in any lanes where b[i] == 0
   5102  // as the result of IntDivUsingFloatDiv(a, b) is implementation-defined in any
   5103  // lanes where b[i] == 0.
   5104 
   5105  // If ScalarAbs(b[i]) == 1 is true, then it is possible for
   5106  // a[i] * flt_recip_b[i] to be rounded up to a value that is outside of the
   5107  // range of T. If a[i] * flt_recip_b[i] is outside of the range of T,
   5108  // IntDivConvFloatToInt will convert any values that are out of the range of T
   5109  // by either saturation, truncation, or wrapping around to LimitsMin<T>().
   5110 
   5111  // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
   5112  // IntDivConvFloatToInt wraps around if ScalarAbs(b[i]) == 1 as r0 will have
   5113  // the correct sign if ScalarAbs(b[i]) == 1, even in the cases where the
   5114  // conversion of a[i] * flt_recip_b[i] to T using IntDivConvFloatToInt is
   5115  // truncated or wraps around.
   5116 
   5117  // If ScalarAbs(b[i]) >= 2 is true, a[i] * flt_recip_b[i] will be within the
   5118  // range of T, even in the cases where the conversion of a[i] to TF is
   5119  // rounded up or the result of multiplying a[i] by flt_recip_b[i] is rounded
   5120  // up.
   5121 
   5122  // ScalarAbs(r0[i]) will also always be less than (LimitsMax<T>() / 2) if
   5123  // b[i] != 0, even in the cases where the conversion of a[i] * flt_recip_b[i]
   5124  // to T using IntDivConvFloatToInt is truncated or is wrapped around.
   5125 
   5126  auto q0 =
   5127      IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
   5128  const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
   5129 
   5130  // If b[i] != 0 is true, r0[i] * flt_recip_b[i] is always within the range of
   5131  // T, even in the cases where the conversion of r0[i] to TF is rounded up or
   5132  // the multiplication of r0[i] by flt_recip_b[i] is rounded up.
   5133 
   5134  auto q1 =
   5135      IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
   5136  const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
   5137 
   5138  auto r3 = r1;
   5139 
   5140 #if !HWY_HAVE_FLOAT64
   5141  // Need two additional reciprocal multiplication steps for I64/U64 vectors if
   5142  // HWY_HAVE_FLOAT64 is 0
   5143  if (sizeof(T) == 8) {
   5144    const auto q2 = IntDivConvFloatToInt(
   5145        di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
   5146    const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);
   5147 
   5148    const auto q3 = IntDivConvFloatToInt(
   5149        di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
   5150    r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);
   5151 
   5152    q0 = Add(q0, BitCast(d, q2));
   5153    q1 = Add(q1, q3);
   5154  }
   5155 #endif  // !HWY_HAVE_FLOAT64
   5156 
   5157  auto r4 = r3;
   5158 
   5159  // Need to negate r4[i] if a[i] < 0 is true
   5160  if (IsSigned<TFromV<V>>()) {
   5161    r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
   5162  }
   5163 
   5164  // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
   5165 
   5166  auto abs_b = BitCast(du, b);
   5167  if (IsSigned<TFromV<V>>()) {
   5168    abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
   5169  }
   5170 
   5171  // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
   5172  // Otherwise, set r4[i] to 0.
   5173 
   5174  // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
   5175  // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
   5176  // will be true if r4[i] < 0 is true.
   5177  auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
   5178 
   5179  // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
   5180 
   5181  // Need to negate q4[i] if r3[i] and b[i] do not have the same sign
   5182  auto q4_negate_mask = r3;
   5183  if (IsSigned<TFromV<V>>()) {
   5184    q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
   5185  }
   5186  q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
   5187 
   5188  // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
   5189  //                       (((r3[i] ^ b[i]) < 0) ? 1 : -1)
   5190 
   5191  // The final result is equal to q0[i] + q1[i] - q4[i]
   5192  return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
   5193 }
   5194 
   5195 template <size_t kOrigLaneSize, class V,
   5196          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
   5197          HWY_IF_V_SIZE_LE_V(
   5198              V, HWY_MAX_BYTES /
   5199                     ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
   5200 HWY_INLINE V IntDiv(V a, V b) {
   5201  using T = TFromV<V>;
   5202 
   5203  // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
   5204  using TW = MakeWide<
   5205      If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
   5206 
   5207  const DFromV<decltype(a)> d;
   5208  const Rebind<TW, decltype(d)> dw;
   5209 
   5210 #if HWY_TARGET <= HWY_SSE2
   5211  // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
   5212  // unnecessary overhead
   5213  const RebindToSigned<decltype(dw)> dw_i;
   5214 
   5215  // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
   5216  // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
   5217  const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
   5218           decltype(d)>
   5219      d_demote_to;
   5220 #else
   5221  // On other targets, promote to TW and demote to T
   5222  const decltype(dw) dw_i;
   5223  const decltype(d) d_demote_to;
   5224 #endif
   5225 
   5226  return BitCast(
   5227      d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
   5228                                   PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
   5229 }
   5230 
   5231 template <size_t kOrigLaneSize, class V,
   5232          HWY_IF_T_SIZE_ONE_OF_V(V,
   5233                                 (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
   5234          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
   5235 HWY_INLINE V IntDiv(V a, V b) {
   5236  const DFromV<decltype(a)> d;
   5237  const RepartitionToWide<decltype(d)> dw;
   5238 
   5239 #if HWY_TARGET <= HWY_SSE2
   5240  // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
   5241  // unnecessary overhead
   5242  const RebindToSigned<decltype(dw)> dw_i;
   5243 
   5244  // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
   5245  // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
   5246  const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
   5247           decltype(d)>
   5248      d_demote_to;
   5249 #else
   5250  // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
   5251  const decltype(dw) dw_i;
   5252  const decltype(d) d_demote_to;
   5253 #endif
   5254 
   5255  return BitCast(d, OrderedDemote2To(
   5256                        d_demote_to,
   5257                        IntDivUsingFloatDiv<kOrigLaneSize>(
   5258                            PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
   5259                        IntDivUsingFloatDiv<kOrigLaneSize>(
   5260                            PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
   5261 }
   5262 
   5263 #if !HWY_HAVE_FLOAT16
   5264 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
   5265          HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
   5266 HWY_INLINE V IntDiv(V a, V b) {
   5267  const DFromV<decltype(a)> d;
   5268  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
   5269 
   5270 #if HWY_TARGET <= HWY_SSE2
   5271  // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
   5272  // overhead
   5273  const RebindToSigned<decltype(dw)> dw_i;
   5274 #else
   5275  // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
   5276  const decltype(dw) dw_i;
   5277 #endif
   5278 
   5279  return DemoteTo(d,
   5280                  BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
   5281 }
   5282 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
   5283          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
   5284 HWY_INLINE V IntDiv(V a, V b) {
   5285  const DFromV<decltype(a)> d;
   5286  const RepartitionToWide<decltype(d)> dw;
   5287 
   5288 #if HWY_TARGET <= HWY_SSE2
   5289  // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
   5290  // overhead
   5291  const RebindToSigned<decltype(dw)> dw_i;
   5292 #else
   5293  // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
   5294  const decltype(dw) dw_i;
   5295 #endif
   5296 
   5297  return OrderedDemote2To(
   5298      d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
   5299      BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
   5300 }
   5301 #endif  // !HWY_HAVE_FLOAT16
   5302 
   5303 template <size_t kOrigLaneSize, class V,
   5304          HWY_IF_T_SIZE_ONE_OF_V(V,
   5305                                 (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
   5306 HWY_INLINE V IntDiv(V a, V b) {
   5307  return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
   5308 }
   5309 
   5310 #if HWY_HAVE_FLOAT64
   5311 template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
   5312          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
   5313 HWY_INLINE V IntDiv(V a, V b) {
   5314  const DFromV<decltype(a)> d;
   5315  const Rebind<double, decltype(d)> df64;
   5316 
   5317  // It is okay to demote the F64 Div result to int32_t or uint32_t using
   5318  // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
   5319  // will always be within the range of TFromV<V> if b[i] != 0 and
   5320  // sizeof(TFromV<V>) <= 4.
   5321 
   5322  return DemoteInRangeTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
   5323 }
   5324 template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
   5325          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
   5326 HWY_INLINE V IntDiv(V a, V b) {
   5327  const DFromV<decltype(a)> d;
   5328  const Half<decltype(d)> dh;
   5329  const Repartition<double, decltype(d)> df64;
   5330 
   5331  // It is okay to demote the F64 Div result to int32_t or uint32_t using
   5332  // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
   5333  // will always be within the range of TFromV<V> if b[i] != 0 and
   5334  // sizeof(TFromV<V>) <= 4.
   5335 
   5336  const VFromD<decltype(df64)> div1 =
   5337      Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b));
   5338  const VFromD<decltype(df64)> div0 =
   5339      Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b));
   5340  return Combine(d, DemoteInRangeTo(dh, div1), DemoteInRangeTo(dh, div0));
   5341 }
   5342 #endif  // HWY_HAVE_FLOAT64
   5343 
   5344 template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   5345          HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
   5346                                      HWY_TARGET == HWY_WASM ||
   5347                                      HWY_TARGET == HWY_WASM_EMU256 ||
   5348                                      HWY_TARGET == HWY_LSX ||
   5349                                      HWY_TARGET == HWY_LASX)
   5350                                         ? 0
   5351                                         : (1 << 1)) |
   5352                                        (1 << 2) | (1 << 4) | (1 << 8))>
   5353 HWY_INLINE V IntMod(V a, V b) {
   5354  return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
   5355 }
   5356 
   5357 #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM ||       \
   5358    HWY_TARGET == HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || \
   5359    HWY_TARGET == HWY_LASX
   5360 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
   5361          HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
   5362 HWY_INLINE V IntMod(V a, V b) {
   5363  const DFromV<decltype(a)> d;
   5364  const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
   5365  return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
   5366 }
   5367 
   5368 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
   5369          HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
   5370 HWY_INLINE V IntMod(V a, V b) {
   5371  const DFromV<decltype(a)> d;
   5372  const RepartitionToWide<decltype(d)> dw;
   5373  return OrderedDemote2To(
   5374      d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
   5375      IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
   5376 }
   5377 #endif  // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
   5378        // HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
   5379 
   5380 }  // namespace detail
   5381 
   5382 #if HWY_TARGET == HWY_SCALAR
   5383 
   5384 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5385 HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
   5386  return detail::IntDiv<sizeof(T)>(a, b);
   5387 }
   5388 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5389 HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
   5390  return detail::IntMod<sizeof(T)>(a, b);
   5391 }
   5392 
   5393 #else  // HWY_TARGET != HWY_SCALAR
   5394 
   5395 template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5396 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
   5397  return detail::IntDiv<sizeof(T)>(a, b);
   5398 }
   5399 
   5400 template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5401 HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
   5402  return detail::IntMod<sizeof(T)>(a, b);
   5403 }
   5404 
   5405 #if HWY_CAP_GE256
   5406 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5407 HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
   5408  return detail::IntDiv<sizeof(T)>(a, b);
   5409 }
   5410 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5411 HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
   5412  return detail::IntMod<sizeof(T)>(a, b);
   5413 }
   5414 #endif
   5415 
   5416 #if HWY_CAP_GE512
   5417 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5418 HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
   5419  return detail::IntDiv<sizeof(T)>(a, b);
   5420 }
   5421 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   5422 HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
   5423  return detail::IntMod<sizeof(T)>(a, b);
   5424 }
   5425 #endif
   5426 
   5427 #endif  // HWY_TARGET == HWY_SCALAR
   5428 
   5429 #endif  // HWY_NATIVE_INT_DIV
   5430 
   5431 // ------------------------------ AverageRound
   5432 
   5433 #if (defined(HWY_NATIVE_AVERAGE_ROUND_UI32) == defined(HWY_TARGET_TOGGLE))
   5434 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
   5435 #undef HWY_NATIVE_AVERAGE_ROUND_UI32
   5436 #else
   5437 #define HWY_NATIVE_AVERAGE_ROUND_UI32
   5438 #endif
   5439 
   5440 template <class V, HWY_IF_UI32(TFromV<V>)>
   5441 HWY_API V AverageRound(V a, V b) {
   5442  return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
   5443 }
   5444 
   5445 #endif  // HWY_NATIVE_AVERAGE_ROUND_UI64
   5446 
   5447 #if (defined(HWY_NATIVE_AVERAGE_ROUND_UI64) == defined(HWY_TARGET_TOGGLE))
   5448 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
   5449 #undef HWY_NATIVE_AVERAGE_ROUND_UI64
   5450 #else
   5451 #define HWY_NATIVE_AVERAGE_ROUND_UI64
   5452 #endif
   5453 
   5454 #if HWY_HAVE_INTEGER64
   5455 template <class V, HWY_IF_UI64(TFromV<V>)>
   5456 HWY_API V AverageRound(V a, V b) {
   5457  return Sub(Or(a, b), ShiftRight<1>(Xor(a, b)));
   5458 }
   5459 #endif
   5460 
   5461 #endif  // HWY_NATIVE_AVERAGE_ROUND_UI64
   5462 
   5463 // ------------------------------ RoundingShiftRight (AverageRound)
   5464 
   5465 #if (defined(HWY_NATIVE_ROUNDING_SHR) == defined(HWY_TARGET_TOGGLE))
   5466 #ifdef HWY_NATIVE_ROUNDING_SHR
   5467 #undef HWY_NATIVE_ROUNDING_SHR
   5468 #else
   5469 #define HWY_NATIVE_ROUNDING_SHR
   5470 #endif
   5471 
   5472 template <int kShiftAmt, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5473 HWY_API V RoundingShiftRight(V v) {
   5474  const DFromV<V> d;
   5475  using T = TFromD<decltype(d)>;
   5476 
   5477  static_assert(
   5478      0 <= kShiftAmt && kShiftAmt <= static_cast<int>(sizeof(T) * 8 - 1),
   5479      "kShiftAmt is out of range");
   5480 
   5481  constexpr int kScaleDownShrAmt = HWY_MAX(kShiftAmt - 1, 0);
   5482 
   5483  auto scaled_down_v = v;
   5484  HWY_IF_CONSTEXPR(kScaleDownShrAmt > 0) {
   5485    scaled_down_v = ShiftRight<kScaleDownShrAmt>(v);
   5486  }
   5487 
   5488  HWY_IF_CONSTEXPR(kShiftAmt == 0) { return scaled_down_v; }
   5489 
   5490  return AverageRound(scaled_down_v, Zero(d));
   5491 }
   5492 
   5493 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5494 HWY_API V RoundingShiftRightSame(V v, int shift_amt) {
   5495  const DFromV<V> d;
   5496 
   5497  const bool shift_amt_is_zero = (shift_amt == 0);
   5498  const auto scaled_down_v = ShiftRightSame(
   5499      v, static_cast<int>(static_cast<unsigned>(shift_amt) +
   5500                          static_cast<unsigned>(shift_amt_is_zero) - 1u));
   5501 
   5502  return AverageRound(
   5503      scaled_down_v,
   5504      IfThenElseZero(SetMask(d, shift_amt_is_zero), scaled_down_v));
   5505 }
   5506 
   5507 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   5508 HWY_API V RoundingShr(V v, V amt) {
   5509  const DFromV<V> d;
   5510  const RebindToUnsigned<decltype(d)> du;
   5511  using T = TFromD<decltype(d)>;
   5512  using TU = MakeUnsigned<T>;
   5513 
   5514  const auto unsigned_amt = BitCast(du, amt);
   5515  const auto scale_down_shr_amt =
   5516      BitCast(d, SaturatedSub(unsigned_amt, Set(du, TU{1})));
   5517 
   5518  const auto scaled_down_v = Shr(v, scale_down_shr_amt);
   5519  return AverageRound(scaled_down_v,
   5520                      IfThenElseZero(Eq(amt, Zero(d)), scaled_down_v));
   5521 }
   5522 
   5523 #endif  // HWY_NATIVE_ROUNDING_SHR
   5524 
   5525 // ------------------------------ MulEvenAdd (PromoteEvenTo)
   5526 
   5527 // SVE with bf16 and NEON with bf16 override this.
   5528 #if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
   5529 #ifdef HWY_NATIVE_MUL_EVEN_BF16
   5530 #undef HWY_NATIVE_MUL_EVEN_BF16
   5531 #else
   5532 #define HWY_NATIVE_MUL_EVEN_BF16
   5533 #endif
   5534 
   5535 template <class DF, HWY_IF_F32_D(DF),
   5536          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   5537 HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
   5538  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c);
   5539 }
   5540 
   5541 template <class DF, HWY_IF_F32_D(DF),
   5542          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   5543 HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
   5544  return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c);
   5545 }
   5546 
   5547 #endif  // HWY_NATIVE_MUL_EVEN_BF16
   5548 
   5549 // ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd)
   5550 
   5551 // AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this.
   5552 #if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
   5553     defined(HWY_TARGET_TOGGLE))
   5554 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   5555 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   5556 #else
   5557 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   5558 #endif
   5559 
   5560 template <class DF, HWY_IF_F32_D(DF),
   5561          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
   5562 HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
   5563                                             VFromD<DF> sum0,
   5564                                             VFromD<DF>& sum1) {
   5565  // Lane order within sum0/1 is undefined, hence we can avoid the
   5566  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
   5567  sum1 = MulOddAdd(df, a, b, sum1);
   5568  return MulEvenAdd(df, a, b, sum0);
   5569 }
   5570 
   5571 #endif  // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   5572 
   5573 // ------------------------------ WidenMulAccumulate
   5574 
   5575 #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
   5576 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   5577 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   5578 #else
   5579 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   5580 #endif
   5581 
   5582 template<class D, HWY_IF_INTEGER(TFromD<D>),
   5583         class DN = RepartitionToNarrow<D>>
   5584 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
   5585                                     VFromD<D> low, VFromD<D>& high) {
   5586  high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
   5587  return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
   5588 }
   5589 
   5590 #endif  // HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   5591 
   5592 #if 0
   5593 #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
   5594 
   5595 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   5596 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   5597 #else
   5598 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   5599 #endif
   5600 
   5601 #if HWY_HAVE_FLOAT16
   5602 
   5603 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
   5604 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
   5605                                     VFromD<D> low, VFromD<D>& high) {
   5606  high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
   5607  return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
   5608 }
   5609 
   5610 #endif  // HWY_HAVE_FLOAT16
   5611 
   5612 #endif  // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   5613 #endif  // #if 0
   5614 
   5615 // ------------------------------ SatWidenMulPairwiseAdd
   5616 
   5617 #if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
   5618     defined(HWY_TARGET_TOGGLE))
   5619 
   5620 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   5621 #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   5622 #else
   5623 #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
   5624 #endif
   5625 
   5626 template <class DI16, class VU8, class VI8,
   5627          class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16),
   5628          HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>),
   5629          HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)),
   5630          HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))>
   5631 HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
   5632  const RebindToUnsigned<decltype(di16)> du16;
   5633 
   5634  const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
   5635  const auto b0 = PromoteEvenTo(di16, b);
   5636 
   5637  const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
   5638  const auto b1 = PromoteOddTo(di16, b);
   5639 
   5640  return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
   5641 }
   5642 
   5643 #endif
   5644 
   5645 // ------------------------------ SatWidenMulPairwiseAccumulate
   5646 
   5647 #if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
   5648     defined(HWY_TARGET_TOGGLE))
   5649 
   5650 #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   5651 #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   5652 #else
   5653 #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   5654 #endif
   5655 
   5656 template <class DI32, HWY_IF_I32_D(DI32)>
   5657 HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
   5658    DI32 di32, VFromD<Repartition<int16_t, DI32>> a,
   5659    VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
   5660  // WidenMulPairwiseAdd(di32, a, b) is okay here as
   5661  // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
   5662  // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
   5663  // a[0], b[0], a[1], and b[1] are all equal to -32768.
   5664 
   5665  const auto product = WidenMulPairwiseAdd(di32, a, b);
   5666 
   5667  const auto mul_overflow =
   5668      VecFromMask(di32, Eq(product, Set(di32, LimitsMin<int32_t>())));
   5669 
   5670  return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
   5671                      Add(product, mul_overflow));
   5672 }
   5673 
   5674 #endif  // HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
   5675 
   5676 // ------------------------------ SatWidenMulAccumFixedPoint
   5677 
   5678 #if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
   5679     defined(HWY_TARGET_TOGGLE))
   5680 
   5681 #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   5682 #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   5683 #else
   5684 #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   5685 #endif
   5686 
   5687 template <class DI32, HWY_IF_I32_D(DI32)>
   5688 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
   5689                                                VFromD<Rebind<int16_t, DI32>> a,
   5690                                                VFromD<Rebind<int16_t, DI32>> b,
   5691                                                VFromD<DI32> sum) {
   5692  const Repartition<int16_t, DI32> dt_i16;
   5693 
   5694  const auto vt_a = ResizeBitCast(dt_i16, a);
   5695  const auto vt_b = ResizeBitCast(dt_i16, b);
   5696 
   5697  const auto dup_a = InterleaveWholeLower(dt_i16, vt_a, vt_a);
   5698  const auto dup_b = InterleaveWholeLower(dt_i16, vt_b, vt_b);
   5699 
   5700  return SatWidenMulPairwiseAccumulate(di32, dup_a, dup_b, sum);
   5701 }
   5702 
   5703 #endif  // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   5704 
   5705 // ------------------------------ MaskedSqrt
   5706 
   5707 #if (defined(HWY_NATIVE_MASKED_SQRT) == defined(HWY_TARGET_TOGGLE))
   5708 
   5709 #ifdef HWY_NATIVE_MASKED_SQRT
   5710 #undef HWY_NATIVE_MASKED_SQRT
   5711 #else
   5712 #define HWY_NATIVE_MASKED_SQRT
   5713 #endif
   5714 template <class V, HWY_IF_FLOAT_V(V), class M>
   5715 HWY_API V MaskedSqrt(M m, V v) {
   5716  return IfThenElseZero(m, Sqrt(v));
   5717 }
   5718 
   5719 template <class V, HWY_IF_FLOAT_V(V), class M>
   5720 HWY_API V MaskedSqrtOr(V no, M m, V v) {
   5721  return IfThenElse(m, Sqrt(v), no);
   5722 }
   5723 #endif
   5724 
   5725 // ------------------------------ SumOfMulQuadAccumulate
   5726 
   5727 #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
   5728     defined(HWY_TARGET_TOGGLE))
   5729 
   5730 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
   5731 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
   5732 #else
   5733 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
   5734 #endif
   5735 
   5736 template <class DI32, HWY_IF_I32_D(DI32)>
   5737 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
   5738                                            VFromD<Repartition<int8_t, DI32>> a,
   5739                                            VFromD<Repartition<int8_t, DI32>> b,
   5740                                            VFromD<DI32> sum) {
   5741  const Repartition<int16_t, decltype(di32)> di16;
   5742 
   5743  const auto a0 = PromoteEvenTo(di16, a);
   5744  const auto b0 = PromoteEvenTo(di16, b);
   5745 
   5746  const auto a1 = PromoteOddTo(di16, a);
   5747  const auto b1 = PromoteOddTo(di16, b);
   5748 
   5749  return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
   5750                      WidenMulPairwiseAdd(di32, a1, b1)));
   5751 }
   5752 
   5753 #endif
   5754 
   5755 #if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
   5756     defined(HWY_TARGET_TOGGLE))
   5757 
   5758 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
   5759 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
   5760 #else
   5761 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
   5762 #endif
   5763 
   5764 template <class DU32, HWY_IF_U32_D(DU32)>
   5765 HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
   5766    DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
   5767    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
   5768  const Repartition<uint16_t, decltype(du32)> du16;
   5769  const RebindToSigned<decltype(du16)> di16;
   5770  const RebindToSigned<decltype(du32)> di32;
   5771 
   5772  const auto lo8_mask = Set(di16, int16_t{0x00FF});
   5773  const auto a0 = And(BitCast(di16, a), lo8_mask);
   5774  const auto b0 = And(BitCast(di16, b), lo8_mask);
   5775 
   5776  const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
   5777  const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b)));
   5778 
   5779  return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)),
   5780                      BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1))));
   5781 }
   5782 
   5783 #endif
   5784 
   5785 #if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
   5786     defined(HWY_TARGET_TOGGLE))
   5787 
   5788 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
   5789 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
   5790 #else
   5791 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
   5792 #endif
   5793 
   5794 template <class DI32, HWY_IF_I32_D(DI32)>
   5795 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
   5796    DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
   5797    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
   5798  const Repartition<int16_t, decltype(di32)> di16;
   5799  const RebindToUnsigned<decltype(di16)> du16;
   5800 
   5801  const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF}));
   5802  const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i)));
   5803 
   5804  const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u)));
   5805  const auto b1 = ShiftRight<8>(BitCast(di16, b_i));
   5806 
   5807  // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in
   5808  // SumOfMulQuadAccumulate as it is possible for
   5809  // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0],
   5810  // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same
   5811  // sign.
   5812 
   5813  return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
   5814                      WidenMulPairwiseAdd(di32, a1, b1)));
   5815 }
   5816 
   5817 #endif
   5818 
   5819 #if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
   5820     defined(HWY_TARGET_TOGGLE))
   5821 
   5822 #ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
   5823 #undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
   5824 #else
   5825 #define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
   5826 #endif
   5827 
   5828 #if HWY_HAVE_INTEGER64
   5829 template <class DI64, HWY_IF_I64_D(DI64)>
   5830 HWY_API VFromD<DI64> SumOfMulQuadAccumulate(
   5831    DI64 di64, VFromD<Repartition<int16_t, DI64>> a,
   5832    VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) {
   5833  const Repartition<int32_t, decltype(di64)> di32;
   5834 
   5835  // WidenMulPairwiseAdd(di32, a, b) is okay here as
   5836  // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
   5837  // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
   5838  // a[0], b[0], a[1], and b[1] are all equal to -32768.
   5839 
   5840  const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b);
   5841  const auto i32_pairwise_sum_overflow =
   5842      VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin<int32_t>())));
   5843 
   5844  // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of
   5845  // overflow.
   5846  const auto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF}));
   5847  const auto p0_zero_out_mask =
   5848      ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow));
   5849  const auto p1_zero_out_mask =
   5850      And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);
   5851 
   5852  const auto p0 =
   5853      AndNot(p0_zero_out_mask,
   5854             ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum))));
   5855  const auto p1 =
   5856      AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum)));
   5857 
   5858  return Add(sum, Add(p0, p1));
   5859 }
   5860 #endif  // HWY_HAVE_INTEGER64
   5861 #endif  // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
   5862 
   5863 #if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
   5864     defined(HWY_TARGET_TOGGLE))
   5865 
   5866 #ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
   5867 #undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
   5868 #else
   5869 #define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
   5870 #endif
   5871 
   5872 #if HWY_HAVE_INTEGER64
   5873 template <class DU64, HWY_IF_U64_D(DU64)>
   5874 HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
   5875    DU64 du64, VFromD<Repartition<uint16_t, DU64>> a,
   5876    VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) {
   5877  const auto u32_even_prod = MulEven(a, b);
   5878  const auto u32_odd_prod = MulOdd(a, b);
   5879 
   5880  const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
   5881                      PromoteEvenTo(du64, u32_odd_prod));
   5882  const auto p1 =
   5883      Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
   5884 
   5885  return Add(sum, Add(p0, p1));
   5886 }
   5887 #endif  // HWY_HAVE_INTEGER64
   5888 #endif  // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
   5889 
   5890 // ------------------------------ F64 ApproximateReciprocal
   5891 
   5892 #if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
   5893 #ifdef HWY_NATIVE_F64_APPROX_RECIP
   5894 #undef HWY_NATIVE_F64_APPROX_RECIP
   5895 #else
   5896 #define HWY_NATIVE_F64_APPROX_RECIP
   5897 #endif
   5898 
   5899 #if HWY_HAVE_FLOAT64
   5900 template <class V, HWY_IF_F64_D(DFromV<V>)>
   5901 HWY_API V ApproximateReciprocal(V v) {
   5902  const DFromV<decltype(v)> d;
   5903  return Div(Set(d, 1.0), v);
   5904 }
   5905 #endif  // HWY_HAVE_FLOAT64
   5906 
   5907 #endif  // HWY_NATIVE_F64_APPROX_RECIP
   5908 
   5909 // ------------------------------ MaskedApproximateReciprocal
   5910 template <class V, HWY_IF_FLOAT_V(V), class M>
   5911 HWY_API V MaskedApproximateReciprocal(M m, V v) {
   5912  return IfThenElseZero(m, ApproximateReciprocal(v));
   5913 }
   5914 
   5915 // ------------------------------ F64 ApproximateReciprocalSqrt
   5916 
   5917 #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
   5918 #ifdef HWY_NATIVE_F64_APPROX_RSQRT
   5919 #undef HWY_NATIVE_F64_APPROX_RSQRT
   5920 #else
   5921 #define HWY_NATIVE_F64_APPROX_RSQRT
   5922 #endif
   5923 
   5924 #if HWY_HAVE_FLOAT64
   5925 template <class V, HWY_IF_F64_D(DFromV<V>)>
   5926 HWY_API V ApproximateReciprocalSqrt(V v) {
   5927  const DFromV<decltype(v)> d;
   5928  const RebindToUnsigned<decltype(d)> du;
   5929  const auto half = Mul(v, Set(d, 0.5));
   5930  // Initial guess based on log2(f)
   5931  const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}),
   5932                                    ShiftRight<1>(BitCast(du, v))));
   5933  // One Newton-Raphson iteration
   5934  return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5)));
   5935 }
   5936 #endif  // HWY_HAVE_FLOAT64
   5937 
   5938 #endif  // HWY_NATIVE_F64_APPROX_RSQRT
   5939 
   5940 // ------------------------------ MaskedApproximateReciprocalSqrt
   5941 template <class V, HWY_IF_FLOAT_V(V), class M>
   5942 HWY_API V MaskedApproximateReciprocalSqrt(M m, V v) {
   5943  return IfThenElseZero(m, ApproximateReciprocalSqrt(v));
   5944 }
   5945 
   5946 // ------------------------------ Compress*
   5947 
   5948 #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
   5949 #ifdef HWY_NATIVE_COMPRESS8
   5950 #undef HWY_NATIVE_COMPRESS8
   5951 #else
   5952 #define HWY_NATIVE_COMPRESS8
   5953 #endif
   5954 
   5955 template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)>
   5956 HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
   5957                                 T* unaligned) {
   5958  HWY_ALIGN T lanes[MaxLanes(d)];
   5959  Store(v, d, lanes);
   5960 
   5961  const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8;
   5962  T* HWY_RESTRICT pos = unaligned;
   5963 
   5964  HWY_ALIGN constexpr T table[2048] = {
   5965      0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   5966      1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   5967      2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7,  //
   5968      1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   5969      3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7,  //
   5970      1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7,  //
   5971      2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7,  //
   5972      1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   5973      4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7,  //
   5974      1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7,  //
   5975      2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7,  //
   5976      1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7,  //
   5977      3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7,  //
   5978      1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7,  //
   5979      2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7,  //
   5980      1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   5981      5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7,  //
   5982      1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7,  //
   5983      2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7,  //
   5984      1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7,  //
   5985      3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7,  //
   5986      1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7,  //
   5987      2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7,  //
   5988      1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7,  //
   5989      4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7,  //
   5990      1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7,  //
   5991      2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7,  //
   5992      1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7,  //
   5993      3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7,  //
   5994      1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7,  //
   5995      2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7,  //
   5996      1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   5997      6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7,  //
   5998      1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7,  //
   5999      2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7,  //
   6000      1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7,  //
   6001      3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7,  //
   6002      1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7,  //
   6003      2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7,  //
   6004      1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7,  //
   6005      4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7,  //
   6006      1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7,  //
   6007      2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7,  //
   6008      1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7,  //
   6009      3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7,  //
   6010      1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7,  //
   6011      2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7,  //
   6012      1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7,  //
   6013      5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7,  //
   6014      1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7,  //
   6015      2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7,  //
   6016      1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7,  //
   6017      3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7,  //
   6018      1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7,  //
   6019      2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7,  //
   6020      1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7,  //
   6021      4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7,  //
   6022      1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7,  //
   6023      2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7,  //
   6024      1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7,  //
   6025      3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7,  //
   6026      1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7,  //
   6027      2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7,  //
   6028      1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
   6029      7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6,  //
   6030      1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6,  //
   6031      2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6,  //
   6032      1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6,  //
   6033      3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6,  //
   6034      1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6,  //
   6035      2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6,  //
   6036      1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6,  //
   6037      4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6,  //
   6038      1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6,  //
   6039      2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6,  //
   6040      1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6,  //
   6041      3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6,  //
   6042      1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6,  //
   6043      2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6,  //
   6044      1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6,  //
   6045      5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6,  //
   6046      1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6,  //
   6047      2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6,  //
   6048      1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6,  //
   6049      3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6,  //
   6050      1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6,  //
   6051      2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6,  //
   6052      1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6,  //
   6053      4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6,  //
   6054      1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6,  //
   6055      2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6,  //
   6056      1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6,  //
   6057      3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6,  //
   6058      1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6,  //
   6059      2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6,  //
   6060      1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6,  //
   6061      6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5,  //
   6062      1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5,  //
   6063      2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5,  //
   6064      1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5,  //
   6065      3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5,  //
   6066      1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5,  //
   6067      2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5,  //
   6068      1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5,  //
   6069      4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5,  //
   6070      1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5,  //
   6071      2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5,  //
   6072      1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5,  //
   6073      3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5,  //
   6074      1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5,  //
   6075      2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5,  //
   6076      1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5,  //
   6077      5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4,  //
   6078      1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4,  //
   6079      2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4,  //
   6080      1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4,  //
   6081      3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4,  //
   6082      1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4,  //
   6083      2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4,  //
   6084      1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4,  //
   6085      4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3,  //
   6086      1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3,  //
   6087      2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3,  //
   6088      1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3,  //
   6089      3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2,  //
   6090      1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2,  //
   6091      2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1,  //
   6092      1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7};
   6093 
   6094  for (size_t i = 0; i < Lanes(d); i += 8) {
   6095    // Each byte worth of bits is the index of one of 256 8-byte ranges, and its
   6096    // population count determines how far to advance the write position.
   6097    const size_t bits8 = bits[i / 8];
   6098    const auto indices = Load(d8, table + bits8 * 8);
   6099    const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
   6100    StoreU(compressed, d8, pos);
   6101    pos += PopCount(bits8);
   6102  }
   6103  return static_cast<size_t>(pos - unaligned);
   6104 }
   6105 
   6106 template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
   6107 HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
   6108  uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
   6109  (void)StoreMaskBits(d, mask, bits);
   6110  return CompressBitsStore(v, bits, d, unaligned);
   6111 }
   6112 
   6113 template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
   6114 HWY_API size_t CompressBlendedStore(V v, M mask, D d,
   6115                                    T* HWY_RESTRICT unaligned) {
   6116  HWY_ALIGN T buf[MaxLanes(d)];
   6117  const size_t bytes = CompressStore(v, mask, d, buf);
   6118  BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned);
   6119  return bytes;
   6120 }
   6121 
   6122 // For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE.
   6123 template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
   6124 HWY_API V Compress(V v, const M mask) {
   6125  const DFromV<V> d;
   6126  HWY_ALIGN T lanes[MaxLanes(d)];
   6127  (void)CompressStore(v, mask, d, lanes);
   6128  return Load(d, lanes);
   6129 }
   6130 
   6131 template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
   6132 HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
   6133  const DFromV<V> d;
   6134  HWY_ALIGN T lanes[MaxLanes(d)];
   6135  (void)CompressBitsStore(v, bits, d, lanes);
   6136  return Load(d, lanes);
   6137 }
   6138 
   6139 template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
   6140 HWY_API V CompressNot(V v, M mask) {
   6141  return Compress(v, Not(mask));
   6142 }
   6143 
   6144 #endif  // HWY_NATIVE_COMPRESS8
   6145 
   6146 // ------------------------------ Expand
   6147 
   6148 // Note that this generic implementation assumes <= 128 bit fixed vectors;
   6149 // the SVE and RVV targets provide their own native implementations.
   6150 #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
   6151 #ifdef HWY_NATIVE_EXPAND
   6152 #undef HWY_NATIVE_EXPAND
   6153 #else
   6154 #define HWY_NATIVE_EXPAND
   6155 #endif
   6156 
   6157 namespace detail {
   6158 
   6159 template <size_t N>
   6160 HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) {
   6161  static_assert(N <= 8, "Should only be called for half-vectors");
   6162  const Simd<uint8_t, N, 0> du8;
   6163  HWY_DASSERT(mask_bits < 0x100);
   6164  alignas(16) static constexpr uint8_t table[2048] = {
   6165      // PrintExpand8x8Tables
   6166      128, 128, 128, 128, 128, 128, 128, 128,  //
   6167      0,   128, 128, 128, 128, 128, 128, 128,  //
   6168      128, 0,   128, 128, 128, 128, 128, 128,  //
   6169      0,   1,   128, 128, 128, 128, 128, 128,  //
   6170      128, 128, 0,   128, 128, 128, 128, 128,  //
   6171      0,   128, 1,   128, 128, 128, 128, 128,  //
   6172      128, 0,   1,   128, 128, 128, 128, 128,  //
   6173      0,   1,   2,   128, 128, 128, 128, 128,  //
   6174      128, 128, 128, 0,   128, 128, 128, 128,  //
   6175      0,   128, 128, 1,   128, 128, 128, 128,  //
   6176      128, 0,   128, 1,   128, 128, 128, 128,  //
   6177      0,   1,   128, 2,   128, 128, 128, 128,  //
   6178      128, 128, 0,   1,   128, 128, 128, 128,  //
   6179      0,   128, 1,   2,   128, 128, 128, 128,  //
   6180      128, 0,   1,   2,   128, 128, 128, 128,  //
   6181      0,   1,   2,   3,   128, 128, 128, 128,  //
   6182      128, 128, 128, 128, 0,   128, 128, 128,  //
   6183      0,   128, 128, 128, 1,   128, 128, 128,  //
   6184      128, 0,   128, 128, 1,   128, 128, 128,  //
   6185      0,   1,   128, 128, 2,   128, 128, 128,  //
   6186      128, 128, 0,   128, 1,   128, 128, 128,  //
   6187      0,   128, 1,   128, 2,   128, 128, 128,  //
   6188      128, 0,   1,   128, 2,   128, 128, 128,  //
   6189      0,   1,   2,   128, 3,   128, 128, 128,  //
   6190      128, 128, 128, 0,   1,   128, 128, 128,  //
   6191      0,   128, 128, 1,   2,   128, 128, 128,  //
   6192      128, 0,   128, 1,   2,   128, 128, 128,  //
   6193      0,   1,   128, 2,   3,   128, 128, 128,  //
   6194      128, 128, 0,   1,   2,   128, 128, 128,  //
   6195      0,   128, 1,   2,   3,   128, 128, 128,  //
   6196      128, 0,   1,   2,   3,   128, 128, 128,  //
   6197      0,   1,   2,   3,   4,   128, 128, 128,  //
   6198      128, 128, 128, 128, 128, 0,   128, 128,  //
   6199      0,   128, 128, 128, 128, 1,   128, 128,  //
   6200      128, 0,   128, 128, 128, 1,   128, 128,  //
   6201      0,   1,   128, 128, 128, 2,   128, 128,  //
   6202      128, 128, 0,   128, 128, 1,   128, 128,  //
   6203      0,   128, 1,   128, 128, 2,   128, 128,  //
   6204      128, 0,   1,   128, 128, 2,   128, 128,  //
   6205      0,   1,   2,   128, 128, 3,   128, 128,  //
   6206      128, 128, 128, 0,   128, 1,   128, 128,  //
   6207      0,   128, 128, 1,   128, 2,   128, 128,  //
   6208      128, 0,   128, 1,   128, 2,   128, 128,  //
   6209      0,   1,   128, 2,   128, 3,   128, 128,  //
   6210      128, 128, 0,   1,   128, 2,   128, 128,  //
   6211      0,   128, 1,   2,   128, 3,   128, 128,  //
   6212      128, 0,   1,   2,   128, 3,   128, 128,  //
   6213      0,   1,   2,   3,   128, 4,   128, 128,  //
   6214      128, 128, 128, 128, 0,   1,   128, 128,  //
   6215      0,   128, 128, 128, 1,   2,   128, 128,  //
   6216      128, 0,   128, 128, 1,   2,   128, 128,  //
   6217      0,   1,   128, 128, 2,   3,   128, 128,  //
   6218      128, 128, 0,   128, 1,   2,   128, 128,  //
   6219      0,   128, 1,   128, 2,   3,   128, 128,  //
   6220      128, 0,   1,   128, 2,   3,   128, 128,  //
   6221      0,   1,   2,   128, 3,   4,   128, 128,  //
   6222      128, 128, 128, 0,   1,   2,   128, 128,  //
   6223      0,   128, 128, 1,   2,   3,   128, 128,  //
   6224      128, 0,   128, 1,   2,   3,   128, 128,  //
   6225      0,   1,   128, 2,   3,   4,   128, 128,  //
   6226      128, 128, 0,   1,   2,   3,   128, 128,  //
   6227      0,   128, 1,   2,   3,   4,   128, 128,  //
   6228      128, 0,   1,   2,   3,   4,   128, 128,  //
   6229      0,   1,   2,   3,   4,   5,   128, 128,  //
   6230      128, 128, 128, 128, 128, 128, 0,   128,  //
   6231      0,   128, 128, 128, 128, 128, 1,   128,  //
   6232      128, 0,   128, 128, 128, 128, 1,   128,  //
   6233      0,   1,   128, 128, 128, 128, 2,   128,  //
   6234      128, 128, 0,   128, 128, 128, 1,   128,  //
   6235      0,   128, 1,   128, 128, 128, 2,   128,  //
   6236      128, 0,   1,   128, 128, 128, 2,   128,  //
   6237      0,   1,   2,   128, 128, 128, 3,   128,  //
   6238      128, 128, 128, 0,   128, 128, 1,   128,  //
   6239      0,   128, 128, 1,   128, 128, 2,   128,  //
   6240      128, 0,   128, 1,   128, 128, 2,   128,  //
   6241      0,   1,   128, 2,   128, 128, 3,   128,  //
   6242      128, 128, 0,   1,   128, 128, 2,   128,  //
   6243      0,   128, 1,   2,   128, 128, 3,   128,  //
   6244      128, 0,   1,   2,   128, 128, 3,   128,  //
   6245      0,   1,   2,   3,   128, 128, 4,   128,  //
   6246      128, 128, 128, 128, 0,   128, 1,   128,  //
   6247      0,   128, 128, 128, 1,   128, 2,   128,  //
   6248      128, 0,   128, 128, 1,   128, 2,   128,  //
   6249      0,   1,   128, 128, 2,   128, 3,   128,  //
   6250      128, 128, 0,   128, 1,   128, 2,   128,  //
   6251      0,   128, 1,   128, 2,   128, 3,   128,  //
   6252      128, 0,   1,   128, 2,   128, 3,   128,  //
   6253      0,   1,   2,   128, 3,   128, 4,   128,  //
   6254      128, 128, 128, 0,   1,   128, 2,   128,  //
   6255      0,   128, 128, 1,   2,   128, 3,   128,  //
   6256      128, 0,   128, 1,   2,   128, 3,   128,  //
   6257      0,   1,   128, 2,   3,   128, 4,   128,  //
   6258      128, 128, 0,   1,   2,   128, 3,   128,  //
   6259      0,   128, 1,   2,   3,   128, 4,   128,  //
   6260      128, 0,   1,   2,   3,   128, 4,   128,  //
   6261      0,   1,   2,   3,   4,   128, 5,   128,  //
   6262      128, 128, 128, 128, 128, 0,   1,   128,  //
   6263      0,   128, 128, 128, 128, 1,   2,   128,  //
   6264      128, 0,   128, 128, 128, 1,   2,   128,  //
   6265      0,   1,   128, 128, 128, 2,   3,   128,  //
   6266      128, 128, 0,   128, 128, 1,   2,   128,  //
   6267      0,   128, 1,   128, 128, 2,   3,   128,  //
   6268      128, 0,   1,   128, 128, 2,   3,   128,  //
   6269      0,   1,   2,   128, 128, 3,   4,   128,  //
   6270      128, 128, 128, 0,   128, 1,   2,   128,  //
   6271      0,   128, 128, 1,   128, 2,   3,   128,  //
   6272      128, 0,   128, 1,   128, 2,   3,   128,  //
   6273      0,   1,   128, 2,   128, 3,   4,   128,  //
   6274      128, 128, 0,   1,   128, 2,   3,   128,  //
   6275      0,   128, 1,   2,   128, 3,   4,   128,  //
   6276      128, 0,   1,   2,   128, 3,   4,   128,  //
   6277      0,   1,   2,   3,   128, 4,   5,   128,  //
   6278      128, 128, 128, 128, 0,   1,   2,   128,  //
   6279      0,   128, 128, 128, 1,   2,   3,   128,  //
   6280      128, 0,   128, 128, 1,   2,   3,   128,  //
   6281      0,   1,   128, 128, 2,   3,   4,   128,  //
   6282      128, 128, 0,   128, 1,   2,   3,   128,  //
   6283      0,   128, 1,   128, 2,   3,   4,   128,  //
   6284      128, 0,   1,   128, 2,   3,   4,   128,  //
   6285      0,   1,   2,   128, 3,   4,   5,   128,  //
   6286      128, 128, 128, 0,   1,   2,   3,   128,  //
   6287      0,   128, 128, 1,   2,   3,   4,   128,  //
   6288      128, 0,   128, 1,   2,   3,   4,   128,  //
   6289      0,   1,   128, 2,   3,   4,   5,   128,  //
   6290      128, 128, 0,   1,   2,   3,   4,   128,  //
   6291      0,   128, 1,   2,   3,   4,   5,   128,  //
   6292      128, 0,   1,   2,   3,   4,   5,   128,  //
   6293      0,   1,   2,   3,   4,   5,   6,   128,  //
   6294      128, 128, 128, 128, 128, 128, 128, 0,    //
   6295      0,   128, 128, 128, 128, 128, 128, 1,    //
   6296      128, 0,   128, 128, 128, 128, 128, 1,    //
   6297      0,   1,   128, 128, 128, 128, 128, 2,    //
   6298      128, 128, 0,   128, 128, 128, 128, 1,    //
   6299      0,   128, 1,   128, 128, 128, 128, 2,    //
   6300      128, 0,   1,   128, 128, 128, 128, 2,    //
   6301      0,   1,   2,   128, 128, 128, 128, 3,    //
   6302      128, 128, 128, 0,   128, 128, 128, 1,    //
   6303      0,   128, 128, 1,   128, 128, 128, 2,    //
   6304      128, 0,   128, 1,   128, 128, 128, 2,    //
   6305      0,   1,   128, 2,   128, 128, 128, 3,    //
   6306      128, 128, 0,   1,   128, 128, 128, 2,    //
   6307      0,   128, 1,   2,   128, 128, 128, 3,    //
   6308      128, 0,   1,   2,   128, 128, 128, 3,    //
   6309      0,   1,   2,   3,   128, 128, 128, 4,    //
   6310      128, 128, 128, 128, 0,   128, 128, 1,    //
   6311      0,   128, 128, 128, 1,   128, 128, 2,    //
   6312      128, 0,   128, 128, 1,   128, 128, 2,    //
   6313      0,   1,   128, 128, 2,   128, 128, 3,    //
   6314      128, 128, 0,   128, 1,   128, 128, 2,    //
   6315      0,   128, 1,   128, 2,   128, 128, 3,    //
   6316      128, 0,   1,   128, 2,   128, 128, 3,    //
   6317      0,   1,   2,   128, 3,   128, 128, 4,    //
   6318      128, 128, 128, 0,   1,   128, 128, 2,    //
   6319      0,   128, 128, 1,   2,   128, 128, 3,    //
   6320      128, 0,   128, 1,   2,   128, 128, 3,    //
   6321      0,   1,   128, 2,   3,   128, 128, 4,    //
   6322      128, 128, 0,   1,   2,   128, 128, 3,    //
   6323      0,   128, 1,   2,   3,   128, 128, 4,    //
   6324      128, 0,   1,   2,   3,   128, 128, 4,    //
   6325      0,   1,   2,   3,   4,   128, 128, 5,    //
   6326      128, 128, 128, 128, 128, 0,   128, 1,    //
   6327      0,   128, 128, 128, 128, 1,   128, 2,    //
   6328      128, 0,   128, 128, 128, 1,   128, 2,    //
   6329      0,   1,   128, 128, 128, 2,   128, 3,    //
   6330      128, 128, 0,   128, 128, 1,   128, 2,    //
   6331      0,   128, 1,   128, 128, 2,   128, 3,    //
   6332      128, 0,   1,   128, 128, 2,   128, 3,    //
   6333      0,   1,   2,   128, 128, 3,   128, 4,    //
   6334      128, 128, 128, 0,   128, 1,   128, 2,    //
   6335      0,   128, 128, 1,   128, 2,   128, 3,    //
   6336      128, 0,   128, 1,   128, 2,   128, 3,    //
   6337      0,   1,   128, 2,   128, 3,   128, 4,    //
   6338      128, 128, 0,   1,   128, 2,   128, 3,    //
   6339      0,   128, 1,   2,   128, 3,   128, 4,    //
   6340      128, 0,   1,   2,   128, 3,   128, 4,    //
   6341      0,   1,   2,   3,   128, 4,   128, 5,    //
   6342      128, 128, 128, 128, 0,   1,   128, 2,    //
   6343      0,   128, 128, 128, 1,   2,   128, 3,    //
   6344      128, 0,   128, 128, 1,   2,   128, 3,    //
   6345      0,   1,   128, 128, 2,   3,   128, 4,    //
   6346      128, 128, 0,   128, 1,   2,   128, 3,    //
   6347      0,   128, 1,   128, 2,   3,   128, 4,    //
   6348      128, 0,   1,   128, 2,   3,   128, 4,    //
   6349      0,   1,   2,   128, 3,   4,   128, 5,    //
   6350      128, 128, 128, 0,   1,   2,   128, 3,    //
   6351      0,   128, 128, 1,   2,   3,   128, 4,    //
   6352      128, 0,   128, 1,   2,   3,   128, 4,    //
   6353      0,   1,   128, 2,   3,   4,   128, 5,    //
   6354      128, 128, 0,   1,   2,   3,   128, 4,    //
   6355      0,   128, 1,   2,   3,   4,   128, 5,    //
   6356      128, 0,   1,   2,   3,   4,   128, 5,    //
   6357      0,   1,   2,   3,   4,   5,   128, 6,    //
   6358      128, 128, 128, 128, 128, 128, 0,   1,    //
   6359      0,   128, 128, 128, 128, 128, 1,   2,    //
   6360      128, 0,   128, 128, 128, 128, 1,   2,    //
   6361      0,   1,   128, 128, 128, 128, 2,   3,    //
   6362      128, 128, 0,   128, 128, 128, 1,   2,    //
   6363      0,   128, 1,   128, 128, 128, 2,   3,    //
   6364      128, 0,   1,   128, 128, 128, 2,   3,    //
   6365      0,   1,   2,   128, 128, 128, 3,   4,    //
   6366      128, 128, 128, 0,   128, 128, 1,   2,    //
   6367      0,   128, 128, 1,   128, 128, 2,   3,    //
   6368      128, 0,   128, 1,   128, 128, 2,   3,    //
   6369      0,   1,   128, 2,   128, 128, 3,   4,    //
   6370      128, 128, 0,   1,   128, 128, 2,   3,    //
   6371      0,   128, 1,   2,   128, 128, 3,   4,    //
   6372      128, 0,   1,   2,   128, 128, 3,   4,    //
   6373      0,   1,   2,   3,   128, 128, 4,   5,    //
   6374      128, 128, 128, 128, 0,   128, 1,   2,    //
   6375      0,   128, 128, 128, 1,   128, 2,   3,    //
   6376      128, 0,   128, 128, 1,   128, 2,   3,    //
   6377      0,   1,   128, 128, 2,   128, 3,   4,    //
   6378      128, 128, 0,   128, 1,   128, 2,   3,    //
   6379      0,   128, 1,   128, 2,   128, 3,   4,    //
   6380      128, 0,   1,   128, 2,   128, 3,   4,    //
   6381      0,   1,   2,   128, 3,   128, 4,   5,    //
   6382      128, 128, 128, 0,   1,   128, 2,   3,    //
   6383      0,   128, 128, 1,   2,   128, 3,   4,    //
   6384      128, 0,   128, 1,   2,   128, 3,   4,    //
   6385      0,   1,   128, 2,   3,   128, 4,   5,    //
   6386      128, 128, 0,   1,   2,   128, 3,   4,    //
   6387      0,   128, 1,   2,   3,   128, 4,   5,    //
   6388      128, 0,   1,   2,   3,   128, 4,   5,    //
   6389      0,   1,   2,   3,   4,   128, 5,   6,    //
   6390      128, 128, 128, 128, 128, 0,   1,   2,    //
   6391      0,   128, 128, 128, 128, 1,   2,   3,    //
   6392      128, 0,   128, 128, 128, 1,   2,   3,    //
   6393      0,   1,   128, 128, 128, 2,   3,   4,    //
   6394      128, 128, 0,   128, 128, 1,   2,   3,    //
   6395      0,   128, 1,   128, 128, 2,   3,   4,    //
   6396      128, 0,   1,   128, 128, 2,   3,   4,    //
   6397      0,   1,   2,   128, 128, 3,   4,   5,    //
   6398      128, 128, 128, 0,   128, 1,   2,   3,    //
   6399      0,   128, 128, 1,   128, 2,   3,   4,    //
   6400      128, 0,   128, 1,   128, 2,   3,   4,    //
   6401      0,   1,   128, 2,   128, 3,   4,   5,    //
   6402      128, 128, 0,   1,   128, 2,   3,   4,    //
   6403      0,   128, 1,   2,   128, 3,   4,   5,    //
   6404      128, 0,   1,   2,   128, 3,   4,   5,    //
   6405      0,   1,   2,   3,   128, 4,   5,   6,    //
   6406      128, 128, 128, 128, 0,   1,   2,   3,    //
   6407      0,   128, 128, 128, 1,   2,   3,   4,    //
   6408      128, 0,   128, 128, 1,   2,   3,   4,    //
   6409      0,   1,   128, 128, 2,   3,   4,   5,    //
   6410      128, 128, 0,   128, 1,   2,   3,   4,    //
   6411      0,   128, 1,   128, 2,   3,   4,   5,    //
   6412      128, 0,   1,   128, 2,   3,   4,   5,    //
   6413      0,   1,   2,   128, 3,   4,   5,   6,    //
   6414      128, 128, 128, 0,   1,   2,   3,   4,    //
   6415      0,   128, 128, 1,   2,   3,   4,   5,    //
   6416      128, 0,   128, 1,   2,   3,   4,   5,    //
   6417      0,   1,   128, 2,   3,   4,   5,   6,    //
   6418      128, 128, 0,   1,   2,   3,   4,   5,    //
   6419      0,   128, 1,   2,   3,   4,   5,   6,    //
   6420      128, 0,   1,   2,   3,   4,   5,   6,    //
   6421      0,   1,   2,   3,   4,   5,   6,   7};
   6422  return LoadU(du8, table + mask_bits * 8);
   6423 }
   6424 
   6425 }  // namespace detail
   6426 
   6427 // Half vector of bytes: one table lookup
   6428 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
   6429 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
   6430  const DFromV<decltype(v)> d;
   6431 
   6432  const uint64_t mask_bits = BitsFromMask(d, mask);
   6433  const Vec128<uint8_t, N> indices =
   6434      detail::IndicesForExpandFromBits<N>(mask_bits);
   6435  return BitCast(d, TableLookupBytesOr0(v, indices));
   6436 }
   6437 
   6438 // Full vector of bytes: two table lookups
   6439 template <typename T, HWY_IF_T_SIZE(T, 1)>
   6440 HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
   6441  const Full128<T> d;
   6442  const RebindToUnsigned<decltype(d)> du;
   6443  const Half<decltype(du)> duh;
   6444  const Vec128<uint8_t> vu = BitCast(du, v);
   6445 
   6446  const uint64_t mask_bits = BitsFromMask(d, mask);
   6447  const uint64_t maskL = mask_bits & 0xFF;
   6448  const uint64_t maskH = mask_bits >> 8;
   6449 
   6450  // We want to skip past the v bytes already consumed by idxL. There is no
   6451  // instruction for shift-reg by variable bytes. Storing v itself would work
   6452  // but would involve a store-load forwarding stall. We instead shuffle using
   6453  // loaded indices.
   6454  // TODO: MultiRotateRight would also help, but if we have that, we probably
   6455  // also have native 8-bit Expand?
   6456  alignas(16) static constexpr uint8_t iota[32] = {
   6457      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,
   6458      11,  12,  13,  14,  15,  128, 128, 128, 128, 128, 128,
   6459      128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
   6460  const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL));
   6461  const VFromD<decltype(duh)> vL = LowerHalf(duh, vu);
   6462  const VFromD<decltype(duh)> vH =
   6463      LowerHalf(duh, TableLookupBytesOr0(vu, shift));
   6464 
   6465  const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL);
   6466  const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH);
   6467 
   6468  const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL);
   6469  const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH);
   6470  return BitCast(d, Combine(du, expandH, expandL));
   6471 }
   6472 
   6473 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
   6474 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
   6475  const DFromV<decltype(v)> d;
   6476  const RebindToUnsigned<decltype(d)> du;
   6477 
   6478  const Rebind<uint8_t, decltype(d)> du8;
   6479  const uint64_t mask_bits = BitsFromMask(d, mask);
   6480 
   6481  // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
   6482  // the nibble trick used below because not all indices fit within one lane.
   6483  alignas(16) static constexpr uint8_t table[2048] = {
   6484      // PrintExpand16x8ByteTables
   6485      128, 128, 128, 128, 128, 128, 128, 128,  //
   6486      0,   128, 128, 128, 128, 128, 128, 128,  //
   6487      128, 0,   128, 128, 128, 128, 128, 128,  //
   6488      0,   2,   128, 128, 128, 128, 128, 128,  //
   6489      128, 128, 0,   128, 128, 128, 128, 128,  //
   6490      0,   128, 2,   128, 128, 128, 128, 128,  //
   6491      128, 0,   2,   128, 128, 128, 128, 128,  //
   6492      0,   2,   4,   128, 128, 128, 128, 128,  //
   6493      128, 128, 128, 0,   128, 128, 128, 128,  //
   6494      0,   128, 128, 2,   128, 128, 128, 128,  //
   6495      128, 0,   128, 2,   128, 128, 128, 128,  //
   6496      0,   2,   128, 4,   128, 128, 128, 128,  //
   6497      128, 128, 0,   2,   128, 128, 128, 128,  //
   6498      0,   128, 2,   4,   128, 128, 128, 128,  //
   6499      128, 0,   2,   4,   128, 128, 128, 128,  //
   6500      0,   2,   4,   6,   128, 128, 128, 128,  //
   6501      128, 128, 128, 128, 0,   128, 128, 128,  //
   6502      0,   128, 128, 128, 2,   128, 128, 128,  //
   6503      128, 0,   128, 128, 2,   128, 128, 128,  //
   6504      0,   2,   128, 128, 4,   128, 128, 128,  //
   6505      128, 128, 0,   128, 2,   128, 128, 128,  //
   6506      0,   128, 2,   128, 4,   128, 128, 128,  //
   6507      128, 0,   2,   128, 4,   128, 128, 128,  //
   6508      0,   2,   4,   128, 6,   128, 128, 128,  //
   6509      128, 128, 128, 0,   2,   128, 128, 128,  //
   6510      0,   128, 128, 2,   4,   128, 128, 128,  //
   6511      128, 0,   128, 2,   4,   128, 128, 128,  //
   6512      0,   2,   128, 4,   6,   128, 128, 128,  //
   6513      128, 128, 0,   2,   4,   128, 128, 128,  //
   6514      0,   128, 2,   4,   6,   128, 128, 128,  //
   6515      128, 0,   2,   4,   6,   128, 128, 128,  //
   6516      0,   2,   4,   6,   8,   128, 128, 128,  //
   6517      128, 128, 128, 128, 128, 0,   128, 128,  //
   6518      0,   128, 128, 128, 128, 2,   128, 128,  //
   6519      128, 0,   128, 128, 128, 2,   128, 128,  //
   6520      0,   2,   128, 128, 128, 4,   128, 128,  //
   6521      128, 128, 0,   128, 128, 2,   128, 128,  //
   6522      0,   128, 2,   128, 128, 4,   128, 128,  //
   6523      128, 0,   2,   128, 128, 4,   128, 128,  //
   6524      0,   2,   4,   128, 128, 6,   128, 128,  //
   6525      128, 128, 128, 0,   128, 2,   128, 128,  //
   6526      0,   128, 128, 2,   128, 4,   128, 128,  //
   6527      128, 0,   128, 2,   128, 4,   128, 128,  //
   6528      0,   2,   128, 4,   128, 6,   128, 128,  //
   6529      128, 128, 0,   2,   128, 4,   128, 128,  //
   6530      0,   128, 2,   4,   128, 6,   128, 128,  //
   6531      128, 0,   2,   4,   128, 6,   128, 128,  //
   6532      0,   2,   4,   6,   128, 8,   128, 128,  //
   6533      128, 128, 128, 128, 0,   2,   128, 128,  //
   6534      0,   128, 128, 128, 2,   4,   128, 128,  //
   6535      128, 0,   128, 128, 2,   4,   128, 128,  //
   6536      0,   2,   128, 128, 4,   6,   128, 128,  //
   6537      128, 128, 0,   128, 2,   4,   128, 128,  //
   6538      0,   128, 2,   128, 4,   6,   128, 128,  //
   6539      128, 0,   2,   128, 4,   6,   128, 128,  //
   6540      0,   2,   4,   128, 6,   8,   128, 128,  //
   6541      128, 128, 128, 0,   2,   4,   128, 128,  //
   6542      0,   128, 128, 2,   4,   6,   128, 128,  //
   6543      128, 0,   128, 2,   4,   6,   128, 128,  //
   6544      0,   2,   128, 4,   6,   8,   128, 128,  //
   6545      128, 128, 0,   2,   4,   6,   128, 128,  //
   6546      0,   128, 2,   4,   6,   8,   128, 128,  //
   6547      128, 0,   2,   4,   6,   8,   128, 128,  //
   6548      0,   2,   4,   6,   8,   10,  128, 128,  //
   6549      128, 128, 128, 128, 128, 128, 0,   128,  //
   6550      0,   128, 128, 128, 128, 128, 2,   128,  //
   6551      128, 0,   128, 128, 128, 128, 2,   128,  //
   6552      0,   2,   128, 128, 128, 128, 4,   128,  //
   6553      128, 128, 0,   128, 128, 128, 2,   128,  //
   6554      0,   128, 2,   128, 128, 128, 4,   128,  //
   6555      128, 0,   2,   128, 128, 128, 4,   128,  //
   6556      0,   2,   4,   128, 128, 128, 6,   128,  //
   6557      128, 128, 128, 0,   128, 128, 2,   128,  //
   6558      0,   128, 128, 2,   128, 128, 4,   128,  //
   6559      128, 0,   128, 2,   128, 128, 4,   128,  //
   6560      0,   2,   128, 4,   128, 128, 6,   128,  //
   6561      128, 128, 0,   2,   128, 128, 4,   128,  //
   6562      0,   128, 2,   4,   128, 128, 6,   128,  //
   6563      128, 0,   2,   4,   128, 128, 6,   128,  //
   6564      0,   2,   4,   6,   128, 128, 8,   128,  //
   6565      128, 128, 128, 128, 0,   128, 2,   128,  //
   6566      0,   128, 128, 128, 2,   128, 4,   128,  //
   6567      128, 0,   128, 128, 2,   128, 4,   128,  //
   6568      0,   2,   128, 128, 4,   128, 6,   128,  //
   6569      128, 128, 0,   128, 2,   128, 4,   128,  //
   6570      0,   128, 2,   128, 4,   128, 6,   128,  //
   6571      128, 0,   2,   128, 4,   128, 6,   128,  //
   6572      0,   2,   4,   128, 6,   128, 8,   128,  //
   6573      128, 128, 128, 0,   2,   128, 4,   128,  //
   6574      0,   128, 128, 2,   4,   128, 6,   128,  //
   6575      128, 0,   128, 2,   4,   128, 6,   128,  //
   6576      0,   2,   128, 4,   6,   128, 8,   128,  //
   6577      128, 128, 0,   2,   4,   128, 6,   128,  //
   6578      0,   128, 2,   4,   6,   128, 8,   128,  //
   6579      128, 0,   2,   4,   6,   128, 8,   128,  //
   6580      0,   2,   4,   6,   8,   128, 10,  128,  //
   6581      128, 128, 128, 128, 128, 0,   2,   128,  //
   6582      0,   128, 128, 128, 128, 2,   4,   128,  //
   6583      128, 0,   128, 128, 128, 2,   4,   128,  //
   6584      0,   2,   128, 128, 128, 4,   6,   128,  //
   6585      128, 128, 0,   128, 128, 2,   4,   128,  //
   6586      0,   128, 2,   128, 128, 4,   6,   128,  //
   6587      128, 0,   2,   128, 128, 4,   6,   128,  //
   6588      0,   2,   4,   128, 128, 6,   8,   128,  //
   6589      128, 128, 128, 0,   128, 2,   4,   128,  //
   6590      0,   128, 128, 2,   128, 4,   6,   128,  //
   6591      128, 0,   128, 2,   128, 4,   6,   128,  //
   6592      0,   2,   128, 4,   128, 6,   8,   128,  //
   6593      128, 128, 0,   2,   128, 4,   6,   128,  //
   6594      0,   128, 2,   4,   128, 6,   8,   128,  //
   6595      128, 0,   2,   4,   128, 6,   8,   128,  //
   6596      0,   2,   4,   6,   128, 8,   10,  128,  //
   6597      128, 128, 128, 128, 0,   2,   4,   128,  //
   6598      0,   128, 128, 128, 2,   4,   6,   128,  //
   6599      128, 0,   128, 128, 2,   4,   6,   128,  //
   6600      0,   2,   128, 128, 4,   6,   8,   128,  //
   6601      128, 128, 0,   128, 2,   4,   6,   128,  //
   6602      0,   128, 2,   128, 4,   6,   8,   128,  //
   6603      128, 0,   2,   128, 4,   6,   8,   128,  //
   6604      0,   2,   4,   128, 6,   8,   10,  128,  //
   6605      128, 128, 128, 0,   2,   4,   6,   128,  //
   6606      0,   128, 128, 2,   4,   6,   8,   128,  //
   6607      128, 0,   128, 2,   4,   6,   8,   128,  //
   6608      0,   2,   128, 4,   6,   8,   10,  128,  //
   6609      128, 128, 0,   2,   4,   6,   8,   128,  //
   6610      0,   128, 2,   4,   6,   8,   10,  128,  //
   6611      128, 0,   2,   4,   6,   8,   10,  128,  //
   6612      0,   2,   4,   6,   8,   10,  12,  128,  //
   6613      128, 128, 128, 128, 128, 128, 128, 0,    //
   6614      0,   128, 128, 128, 128, 128, 128, 2,    //
   6615      128, 0,   128, 128, 128, 128, 128, 2,    //
   6616      0,   2,   128, 128, 128, 128, 128, 4,    //
   6617      128, 128, 0,   128, 128, 128, 128, 2,    //
   6618      0,   128, 2,   128, 128, 128, 128, 4,    //
   6619      128, 0,   2,   128, 128, 128, 128, 4,    //
   6620      0,   2,   4,   128, 128, 128, 128, 6,    //
   6621      128, 128, 128, 0,   128, 128, 128, 2,    //
   6622      0,   128, 128, 2,   128, 128, 128, 4,    //
   6623      128, 0,   128, 2,   128, 128, 128, 4,    //
   6624      0,   2,   128, 4,   128, 128, 128, 6,    //
   6625      128, 128, 0,   2,   128, 128, 128, 4,    //
   6626      0,   128, 2,   4,   128, 128, 128, 6,    //
   6627      128, 0,   2,   4,   128, 128, 128, 6,    //
   6628      0,   2,   4,   6,   128, 128, 128, 8,    //
   6629      128, 128, 128, 128, 0,   128, 128, 2,    //
   6630      0,   128, 128, 128, 2,   128, 128, 4,    //
   6631      128, 0,   128, 128, 2,   128, 128, 4,    //
   6632      0,   2,   128, 128, 4,   128, 128, 6,    //
   6633      128, 128, 0,   128, 2,   128, 128, 4,    //
   6634      0,   128, 2,   128, 4,   128, 128, 6,    //
   6635      128, 0,   2,   128, 4,   128, 128, 6,    //
   6636      0,   2,   4,   128, 6,   128, 128, 8,    //
   6637      128, 128, 128, 0,   2,   128, 128, 4,    //
   6638      0,   128, 128, 2,   4,   128, 128, 6,    //
   6639      128, 0,   128, 2,   4,   128, 128, 6,    //
   6640      0,   2,   128, 4,   6,   128, 128, 8,    //
   6641      128, 128, 0,   2,   4,   128, 128, 6,    //
   6642      0,   128, 2,   4,   6,   128, 128, 8,    //
   6643      128, 0,   2,   4,   6,   128, 128, 8,    //
   6644      0,   2,   4,   6,   8,   128, 128, 10,   //
   6645      128, 128, 128, 128, 128, 0,   128, 2,    //
   6646      0,   128, 128, 128, 128, 2,   128, 4,    //
   6647      128, 0,   128, 128, 128, 2,   128, 4,    //
   6648      0,   2,   128, 128, 128, 4,   128, 6,    //
   6649      128, 128, 0,   128, 128, 2,   128, 4,    //
   6650      0,   128, 2,   128, 128, 4,   128, 6,    //
   6651      128, 0,   2,   128, 128, 4,   128, 6,    //
   6652      0,   2,   4,   128, 128, 6,   128, 8,    //
   6653      128, 128, 128, 0,   128, 2,   128, 4,    //
   6654      0,   128, 128, 2,   128, 4,   128, 6,    //
   6655      128, 0,   128, 2,   128, 4,   128, 6,    //
   6656      0,   2,   128, 4,   128, 6,   128, 8,    //
   6657      128, 128, 0,   2,   128, 4,   128, 6,    //
   6658      0,   128, 2,   4,   128, 6,   128, 8,    //
   6659      128, 0,   2,   4,   128, 6,   128, 8,    //
   6660      0,   2,   4,   6,   128, 8,   128, 10,   //
   6661      128, 128, 128, 128, 0,   2,   128, 4,    //
   6662      0,   128, 128, 128, 2,   4,   128, 6,    //
   6663      128, 0,   128, 128, 2,   4,   128, 6,    //
   6664      0,   2,   128, 128, 4,   6,   128, 8,    //
   6665      128, 128, 0,   128, 2,   4,   128, 6,    //
   6666      0,   128, 2,   128, 4,   6,   128, 8,    //
   6667      128, 0,   2,   128, 4,   6,   128, 8,    //
   6668      0,   2,   4,   128, 6,   8,   128, 10,   //
   6669      128, 128, 128, 0,   2,   4,   128, 6,    //
   6670      0,   128, 128, 2,   4,   6,   128, 8,    //
   6671      128, 0,   128, 2,   4,   6,   128, 8,    //
   6672      0,   2,   128, 4,   6,   8,   128, 10,   //
   6673      128, 128, 0,   2,   4,   6,   128, 8,    //
   6674      0,   128, 2,   4,   6,   8,   128, 10,   //
   6675      128, 0,   2,   4,   6,   8,   128, 10,   //
   6676      0,   2,   4,   6,   8,   10,  128, 12,   //
   6677      128, 128, 128, 128, 128, 128, 0,   2,    //
   6678      0,   128, 128, 128, 128, 128, 2,   4,    //
   6679      128, 0,   128, 128, 128, 128, 2,   4,    //
   6680      0,   2,   128, 128, 128, 128, 4,   6,    //
   6681      128, 128, 0,   128, 128, 128, 2,   4,    //
   6682      0,   128, 2,   128, 128, 128, 4,   6,    //
   6683      128, 0,   2,   128, 128, 128, 4,   6,    //
   6684      0,   2,   4,   128, 128, 128, 6,   8,    //
   6685      128, 128, 128, 0,   128, 128, 2,   4,    //
   6686      0,   128, 128, 2,   128, 128, 4,   6,    //
   6687      128, 0,   128, 2,   128, 128, 4,   6,    //
   6688      0,   2,   128, 4,   128, 128, 6,   8,    //
   6689      128, 128, 0,   2,   128, 128, 4,   6,    //
   6690      0,   128, 2,   4,   128, 128, 6,   8,    //
   6691      128, 0,   2,   4,   128, 128, 6,   8,    //
   6692      0,   2,   4,   6,   128, 128, 8,   10,   //
   6693      128, 128, 128, 128, 0,   128, 2,   4,    //
   6694      0,   128, 128, 128, 2,   128, 4,   6,    //
   6695      128, 0,   128, 128, 2,   128, 4,   6,    //
   6696      0,   2,   128, 128, 4,   128, 6,   8,    //
   6697      128, 128, 0,   128, 2,   128, 4,   6,    //
   6698      0,   128, 2,   128, 4,   128, 6,   8,    //
   6699      128, 0,   2,   128, 4,   128, 6,   8,    //
   6700      0,   2,   4,   128, 6,   128, 8,   10,   //
   6701      128, 128, 128, 0,   2,   128, 4,   6,    //
   6702      0,   128, 128, 2,   4,   128, 6,   8,    //
   6703      128, 0,   128, 2,   4,   128, 6,   8,    //
   6704      0,   2,   128, 4,   6,   128, 8,   10,   //
   6705      128, 128, 0,   2,   4,   128, 6,   8,    //
   6706      0,   128, 2,   4,   6,   128, 8,   10,   //
   6707      128, 0,   2,   4,   6,   128, 8,   10,   //
   6708      0,   2,   4,   6,   8,   128, 10,  12,   //
   6709      128, 128, 128, 128, 128, 0,   2,   4,    //
   6710      0,   128, 128, 128, 128, 2,   4,   6,    //
   6711      128, 0,   128, 128, 128, 2,   4,   6,    //
   6712      0,   2,   128, 128, 128, 4,   6,   8,    //
   6713      128, 128, 0,   128, 128, 2,   4,   6,    //
   6714      0,   128, 2,   128, 128, 4,   6,   8,    //
   6715      128, 0,   2,   128, 128, 4,   6,   8,    //
   6716      0,   2,   4,   128, 128, 6,   8,   10,   //
   6717      128, 128, 128, 0,   128, 2,   4,   6,    //
   6718      0,   128, 128, 2,   128, 4,   6,   8,    //
   6719      128, 0,   128, 2,   128, 4,   6,   8,    //
   6720      0,   2,   128, 4,   128, 6,   8,   10,   //
   6721      128, 128, 0,   2,   128, 4,   6,   8,    //
   6722      0,   128, 2,   4,   128, 6,   8,   10,   //
   6723      128, 0,   2,   4,   128, 6,   8,   10,   //
   6724      0,   2,   4,   6,   128, 8,   10,  12,   //
   6725      128, 128, 128, 128, 0,   2,   4,   6,    //
   6726      0,   128, 128, 128, 2,   4,   6,   8,    //
   6727      128, 0,   128, 128, 2,   4,   6,   8,    //
   6728      0,   2,   128, 128, 4,   6,   8,   10,   //
   6729      128, 128, 0,   128, 2,   4,   6,   8,    //
   6730      0,   128, 2,   128, 4,   6,   8,   10,   //
   6731      128, 0,   2,   128, 4,   6,   8,   10,   //
   6732      0,   2,   4,   128, 6,   8,   10,  12,   //
   6733      128, 128, 128, 0,   2,   4,   6,   8,    //
   6734      0,   128, 128, 2,   4,   6,   8,   10,   //
   6735      128, 0,   128, 2,   4,   6,   8,   10,   //
   6736      0,   2,   128, 4,   6,   8,   10,  12,   //
   6737      128, 128, 0,   2,   4,   6,   8,   10,   //
   6738      0,   128, 2,   4,   6,   8,   10,  12,   //
   6739      128, 0,   2,   4,   6,   8,   10,  12,   //
   6740      0,   2,   4,   6,   8,   10,  12,  14};
   6741  // Extend to double length because InterleaveLower will only use the (valid)
   6742  // lower half, and we want N u16.
   6743  const Twice<decltype(du8)> du8x2;
   6744  const Vec128<uint8_t, 2 * N> indices8 =
   6745      ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8));
   6746  const Vec128<uint16_t, N> indices16 =
   6747      BitCast(du, InterleaveLower(du8x2, indices8, indices8));
   6748  // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
   6749  // indices, add 0 to even and 1 to odd byte lanes.
   6750  const Vec128<uint16_t, N> byte_indices = Add(
   6751      indices16,
   6752      Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
   6753  return BitCast(d, TableLookupBytesOr0(v, byte_indices));
   6754 }
   6755 
   6756 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
   6757 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
   6758  const DFromV<decltype(v)> d;
   6759  const RebindToUnsigned<decltype(d)> du;
   6760 
   6761  const uint64_t mask_bits = BitsFromMask(d, mask);
   6762 
   6763  alignas(16) static constexpr uint32_t packed_array[16] = {
   6764      // PrintExpand64x4Nibble - same for 32x4.
   6765      0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
   6766      0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
   6767      0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
   6768 
   6769  // For lane i, shift the i-th 4-bit index down to bits [0, 2).
   6770  const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]);
   6771  alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
   6772  Vec128<uint32_t, N> indices = packed >> Load(du, shifts);
   6773  // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec
   6774  // checks bounds, so clear the upper bits.
   6775  indices = And(indices, Set(du, N - 1));
   6776  const Vec128<uint32_t, N> expand =
   6777      TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices));
   6778  // TableLookupLanes cannot also zero masked-off lanes, so do that now.
   6779  return IfThenElseZero(mask, BitCast(d, expand));
   6780 }
   6781 
   6782 template <typename T, HWY_IF_T_SIZE(T, 8)>
   6783 HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
   6784  // Same as Compress, just zero out the mask=false lanes.
   6785  return IfThenElseZero(mask, Compress(v, mask));
   6786 }
   6787 
   6788 // For single-element vectors, this is at least as fast as native.
   6789 template <typename T>
   6790 HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
   6791  return IfThenElseZero(mask, v);
   6792 }
   6793 
   6794 // ------------------------------ LoadExpand
   6795 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   6796 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   6797                             const TFromD<D>* HWY_RESTRICT unaligned) {
   6798  return Expand(LoadU(d, unaligned), mask);
   6799 }
   6800 
   6801 #endif  // HWY_NATIVE_EXPAND
   6802 
   6803 // ------------------------------ TwoTablesLookupLanes
   6804 
   6805 template <class D>
   6806 using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>())));
   6807 
   6808 // RVV/SVE have their own implementations of
   6809 // TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
   6810 #if HWY_TARGET != HWY_RVV && !HWY_TARGET_IS_SVE
   6811 template <class D>
   6812 HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b,
   6813                                       IndicesFromD<D> idx) {
   6814  return TwoTablesLookupLanes(a, b, idx);
   6815 }
   6816 #endif
   6817 
   6818 // ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit)
   6819 
   6820 #if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
   6821 #ifdef HWY_NATIVE_REVERSE2_8
   6822 #undef HWY_NATIVE_REVERSE2_8
   6823 #else
   6824 #define HWY_NATIVE_REVERSE2_8
   6825 #endif
   6826 
   6827 #undef HWY_PREFER_ROTATE
   6828 // Platforms on which RotateRight is likely faster than TableLookupBytes.
   6829 // RVV and SVE anyway have their own implementation of this.
   6830 #if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
   6831    HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
   6832 #define HWY_PREFER_ROTATE 1
   6833 #else
   6834 #define HWY_PREFER_ROTATE 0
   6835 #endif
   6836 
   6837 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   6838 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   6839  // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions.
   6840 #if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3
   6841  const Repartition<uint16_t, decltype(d)> du16;
   6842  return BitCast(d, RotateRight<8>(BitCast(du16, v)));
   6843 #else
   6844  const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
   6845                                                11, 10, 13, 12, 15, 14);
   6846  return TableLookupBytes(v, shuffle);
   6847 #endif
   6848 }
   6849 
   6850 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   6851 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   6852 #if HWY_PREFER_ROTATE
   6853  const Repartition<uint16_t, decltype(d)> du16;
   6854  return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
   6855 #else
   6856  const Repartition<uint8_t, decltype(d)> du8;
   6857  const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
   6858      du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
   6859  return TableLookupBytes(v, BitCast(d, shuffle));
   6860 #endif
   6861 }
   6862 
   6863 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   6864 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
   6865 #if HWY_PREFER_ROTATE
   6866  const Repartition<uint32_t, D> du32;
   6867  return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
   6868 #else
   6869  const Repartition<uint8_t, decltype(d)> du8;
   6870  const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
   6871      du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
   6872  return TableLookupBytes(v, BitCast(d, shuffle));
   6873 #endif
   6874 }
   6875 
   6876 #endif  // HWY_NATIVE_REVERSE2_8
   6877 
   6878 // ------------------------------ ReverseLaneBytes
   6879 
   6880 #if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
   6881 #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
   6882 #undef HWY_NATIVE_REVERSE_LANE_BYTES
   6883 #else
   6884 #define HWY_NATIVE_REVERSE_LANE_BYTES
   6885 #endif
   6886 
   6887 template <class V, HWY_IF_T_SIZE_V(V, 2)>
   6888 HWY_API V ReverseLaneBytes(V v) {
   6889  const DFromV<V> d;
   6890  const Repartition<uint8_t, decltype(d)> du8;
   6891  return BitCast(d, Reverse2(du8, BitCast(du8, v)));
   6892 }
   6893 
   6894 template <class V, HWY_IF_T_SIZE_V(V, 4)>
   6895 HWY_API V ReverseLaneBytes(V v) {
   6896  const DFromV<V> d;
   6897  const Repartition<uint8_t, decltype(d)> du8;
   6898  return BitCast(d, Reverse4(du8, BitCast(du8, v)));
   6899 }
   6900 
   6901 template <class V, HWY_IF_T_SIZE_V(V, 8)>
   6902 HWY_API V ReverseLaneBytes(V v) {
   6903  const DFromV<V> d;
   6904  const Repartition<uint8_t, decltype(d)> du8;
   6905  return BitCast(d, Reverse8(du8, BitCast(du8, v)));
   6906 }
   6907 
   6908 #endif  // HWY_NATIVE_REVERSE_LANE_BYTES
   6909 
   6910 // ------------------------------ ReverseBits
   6911 
   6912 // On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore
   6913 // require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit
   6914 // shifts because those would add extra masking already taken care of by
   6915 // UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to
   6916 // implement ReverseBits, so this code is not used there.
   6917 #undef HWY_REVERSE_BITS_MIN_BYTES
   6918 #if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
   6919     HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
   6920 #define HWY_REVERSE_BITS_MIN_BYTES 2
   6921 #else
   6922 #define HWY_REVERSE_BITS_MIN_BYTES 1
   6923 #endif
   6924 
   6925 #if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
   6926 #ifdef HWY_NATIVE_REVERSE_BITS_UI8
   6927 #undef HWY_NATIVE_REVERSE_BITS_UI8
   6928 #else
   6929 #define HWY_NATIVE_REVERSE_BITS_UI8
   6930 #endif
   6931 
   6932 namespace detail {
   6933 
   6934 template <int kShiftAmt, int kShrResultMask, class V,
   6935          HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)>
   6936 HWY_INLINE V UI8ReverseBitsStep(V v) {
   6937  const DFromV<decltype(v)> d;
   6938  const RebindToUnsigned<decltype(d)> du;
   6939 #if HWY_REVERSE_BITS_MIN_BYTES == 2
   6940  const Repartition<uint16_t, decltype(d)> d_shift;
   6941 #else
   6942  const RebindToUnsigned<decltype(d)> d_shift;
   6943 #endif
   6944 
   6945  const auto v_to_shift = BitCast(d_shift, v);
   6946  const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift));
   6947  const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift));
   6948  const auto shr_result_mask =
   6949      BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask)));
   6950  return Or(And(shr_result, shr_result_mask),
   6951            AndNot(shr_result_mask, shl_result));
   6952 }
   6953 
   6954 #if HWY_REVERSE_BITS_MIN_BYTES == 2
   6955 template <int kShiftAmt, int kShrResultMask, class V,
   6956          HWY_IF_V_SIZE_D(DFromV<V>, 1)>
   6957 HWY_INLINE V UI8ReverseBitsStep(V v) {
   6958  return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw})
   6959               .raw};
   6960 }
   6961 #endif
   6962 
   6963 }  // namespace detail
   6964 
   6965 template <class V, HWY_IF_T_SIZE_V(V, 1)>
   6966 HWY_API V ReverseBits(V v) {
   6967  auto result = detail::UI8ReverseBitsStep<1, 0x55>(v);
   6968  result = detail::UI8ReverseBitsStep<2, 0x33>(result);
   6969  result = detail::UI8ReverseBitsStep<4, 0x0F>(result);
   6970  return result;
   6971 }
   6972 
   6973 #endif  // HWY_NATIVE_REVERSE_BITS_UI8
   6974 
   6975 #if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
   6976 #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
   6977 #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
   6978 #else
   6979 #define HWY_NATIVE_REVERSE_BITS_UI16_32_64
   6980 #endif
   6981 
   6982 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)),
   6983          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   6984 HWY_API V ReverseBits(V v) {
   6985  const DFromV<decltype(v)> d;
   6986  const Repartition<uint8_t, decltype(d)> du8;
   6987  return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))));
   6988 }
   6989 #endif  // HWY_NATIVE_REVERSE_BITS_UI16_32_64
   6990 
   6991 // ------------------------------ Per4LaneBlockShuffle
   6992 
   6993 #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
   6994 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   6995 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   6996 #else
   6997 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   6998 #endif
   6999 
   7000 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   7001 namespace detail {
   7002 
   7003 template <class D>
   7004 HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
   7005                                             const uint32_t x2,
   7006                                             const uint32_t x1,
   7007                                             const uint32_t x0) {
   7008 #if HWY_TARGET == HWY_RVV
   7009  constexpr int kPow2 = d.Pow2();
   7010  constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
   7011  const ScalableTag<uint32_t, kLoadPow2> d_load;
   7012 #else
   7013  constexpr size_t kMaxBytes = d.MaxBytes();
   7014 #if HWY_TARGET_IS_NEON
   7015  constexpr size_t kMinLanesToLoad = 2;
   7016 #else
   7017  constexpr size_t kMinLanesToLoad = 4;
   7018 #endif
   7019  constexpr size_t kNumToLoad =
   7020      HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
   7021  const CappedTag<uint32_t, kNumToLoad> d_load;
   7022 #endif
   7023  return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
   7024 }
   7025 
   7026 }  // namespace detail
   7027 #endif
   7028 
   7029 #endif  // HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   7030 
   7031 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   7032 namespace detail {
   7033 
   7034 template <class V>
   7035 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) {
   7036  return DupEven(v);
   7037 }
   7038 
   7039 template <class V>
   7040 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) {
   7041  const DFromV<decltype(v)> d;
   7042  return Reverse2(d, v);
   7043 }
   7044 
   7045 template <class V>
   7046 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) {
   7047  return v;
   7048 }
   7049 
   7050 template <class V>
   7051 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) {
   7052  return DupOdd(v);
   7053 }
   7054 
   7055 HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3,
   7056                                           const uint32_t idx2,
   7057                                           const uint32_t idx1,
   7058                                           const uint32_t idx0) {
   7059 #if HWY_IS_LITTLE_ENDIAN
   7060  return static_cast<uint32_t>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) |
   7061                               idx0);
   7062 #else
   7063  return static_cast<uint32_t>(idx3 | (idx2 << 8) | (idx1 << 16) |
   7064                               (idx0 << 24));
   7065 #endif
   7066 }
   7067 
   7068 template <class D>
   7069 HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
   7070                                                 const uint32_t idx2,
   7071                                                 const uint32_t idx1,
   7072                                                 const uint32_t idx0) {
   7073 #if HWY_TARGET == HWY_RVV
   7074  const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
   7075 #else
   7076  const Repartition<uint32_t, D> du32;
   7077 #endif
   7078 
   7079  return ResizeBitCast(
   7080      d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
   7081 }
   7082 
   7083 #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_EMU128
   7084 #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
   7085 #else
   7086 #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
   7087 
   7088 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
   7089 HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) {
   7090  const DFromV<decltype(v)> d;
   7091  const Repartition<uint8_t, decltype(d)> du8;
   7092  return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx)));
   7093 }
   7094 
   7095 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   7096 HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
   7097                                              const uint32_t idx2,
   7098                                              const uint32_t idx1,
   7099                                              const uint32_t idx0) {
   7100  const Repartition<uint32_t, decltype(d)> du32;
   7101  const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0);
   7102  const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
   7103      du32, static_cast<uint32_t>(idx3210 + 0x0C0C0C0C),
   7104      static_cast<uint32_t>(idx3210 + 0x08080808),
   7105      static_cast<uint32_t>(idx3210 + 0x04040404),
   7106      static_cast<uint32_t>(idx3210));
   7107  return ResizeBitCast(d, v_byte_idx);
   7108 }
   7109 
   7110 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   7111 HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
   7112                                              const uint32_t idx2,
   7113                                              const uint32_t idx1,
   7114                                              const uint32_t idx0) {
   7115  const Repartition<uint32_t, decltype(d)> du32;
   7116 #if HWY_IS_LITTLE_ENDIAN
   7117  const uint32_t idx10 = static_cast<uint32_t>((idx1 << 16) | idx0);
   7118  const uint32_t idx32 = static_cast<uint32_t>((idx3 << 16) | idx2);
   7119  constexpr uint32_t kLaneByteOffsets{0x01000100};
   7120 #else
   7121  const uint32_t idx10 = static_cast<uint32_t>(idx1 | (idx0 << 16));
   7122  const uint32_t idx32 = static_cast<uint32_t>(idx3 | (idx2 << 16));
   7123  constexpr uint32_t kLaneByteOffsets{0x00010001};
   7124 #endif
   7125  constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u};
   7126 
   7127  const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
   7128      du32, static_cast<uint32_t>(idx32 * 0x0202u + kHiLaneByteOffsets),
   7129      static_cast<uint32_t>(idx10 * 0x0202u + kHiLaneByteOffsets),
   7130      static_cast<uint32_t>(idx32 * 0x0202u + kLaneByteOffsets),
   7131      static_cast<uint32_t>(idx10 * 0x0202u + kLaneByteOffsets));
   7132  return ResizeBitCast(d, v_byte_idx);
   7133 }
   7134 
   7135 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   7136 HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
   7137                                              const uint32_t idx2,
   7138                                              const uint32_t idx1,
   7139                                              const uint32_t idx0) {
   7140  const Repartition<uint32_t, decltype(d)> du32;
   7141 #if HWY_IS_LITTLE_ENDIAN
   7142  constexpr uint32_t kLaneByteOffsets{0x03020100};
   7143 #else
   7144  constexpr uint32_t kLaneByteOffsets{0x00010203};
   7145 #endif
   7146 
   7147  const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
   7148      du32, static_cast<uint32_t>(idx3 * 0x04040404u + kLaneByteOffsets),
   7149      static_cast<uint32_t>(idx2 * 0x04040404u + kLaneByteOffsets),
   7150      static_cast<uint32_t>(idx1 * 0x04040404u + kLaneByteOffsets),
   7151      static_cast<uint32_t>(idx0 * 0x04040404u + kLaneByteOffsets));
   7152  return ResizeBitCast(d, v_byte_idx);
   7153 }
   7154 #endif
   7155 
   7156 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   7157 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
   7158                                                  const uint32_t idx2,
   7159                                                  const uint32_t idx1,
   7160                                                  const uint32_t idx0) {
   7161  return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0);
   7162 }
   7163 
   7164 #if HWY_TARGET == HWY_RVV
   7165 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   7166 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
   7167                                                  const uint32_t idx2,
   7168                                                  const uint32_t idx1,
   7169                                                  const uint32_t idx0) {
   7170  const Rebind<uint8_t, decltype(d)> du8;
   7171  return PromoteTo(d,
   7172                   TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
   7173 }
   7174 #else
   7175 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   7176 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
   7177                                                  const uint32_t idx2,
   7178                                                  const uint32_t idx1,
   7179                                                  const uint32_t idx0) {
   7180  const uint16_t u16_idx0 = static_cast<uint16_t>(idx0);
   7181  const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
   7182  const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
   7183  const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
   7184 #if HWY_TARGET_IS_NEON
   7185  constexpr size_t kMinLanesToLoad = 4;
   7186 #else
   7187  constexpr size_t kMinLanesToLoad = 8;
   7188 #endif
   7189  constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
   7190  const CappedTag<uint16_t, kNumToLoad> d_load;
   7191  return ResizeBitCast(
   7192      d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
   7193                             u16_idx0, u16_idx1, u16_idx2, u16_idx3));
   7194 }
   7195 
   7196 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   7197 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
   7198                                                  const uint32_t idx2,
   7199                                                  const uint32_t idx1,
   7200                                                  const uint32_t idx0) {
   7201  return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0);
   7202 }
   7203 
   7204 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   7205 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
   7206                                                  const uint32_t idx2,
   7207                                                  const uint32_t idx1,
   7208                                                  const uint32_t idx0) {
   7209  const RebindToUnsigned<decltype(d)> du;
   7210  const Rebind<uint32_t, decltype(d)> du32;
   7211  return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2,
   7212                                                             idx1, idx0)));
   7213 }
   7214 #endif
   7215 
   7216 template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
   7217 HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
   7218                                                       const uint32_t idx2,
   7219                                                       const uint32_t idx1,
   7220                                                       const uint32_t idx0) {
   7221  const RebindToUnsigned<decltype(d)> du;
   7222  using TU = TFromD<decltype(du)>;
   7223  auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0);
   7224 
   7225  constexpr size_t kN = HWY_MAX_LANES_D(D);
   7226  if (kN < 4) {
   7227    idx_in_blk = And(idx_in_blk, Set(du, static_cast<TU>(kN - 1)));
   7228  }
   7229 
   7230 #if HWY_TARGET == HWY_RVV
   7231  const auto blk_offsets = AndS(Iota0(du), static_cast<TU>(~TU{3}));
   7232 #else
   7233  const auto blk_offsets =
   7234      And(Iota(du, TU{0}), Set(du, static_cast<TU>(~TU{3})));
   7235 #endif
   7236  return IndicesFromVec(d, Add(idx_in_blk, blk_offsets));
   7237 }
   7238 
   7239 template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
   7240 HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) {
   7241  return TableLookupLanes(v, idx);
   7242 }
   7243 
   7244 #undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE
   7245 
   7246 template <class V>
   7247 HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) {
   7248  const DFromV<decltype(v)> d;
   7249  const uint32_t idx3 = static_cast<uint32_t>((idx3210 >> 6) & 3);
   7250  const uint32_t idx2 = static_cast<uint32_t>((idx3210 >> 4) & 3);
   7251  const uint32_t idx1 = static_cast<uint32_t>((idx3210 >> 2) & 3);
   7252  const uint32_t idx0 = static_cast<uint32_t>(idx3210 & 3);
   7253  const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0);
   7254  return Per4LaneBlkShufDoTblLookup(v, idx);
   7255 }
   7256 
   7257 // The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag
   7258 // and vect_size_tag parameters are only called for vectors that have at
   7259 // least 4 lanes (or scalable vectors that might possibly have 4 or more lanes)
   7260 template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
   7261 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
   7262                                  hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
   7263                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
   7264                                  V v) {
   7265  return TblLookupPer4LaneBlkShuf(v, kIdx3210);
   7266 }
   7267 
   7268 #if HWY_HAVE_FLOAT64
   7269 template <class V>
   7270 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
   7271    hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) {
   7272  const DFromV<decltype(v)> d;
   7273  const RepartitionToWide<decltype(d)> dw;
   7274  return BitCast(dw, v);
   7275 }
   7276 #endif
   7277 
   7278 template <size_t kLaneSize, class V>
   7279 HWY_INLINE VFromD<RepartitionToWide<RebindToUnsigned<DFromV<V>>>>
   7280 Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */,
   7281                            hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
   7282  const DFromV<decltype(v)> d;
   7283  const RebindToUnsigned<decltype(d)> du;
   7284  const RepartitionToWide<decltype(du)> dw;
   7285  return BitCast(dw, v);
   7286 }
   7287 
   7288 template <size_t kLaneSize, class V>
   7289 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
   7290    hwy::NonFloatTag /* type_tag */,
   7291    hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
   7292  const DFromV<decltype(v)> d;
   7293  const RepartitionToWide<decltype(d)> dw;
   7294  return BitCast(dw, v);
   7295 }
   7296 
   7297 template <class V>
   7298 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) {
   7299  const DFromV<decltype(v)> d;
   7300  return Reverse4(d, v);
   7301 }
   7302 
   7303 template <class V,
   7304          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
   7305                                        (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
   7306 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) {
   7307  const DFromV<decltype(v)> d;
   7308  const auto vw = Per4LaneBlockShufCastToWide(
   7309      hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
   7310  return BitCast(d, DupEven(vw));
   7311 }
   7312 
   7313 template <class V,
   7314          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
   7315                                        (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
   7316 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
   7317  const DFromV<decltype(v)> d;
   7318  const auto vw = Per4LaneBlockShufCastToWide(
   7319      hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
   7320  const DFromV<decltype(vw)> dw;
   7321  return BitCast(d, Reverse2(dw, vw));
   7322 }
   7323 
   7324 #if HWY_MAX_BYTES >= 32
   7325 template <class V, HWY_IF_T_SIZE_V(V, 8)>
   7326 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
   7327  return SwapAdjacentBlocks(v);
   7328 }
   7329 #endif
   7330 
   7331 template <class V, HWY_IF_LANES_D(DFromV<V>, 4),
   7332          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
   7333 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
   7334  const DFromV<decltype(v)> d;
   7335  return InterleaveLower(d, v, v);
   7336 }
   7337 
   7338 template <class V, HWY_IF_T_SIZE_V(V, 4)>
   7339 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
   7340  const DFromV<decltype(v)> d;
   7341  return InterleaveLower(d, v, v);
   7342 }
   7343 
   7344 template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
   7345 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) {
   7346  const DFromV<decltype(v)> d;
   7347  return ConcatEven(d, v, v);
   7348 }
   7349 
   7350 template <class V>
   7351 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) {
   7352  return DupEven(v);
   7353 }
   7354 
   7355 template <class V>
   7356 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) {
   7357  const DFromV<decltype(v)> d;
   7358  return Reverse2(d, v);
   7359 }
   7360 
   7361 template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
   7362 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) {
   7363  const DFromV<decltype(v)> d;
   7364  return ConcatOdd(d, v, v);
   7365 }
   7366 
   7367 template <class V>
   7368 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) {
   7369  return v;
   7370 }
   7371 
   7372 template <class V,
   7373          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
   7374                                        (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
   7375 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) {
   7376  const DFromV<decltype(v)> d;
   7377  const auto vw = Per4LaneBlockShufCastToWide(
   7378      hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
   7379  return BitCast(d, DupOdd(vw));
   7380 }
   7381 
   7382 template <class V>
   7383 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) {
   7384  return DupOdd(v);
   7385 }
   7386 
   7387 template <class V, HWY_IF_T_SIZE_V(V, 4)>
   7388 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) {
   7389  const DFromV<decltype(v)> d;
   7390  return InterleaveUpper(d, v, v);
   7391 }
   7392 
   7393 template <size_t kIdx3210, class V>
   7394 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, V v) {
   7395  const DFromV<decltype(v)> d;
   7396  return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag<sizeof(TFromV<V>)>(),
   7397                              hwy::SizeTag<d.MaxBytes()>(), v);
   7398 }
   7399 
   7400 }  // namespace detail
   7401 #endif  // HWY_TARGET != HWY_SCALAR
   7402 
   7403 template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
   7404          HWY_IF_LANES_D(DFromV<V>, 1)>
   7405 HWY_API V Per4LaneBlockShuffle(V v) {
   7406  static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
   7407  static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
   7408  static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
   7409  static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
   7410 
   7411  return v;
   7412 }
   7413 
   7414 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   7415 template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
   7416          HWY_IF_LANES_D(DFromV<V>, 2)>
   7417 HWY_API V Per4LaneBlockShuffle(V v) {
   7418  static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
   7419  static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
   7420  static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
   7421  static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
   7422 
   7423  constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1);
   7424  constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0);
   7425  constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1);
   7426 
   7427  constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0;
   7428  static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true");
   7429  return detail::Per2LaneBlockShuffle(hwy::SizeTag<kIdx10>(), v);
   7430 }
   7431 
   7432 template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
   7433          HWY_IF_LANES_GT_D(DFromV<V>, 2)>
   7434 HWY_API V Per4LaneBlockShuffle(V v) {
   7435  static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
   7436  static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
   7437  static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
   7438  static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
   7439 
   7440  constexpr size_t kIdx3210 =
   7441      (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0;
   7442  return detail::Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210>(), v);
   7443 }
   7444 #endif
   7445 
   7446 // ------------------------------ PairwiseAdd128/PairwiseSub128
   7447 //                                (Per4LaneBlockShuffle)
   7448 #if (defined(HWY_NATIVE_PAIRWISE_ADD_128) == defined(HWY_TARGET_TOGGLE))
   7449 #ifdef HWY_NATIVE_PAIRWISE_ADD_128
   7450 #undef HWY_NATIVE_PAIRWISE_ADD_128
   7451 #else
   7452 #define HWY_NATIVE_PAIRWISE_ADD_128
   7453 #endif
   7454 
   7455 namespace detail {
   7456 
   7457 // detail::BlockwiseConcatOddEven(d, v) returns the even lanes of each block of
   7458 // v followed by the odd lanes of v
   7459 #if HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV || \
   7460    HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
   7461 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
   7462          HWY_IF_V_SIZE_GT_D(D, 8)>
   7463 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
   7464                                                                 Vec<D> v) {
   7465 #if HWY_TARGET == HWY_RVV
   7466  const ScalableTag<uint64_t, HWY_MAX(HWY_POW2_D(D), 0)> du64;
   7467 #else
   7468  const Repartition<uint64_t, DFromV<decltype(v)>> du64;
   7469 #endif
   7470 
   7471  const Repartition<TFromD<decltype(d)>, decltype(du64)> d_concat;
   7472  const auto v_to_concat = ResizeBitCast(d_concat, v);
   7473 
   7474  const auto evens = ConcatEven(d, v_to_concat, v_to_concat);
   7475  const auto odds = ConcatOdd(d, v_to_concat, v_to_concat);
   7476  return ResizeBitCast(
   7477      d, InterleaveWholeLower(BitCast(du64, evens), BitCast(du64, odds)));
   7478 }
   7479 
   7480 #else  // !(HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV)
   7481 
   7482 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
   7483 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
   7484                                                                 Vec<D> v) {
   7485 #if HWY_TARGET == HWY_SSE2
   7486  const RebindToUnsigned<decltype(d)> du;
   7487  const RebindToSigned<RepartitionToWide<decltype(du)>> dw;
   7488 
   7489  const auto vu = BitCast(du, v);
   7490  return BitCast(
   7491      d, OrderedDemote2To(du, PromoteEvenTo(dw, vu), PromoteOddTo(dw, vu)));
   7492 #else
   7493  const Repartition<uint8_t, decltype(d)> du8;
   7494  const auto idx =
   7495      BitCast(d, Dup128VecFromValues(du8, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,
   7496                                     9, 11, 13, 15));
   7497  return TableLookupBytes(v, idx);
   7498 #endif
   7499 }
   7500 
   7501 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
   7502 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d,
   7503                                                                 Vec<D> v) {
   7504 #if HWY_TARGET == HWY_SSE2
   7505  const RebindToSigned<decltype(d)> di;
   7506  const RepartitionToWide<decltype(di)> dw;
   7507  const auto vi = BitCast(di, v);
   7508  return BitCast(
   7509      d, OrderedDemote2To(di, PromoteEvenTo(dw, vi), PromoteOddTo(dw, vi)));
   7510 #else
   7511  const Repartition<uint8_t, decltype(d)> du8;
   7512  const auto idx = BitCast(d, Dup128VecFromValues(du8, 0, 1, 4, 5, 8, 9, 12, 13,
   7513                                                  2, 3, 6, 7, 10, 11, 14, 15));
   7514  return TableLookupBytes(v, idx);
   7515 #endif
   7516 }
   7517 
   7518 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
   7519 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/,
   7520                                                                 Vec<D> v) {
   7521  return Per4LaneBlockShuffle<3, 1, 2, 0>(v);
   7522 }
   7523 #endif  // HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV
   7524 
   7525 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
   7526 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/,
   7527                                                                 Vec<D> v) {
   7528  return v;
   7529 }
   7530 
   7531 }  // namespace detail
   7532 
   7533 // Pairwise add with output in 128 bit blocks of a and b.
   7534 template <class D, HWY_IF_PAIRWISE_ADD_128_D(D)>
   7535 HWY_API Vec<D> PairwiseAdd128(D d, Vec<D> a, Vec<D> b) {
   7536  return detail::BlockwiseConcatOddEven(d, PairwiseAdd(d, a, b));
   7537 }
   7538 
   7539 // Pairwise sub with output in 128 bit blocks of a and b.
   7540 template <class D, HWY_IF_PAIRWISE_SUB_128_D(D)>
   7541 HWY_API Vec<D> PairwiseSub128(D d, Vec<D> a, Vec<D> b) {
   7542  return detail::BlockwiseConcatOddEven(d, PairwiseSub(d, a, b));
   7543 }
   7544 
   7545 #endif
   7546 
   7547 // ------------------------------ Blocks
   7548 
   7549 template <class D>
   7550 HWY_API size_t Blocks(D d) {
   7551  return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD<D>) + 15) / 16);
   7552 }
   7553 
   7554 // ------------------------------ Block insert/extract/broadcast ops
   7555 #if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
   7556 #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
   7557 #undef HWY_NATIVE_BLK_INSERT_EXTRACT
   7558 #else
   7559 #define HWY_NATIVE_BLK_INSERT_EXTRACT
   7560 #endif
   7561 
   7562 template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   7563 HWY_API V InsertBlock(V /*v*/, V blk_to_insert) {
   7564  static_assert(kBlockIdx == 0, "Invalid block index");
   7565  return blk_to_insert;
   7566 }
   7567 
   7568 template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   7569 HWY_API V ExtractBlock(V v) {
   7570  static_assert(kBlockIdx == 0, "Invalid block index");
   7571  return v;
   7572 }
   7573 
   7574 template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   7575 HWY_API V BroadcastBlock(V v) {
   7576  static_assert(kBlockIdx == 0, "Invalid block index");
   7577  return v;
   7578 }
   7579 
   7580 #endif  // HWY_NATIVE_BLK_INSERT_EXTRACT
   7581 
   7582 // ------------------------------ BroadcastLane
   7583 #if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
   7584 #ifdef HWY_NATIVE_BROADCASTLANE
   7585 #undef HWY_NATIVE_BROADCASTLANE
   7586 #else
   7587 #define HWY_NATIVE_BROADCASTLANE
   7588 #endif
   7589 
   7590 template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
   7591 HWY_API V BroadcastLane(V v) {
   7592  return Broadcast<kLane>(v);
   7593 }
   7594 
   7595 #endif  // HWY_NATIVE_BROADCASTLANE
   7596 
   7597 // ------------------------------ Slide1Up and Slide1Down
   7598 #if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
   7599 #ifdef HWY_NATIVE_SLIDE1_UP_DOWN
   7600 #undef HWY_NATIVE_SLIDE1_UP_DOWN
   7601 #else
   7602 #define HWY_NATIVE_SLIDE1_UP_DOWN
   7603 #endif
   7604 
   7605 template <class D, HWY_IF_LANES_D(D, 1)>
   7606 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> /*v*/) {
   7607  return Zero(d);
   7608 }
   7609 template <class D, HWY_IF_LANES_D(D, 1)>
   7610 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) {
   7611  return Zero(d);
   7612 }
   7613 
   7614 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   7615 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
   7616 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
   7617  return ShiftLeftLanes<1>(d, v);
   7618 }
   7619 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
   7620 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   7621  return ShiftRightLanes<1>(d, v);
   7622 }
   7623 #endif  // HWY_TARGET != HWY_SCALAR
   7624 
   7625 #endif  // HWY_NATIVE_SLIDE1_UP_DOWN
   7626 
   7627 // ------------------------------ SlideUpBlocks
   7628 
   7629 template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   7630 HWY_API VFromD<D> SlideUpBlocks(D /*d*/, VFromD<D> v) {
   7631  static_assert(kBlocks == 0, "kBlocks == 0 must be true");
   7632  return v;
   7633 }
   7634 
   7635 #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
   7636 template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
   7637 HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
   7638  static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
   7639                "kBlocks must be between 0 and d.MaxBlocks() - 1");
   7640  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
   7641  return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
   7642 }
   7643 #endif
   7644 
   7645 // ------------------------------ SlideDownBlocks
   7646 
   7647 template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   7648 HWY_API VFromD<D> SlideDownBlocks(D /*d*/, VFromD<D> v) {
   7649  static_assert(kBlocks == 0, "kBlocks == 0 must be true");
   7650  return v;
   7651 }
   7652 
   7653 #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
   7654 template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
   7655 HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
   7656  static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
   7657                "kBlocks must be between 0 and d.MaxBlocks() - 1");
   7658  constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
   7659  return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
   7660 }
   7661 #endif
   7662 
   7663 // ------------------------------ Slide mask up/down
   7664 #if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
   7665 
   7666 #ifdef HWY_NATIVE_SLIDE_MASK
   7667 #undef HWY_NATIVE_SLIDE_MASK
   7668 #else
   7669 #define HWY_NATIVE_SLIDE_MASK
   7670 #endif
   7671 
   7672 template <class D>
   7673 HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) {
   7674  return MaskFromVec(Slide1Up(d, VecFromMask(d, m)));
   7675 }
   7676 
   7677 template <class D>
   7678 HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) {
   7679  return MaskFromVec(Slide1Down(d, VecFromMask(d, m)));
   7680 }
   7681 
   7682 template <class D>
   7683 HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) {
   7684  return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt));
   7685 }
   7686 
   7687 template <class D>
   7688 HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) {
   7689  return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt));
   7690 }
   7691 
   7692 #endif  // HWY_NATIVE_SLIDE_MASK
   7693 
   7694 // ------------------------------ SumsOfAdjQuadAbsDiff
   7695 
   7696 #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
   7697     defined(HWY_TARGET_TOGGLE))
   7698 #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   7699 #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   7700 #else
   7701 #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   7702 #endif
   7703 
   7704 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   7705 template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
   7706 HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
   7707  static_assert(0 <= kAOffset && kAOffset <= 1,
   7708                "kAOffset must be between 0 and 1");
   7709  static_assert(0 <= kBOffset && kBOffset <= 3,
   7710                "kBOffset must be between 0 and 3");
   7711  using D8 = DFromV<V8>;
   7712  const D8 d8;
   7713  const RebindToUnsigned<decltype(d8)> du8;
   7714  const RepartitionToWide<decltype(d8)> d16;
   7715  const RepartitionToWide<decltype(du8)> du16;
   7716 
   7717  // Ensure that a is resized to a vector that has at least
   7718  // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
   7719  // CombineShiftRightBytes operations below.
   7720 #if HWY_TARGET == HWY_RVV
   7721  // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
   7722  // to ensure that Lanes(d8_interleave) >= 16 is true.
   7723 
   7724  // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
   7725  // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
   7726  constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
   7727  const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
   7728 #elif HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
   7729  // On SVE targets, Lanes(d8_interleave) >= 16 and
   7730  // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
   7731  // tag for a full u8/i8 vector on SVE.
   7732  const D8 d8_interleave;
   7733 #else
   7734  // On targets that use non-scalable vector types, Lanes(d8_interleave) is
   7735  // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
   7736  constexpr size_t kInterleaveLanes =
   7737      HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
   7738  const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
   7739 #endif
   7740 
   7741  // The ResizeBitCast operation below will resize a to a vector that has
   7742  // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
   7743  // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
   7744  // below.
   7745  const auto a_to_interleave = ResizeBitCast(d8_interleave, a);
   7746 
   7747  const auto a_interleaved_lo =
   7748      InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
   7749  const auto a_interleaved_hi =
   7750      InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);
   7751 
   7752  /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
   7753            a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
   7754            a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
   7755            a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
   7756   */
   7757  /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
   7758            a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
   7759            a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
   7760            a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
   7761     } */
   7762 
   7763  // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
   7764  // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
   7765  // and as a01 and a23 need to be the same vector type as b01 and b23 for the
   7766  // AbsDiff operations below.
   7767  const V8 a01 =
   7768      ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
   7769                            d8_interleave, a_interleaved_hi, a_interleaved_lo));
   7770  const V8 a23 =
   7771      ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
   7772                            d8_interleave, a_interleaved_hi, a_interleaved_lo));
   7773 
   7774  /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
   7775            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
   7776            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
   7777            b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
   7778   */
   7779  /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
   7780            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
   7781            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
   7782            b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
   7783   */
   7784  const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
   7785  const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));
   7786 
   7787  const VFromD<decltype(du16)> absdiff_sum_01 =
   7788      SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
   7789  const VFromD<decltype(du16)> absdiff_sum_23 =
   7790      SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
   7791  return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
   7792 }
   7793 #endif  // HWY_TARGET != HWY_SCALAR
   7794 
   7795 #endif  // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
   7796 
   7797 // ------------------------------ SumsOfShuffledQuadAbsDiff
   7798 
   7799 #if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
   7800     defined(HWY_TARGET_TOGGLE))
   7801 #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   7802 #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   7803 #else
   7804 #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   7805 #endif
   7806 
   7807 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
   7808 template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
   7809          HWY_IF_UI8_D(DFromV<V8>)>
   7810 HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
   7811                                                                     V8 b) {
   7812  static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
   7813  static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
   7814  static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
   7815  static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
   7816 
   7817 #if HWY_TARGET == HWY_RVV
   7818  // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
   7819  // both vA and vB can be bitcasted to a u32 vector.
   7820  const detail::AdjustSimdTagToMinVecPow2<
   7821      RepartitionToWideX2<DFromV<decltype(a)>>>
   7822      d32;
   7823  const RepartitionToNarrow<decltype(d32)> d16;
   7824  const RepartitionToNarrow<decltype(d16)> d8;
   7825 
   7826  const auto vA = ResizeBitCast(d8, a);
   7827  const auto vB = ResizeBitCast(d8, b);
   7828 #else
   7829  const DFromV<decltype(a)> d8;
   7830  const RepartitionToWide<decltype(d8)> d16;
   7831  const RepartitionToWide<decltype(d16)> d32;
   7832 
   7833  const auto vA = a;
   7834  const auto vB = b;
   7835 #endif
   7836 
   7837  const RebindToUnsigned<decltype(d8)> du8;
   7838 
   7839  const auto a_shuf =
   7840      Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
   7841  /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
   7842                   a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
   7843                   a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
   7844                   a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
   7845  /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
   7846                   a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
   7847                   a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
   7848                   a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
   7849 #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
   7850  // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
   7851  // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
   7852  // lanes that are shifted into an adjacent 16-byte block as any lanes that are
   7853  // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
   7854  // replaced by the OddEven operation.
   7855  const auto a_0123_2345 = BitCast(
   7856      d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
   7857  const auto a_1234_3456 =
   7858      BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
   7859                          BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
   7860 #else
   7861  const auto a_0123_2345 =
   7862      BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
   7863  const auto a_1234_3456 = BitCast(
   7864      d8,
   7865      OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
   7866 #endif
   7867 
   7868  auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
   7869  auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));
   7870 
   7871 #if HWY_IS_LITTLE_ENDIAN
   7872  odd_sums = ShiftLeft<16>(odd_sums);
   7873 #else
   7874  even_sums = ShiftLeft<16>(even_sums);
   7875 #endif
   7876 
   7877  const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));
   7878 
   7879 #if HWY_TARGET == HWY_RVV
   7880  return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
   7881 #else
   7882  return sums;
   7883 #endif
   7884 }
   7885 #endif  // HWY_TARGET != HWY_SCALAR
   7886 
   7887 #endif  // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
   7888 
   7889 // ------------------------------ BitShuffle (Rol)
   7890 #if (defined(HWY_NATIVE_BITSHUFFLE) == defined(HWY_TARGET_TOGGLE))
   7891 #ifdef HWY_NATIVE_BITSHUFFLE
   7892 #undef HWY_NATIVE_BITSHUFFLE
   7893 #else
   7894 #define HWY_NATIVE_BITSHUFFLE
   7895 #endif
   7896 
   7897 #if HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
   7898 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>)>
   7899 HWY_API V BitShuffle(V v, VI idx) {
   7900  const DFromV<decltype(v)> d64;
   7901  const RebindToUnsigned<decltype(d64)> du64;
   7902  const Repartition<uint8_t, decltype(d64)> du8;
   7903 
   7904 #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
   7905    HWY_TARGET == HWY_WASM_EMU256
   7906  const Repartition<uint16_t, decltype(d64)> d_idx_shr;
   7907 #else
   7908  const Repartition<uint8_t, decltype(d64)> d_idx_shr;
   7909 #endif
   7910 
   7911 #if HWY_IS_LITTLE_ENDIAN
   7912  constexpr uint64_t kExtractedBitsMask =
   7913      static_cast<uint64_t>(0x8040201008040201u);
   7914 #else
   7915  constexpr uint64_t kExtractedBitsMask =
   7916      static_cast<uint64_t>(0x0102040810204080u);
   7917 #endif
   7918 
   7919  const auto k7 = Set(du8, uint8_t{0x07});
   7920 
   7921  auto unmasked_byte_idx = BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx)));
   7922 #if HWY_IS_BIG_ENDIAN
   7923  // Need to invert the lower 3 bits of unmasked_byte_idx[i] on big-endian
   7924  // targets
   7925  unmasked_byte_idx = Xor(unmasked_byte_idx, k7);
   7926 #endif  // HWY_IS_BIG_ENDIAN
   7927 
   7928  const auto byte_idx = BitwiseIfThenElse(
   7929      k7, unmasked_byte_idx,
   7930      BitCast(du8, Dup128VecFromValues(du64, uint64_t{0},
   7931                                       uint64_t{0x0808080808080808u})));
   7932  // We want to shift right by idx & 7 to extract the desired bit in `bytes`,
   7933  // and left by iota & 7 to put it in the correct output bit. To correctly
   7934  // handle shift counts from -7 to 7, we rotate.
   7935  const auto rotate_left_bits = Sub(Iota(du8, uint8_t{0}), BitCast(du8, idx));
   7936 
   7937  const auto extracted_bits =
   7938      And(Rol(TableLookupBytes(v, byte_idx), rotate_left_bits),
   7939          BitCast(du8, Set(du64, kExtractedBitsMask)));
   7940  // Combine bit-sliced (one bit per byte) into one 64-bit sum.
   7941  return BitCast(d64, SumsOf8(extracted_bits));
   7942 }
   7943 #endif  // HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
   7944 
   7945 #endif  // HWY_NATIVE_BITSHUFFLE
   7946 
   7947 template <class V, class M>
   7948 HWY_API V MaskedOr(M m, V a, V b) {
   7949  return IfThenElseZero(m, Or(a, b));
   7950 }
   7951 // ------------------------------ AllBits1/AllBits0
   7952 #if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE))
   7953 #ifdef HWY_NATIVE_ALLONES
   7954 #undef HWY_NATIVE_ALLONES
   7955 #else
   7956 #define HWY_NATIVE_ALLONES
   7957 #endif
   7958 
   7959 template <class D, class V = VFromD<D>>
   7960 HWY_API bool AllBits1(D d, V v) {
   7961  const RebindToUnsigned<decltype(d)> du;
   7962  using TU = TFromD<decltype(du)>;
   7963  return AllTrue(du, Eq(BitCast(du, v), Set(du, hwy::HighestValue<TU>())));
   7964 }
   7965 #endif  // HWY_NATIVE_ALLONES
   7966 
   7967 #if (defined(HWY_NATIVE_ALLZEROS) == defined(HWY_TARGET_TOGGLE))
   7968 #ifdef HWY_NATIVE_ALLZEROS
   7969 #undef HWY_NATIVE_ALLZEROS
   7970 #else
   7971 #define HWY_NATIVE_ALLZEROS
   7972 #endif
   7973 
   7974 template <class D, class V = VFromD<D>>
   7975 HWY_API bool AllBits0(D d, V v) {
   7976  return AllTrue(d, Eq(v, Zero(d)));
   7977 }
   7978 #endif  // HWY_NATIVE_ALLZEROS
   7979 
   7980 // ------------------------------ MultiRotateRight
   7981 #if (defined(HWY_NATIVE_MULTIROTATERIGHT) == defined(HWY_TARGET_TOGGLE))
   7982 #ifdef HWY_NATIVE_MULTIROTATERIGHT
   7983 #undef HWY_NATIVE_MULTIROTATERIGHT
   7984 #else
   7985 #define HWY_NATIVE_MULTIROTATERIGHT
   7986 #endif
   7987 
   7988 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
   7989          class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
   7990          HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)),
   7991          HWY_IF_V_SIZE_V(V, 8)>
   7992 HWY_API V MultiRotateRight(V v, VI idx) {
   7993  const DFromV<V> d64;
   7994  const Twice<decltype(d64)> dt64;
   7995  const Repartition<uint8_t, decltype(d64)> du8;
   7996  const Repartition<uint8_t, decltype(dt64)> dt_u8;
   7997  const Repartition<uint16_t, decltype(dt64)> dt_u16;
   7998  const auto k7 = Set(du8, uint8_t{0x07});
   7999  const auto k63 = Set(du8, uint8_t{0x3F});
   8000 
   8001  const auto masked_idx = And(k63, BitCast(du8, idx));
   8002 
   8003  auto byte_idx = ShiftRight<3>(masked_idx);
   8004 #if HWY_IS_LITTLE_ENDIAN
   8005  const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1}));
   8006 #else
   8007  byte_idx = Xor(byte_idx, k7);
   8008  const auto hi_byte_idx = Add(byte_idx, k7);
   8009 #endif
   8010 
   8011  const auto idx_shift = And(k7, masked_idx);
   8012 
   8013  // Calculate even lanes
   8014  const auto even_src = DupEven(ResizeBitCast(dt64, v));
   8015  // Expand indexes to pull out 16 bit segments of idx and idx + 1
   8016 #if HWY_IS_LITTLE_ENDIAN
   8017  const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, byte_idx),
   8018                                        ResizeBitCast(dt_u8, hi_byte_idx));
   8019 #else
   8020  const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, hi_byte_idx),
   8021                                        ResizeBitCast(dt_u8, byte_idx));
   8022 #endif
   8023  // TableLookupBytes indexes select from within a 16 byte block
   8024  const auto even_segments = TableLookupBytes(even_src, even_idx);
   8025  // Extract unaligned bytes from 16 bit segments
   8026  const auto even_idx_shift = PromoteTo(dt_u16, idx_shift);
   8027  const auto extracted_even_bytes =
   8028      Shr(BitCast(dt_u16, even_segments), even_idx_shift);
   8029 
   8030  // Extract the even bytes of each 128 bit block and pack into lower 64 bits
   8031 #if HWY_IS_LITTLE_ENDIAN
   8032  const auto even_lanes = BitCast(
   8033      dt64,
   8034      ConcatEven(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes)));
   8035 #else
   8036  const auto even_lanes = BitCast(
   8037      dt64,
   8038      ConcatOdd(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes)));
   8039 #endif
   8040 
   8041  return LowerHalf(d64, even_lanes);
   8042 }
   8043 
   8044 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
   8045          class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
   8046          HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)),
   8047          HWY_IF_V_SIZE_GT_V(V, 8)>
   8048 HWY_API V MultiRotateRight(V v, VI idx) {
   8049  const DFromV<V> d64;
   8050  const Repartition<uint8_t, decltype(d64)> du8;
   8051  const Repartition<uint16_t, decltype(d64)> du16;
   8052  const auto k7 = Set(du8, uint8_t{0x07});
   8053  const auto k63 = Set(du8, uint8_t{0x3F});
   8054 
   8055  const auto masked_idx = And(k63, BitCast(du8, idx));
   8056 
   8057  auto byte_idx = ShiftRight<3>(masked_idx);
   8058 #if HWY_IS_LITTLE_ENDIAN
   8059  const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1}));
   8060 #else
   8061  byte_idx = Xor(byte_idx, k7);
   8062  const auto hi_byte_idx = Add(byte_idx, k7);
   8063 #endif
   8064 
   8065  const auto idx_shift = And(k7, masked_idx);
   8066 
   8067  // Calculate even lanes
   8068  const auto even_src = DupEven(v);
   8069  // Expand indexes to pull out 16 bit segments of idx and idx + 1
   8070 #if HWY_IS_LITTLE_ENDIAN
   8071  const auto even_idx = InterleaveLower(byte_idx, hi_byte_idx);
   8072 #else
   8073  const auto even_idx = InterleaveLower(hi_byte_idx, byte_idx);
   8074 #endif
   8075  // TableLookupBytes indexes select from within a 16 byte block
   8076  const auto even_segments = TableLookupBytes(even_src, even_idx);
   8077  // Extract unaligned bytes from 16 bit segments
   8078 #if HWY_IS_LITTLE_ENDIAN
   8079  const auto even_idx_shift = ZipLower(idx_shift, Zero(du8));
   8080 #else
   8081  const auto even_idx_shift = ZipLower(Zero(du8), idx_shift);
   8082 #endif
   8083  const auto extracted_even_bytes =
   8084      Shr(BitCast(du16, even_segments), even_idx_shift);
   8085 
   8086  // Calculate odd lanes
   8087  const auto odd_src = DupOdd(v);
   8088  // Expand indexes to pull out 16 bit segments of idx and idx + 1
   8089 #if HWY_IS_LITTLE_ENDIAN
   8090  const auto odd_idx = InterleaveUpper(du8, byte_idx, hi_byte_idx);
   8091 #else
   8092  const auto odd_idx = InterleaveUpper(du8, hi_byte_idx, byte_idx);
   8093 #endif
   8094  // TableLookupBytes indexes select from within a 16 byte block
   8095  const auto odd_segments = TableLookupBytes(odd_src, odd_idx);
   8096  // Extract unaligned bytes from 16 bit segments
   8097 #if HWY_IS_LITTLE_ENDIAN
   8098  const auto odd_idx_shift = ZipUpper(du16, idx_shift, Zero(du8));
   8099 #else
   8100  const auto odd_idx_shift = ZipUpper(du16, Zero(du8), idx_shift);
   8101 #endif
   8102  const auto extracted_odd_bytes =
   8103      Shr(BitCast(du16, odd_segments), odd_idx_shift);
   8104 
   8105  // Extract the even bytes of each 128 bit block and pack into lower 64 bits
   8106 #if HWY_IS_LITTLE_ENDIAN
   8107  const auto even_lanes = BitCast(
   8108      d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_even_bytes)));
   8109  const auto odd_lanes = BitCast(
   8110      d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_odd_bytes)));
   8111 #else
   8112  const auto even_lanes = BitCast(
   8113      d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_even_bytes)));
   8114  const auto odd_lanes = BitCast(
   8115      d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_odd_bytes)));
   8116 #endif
   8117  // Interleave at 64 bit level
   8118  return InterleaveWholeLower(even_lanes, odd_lanes);
   8119 }
   8120 
   8121 #if HWY_TARGET == HWY_RVV
   8122 
   8123 // MultiRotateRight for LMUL=1/2 case on RVV
   8124 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
   8125          class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>,
   8126          HWY_IF_POW2_LE_D(DFromV<V>, 0),
   8127          HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2) / 2)>
   8128 HWY_API V MultiRotateRight(V v, VI idx) {
   8129  return MultiRotateRight(v, ResizeBitCast(Twice<DFromV<VI>>(), idx));
   8130 }
   8131 
   8132 #endif
   8133 
   8134 #endif
   8135 
   8136 // ================================================== Operator wrapper
   8137 
   8138 // SVE* and RVV currently cannot define operators and have already defined
   8139 // (only) the corresponding functions such as Add.
   8140 #if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
   8141 #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
   8142 #undef HWY_NATIVE_OPERATOR_REPLACEMENTS
   8143 #else
   8144 #define HWY_NATIVE_OPERATOR_REPLACEMENTS
   8145 #endif
   8146 
   8147 template <class V>
   8148 HWY_API V Add(V a, V b) {
   8149  return a + b;
   8150 }
   8151 template <class V>
   8152 HWY_API V Sub(V a, V b) {
   8153  return a - b;
   8154 }
   8155 
   8156 template <class V>
   8157 HWY_API V Mul(V a, V b) {
   8158  return a * b;
   8159 }
   8160 template <class V>
   8161 HWY_API V Div(V a, V b) {
   8162  return a / b;
   8163 }
   8164 template <class V>
   8165 HWY_API V Mod(V a, V b) {
   8166  return a % b;
   8167 }
   8168 
   8169 template <class V>
   8170 V Shl(V a, V b) {
   8171  return a << b;
   8172 }
   8173 template <class V>
   8174 V Shr(V a, V b) {
   8175  return a >> b;
   8176 }
   8177 
   8178 template <class V>
   8179 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
   8180  return a == b;
   8181 }
   8182 template <class V>
   8183 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
   8184  return a != b;
   8185 }
   8186 template <class V>
   8187 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
   8188  return a < b;
   8189 }
   8190 
   8191 template <class V>
   8192 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
   8193  return a > b;
   8194 }
   8195 template <class V>
   8196 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
   8197  return a >= b;
   8198 }
   8199 
   8200 template <class V>
   8201 HWY_API auto Le(V a, V b) -> decltype(a == b) {
   8202  return a <= b;
   8203 }
   8204 
   8205 #endif  // HWY_NATIVE_OPERATOR_REPLACEMENTS
   8206 
   8207 #undef HWY_GENERIC_IF_EMULATED_D
   8208 
   8209 // TODO: remove once callers are updated.
   8210 // SVE and RVV do not support DFromM because their masks are loosely typed.
   8211 #if HWY_MAX_BYTES <= 64 && !HWY_TARGET_IS_SVE && HWY_TARGET != HWY_RVV
   8212 namespace detail {
   8213 template <class M>
   8214 uint64_t BitsFromMask(M m) {
   8215  const DFromM<M> d;
   8216  return ::hwy::HWY_NAMESPACE::BitsFromMask(d, m);
   8217 }
   8218 }  // namespace detail
   8219 #endif  // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64
   8220 
   8221 // NOLINTNEXTLINE(google-readability-namespace-comments)
   8222 }  // namespace HWY_NAMESPACE
   8223 }  // namespace hwy
   8224 HWY_AFTER_NAMESPACE();