tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rvv-inl.h (282168B)


      1 // Copyright 2021 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // RISC-V V vectors (length not known at compile time).
     17 // External include guard in highway.h - see comment there.
     18 
     19 #pragma push_macro("__riscv_v_elen")
     20 
     21 // Workaround that ensures that all of the __riscv_vsetvl_* and
     22 // __riscv_vsetvlmax_* macros in riscv_vector.h are defined when compiling with
     23 // Clang 20 with dynamic dispatch and a baseline target of SCALAR or EMU128
     24 #if HWY_COMPILER_CLANG >= 2000 && HWY_COMPILER_CLANG < 2100 && \
     25    (!defined(__riscv_v_elen) || __riscv_v_elen < 64)
     26 #undef __riscv_v_elen
     27 #define __riscv_v_elen 64
     28 #endif
     29 
     30 #include <riscv_vector.h>
     31 
     32 #pragma pop_macro("__riscv_v_elen")
     33 
     34 #include "hwy/ops/shared-inl.h"
     35 
     36 HWY_BEFORE_NAMESPACE();
     37 namespace hwy {
     38 namespace HWY_NAMESPACE {
     39 
     40 // Support for vfloat16m*_t and PromoteTo/DemoteTo.
     41 #ifdef __riscv_zvfhmin
     42 #define HWY_RVV_HAVE_F16C 1
     43 #else
     44 #define HWY_RVV_HAVE_F16C 0
     45 #endif
     46 
     47 template <class V>
     48 struct DFromV_t {};  // specialized in macros
     49 template <class V>
     50 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
     51 
     52 template <class V>
     53 using TFromV = TFromD<DFromV<V>>;
     54 
     55 template <typename T, size_t N, int kPow2>
     56 constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
     57  // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
     58  // argument enables fractional LMUL < 1. Limit to 64 because that is the
     59  // largest value for which vbool##_t are defined.
     60  return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
     61 }
     62 
     63 namespace detail {
     64 
     65 template <class D>
     66 class AdjustSimdTagToMinVecPow2_t {};
     67 
     68 template <typename T, size_t N, int kPow2>
     69 class AdjustSimdTagToMinVecPow2_t<Simd<T, N, kPow2>> {
     70 private:
     71  using D = Simd<T, N, kPow2>;
     72  static constexpr int kMinVecPow2 =
     73      -3 + static_cast<int>(FloorLog2(sizeof(T)));
     74  static constexpr size_t kNumMaxLanes = HWY_MAX_LANES_D(D);
     75  static constexpr int kNewPow2 = HWY_MAX(kPow2, kMinVecPow2);
     76  static constexpr size_t kNewN = D::template NewN<kNewPow2, kNumMaxLanes>();
     77 
     78 public:
     79  using type = Simd<T, kNewN, kNewPow2>;
     80 };
     81 
     82 template <class D>
     83 using AdjustSimdTagToMinVecPow2 =
     84    typename AdjustSimdTagToMinVecPow2_t<RemoveConst<D>>::type;
     85 
     86 }  // namespace detail
     87 
     88 // ================================================== MACROS
     89 
     90 // Generate specializations and function definitions using X macros. Although
     91 // harder to read and debug, writing everything manually is too bulky.
     92 
     93 namespace detail {  // for code folding
     94 
     95 // For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
     96 // The first three arguments are arbitrary SEW, LMUL, SHIFT such that
     97 // SEW >> SHIFT = MLEN.
     98 #define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
     99  X_MACRO(64, 0, 64, NAME, OP)               \
    100  X_MACRO(32, 0, 32, NAME, OP)               \
    101  X_MACRO(16, 0, 16, NAME, OP)               \
    102  X_MACRO(8, 0, 8, NAME, OP)                 \
    103  X_MACRO(8, 1, 4, NAME, OP)                 \
    104  X_MACRO(8, 2, 2, NAME, OP)                 \
    105  X_MACRO(8, 3, 1, NAME, OP)
    106 
    107 // For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
    108 // reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
    109 // _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
    110 //
    111 // Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
    112 // reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
    113 // respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
    114 // Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
    115 
    116 // LMULS = _TRUNC: truncatable (not the smallest LMUL)
    117 #define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)            \
    118  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
    119  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
    120  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)     \
    121  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)      \
    122  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)      \
    123  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
    124 
    125 #define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)           \
    126  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
    127  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)   \
    128  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)     \
    129  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)     \
    130  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
    131 
    132 #define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)          \
    133  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
    134  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)  \
    135  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)   \
    136  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
    137 
    138 #define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)         \
    139  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
    140  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
    141  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
    142 
    143 #define HWY_RVV_FOREACH_08_GET_SET(X_MACRO, BASE, CHAR, NAME, OP)     \
    144  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
    145  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
    146  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
    147 
    148 #define HWY_RVV_FOREACH_16_GET_SET(X_MACRO, BASE, CHAR, NAME, OP)     \
    149  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
    150  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
    151  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
    152 
    153 #define HWY_RVV_FOREACH_32_GET_SET(X_MACRO, BASE, CHAR, NAME, OP)       \
    154  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
    155  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)  \
    156  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
    157 
    158 #define HWY_RVV_FOREACH_64_GET_SET(X_MACRO, BASE, CHAR, NAME, OP)       \
    159  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
    160  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
    161  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
    162 
    163 // LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
    164 #define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)           \
    165  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
    166  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
    167  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)     \
    168  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)      \
    169  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)      \
    170  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
    171 
    172 #define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)           \
    173  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
    174  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP)  \
    175  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)    \
    176  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)      \
    177  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)      \
    178  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
    179 
    180 #define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)           \
    181  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
    182  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)   \
    183  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)    \
    184  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)     \
    185  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
    186 
    187 #define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)         \
    188  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
    189  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)  \
    190  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)  \
    191  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
    192 
    193 // LMULS = _LE2: <= 2
    194 #define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP)              \
    195  X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP)  \
    196  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
    197  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
    198  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)     \
    199  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)
    200 
    201 #define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP)              \
    202  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
    203  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP)  \
    204  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)    \
    205  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)
    206 
    207 #define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP)              \
    208  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
    209  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)   \
    210  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)
    211 
    212 #define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
    213  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
    214  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)
    215 
    216 // LMULS = _EXT: not the largest LMUL
    217 #define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
    218  HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
    219  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
    220 
    221 #define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
    222  HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
    223  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
    224 
    225 #define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
    226  HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
    227  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
    228 
    229 #define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
    230  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
    231  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
    232 
    233 // LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
    234 #define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
    235  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
    236  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
    237 
    238 #define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
    239  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
    240  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
    241 
    242 #define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
    243  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
    244  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
    245 
    246 #define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
    247  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
    248  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
    249 
    250 // 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
    251 // 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
    252 // though RISC-V LMUL must be at least SEW/64 (notice that this rules out
    253 // LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
    254 // one less than should be supported, with all other parameters (vector type
    255 // etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
    256 // returns half of what it usually would.
    257 //
    258 // Notice that we can only add overloads whenever there is a D argument: those
    259 // are unique with respect to non-virtual-LMUL overloads because their kPow2
    260 // template argument differs. Otherwise, there is no actual vuint64mf2_t, and
    261 // defining another overload with the same LMUL would be an error. Thus we have
    262 // a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
    263 // _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
    264 // functions that take a D.
    265 
    266 #define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    267 
    268 #define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    269  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
    270 
    271 #define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    272  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
    273 
    274 #define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    275  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
    276 
    277 // ALL + VIRT
    278 #define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    279  HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
    280  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    281 
    282 #define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    283  HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
    284  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    285 
    286 #define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    287  HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
    288  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    289 
    290 #define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    291  HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
    292  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    293 
    294 // LE2 + VIRT
    295 #define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    296  HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
    297  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    298 
    299 #define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    300  HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
    301  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    302 
    303 #define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    304  HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
    305  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    306 
    307 #define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    308  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
    309  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    310 
    311 // GET/SET + VIRT
    312 #define HWY_RVV_FOREACH_08_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP)     \
    313  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
    314  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
    315  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)
    316 
    317 #define HWY_RVV_FOREACH_16_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP)    \
    318  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
    319  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)
    320 
    321 #define HWY_RVV_FOREACH_32_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    322  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)
    323 
    324 #define HWY_RVV_FOREACH_64_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    325 
    326 // For the smallest LMUL for each SEW, similar to the LowerHalf operator, we
    327 // provide the Get and Set operator that returns the same vector type.
    328 #define HWY_RVV_FOREACH_08_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
    329  X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP)
    330 
    331 #define HWY_RVV_FOREACH_16_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
    332  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP)
    333 
    334 #define HWY_RVV_FOREACH_32_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
    335  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP)
    336 
    337 #define HWY_RVV_FOREACH_64_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
    338  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP)
    339 
    340 // EXT + VIRT
    341 #define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    342  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
    343  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    344 
    345 #define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    346  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
    347  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    348 
    349 #define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    350  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
    351  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    352 
    353 #define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    354  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
    355  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    356 
    357 // DEMOTE + VIRT
    358 #define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    359  HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
    360  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    361 
    362 #define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    363  HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
    364  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    365 
    366 #define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    367  HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
    368  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    369 
    370 #define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
    371  HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
    372  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
    373 
    374 // SEW for unsigned:
    375 #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
    376  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
    377 #define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
    378  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
    379 #define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
    380  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
    381 #define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
    382  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
    383 
    384 // SEW for signed:
    385 #define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
    386  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
    387 #define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
    388  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
    389 #define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
    390  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
    391 #define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
    392  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
    393 
    394 // SEW for float:
    395 
    396 // Used for conversion instructions if HWY_RVV_HAVE_F16C.
    397 #define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) \
    398  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
    399 
    400 #if HWY_HAVE_FLOAT16
    401 // Full support for f16 in all ops
    402 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
    403  HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
    404 // Only BF16 is emulated.
    405 #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
    406 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
    407 #define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
    408 #else
    409 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
    410 #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
    411 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
    412 #define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D)
    413 #endif
    414 #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
    415  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
    416 #define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
    417  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
    418 
    419 // Commonly used type/SEW groups:
    420 #define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
    421  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)        \
    422  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
    423 
    424 #define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
    425  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)        \
    426  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
    427 
    428 #define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
    429  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)        \
    430  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
    431 
    432 #define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
    433  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)        \
    434  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
    435 
    436 #define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
    437  HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS)         \
    438  HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
    439 
    440 #define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
    441  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)           \
    442  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)           \
    443  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
    444 
    445 #define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
    446  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)           \
    447  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)           \
    448  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
    449 
    450 #define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
    451  HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)        \
    452  HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
    453 
    454 #define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
    455  HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)         \
    456  HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
    457 
    458 // For all combinations of SEW:
    459 #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
    460  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)     \
    461  HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
    462 
    463 #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
    464  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)     \
    465  HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
    466 
    467 #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
    468  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)     \
    469  HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
    470 
    471 // Commonly used type categories:
    472 #define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
    473  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)        \
    474  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
    475 
    476 #define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
    477  HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)    \
    478  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
    479 
    480 // Assemble types for use in x-macros
    481 #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
    482 #define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
    483 #define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
    484 #define HWY_RVV_TUP(BASE, SEW, LMUL, TUP) v##BASE##SEW##LMUL##x##TUP##_t
    485 #define HWY_RVV_M(MLEN) vbool##MLEN##_t
    486 
    487 }  // namespace detail
    488 
    489 // Until we have full intrinsic support for fractional LMUL, mixed-precision
    490 // code can use LMUL 1..8 (adequate unless they need many registers).
    491 #define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
    492                       MLEN, NAME, OP)                                         \
    493  template <>                                                                  \
    494  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                                \
    495    using Lane = HWY_RVV_T(BASE, SEW);                                         \
    496    using type = ScalableTag<Lane, SHIFT>;                                     \
    497  };
    498 
    499 HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
    500 #undef HWY_SPECIALIZE
    501 
    502 // ------------------------------ Lanes
    503 
    504 // WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL!
    505 
    506 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
    507 // HWY_RVV_CAPPED_LANES_SPECIAL_CASES provides some additional optimizations
    508 // to CappedLanes in non-debug builds
    509 #define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)                    \
    510  if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) {          \
    511    /* If cap is known to be greater than or equal to MaxLanes(d), */          \
    512    /* HWY_MIN(cap, Lanes(d)) will be equal to Lanes(d) */                     \
    513    return Lanes(d);                                                           \
    514  }                                                                            \
    515                                                                               \
    516  if ((__builtin_constant_p((cap & (cap - 1)) == 0) &&                         \
    517       ((cap & (cap - 1)) == 0)) ||                                            \
    518      (__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) &&         \
    519       (cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) {                            \
    520    /* If cap is known to be a power of 2, then */                             \
    521    /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */     \
    522    /* result as HWY_MIN(cap, Lanes(d)) as kMaxLanes is a power of 2 and */    \
    523    /* as (cap > VLMAX && cap < 2 * VLMAX) can only be true if cap is not a */ \
    524    /* power of 2 since VLMAX is always a power of 2 */                        \
    525                                                                               \
    526    /* If cap is known to be less than or equal to 4, then */                  \
    527    /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */     \
    528    /* result as HWY_MIN(cap, Lanes(d)) as HWY_MIN(cap, kMaxLanes) <= 4 is */  \
    529    /* true if cap <= 4 and as vsetvl(HWY_MIN(cap, kMaxLanes)) is */           \
    530    /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) */       \
    531    /* if HWY_MIN(cap, kMaxLanes) <= 4 is true */                              \
    532                                                                               \
    533    /* If cap is known to be less than or equal to kMinLanesPerFullVec, */     \
    534    /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */     \
    535    /* same result as HWY_MIN(cap, Lanes(d)) as */                             \
    536    /* HWY_MIN(cap, kMaxLanes) <= kMinLanesPerFullVec is true if */            \
    537    /* cap <= kMinLanesPerFullVec is true */                                   \
    538                                                                               \
    539    /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then either */       \
    540    /* cap <= 4 or cap <= kMinLanesPerFullVec must be true */                  \
    541                                                                               \
    542    /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is known to be true, */       \
    543    /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */     \
    544    /* same result as HWY_MIN(cap, Lanes(d)) */                                \
    545                                                                               \
    546    /* If no cap, avoid the HWY_MIN. */                                        \
    547    return detail::IsFull(d)                                                   \
    548               ? __riscv_vsetvl_e##SEW##LMUL(cap)                              \
    549               : __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes));         \
    550  }
    551 #else
    552 #define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)
    553 #endif
    554 
    555 #define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,  \
    556                      MLEN, NAME, OP)                                          \
    557  template <size_t N>                                                          \
    558  HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) {                      \
    559    constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW));                  \
    560    constexpr size_t kCap = MaxLanes(d);                                       \
    561    /* If no cap, avoid generating a constant by using VLMAX. */               \
    562    return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL()                       \
    563                      : __riscv_vsetvl_e##SEW##LMUL(kCap);                     \
    564  }                                                                            \
    565  template <size_t N>                                                          \
    566  HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) {  \
    567    /* NOTE: Section 6.3 of the RVV specification, which can be found at */    \
    568    /* https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, */       \
    569    /* allows vsetvl to return a result less than Lanes(d) but greater than */ \
    570    /* or equal to ((cap + 1) / 2) if */                                       \
    571    /* (Lanes(d) > 2 && cap > HWY_MAX(Lanes(d), 4) && cap < (2 * Lanes(d))) */ \
    572    /* is true */                                                              \
    573                                                                               \
    574    /* VLMAX is the number of lanes in a vector of type */                     \
    575    /* VFromD<decltype(d)>, which is returned by */                            \
    576    /* Lanes(DFromV<VFromD<decltype(d)>>()) */                                 \
    577                                                                               \
    578    /* VLMAX is guaranteed to be a power of 2 under Section 2 of the RVV */    \
    579    /* specification */                                                        \
    580                                                                               \
    581    /* The VLMAX of a vector of type VFromD<decltype(d)> is at least 2 as */   \
    582    /* the HWY_RVV target requires support for the RVV Zvl128b extension, */   \
    583    /* which guarantees that vectors with LMUL=1 are at least 16 bytes */      \
    584                                                                               \
    585    /* If VLMAX == 2 is true, then vsetvl(cap) is equal to HWY_MIN(cap, 2) */  \
    586    /* as cap == 3 is the only value such that */                              \
    587    /* (cap > VLMAX && cap < 2 * VLMAX) if VLMAX == 2 and as */                \
    588    /* ((3 + 1) / 2) is equal to 2 */                                          \
    589                                                                               \
    590    /* If cap <= 4 is true, then vsetvl(cap) must be equal to */               \
    591    /* HWY_MIN(cap, VLMAX) as cap <= VLMAX is true if VLMAX >= 4 is true */    \
    592    /* and as vsetvl(cap) is guaranteed to be equal to HWY_MIN(cap, VLMAX) */  \
    593    /* if VLMAX == 2 */                                                        \
    594                                                                               \
    595    /* We want CappedLanes(d, cap) to return Lanes(d) if cap > Lanes(d) as */  \
    596    /* LoadN(d, p, cap) expects to load exactly HWY_MIN(cap, Lanes(d)) */      \
    597    /* lanes and StoreN(v, d, p, cap) expects to store exactly */              \
    598    /* HWY_MIN(cap, Lanes(d)) lanes, even in the case where vsetvl returns */  \
    599    /* a result that is less than HWY_MIN(cap, Lanes(d)) */                    \
    600                                                                               \
    601    /* kMinLanesPerFullVec is the minimum value of VLMAX for a vector of */    \
    602    /* type VFromD<decltype(d)> */                                             \
    603    constexpr size_t kMinLanesPerFullVec =                                     \
    604        detail::ScaleByPower(16 / (SEW / 8), SHIFT);                           \
    605    /* kMaxLanes is the maximum number of lanes returned by Lanes(d) */        \
    606    constexpr size_t kMaxLanes = MaxLanes(d);                                  \
    607                                                                               \
    608    HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)                        \
    609                                                                               \
    610    if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) {                        \
    611      /* If kMaxLanes <= kMinLanesPerFullVec is true, then */                  \
    612      /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return */            \
    613      /* HWY_MIN(cap, Lanes(d)) as */                                          \
    614      /* HWY_MIN(cap, kMaxLanes) <= kMaxLanes <= VLMAX is true if */           \
    615      /* kMaxLanes <= kMinLanesPerFullVec is true */                           \
    616                                                                               \
    617      /* If kMaxLanes <= 4 is true, then vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
    618      /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) as */  \
    619      /* HWY_MIN(cap, kMaxLanes) <= 4 is true if kMaxLanes <= 4 is true */     \
    620                                                                               \
    621      /* If kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then */      \
    622      /* either kMaxLanes <= 4 or kMaxLanes <= kMinLanesPerFullVec must be */  \
    623      /* true */                                                               \
    624                                                                               \
    625      return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes));             \
    626    } else {                                                                   \
    627      /* If kMaxLanes > HWY_MAX(kMinLanesPerFullVec, 4) is true, need to */    \
    628      /* obtain the actual number of lanes using Lanes(d) and clamp cap to */  \
    629      /* the result of Lanes(d) */                                             \
    630      const size_t actual = Lanes(d);                                          \
    631      return HWY_MIN(actual, cap);                                             \
    632    }                                                                          \
    633  }
    634 
    635 #define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
    636                           SHIFT, MLEN, NAME, OP)                             \
    637  template <size_t N>                                                         \
    638  HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) {                     \
    639    constexpr size_t kCap = MaxLanes(d);                                      \
    640    /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */   \
    641    /* vsetvl may or may not be correct, so do it ourselves. */               \
    642    const size_t actual =                                                     \
    643        detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT);             \
    644    return HWY_MIN(actual, kCap);                                             \
    645  }                                                                           \
    646  template <size_t N>                                                         \
    647  HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
    648    /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */   \
    649    /* vsetvl may or may not be correct, so do it ourselves. */               \
    650    const size_t actual =                                                     \
    651        detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT);             \
    652    /* If no cap, avoid an extra HWY_MIN. */                                  \
    653    return detail::IsFull(d) ? HWY_MIN(actual, cap)                           \
    654                             : HWY_MIN(HWY_MIN(actual, cap), MaxLanes(d));    \
    655  }
    656 
    657 HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
    658 HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT)
    659 #undef HWY_RVV_LANES
    660 #undef HWY_RVV_LANES_VIRT
    661 #undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES
    662 
    663 template <class D, HWY_RVV_IF_EMULATED_D(D)>
    664 HWY_API size_t Lanes(D /* tag*/) {
    665  return Lanes(RebindToUnsigned<D>());
    666 }
    667 
    668 template <class D, HWY_RVV_IF_EMULATED_D(D)>
    669 HWY_API size_t CappedLanes(D /* tag*/, size_t cap) {
    670  return CappedLanes(RebindToUnsigned<D>(), cap);
    671 }
    672 
    673 // ------------------------------ Common x-macros
    674 
    675 // Last argument to most intrinsics. Use when the op has no d arg of its own,
    676 // which means there is no user-specified cap.
    677 #define HWY_RVV_AVL(SEW, SHIFT) \
    678  Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
    679 
    680 // vector = f(vector), e.g. Not
    681 #define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
    682                          SHIFT, MLEN, NAME, OP)                            \
    683  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {   \
    684    return __riscv_v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
    685  }
    686 
    687 // vector = f(vector, scalar), e.g. detail::AddS
    688 #define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
    689                           SHIFT, MLEN, NAME, OP)                            \
    690  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
    691      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {           \
    692    return __riscv_v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
    693  }
    694 
    695 // vector = f(vector, vector), e.g. Add
    696 #define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    697                           SHIFT, MLEN, NAME, OP)                           \
    698  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
    699      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {    \
    700    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(a, b,                       \
    701                                                HWY_RVV_AVL(SEW, SHIFT));   \
    702  }
    703 
    704 // vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
    705 #define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
    706                            SHIFT, MLEN, NAME, OP)                             \
    707  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
    708      NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m,                   \
    709           HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {       \
    710    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b,              \
    711                                                     HWY_RVV_AVL(SEW, SHIFT)); \
    712  }
    713 
    714 // mask = f(mask)
    715 #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)              \
    716  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) {                \
    717    return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
    718  }
    719 
    720 // ================================================== INIT
    721 
    722 // ------------------------------ Set
    723 
    724 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
    725                    MLEN, NAME, OP)                                         \
    726  template <size_t N>                                                       \
    727  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
    728      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) {    \
    729    return __riscv_v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d));                \
    730  }
    731 
    732 HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
    733 HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
    734 #undef HWY_RVV_SET
    735 
    736 // Treat bfloat16_t as int16_t (using the previously defined Set overloads);
    737 // required for Zero and VFromD.
    738 template <class D, HWY_IF_BF16_D(D)>
    739 decltype(Set(RebindToSigned<D>(), 0)) Set(D d, hwy::bfloat16_t arg) {
    740  return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
    741 }
    742 #if !HWY_HAVE_FLOAT16  // Otherwise already defined above.
    743 // WARNING: returns a different type than emulated bfloat16_t so that we can
    744 // implement PromoteTo overloads for both bfloat16_t and float16_t, and also
    745 // provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
    746 template <class D, HWY_IF_F16_D(D)>
    747 decltype(Set(RebindToUnsigned<D>(), 0)) Set(D d, hwy::float16_t arg) {
    748  return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
    749 }
    750 #endif
    751 
    752 template <class D>
    753 using VFromD = decltype(Set(D(), TFromD<D>()));
    754 
    755 // ------------------------------ Zero
    756 
    757 template <class D>
    758 HWY_API VFromD<D> Zero(D d) {
    759  // Cast to support bfloat16_t.
    760  const RebindToUnsigned<decltype(d)> du;
    761  return BitCast(d, Set(du, 0));
    762 }
    763 
    764 // ------------------------------ Undefined
    765 
    766 // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
    767 // by it gives unpredictable results. It should only be used for maskoff, so
    768 // keep it internal. For the Highway op, just use Zero (single instruction).
    769 namespace detail {
    770 #define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    771                          SHIFT, MLEN, NAME, OP)                           \
    772  template <size_t N>                                                      \
    773  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
    774      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) {                     \
    775    return __riscv_v##OP##_##CHAR##SEW##LMUL(); /* no AVL */               \
    776  }
    777 
    778 HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL)
    779 #undef HWY_RVV_UNDEFINED
    780 }  // namespace detail
    781 
    782 template <class D>
    783 HWY_API VFromD<D> Undefined(D d) {
    784  return Zero(d);
    785 }
    786 
    787 // ------------------------------ BitCast
    788 
    789 namespace detail {
    790 
    791 // Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
    792 #define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
    793                      MLEN, NAME, OP)                                         \
    794  HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {    \
    795    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(          \
    796        v); /* no AVL */                                                      \
    797  }
    798 HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
    799 #undef HWY_RVV_TRUNC
    800 
    801 // Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
    802 #define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
    803                    MLEN, NAME, OP)                                         \
    804  template <size_t N>                                                       \
    805  HWY_API HWY_RVV_V(BASE, SEW, LMULD)                                       \
    806      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */,                     \
    807           HWY_RVV_V(BASE, SEW, LMUL) v) {                                  \
    808    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(        \
    809        v); /* no AVL */                                                    \
    810  }
    811 HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
    812 #undef HWY_RVV_EXT
    813 
    814 // For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
    815 // the same as the actual input type.
    816 #define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    817                         SHIFT, MLEN, NAME, OP)                           \
    818  template <size_t N>                                                     \
    819  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
    820      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */,                   \
    821           HWY_RVV_V(BASE, SEW, LMUL) v) {                                \
    822    return v;                                                             \
    823  }
    824 HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
    825 #undef HWY_RVV_EXT_VIRT
    826 
    827 template <class D, HWY_RVV_IF_EMULATED_D(D)>
    828 VFromD<D> Ext(D d, VFromD<Half<D>> v) {
    829  const RebindToUnsigned<decltype(d)> du;
    830  const Half<decltype(du)> duh;
    831  return BitCast(d, Ext(du, BitCast(duh, v)));
    832 }
    833 
    834 // For BitCastToByte, the D arg is only to prevent duplicate definitions caused
    835 // by _ALL_VIRT.
    836 
    837 // There is no reinterpret from u8 <-> u8, so just return.
    838 #define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    839                        SHIFT, MLEN, NAME, OP)                           \
    840  template <typename T, size_t N>                                        \
    841  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,      \
    842                                         vuint8##LMUL##_t v) {           \
    843    return v;                                                            \
    844  }                                                                      \
    845  template <size_t N>                                                    \
    846  HWY_API vuint8##LMUL##_t BitCastFromByte(                              \
    847      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {      \
    848    return v;                                                            \
    849  }
    850 
    851 // For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
    852 #define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    853                        SHIFT, MLEN, NAME, OP)                           \
    854  template <typename T, size_t N>                                        \
    855  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,      \
    856                                         vint8##LMUL##_t v) {            \
    857    return __riscv_vreinterpret_v_i8##LMUL##_u8##LMUL(v);                \
    858  }                                                                      \
    859  template <size_t N>                                                    \
    860  HWY_API vint8##LMUL##_t BitCastFromByte(                               \
    861      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {      \
    862    return __riscv_vreinterpret_v_u8##LMUL##_i8##LMUL(v);                \
    863  }
    864 
    865 // Separate u/i because clang only provides signed <-> unsigned reinterpret for
    866 // the same SEW.
    867 #define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
    868                       MLEN, NAME, OP)                                         \
    869  template <typename T, size_t N>                                              \
    870  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,            \
    871                                         HWY_RVV_V(BASE, SEW, LMUL) v) {       \
    872    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v);                  \
    873  }                                                                            \
    874  template <size_t N>                                                          \
    875  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                          \
    876      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {            \
    877    return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v);                  \
    878  }
    879 
    880 // Signed/Float: first cast to/from unsigned
    881 #define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    882                        SHIFT, MLEN, NAME, OP)                           \
    883  template <typename T, size_t N>                                        \
    884  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,      \
    885                                         HWY_RVV_V(BASE, SEW, LMUL) v) { \
    886    return __riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL(                    \
    887        __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v));          \
    888  }                                                                      \
    889  template <size_t N>                                                    \
    890  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
    891      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {      \
    892    return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL(           \
    893        __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v));                   \
    894  }
    895 
    896 // Additional versions for virtual LMUL using LMULH for byte vectors.
    897 #define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    898                            SHIFT, MLEN, NAME, OP)                           \
    899  template <typename T, size_t N>                                            \
    900  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,         \
    901                                          HWY_RVV_V(BASE, SEW, LMUL) v) {    \
    902    return detail::Trunc(__riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
    903  }                                                                          \
    904  template <size_t N>                                                        \
    905  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                        \
    906      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) {         \
    907    HWY_RVV_D(uint, 8, N, SHIFT + 1) d2;                                     \
    908    const vuint8##LMUL##_t v2 = detail::Ext(d2, v);                          \
    909    return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2);               \
    910  }
    911 
    912 // Signed/Float: first cast to/from unsigned
    913 #define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    914                             SHIFT, MLEN, NAME, OP)                           \
    915  template <typename T, size_t N>                                             \
    916  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,          \
    917                                          HWY_RVV_V(BASE, SEW, LMUL) v) {     \
    918    return detail::Trunc(__riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL(           \
    919        __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)));              \
    920  }                                                                           \
    921  template <size_t N>                                                         \
    922  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                         \
    923      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) {          \
    924    HWY_RVV_D(uint, 8, N, SHIFT + 1) d2;                                      \
    925    const vuint8##LMUL##_t v2 = detail::Ext(d2, v);                           \
    926    return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL(                \
    927        __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2));                       \
    928  }
    929 
    930 HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
    931 HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
    932 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
    933 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
    934 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
    935 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
    936 HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
    937 HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
    938 #if HWY_HAVE_FLOAT16     // HWY_RVV_FOREACH_F already covered float16_
    939 #elif HWY_RVV_HAVE_F16C  // zvfhmin provides reinterpret* intrinsics:
    940 HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
    941 HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
    942 #else
    943 template <class D, HWY_IF_F16_D(D)>
    944 HWY_INLINE VFromD<RebindToUnsigned<D>> BitCastFromByte(
    945    D /* d */, VFromD<Repartition<uint8_t, D>> v) {
    946  return BitCastFromByte(RebindToUnsigned<D>(), v);
    947 }
    948 #endif
    949 
    950 #undef HWY_RVV_CAST_U8
    951 #undef HWY_RVV_CAST_I8
    952 #undef HWY_RVV_CAST_U
    953 #undef HWY_RVV_CAST_IF
    954 #undef HWY_RVV_CAST_VIRT_U
    955 #undef HWY_RVV_CAST_VIRT_IF
    956 
    957 template <class D, HWY_IF_BF16_D(D)>
    958 HWY_INLINE VFromD<RebindToSigned<D>> BitCastFromByte(
    959    D d, VFromD<Repartition<uint8_t, D>> v) {
    960  return BitCastFromByte(RebindToSigned<decltype(d)>(), v);
    961 }
    962 
    963 }  // namespace detail
    964 
    965 template <class D, class FromV>
    966 HWY_API VFromD<D> BitCast(D d, FromV v) {
    967  return detail::BitCastFromByte(d, detail::BitCastToByte(d, v));
    968 }
    969 
    970 // ------------------------------ Iota
    971 
    972 namespace detail {
    973 
    974 #define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,  \
    975                     MLEN, NAME, OP)                                          \
    976  template <size_t N>                                                         \
    977  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
    978    return __riscv_v##OP##_##CHAR##SEW##LMUL(Lanes(d));                       \
    979  }
    980 
    981 // For i8 lanes, this may well wrap around. Unsigned only is less error-prone.
    982 HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
    983 #undef HWY_RVV_IOTA
    984 
    985 // Used by Expand.
    986 #define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
    987                            SHIFT, MLEN, NAME, OP)                           \
    988  template <size_t N>                                                        \
    989  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
    990      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) {         \
    991    return __riscv_v##OP##_##CHAR##SEW##LMUL(mask, Lanes(d));                \
    992  }
    993 
    994 HWY_RVV_FOREACH_U(HWY_RVV_MASKED_IOTA, MaskedIota, iota_m, _ALL_VIRT)
    995 #undef HWY_RVV_MASKED_IOTA
    996 
    997 }  // namespace detail
    998 
    999 // ================================================== LOGICAL
   1000 
   1001 // ------------------------------ Not
   1002 
   1003 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV, Not, not, _ALL)
   1004 
   1005 template <class V, HWY_IF_FLOAT_V(V)>
   1006 HWY_API V Not(const V v) {
   1007  using DF = DFromV<V>;
   1008  using DU = RebindToUnsigned<DF>;
   1009  return BitCast(DF(), Not(BitCast(DU(), v)));
   1010 }
   1011 
   1012 // ------------------------------ And
   1013 
   1014 // Non-vector version (ideally immediate) for use with Iota0
   1015 namespace detail {
   1016 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
   1017 }  // namespace detail
   1018 
   1019 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, And, and, _ALL)
   1020 
   1021 template <class V, HWY_IF_FLOAT_V(V)>
   1022 HWY_API V And(const V a, const V b) {
   1023  using DF = DFromV<V>;
   1024  using DU = RebindToUnsigned<DF>;
   1025  return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
   1026 }
   1027 
   1028 // ------------------------------ Or
   1029 
   1030 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Or, or, _ALL)
   1031 
   1032 template <class V, HWY_IF_FLOAT_V(V)>
   1033 HWY_API V Or(const V a, const V b) {
   1034  using DF = DFromV<V>;
   1035  using DU = RebindToUnsigned<DF>;
   1036  return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
   1037 }
   1038 
   1039 // ------------------------------ Xor
   1040 
   1041 // Non-vector version (ideally immediate) for use with Iota0
   1042 namespace detail {
   1043 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
   1044 }  // namespace detail
   1045 
   1046 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Xor, xor, _ALL)
   1047 
   1048 template <class V, HWY_IF_FLOAT_V(V)>
   1049 HWY_API V Xor(const V a, const V b) {
   1050  using DF = DFromV<V>;
   1051  using DU = RebindToUnsigned<DF>;
   1052  return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
   1053 }
   1054 
   1055 // ------------------------------ AndNot
   1056 template <class V>
   1057 HWY_API V AndNot(const V not_a, const V b) {
   1058  return And(Not(not_a), b);
   1059 }
   1060 
   1061 // ------------------------------ Xor3
   1062 template <class V>
   1063 HWY_API V Xor3(V x1, V x2, V x3) {
   1064  return Xor(x1, Xor(x2, x3));
   1065 }
   1066 
   1067 // ------------------------------ Or3
   1068 template <class V>
   1069 HWY_API V Or3(V o1, V o2, V o3) {
   1070  return Or(o1, Or(o2, o3));
   1071 }
   1072 
   1073 // ------------------------------ OrAnd
   1074 template <class V>
   1075 HWY_API V OrAnd(const V o, const V a1, const V a2) {
   1076  return Or(o, And(a1, a2));
   1077 }
   1078 
   1079 // ------------------------------ CopySign
   1080 
   1081 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL)
   1082 
   1083 template <class V>
   1084 HWY_API V CopySignToAbs(const V abs, const V sign) {
   1085  // RVV can also handle abs < 0, so no extra action needed.
   1086  return CopySign(abs, sign);
   1087 }
   1088 
   1089 // ================================================== ARITHMETIC
   1090 
   1091 // Per-target flags to prevent generic_ops-inl.h defining Add etc.
   1092 #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
   1093 #undef HWY_NATIVE_OPERATOR_REPLACEMENTS
   1094 #else
   1095 #define HWY_NATIVE_OPERATOR_REPLACEMENTS
   1096 #endif
   1097 
   1098 // ------------------------------ Add
   1099 
   1100 namespace detail {
   1101 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
   1102 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
   1103 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
   1104 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
   1105 }  // namespace detail
   1106 
   1107 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL)
   1108 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL)
   1109 
   1110 // ------------------------------ Sub
   1111 namespace detail {
   1112 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL)
   1113 }  // namespace detail
   1114 
   1115 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL)
   1116 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL)
   1117 
   1118 // ------------------------------ Neg (ReverseSubS, Xor)
   1119 
   1120 template <class V, HWY_IF_SIGNED_V(V)>
   1121 HWY_API V Neg(const V v) {
   1122  return detail::ReverseSubS(v, 0);
   1123 }
   1124 
   1125 // vector = f(vector), but argument is repeated
   1126 #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1127                           SHIFT, MLEN, NAME, OP)                           \
   1128  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {   \
   1129    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v,                       \
   1130                                                HWY_RVV_AVL(SEW, SHIFT));   \
   1131  }
   1132 
   1133 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
   1134 
   1135 #if !HWY_HAVE_FLOAT16
   1136 
   1137 template <class V, HWY_IF_U16_D(DFromV<V>)>  // hwy::float16_t
   1138 HWY_API V Neg(V v) {
   1139  const DFromV<decltype(v)> d;
   1140  const RebindToUnsigned<decltype(d)> du;
   1141  using TU = TFromD<decltype(du)>;
   1142  return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
   1143 }
   1144 
   1145 #endif  // !HWY_HAVE_FLOAT16
   1146 
   1147 // ------------------------------ SaturatedAdd
   1148 
   1149 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
   1150 #undef HWY_NATIVE_I32_SATURATED_ADDSUB
   1151 #else
   1152 #define HWY_NATIVE_I32_SATURATED_ADDSUB
   1153 #endif
   1154 
   1155 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
   1156 #undef HWY_NATIVE_U32_SATURATED_ADDSUB
   1157 #else
   1158 #define HWY_NATIVE_U32_SATURATED_ADDSUB
   1159 #endif
   1160 
   1161 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
   1162 #undef HWY_NATIVE_I64_SATURATED_ADDSUB
   1163 #else
   1164 #define HWY_NATIVE_I64_SATURATED_ADDSUB
   1165 #endif
   1166 
   1167 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
   1168 #undef HWY_NATIVE_U64_SATURATED_ADDSUB
   1169 #else
   1170 #define HWY_NATIVE_U64_SATURATED_ADDSUB
   1171 #endif
   1172 
   1173 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL)
   1174 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL)
   1175 
   1176 // ------------------------------ SaturatedSub
   1177 
   1178 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL)
   1179 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
   1180 
   1181 // ------------------------------ AverageRound
   1182 
   1183 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
   1184 #undef HWY_NATIVE_AVERAGE_ROUND_UI32
   1185 #else
   1186 #define HWY_NATIVE_AVERAGE_ROUND_UI32
   1187 #endif
   1188 
   1189 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
   1190 #undef HWY_NATIVE_AVERAGE_ROUND_UI64
   1191 #else
   1192 #define HWY_NATIVE_AVERAGE_ROUND_UI64
   1193 #endif
   1194 
   1195 // Define this to opt-out of the default behavior, which is AVOID on certain
   1196 // compiler versions. You can define only this to use VXRM, or define both this
   1197 // and HWY_RVV_AVOID_VXRM to always avoid VXRM.
   1198 #ifndef HWY_RVV_CHOOSE_VXRM
   1199 
   1200 // Assume that GCC-13 defaults to 'avoid VXRM'. Tested with GCC 13.1.0.
   1201 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
   1202 #define HWY_RVV_AVOID_VXRM
   1203 // Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid.
   1204 // Assume that Clang 16 and earlier avoid VXRM.
   1205 #elif HWY_COMPILER_CLANG && \
   1206    (HWY_COMPILER_CLANG < 1700 || __riscv_v_intrinsic < 11000)
   1207 #define HWY_RVV_AVOID_VXRM
   1208 #endif
   1209 
   1210 #endif  // HWY_RVV_CHOOSE_VXRM
   1211 
   1212 // Adding __RISCV_VXRM_* was a backwards-incompatible change and it is not clear
   1213 // how to detect whether it is supported or required. #ifdef __RISCV_VXRM_RDN
   1214 // does not work because it seems to be a compiler built-in, but neither does
   1215 // __has_builtin(__RISCV_VXRM_RDN). The intrinsics version was also not updated,
   1216 // so we require a macro to opt out of the new intrinsics.
   1217 #ifdef HWY_RVV_AVOID_VXRM
   1218 #define HWY_RVV_INSERT_VXRM(vxrm, avl) avl
   1219 #define __RISCV_VXRM_RNU
   1220 #define __RISCV_VXRM_RDN
   1221 #else  // default: use new vxrm arguments
   1222 #define HWY_RVV_INSERT_VXRM(vxrm, avl) vxrm, avl
   1223 #endif
   1224 
   1225 // Extra rounding mode = up argument.
   1226 #define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
   1227                             SHIFT, MLEN, NAME, OP)                            \
   1228  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   1229      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {       \
   1230    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(                               \
   1231        a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
   1232  }
   1233 
   1234 HWY_RVV_FOREACH_I(HWY_RVV_RETV_AVERAGE, AverageRound, aadd, _ALL)
   1235 HWY_RVV_FOREACH_U(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
   1236 
   1237 #undef HWY_RVV_RETV_AVERAGE
   1238 
   1239 // ------------------------------ ShiftLeft[Same]
   1240 
   1241 // Intrinsics do not define .vi forms, so use .vx instead.
   1242 #define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,  \
   1243                      MLEN, NAME, OP)                                          \
   1244  template <int kBits>                                                         \
   1245  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {      \
   1246    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, kBits,                      \
   1247                                                HWY_RVV_AVL(SEW, SHIFT));      \
   1248  }                                                                            \
   1249  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   1250      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                     \
   1251    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
   1252                                                HWY_RVV_AVL(SEW, SHIFT));      \
   1253  }
   1254 
   1255 HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL)
   1256 
   1257 // ------------------------------ ShiftRight[Same]
   1258 
   1259 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL)
   1260 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
   1261 
   1262 #undef HWY_RVV_SHIFT
   1263 
   1264 // ------------------------------ RoundingShiftRight[Same]
   1265 
   1266 #ifdef HWY_NATIVE_ROUNDING_SHR
   1267 #undef HWY_NATIVE_ROUNDING_SHR
   1268 #else
   1269 #define HWY_NATIVE_ROUNDING_SHR
   1270 #endif
   1271 
   1272 // Intrinsics do not define .vi forms, so use .vx instead.
   1273 #define HWY_RVV_ROUNDING_SHR(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1274                             SHIFT, MLEN, NAME, OP)                           \
   1275  template <int kBits>                                                        \
   1276  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {     \
   1277    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(                              \
   1278        v, kBits,                                                             \
   1279        HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT)));      \
   1280  }                                                                           \
   1281  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   1282      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                    \
   1283    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(                              \
   1284        v, static_cast<uint8_t>(bits),                                        \
   1285        HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT)));      \
   1286  }
   1287 
   1288 HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssrl, _ALL)
   1289 HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssra, _ALL)
   1290 
   1291 #undef HWY_RVV_ROUNDING_SHR
   1292 
   1293 // ------------------------------ SumsOf8 (ShiftRight, Add)
   1294 template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
   1295 HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
   1296  const DFromV<VU8> du8;
   1297  const RepartitionToWide<decltype(du8)> du16;
   1298  const RepartitionToWide<decltype(du16)> du32;
   1299  const RepartitionToWide<decltype(du32)> du64;
   1300  using VU16 = VFromD<decltype(du16)>;
   1301 
   1302  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
   1303  const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
   1304  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
   1305 
   1306  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
   1307      BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
   1308  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
   1309      Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
   1310  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
   1311      BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
   1312  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
   1313      Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
   1314  return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
   1315 }
   1316 
   1317 template <class VI8, HWY_IF_I8_D(DFromV<VI8>)>
   1318 HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) {
   1319  const DFromV<VI8> di8;
   1320  const RepartitionToWide<decltype(di8)> di16;
   1321  const RepartitionToWide<decltype(di16)> di32;
   1322  const RepartitionToWide<decltype(di32)> di64;
   1323  const RebindToUnsigned<decltype(di32)> du32;
   1324  const RebindToUnsigned<decltype(di64)> du64;
   1325  using VI16 = VFromD<decltype(di16)>;
   1326 
   1327  const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
   1328  const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
   1329  const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
   1330 
   1331  const VI16 sDC_zz_98_zz_54_zz_10_zz =
   1332      BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
   1333  const VI16 sFC_xx_B8_xx_74_xx_30_xx =
   1334      Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
   1335  const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
   1336      BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
   1337  const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
   1338      Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
   1339  return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
   1340 }
   1341 
   1342 // ------------------------------ RotateRight
   1343 template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
   1344 HWY_API V RotateRight(const V v) {
   1345  const DFromV<decltype(v)> d;
   1346  const RebindToUnsigned<decltype(d)> du;
   1347 
   1348  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
   1349  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   1350  if (kBits == 0) return v;
   1351 
   1352  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
   1353            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
   1354 }
   1355 
   1356 // ------------------------------ Shl
   1357 #define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
   1358                         SHIFT, MLEN, NAME, OP)                             \
   1359  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   1360      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
   1361    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, bits,                    \
   1362                                                HWY_RVV_AVL(SEW, SHIFT));   \
   1363  }
   1364 
   1365 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL)
   1366 
   1367 #define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
   1368                         SHIFT, MLEN, NAME, OP)                             \
   1369  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   1370      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
   1371    const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du;  \
   1372    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, BitCast(du, bits),       \
   1373                                                HWY_RVV_AVL(SEW, SHIFT));   \
   1374  }
   1375 
   1376 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL)
   1377 
   1378 // ------------------------------ Shr
   1379 
   1380 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL)
   1381 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL)
   1382 
   1383 #undef HWY_RVV_SHIFT_II
   1384 #undef HWY_RVV_SHIFT_VV
   1385 
   1386 // ------------------------------ RoundingShr
   1387 #define HWY_RVV_ROUNDING_SHR_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD,   \
   1388                                LMULH, SHIFT, MLEN, NAME, OP)               \
   1389  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   1390      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
   1391    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(                            \
   1392        v, bits,                                                            \
   1393        HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT)));    \
   1394  }
   1395 
   1396 HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR_VV, RoundingShr, ssrl, _ALL)
   1397 
   1398 #define HWY_RVV_ROUNDING_SHR_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD,   \
   1399                                LMULH, SHIFT, MLEN, NAME, OP)               \
   1400  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   1401      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
   1402    const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du;  \
   1403    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(                            \
   1404        v, BitCast(du, bits),                                               \
   1405        HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT)));    \
   1406  }
   1407 
   1408 HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR_II, RoundingShr, ssra, _ALL)
   1409 
   1410 #undef HWY_RVV_ROUNDING_SHR_VV
   1411 #undef HWY_RVV_ROUNDING_SHR_II
   1412 
   1413 // ------------------------------ Min
   1414 
   1415 namespace detail {
   1416 
   1417 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MinS, minu_vx, _ALL)
   1418 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MinS, min_vx, _ALL)
   1419 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MinS, fmin_vf, _ALL)
   1420 
   1421 }  // namespace detail
   1422 
   1423 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL)
   1424 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL)
   1425 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL)
   1426 
   1427 // ------------------------------ Max
   1428 
   1429 namespace detail {
   1430 
   1431 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
   1432 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
   1433 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
   1434 
   1435 }  // namespace detail
   1436 
   1437 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL)
   1438 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL)
   1439 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL)
   1440 
   1441 // ------------------------------ Mul
   1442 
   1443 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
   1444 #ifdef HWY_NATIVE_MUL_8
   1445 #undef HWY_NATIVE_MUL_8
   1446 #else
   1447 #define HWY_NATIVE_MUL_8
   1448 #endif
   1449 #ifdef HWY_NATIVE_MUL_64
   1450 #undef HWY_NATIVE_MUL_64
   1451 #else
   1452 #define HWY_NATIVE_MUL_64
   1453 #endif
   1454 
   1455 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
   1456 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
   1457 
   1458 // ------------------------------ MulHigh
   1459 
   1460 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
   1461 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
   1462 
   1463 // ------------------------------ MulFixedPoint15
   1464 
   1465 // Extra rounding mode = up argument.
   1466 #define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,  \
   1467                      MLEN, NAME, OP)                                          \
   1468  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   1469      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {       \
   1470    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(                               \
   1471        a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
   1472  }
   1473 
   1474 HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL)
   1475 
   1476 #undef HWY_RVV_MUL15
   1477 
   1478 // ------------------------------ Div
   1479 #ifdef HWY_NATIVE_INT_DIV
   1480 #undef HWY_NATIVE_INT_DIV
   1481 #else
   1482 #define HWY_NATIVE_INT_DIV
   1483 #endif
   1484 
   1485 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL)
   1486 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL)
   1487 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
   1488 
   1489 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL)
   1490 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL)
   1491 
   1492 // ------------------------------ MaskedAddOr etc.
   1493 
   1494 #ifdef HWY_NATIVE_MASKED_ARITH
   1495 #undef HWY_NATIVE_MASKED_ARITH
   1496 #else
   1497 #define HWY_NATIVE_MASKED_ARITH
   1498 #endif
   1499 
   1500 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL)
   1501 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL)
   1502 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL)
   1503 
   1504 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL)
   1505 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL)
   1506 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL)
   1507 
   1508 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL)
   1509 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL)
   1510 
   1511 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL)
   1512 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL)
   1513 
   1514 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL)
   1515 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL)
   1516 
   1517 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL)
   1518 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL)
   1519 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL)
   1520 
   1521 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL)
   1522 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL)
   1523 
   1524 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL)
   1525 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL)
   1526 
   1527 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL)
   1528 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL)
   1529 
   1530 // ------------------------------ ApproximateReciprocal
   1531 #ifdef HWY_NATIVE_F64_APPROX_RECIP
   1532 #undef HWY_NATIVE_F64_APPROX_RECIP
   1533 #else
   1534 #define HWY_NATIVE_F64_APPROX_RECIP
   1535 #endif
   1536 
   1537 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL)
   1538 
   1539 // ------------------------------ Sqrt
   1540 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL)
   1541 
   1542 // ------------------------------ ApproximateReciprocalSqrt
   1543 #ifdef HWY_NATIVE_F64_APPROX_RSQRT
   1544 #undef HWY_NATIVE_F64_APPROX_RSQRT
   1545 #else
   1546 #define HWY_NATIVE_F64_APPROX_RSQRT
   1547 #endif
   1548 
   1549 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL)
   1550 
   1551 // ------------------------------ MulAdd
   1552 
   1553 // Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
   1554 #ifdef HWY_NATIVE_INT_FMA
   1555 #undef HWY_NATIVE_INT_FMA
   1556 #else
   1557 #define HWY_NATIVE_INT_FMA
   1558 #endif
   1559 
   1560 // Note: op is still named vv, not vvv.
   1561 #define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   1562                    MLEN, NAME, OP)                                         \
   1563  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   1564      NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x,    \
   1565           HWY_RVV_V(BASE, SEW, LMUL) add) {                                \
   1566    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x,                \
   1567                                                HWY_RVV_AVL(SEW, SHIFT));   \
   1568  }
   1569 
   1570 HWY_RVV_FOREACH_UI(HWY_RVV_FMA, MulAdd, macc, _ALL)
   1571 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL)
   1572 
   1573 // ------------------------------ NegMulAdd
   1574 HWY_RVV_FOREACH_UI(HWY_RVV_FMA, NegMulAdd, nmsac, _ALL)
   1575 HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL)
   1576 
   1577 // ------------------------------ MulSub
   1578 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL)
   1579 
   1580 // ------------------------------ NegMulSub
   1581 HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
   1582 
   1583 #undef HWY_RVV_FMA
   1584 
   1585 // ================================================== COMPARE
   1586 
   1587 // ------------------------------ MClear
   1588 
   1589 // mask = f()
   1590 #define HWY_RVV_RETM(SEW, SHIFT, MLEN, NAME, OP)                \
   1591  HWY_API HWY_RVV_M(MLEN) NAME##MLEN() {                        \
   1592    return __riscv_vm##OP##_m_b##MLEN(HWY_RVV_AVL(SEW, SHIFT)); \
   1593  }
   1594 
   1595 namespace detail {
   1596 HWY_RVV_FOREACH_B(HWY_RVV_RETM, MClear, clr)  // with ##MLEN suffix
   1597 }  // namespace detail
   1598 
   1599 #undef HWY_RVV_RETM
   1600 
   1601 // Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
   1602 // vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
   1603 // of all bits; SEW=8 / LMUL=4 = half of all bits.
   1604 
   1605 // mask = f(vector, vector)
   1606 #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1607                           SHIFT, MLEN, NAME, OP)                           \
   1608  HWY_API HWY_RVV_M(MLEN)                                                   \
   1609      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {    \
   1610    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(                  \
   1611        a, b, HWY_RVV_AVL(SEW, SHIFT));                                     \
   1612  }
   1613 
   1614 // mask = f(mask, vector, vector)
   1615 #define HWY_RVV_RETM_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1616                            SHIFT, MLEN, NAME, OP)                           \
   1617  HWY_API HWY_RVV_M(MLEN)                                                    \
   1618      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) a,                  \
   1619           HWY_RVV_V(BASE, SEW, LMUL) b) {                                   \
   1620    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN##_mu(              \
   1621        m, detail::MClear##MLEN(), a, b, HWY_RVV_AVL(SEW, SHIFT));           \
   1622  }
   1623 
   1624 // mask = f(vector, scalar)
   1625 #define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1626                           SHIFT, MLEN, NAME, OP)                           \
   1627  HWY_API HWY_RVV_M(MLEN)                                                   \
   1628      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {          \
   1629    return __riscv_v##OP##_##CHAR##SEW##LMUL##_b##MLEN(                     \
   1630        a, b, HWY_RVV_AVL(SEW, SHIFT));                                     \
   1631  }
   1632 
   1633 #ifdef HWY_NATIVE_MASKED_COMP
   1634 #undef HWY_NATIVE_MASKED_COMP
   1635 #else
   1636 #define HWY_NATIVE_MASKED_COMP
   1637 #endif
   1638 
   1639 // ------------------------------ Eq
   1640 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL)
   1641 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL)
   1642 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedEq, mseq, _ALL)
   1643 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedEq, mfeq, _ALL)
   1644 
   1645 namespace detail {
   1646 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
   1647 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
   1648 }  // namespace detail
   1649 
   1650 // ------------------------------ Ne
   1651 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL)
   1652 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL)
   1653 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedNe, msne, _ALL)
   1654 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedNe, mfne, _ALL)
   1655 
   1656 namespace detail {
   1657 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
   1658 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
   1659 }  // namespace detail
   1660 
   1661 // ------------------------------ Lt
   1662 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL)
   1663 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL)
   1664 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL)
   1665 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLt, msltu, _ALL)
   1666 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLt, mslt, _ALL)
   1667 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLt, mflt, _ALL)
   1668 
   1669 namespace detail {
   1670 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
   1671 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
   1672 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
   1673 }  // namespace detail
   1674 
   1675 // ------------------------------ Le
   1676 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL)
   1677 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL)
   1678 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL)
   1679 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLe, msleu, _ALL)
   1680 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLe, msle, _ALL)
   1681 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLe, mfle, _ALL)
   1682 
   1683 template <class D>
   1684 using MFromD = decltype(Eq(Zero(D()), Zero(D())));
   1685 
   1686 template <class V, class M, class D = DFromV<V>>
   1687 HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
   1688  return MaskedNe(m, v, v);
   1689 }
   1690 
   1691 #undef HWY_RVV_RETM_ARGMVV
   1692 #undef HWY_RVV_RETM_ARGVV
   1693 #undef HWY_RVV_RETM_ARGVS
   1694 
   1695 // ------------------------------ Gt/Ge (Lt, Le)
   1696 
   1697 // Swap args to reverse comparisons:
   1698 template <class V>
   1699 HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
   1700  return Lt(b, a);
   1701 }
   1702 
   1703 template <class V>
   1704 HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
   1705  return Le(b, a);
   1706 }
   1707 
   1708 template <class V, class M, class D = DFromV<V>>
   1709 HWY_API MFromD<D> MaskedGt(M m, V a, V b) {
   1710  return MaskedLt(m, b, a);
   1711 }
   1712 
   1713 template <class V, class M, class D = DFromV<V>>
   1714 HWY_API MFromD<D> MaskedGe(M m, V a, V b) {
   1715  return MaskedLe(m, b, a);
   1716 }
   1717 
   1718 // ------------------------------ TestBit
   1719 template <class V>
   1720 HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
   1721  return detail::NeS(And(a, bit), 0);
   1722 }
   1723 
   1724 // ------------------------------ Not
   1725 // NOLINTNEXTLINE
   1726 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not )
   1727 
   1728 // ------------------------------ And
   1729 
   1730 // mask = f(mask_a, mask_b) (note arg2,arg1 order!)
   1731 #define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)                 \
   1732  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
   1733    return __riscv_vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
   1734  }
   1735 
   1736 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, And, and)
   1737 
   1738 // ------------------------------ AndNot
   1739 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn)
   1740 
   1741 // ------------------------------ Or
   1742 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or)
   1743 
   1744 // ------------------------------ Xor
   1745 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor)
   1746 
   1747 // ------------------------------ ExclusiveNeither
   1748 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor)
   1749 
   1750 #undef HWY_RVV_RETM_ARGMM
   1751 
   1752 // ------------------------------ IfThenElse
   1753 
   1754 #define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1755                             SHIFT, MLEN, NAME, OP)                           \
   1756  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   1757      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,                 \
   1758           HWY_RVV_V(BASE, SEW, LMUL) no) {                                   \
   1759    return __riscv_v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m,                  \
   1760                                                 HWY_RVV_AVL(SEW, SHIFT));    \
   1761  }
   1762 
   1763 HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL)
   1764 
   1765 #undef HWY_RVV_IF_THEN_ELSE
   1766 
   1767 // ------------------------------ IfThenElseZero
   1768 template <class M, class V>
   1769 HWY_API V IfThenElseZero(const M mask, const V yes) {
   1770  return IfThenElse(mask, yes, Zero(DFromV<V>()));
   1771 }
   1772 
   1773 // ------------------------------ IfThenZeroElse
   1774 
   1775 #define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
   1776                                  LMULH, SHIFT, MLEN, NAME, OP)             \
   1777  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   1778      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) {              \
   1779    return __riscv_v##OP##_##CHAR##SEW##LMUL(no, 0, m,                      \
   1780                                             HWY_RVV_AVL(SEW, SHIFT));      \
   1781  }
   1782 
   1783 HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL)
   1784 HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
   1785 
   1786 #undef HWY_RVV_IF_THEN_ZERO_ELSE
   1787 
   1788 // ------------------------------ MaskFromVec
   1789 template <class V>
   1790 HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
   1791  return detail::NeS(v, 0);
   1792 }
   1793 
   1794 // ------------------------------ IsNegative (MFromD)
   1795 #ifdef HWY_NATIVE_IS_NEGATIVE
   1796 #undef HWY_NATIVE_IS_NEGATIVE
   1797 #else
   1798 #define HWY_NATIVE_IS_NEGATIVE
   1799 #endif
   1800 
   1801 // Generic for all vector lengths
   1802 template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
   1803 HWY_API MFromD<DFromV<V>> IsNegative(V v) {
   1804  const DFromV<decltype(v)> d;
   1805  const RebindToSigned<decltype(d)> di;
   1806  using TI = TFromD<decltype(di)>;
   1807 
   1808  return detail::LtS(BitCast(di, v), static_cast<TI>(0));
   1809 }
   1810 
   1811 // ------------------------------ MaskFalse
   1812 
   1813 // For mask ops including vmclr, elements past VL are tail-agnostic and cannot
   1814 // be relied upon, so define a variant of the generic_ops-inl implementation of
   1815 // MaskFalse that ensures all bits are zero as required by mask_test.
   1816 #ifdef HWY_NATIVE_MASK_FALSE
   1817 #undef HWY_NATIVE_MASK_FALSE
   1818 #else
   1819 #define HWY_NATIVE_MASK_FALSE
   1820 #endif
   1821 
   1822 template <class D>
   1823 HWY_API MFromD<D> MaskFalse(D d) {
   1824  const DFromV<VFromD<decltype(d)>> d_full;
   1825  return MaskFromVec(Zero(d_full));
   1826 }
   1827 
   1828 // ------------------------------ RebindMask
   1829 template <class D, typename MFrom>
   1830 HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
   1831  // No need to check lane size/LMUL are the same: if not, casting MFrom to
   1832  // MFromD<D> would fail.
   1833  return mask;
   1834 }
   1835 
   1836 // ------------------------------ VecFromMask
   1837 
   1838 // Returns mask ? ~0 : 0. No longer use sub.vx(Zero(), 1, mask) because per the
   1839 // default mask-agnostic policy, the result of inactive lanes may also be ~0.
   1840 #define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1841                              SHIFT, MLEN, NAME, OP)                           \
   1842  template <size_t N>                                                          \
   1843  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   1844      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) {              \
   1845    /* MaskFalse requires we set all lanes for capped d and virtual LMUL. */   \
   1846    const DFromV<VFromD<decltype(d)>> d_full;                                  \
   1847    const RebindToSigned<decltype(d_full)> di;                                 \
   1848    using TI = TFromD<decltype(di)>;                                           \
   1849    return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m,   \
   1850                                                        Lanes(d_full)));       \
   1851  }
   1852 
   1853 HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT)
   1854 
   1855 #undef HWY_RVV_VEC_FROM_MASK
   1856 
   1857 template <class D, HWY_IF_FLOAT_D(D)>
   1858 HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
   1859  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
   1860 }
   1861 
   1862 // ------------------------------ IfVecThenElse (MaskFromVec)
   1863 template <class V>
   1864 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
   1865  return IfThenElse(MaskFromVec(mask), yes, no);
   1866 }
   1867 
   1868 // ------------------------------ BroadcastSignBit
   1869 template <class V, HWY_IF_SIGNED_V(V)>
   1870 HWY_API V BroadcastSignBit(const V v) {
   1871  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
   1872 }
   1873 
   1874 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
   1875 template <class V>
   1876 HWY_API V IfNegativeThenElse(V v, V yes, V no) {
   1877  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
   1878  return IfThenElse(IsNegative(v), yes, no);
   1879 }
   1880 
   1881 // ------------------------------ FindFirstTrue
   1882 
   1883 #define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)            \
   1884  template <class D>                                                   \
   1885  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) {             \
   1886    static_assert(MLenFromD(d) == MLEN, "Type mismatch");              \
   1887    return __riscv_vfirst_m_b##MLEN(m, Lanes(d));                      \
   1888  }                                                                    \
   1889  template <class D>                                                   \
   1890  HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) {          \
   1891    static_assert(MLenFromD(d) == MLEN, "Type mismatch");              \
   1892    return static_cast<size_t>(__riscv_vfirst_m_b##MLEN(m, Lanes(d))); \
   1893  }
   1894 
   1895 HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _)
   1896 #undef HWY_RVV_FIND_FIRST_TRUE
   1897 
   1898 // ------------------------------ AllFalse
   1899 template <class D>
   1900 HWY_API bool AllFalse(D d, MFromD<D> m) {
   1901  return FindFirstTrue(d, m) < 0;
   1902 }
   1903 
   1904 // ------------------------------ AllTrue
   1905 
   1906 #define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)          \
   1907  template <class D>                                          \
   1908  HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) {              \
   1909    static_assert(MLenFromD(d) == MLEN, "Type mismatch");     \
   1910    return AllFalse(d, __riscv_vmnot_m_b##MLEN(m, Lanes(d))); \
   1911  }
   1912 
   1913 HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _)
   1914 #undef HWY_RVV_ALL_TRUE
   1915 
   1916 // ------------------------------ CountTrue
   1917 
   1918 #define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)    \
   1919  template <class D>                                      \
   1920  HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) {      \
   1921    static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
   1922    return __riscv_vcpop_m_b##MLEN(m, Lanes(d));          \
   1923  }
   1924 
   1925 HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
   1926 #undef HWY_RVV_COUNT_TRUE
   1927 
   1928 // ------------------------------ PromoteMaskTo
   1929 
   1930 #ifdef HWY_NATIVE_PROMOTE_MASK_TO
   1931 #undef HWY_NATIVE_PROMOTE_MASK_TO
   1932 #else
   1933 #define HWY_NATIVE_PROMOTE_MASK_TO
   1934 #endif
   1935 
   1936 template <class DTo, class DFrom,
   1937          HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
   1938          hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
   1939 HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
   1940                                  MFromD<DFrom> m) {
   1941  return m;
   1942 }
   1943 
   1944 // ------------------------------ DemoteMaskTo
   1945 
   1946 #ifdef HWY_NATIVE_DEMOTE_MASK_TO
   1947 #undef HWY_NATIVE_DEMOTE_MASK_TO
   1948 #else
   1949 #define HWY_NATIVE_DEMOTE_MASK_TO
   1950 #endif
   1951 
   1952 template <class DTo, class DFrom,
   1953          HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
   1954          hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
   1955 HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
   1956                                 MFromD<DFrom> m) {
   1957  return m;
   1958 }
   1959 
   1960 // ================================================== MEMORY
   1961 
   1962 // ------------------------------ Load
   1963 
   1964 #define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   1965                     MLEN, NAME, OP)                                         \
   1966  template <size_t N>                                                        \
   1967  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
   1968      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                 \
   1969           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                    \
   1970    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(                         \
   1971        detail::NativeLanePointer(p), Lanes(d));                             \
   1972  }
   1973 HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
   1974 #undef HWY_RVV_LOAD
   1975 
   1976 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   1977 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
   1978  const RebindToUnsigned<decltype(d)> du;
   1979  return BitCast(d, Load(du, detail::U16LanePointer(p)));
   1980 }
   1981 
   1982 // ------------------------------ LoadU
   1983 template <class D>
   1984 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   1985  // RVV only requires element alignment, not vector alignment.
   1986  return Load(d, p);
   1987 }
   1988 
   1989 // ------------------------------ MaskedLoad
   1990 
   1991 #define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   1992                            SHIFT, MLEN, NAME, OP)                           \
   1993  template <size_t N>                                                        \
   1994  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
   1995      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d,              \
   1996           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                    \
   1997    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu(                    \
   1998        m, Zero(d), detail::NativeLanePointer(p), Lanes(d));                 \
   1999  }                                                                          \
   2000  template <size_t N>                                                        \
   2001  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
   2002      NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m,              \
   2003               HWY_RVV_D(BASE, SEW, N, SHIFT) d,                             \
   2004               const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                \
   2005    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu(                    \
   2006        m, v, detail::NativeLanePointer(p), Lanes(d));                       \
   2007  }
   2008 
   2009 HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
   2010 #undef HWY_RVV_MASKED_LOAD
   2011 
   2012 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2013 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   2014                             const TFromD<D>* HWY_RESTRICT p) {
   2015  const RebindToUnsigned<decltype(d)> du;
   2016  return BitCast(d,
   2017                 MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
   2018 }
   2019 
   2020 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2021 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> no, MFromD<D> m, D d,
   2022                               const TFromD<D>* HWY_RESTRICT p) {
   2023  const RebindToUnsigned<decltype(d)> du;
   2024  return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du,
   2025                                 detail::U16LanePointer(p)));
   2026 }
   2027 
   2028 // ------------------------------ LoadN
   2029 
   2030 // Native with avl is faster than the generic_ops using FirstN.
   2031 #ifdef HWY_NATIVE_LOAD_N
   2032 #undef HWY_NATIVE_LOAD_N
   2033 #else
   2034 #define HWY_NATIVE_LOAD_N
   2035 #endif
   2036 
   2037 #define HWY_RVV_LOADN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   2038                      MLEN, NAME, OP)                                         \
   2039  template <size_t N>                                                         \
   2040  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   2041      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                  \
   2042           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) {   \
   2043    /* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */   \
   2044    /* operation below will leave any lanes past the first */                 \
   2045    /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */       \
   2046    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu(                     \
   2047        Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes));    \
   2048  }                                                                           \
   2049  template <size_t N>                                                         \
   2050  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or(                                \
   2051      HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d,        \
   2052      const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) {        \
   2053    /* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \
   2054    /* operation below will set any lanes past the first */                   \
   2055    /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */          \
   2056    /* corresponding lanes in no */                                           \
   2057    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu(                     \
   2058        no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes));         \
   2059  }
   2060 
   2061 HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
   2062 #undef HWY_RVV_LOADN
   2063 
   2064 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2065 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
   2066                        size_t num_lanes) {
   2067  const RebindToUnsigned<D> du;
   2068  return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
   2069 }
   2070 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2071 HWY_API VFromD<D> LoadNOr(VFromD<D> v, D d, const TFromD<D>* HWY_RESTRICT p,
   2072                          size_t num_lanes) {
   2073  const RebindToUnsigned<D> du;
   2074  return BitCast(
   2075      d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes));
   2076 }
   2077 
   2078 // ------------------------------ Store
   2079 
   2080 #define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   2081                      MLEN, NAME, OP)                                         \
   2082  template <size_t N>                                                         \
   2083  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                             \
   2084                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                         \
   2085                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                  \
   2086    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(                          \
   2087        detail::NativeLanePointer(p), v, Lanes(d));                           \
   2088  }
   2089 HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
   2090 #undef HWY_RVV_STORE
   2091 
   2092 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2093 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   2094  const RebindToUnsigned<decltype(d)> du;
   2095  Store(BitCast(du, v), du, detail::U16LanePointer(p));
   2096 }
   2097 
   2098 // ------------------------------ BlendedStore
   2099 
   2100 #define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   2101                              SHIFT, MLEN, NAME, OP)                           \
   2102  template <size_t N>                                                          \
   2103  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m,           \
   2104                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                          \
   2105                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                   \
   2106    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(                       \
   2107        m, detail::NativeLanePointer(p), v, Lanes(d));                         \
   2108  }
   2109 HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT)
   2110 #undef HWY_RVV_BLENDED_STORE
   2111 
   2112 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2113 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   2114                          TFromD<D>* HWY_RESTRICT p) {
   2115  const RebindToUnsigned<decltype(d)> du;
   2116  BlendedStore(BitCast(du, v), RebindMask(du, m), du,
   2117               detail::U16LanePointer(p));
   2118 }
   2119 
   2120 // ------------------------------ StoreN
   2121 
   2122 namespace detail {
   2123 
   2124 #define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   2125                       MLEN, NAME, OP)                                         \
   2126  template <size_t N>                                                          \
   2127  HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v,                \
   2128                    HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */,                    \
   2129                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                   \
   2130    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(                           \
   2131        detail::NativeLanePointer(p), v, count);                               \
   2132  }
   2133 HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
   2134 #undef HWY_RVV_STOREN
   2135 
   2136 template <class D, HWY_RVV_IF_EMULATED_D(D)>
   2137 HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   2138  const RebindToUnsigned<decltype(d)> du;
   2139  StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p));
   2140 }
   2141 
   2142 }  // namespace detail
   2143 
   2144 #ifdef HWY_NATIVE_STORE_N
   2145 #undef HWY_NATIVE_STORE_N
   2146 #else
   2147 #define HWY_NATIVE_STORE_N
   2148 #endif
   2149 
   2150 template <class D>
   2151 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
   2152                    size_t max_lanes_to_store) {
   2153  // NOTE: Need to clamp max_lanes_to_store to Lanes(d), even if
   2154  // MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible for
   2155  // detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than
   2156  // Lanes(DFromV<VFromD<D>>()) lanes to p if
   2157  // max_lanes_to_store > Lanes(DFromV<VFromD<D>>()) and
   2158  // max_lanes_to_store < 2 * Lanes(DFromV<VFromD<D>>()) are both true.
   2159 
   2160  // Also need to make sure that no more than Lanes(d) lanes are stored to p
   2161  // if Lanes(d) < Lanes(DFromV<VFromD<D>>()) is true, which is possible if
   2162  // MaxLanes(d) < MaxLanes(DFromV<VFromD<D>>()) or
   2163  // d.Pow2() < DFromV<VFromD<D>>().Pow2() is true.
   2164  detail::StoreN(CappedLanes(d, max_lanes_to_store), v, d, p);
   2165 }
   2166 
   2167 // ------------------------------ StoreU
   2168 template <class V, class D>
   2169 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
   2170  // RVV only requires element alignment, not vector alignment.
   2171  Store(v, d, p);
   2172 }
   2173 
   2174 // ------------------------------ Stream
   2175 template <class V, class D, typename T>
   2176 HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
   2177  Store(v, d, aligned);
   2178 }
   2179 
   2180 // ------------------------------ ScatterOffset
   2181 
   2182 #ifdef HWY_NATIVE_SCATTER
   2183 #undef HWY_NATIVE_SCATTER
   2184 #else
   2185 #define HWY_NATIVE_SCATTER
   2186 #endif
   2187 
   2188 #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
   2189                        SHIFT, MLEN, NAME, OP)                              \
   2190  template <size_t N>                                                       \
   2191  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                           \
   2192                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                       \
   2193                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,               \
   2194                    HWY_RVV_V(int, SEW, LMUL) offset) {                     \
   2195    const RebindToUnsigned<decltype(d)> du;                                 \
   2196    return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                    \
   2197        detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
   2198  }
   2199 HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT)
   2200 #undef HWY_RVV_SCATTER
   2201 
   2202 // ------------------------------ ScatterIndex
   2203 template <class D>
   2204 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
   2205                          VFromD<RebindToSigned<D>> indices) {
   2206  constexpr size_t kBits = CeilLog2(sizeof(TFromD<D>));
   2207  return ScatterOffset(v, d, base, ShiftLeft<kBits>(indices));
   2208 }
   2209 
   2210 // ------------------------------ MaskedScatterIndex
   2211 
   2212 #define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
   2213                               LMULH, SHIFT, MLEN, NAME, OP)             \
   2214  template <size_t N>                                                    \
   2215  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m,     \
   2216                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                    \
   2217                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,            \
   2218                    HWY_RVV_V(int, SEW, LMUL) indices) {                 \
   2219    const RebindToUnsigned<decltype(d)> du;                              \
   2220    constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>));      \
   2221    return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m(             \
   2222        m, detail::NativeLanePointer(base),                              \
   2223        ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d));            \
   2224  }
   2225 HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT)
   2226 #undef HWY_RVV_MASKED_SCATTER
   2227 
   2228 // ------------------------------ GatherOffset
   2229 
   2230 #ifdef HWY_NATIVE_GATHER
   2231 #undef HWY_NATIVE_GATHER
   2232 #else
   2233 #define HWY_NATIVE_GATHER
   2234 #endif
   2235 
   2236 #define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   2237                       MLEN, NAME, OP)                                         \
   2238  template <size_t N>                                                          \
   2239  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   2240      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                   \
   2241           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,                     \
   2242           HWY_RVV_V(int, SEW, LMUL) offset) {                                 \
   2243    const RebindToUnsigned<decltype(d)> du;                                    \
   2244    return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                       \
   2245        detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d));       \
   2246  }
   2247 HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT)
   2248 #undef HWY_RVV_GATHER
   2249 
   2250 // ------------------------------ GatherIndex
   2251 
   2252 template <class D>
   2253 HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
   2254                              const VFromD<RebindToSigned<D>> index) {
   2255  constexpr size_t kBits = CeilLog2(sizeof(TFromD<D>));
   2256  return GatherOffset(d, base, ShiftLeft<kBits>(index));
   2257 }
   2258 
   2259 // ------------------------------ MaskedGatherIndexOr
   2260 
   2261 #define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   2262                              SHIFT, MLEN, NAME, OP)                           \
   2263  template <size_t N>                                                          \
   2264  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   2265      NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m,                   \
   2266           HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                   \
   2267           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,                     \
   2268           HWY_RVV_V(int, SEW, LMUL) indices) {                                \
   2269    const RebindToUnsigned<decltype(d)> du;                                    \
   2270    const RebindToSigned<decltype(d)> di;                                      \
   2271    (void)di; /* for HWY_DASSERT */                                            \
   2272    constexpr size_t kBits = CeilLog2(SEW / 8);                                \
   2273    HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));                          \
   2274    return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu(                  \
   2275        m, no, detail::NativeLanePointer(base),                                \
   2276        ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d));                     \
   2277  }
   2278 HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndexOr, lux, _ALL_VIRT)
   2279 #undef HWY_RVV_MASKED_GATHER
   2280 
   2281 template <class D>
   2282 HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, const TFromD<D>* base,
   2283                                    VFromD<RebindToSigned<D>> indices) {
   2284  return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
   2285 }
   2286 
   2287 // ================================================== CONVERT
   2288 
   2289 // ------------------------------ PromoteTo
   2290 
   2291 // SEW is for the input.
   2292 #define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,     \
   2293                        SHIFT, MLEN, NAME, OP)                               \
   2294  template <size_t N>                                                        \
   2295  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME(                                 \
   2296      HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
   2297    return __riscv_v##OP##CHAR##SEWD##LMULD(v, Lanes(d));                    \
   2298  }
   2299 
   2300 HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT)
   2301 HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT)
   2302 HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT)
   2303 HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT)
   2304 HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT)
   2305 HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT)
   2306 HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, _EXT_VIRT)
   2307 
   2308 #if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
   2309 
   2310 HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_,
   2311                                  _EXT_VIRT)
   2312 
   2313 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
   2314 #ifdef HWY_NATIVE_F16C
   2315 #undef HWY_NATIVE_F16C
   2316 #else
   2317 #define HWY_NATIVE_F16C
   2318 #endif
   2319 #endif  // HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
   2320 
   2321 #undef HWY_RVV_PROMOTE
   2322 
   2323 // The above X-macro cannot handle 4x promotion nor type switching.
   2324 // TODO(janwas): use BASE2 arg to allow the latter.
   2325 #define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
   2326                        SHIFT, ADD)                                            \
   2327  template <size_t N>                                                          \
   2328  HWY_API HWY_RVV_V(BASE, BITS, LMUL)                                          \
   2329      PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d,                       \
   2330                HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) {                      \
   2331    return __riscv_v##OP##CHAR##BITS##LMUL(v, Lanes(d));                       \
   2332  }
   2333 
   2334 #define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)        \
   2335  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
   2336  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
   2337  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1)   \
   2338  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1)   \
   2339  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
   2340 
   2341 #define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)        \
   2342  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
   2343  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
   2344  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2)   \
   2345  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
   2346 
   2347 #define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
   2348  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
   2349  HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
   2350 
   2351 #define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)        \
   2352  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf8, -3, 3) \
   2353  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf4, -2, 3) \
   2354  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, mf2, -1, 3) \
   2355  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m1, 0, 3)
   2356 
   2357 HWY_RVV_PROMOTE_X8(zext_vf8_, uint, u, 64, uint, 8)
   2358 HWY_RVV_PROMOTE_X8(sext_vf8_, int, i, 64, int, 8)
   2359 
   2360 HWY_RVV_PROMOTE_X4_FROM_U8(zext_vf4_, uint, u, 32, uint, 8)
   2361 HWY_RVV_PROMOTE_X4_FROM_U8(sext_vf4_, int, i, 32, int, 8)
   2362 HWY_RVV_PROMOTE_X4(zext_vf4_, uint, u, 64, uint, 16)
   2363 HWY_RVV_PROMOTE_X4(sext_vf4_, int, i, 64, int, 16)
   2364 
   2365 // i32 to f64
   2366 HWY_RVV_PROMOTE_X2(fwcvt_f_x_v_, float, f, 64, int, 32)
   2367 
   2368 // u32 to f64
   2369 HWY_RVV_PROMOTE_X2(fwcvt_f_xu_v_, float, f, 64, uint, 32)
   2370 
   2371 // f32 to i64
   2372 HWY_RVV_PROMOTE_X2(fwcvt_rtz_x_f_v_, int, i, 64, float, 32)
   2373 
   2374 // f32 to u64
   2375 HWY_RVV_PROMOTE_X2(fwcvt_rtz_xu_f_v_, uint, u, 64, float, 32)
   2376 
   2377 #undef HWY_RVV_PROMOTE_X8
   2378 #undef HWY_RVV_PROMOTE_X4_FROM_U8
   2379 #undef HWY_RVV_PROMOTE_X4
   2380 #undef HWY_RVV_PROMOTE_X2
   2381 #undef HWY_RVV_PROMOTE
   2382 
   2383 // I16->I64 or U16->U64 PromoteTo with virtual LMUL
   2384 template <size_t N>
   2385 HWY_API auto PromoteTo(Simd<int64_t, N, -1> d,
   2386                       VFromD<Rebind<int16_t, decltype(d)>> v)
   2387    -> VFromD<decltype(d)> {
   2388  return PromoteTo(ScalableTag<int64_t>(), v);
   2389 }
   2390 
   2391 template <size_t N>
   2392 HWY_API auto PromoteTo(Simd<uint64_t, N, -1> d,
   2393                       VFromD<Rebind<uint16_t, decltype(d)>> v)
   2394    -> VFromD<decltype(d)> {
   2395  return PromoteTo(ScalableTag<uint64_t>(), v);
   2396 }
   2397 
   2398 // Unsigned to signed: cast for unsigned promote.
   2399 template <class D, HWY_IF_I16_D(D)>
   2400 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
   2401  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
   2402 }
   2403 
   2404 template <class D, HWY_IF_I32_D(D)>
   2405 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
   2406  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
   2407 }
   2408 
   2409 template <class D, HWY_IF_I32_D(D)>
   2410 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
   2411  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
   2412 }
   2413 
   2414 template <class D, HWY_IF_I64_D(D)>
   2415 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
   2416  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
   2417 }
   2418 
   2419 template <class D, HWY_IF_I64_D(D)>
   2420 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) {
   2421  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
   2422 }
   2423 
   2424 template <class D, HWY_IF_I64_D(D)>
   2425 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
   2426  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
   2427 }
   2428 
   2429 template <class D, HWY_IF_F32_D(D)>
   2430 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) {
   2431  const RebindToSigned<decltype(d)> di32;
   2432  const Rebind<uint16_t, decltype(d)> du16;
   2433  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
   2434 }
   2435 
   2436 // ------------------------------ DemoteTo U
   2437 
   2438 // SEW is for the source so we can use _DEMOTE_VIRT.
   2439 #define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   2440                       MLEN, NAME, OP)                                         \
   2441  template <size_t N>                                                          \
   2442  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME(                                   \
   2443      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) {   \
   2444    return __riscv_v##OP##CHAR##SEWH##LMULH(                                   \
   2445        v, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));                \
   2446  }
   2447 
   2448 // Unsigned -> unsigned
   2449 HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT)
   2450 HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT)
   2451 HWY_RVV_FOREACH_U64(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT)
   2452 
   2453 // SEW is for the source so we can use _DEMOTE_VIRT.
   2454 #define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   2455                              SHIFT, MLEN, NAME, OP)                           \
   2456  template <size_t N>                                                          \
   2457  HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME(                                   \
   2458      HWY_RVV_D(uint, SEWH, N, SHIFT - 1) dn, HWY_RVV_V(int, SEW, LMUL) v) {   \
   2459    const HWY_RVV_D(uint, SEW, N, SHIFT) du;                                   \
   2460    /* First clamp negative numbers to zero to match x86 packus. */            \
   2461    return DemoteTo(dn, BitCast(du, detail::MaxS(v, 0)));                      \
   2462  }
   2463 HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT)
   2464 HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT)
   2465 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT)
   2466 #undef HWY_RVV_DEMOTE_I_TO_U
   2467 
   2468 template <size_t N>
   2469 HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
   2470  return __riscv_vnclipu_wx_u8mf8(
   2471      DemoteTo(Simd<uint16_t, N, -2>(), v), 0,
   2472      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2473 }
   2474 template <size_t N>
   2475 HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
   2476  return __riscv_vnclipu_wx_u8mf4(
   2477      DemoteTo(Simd<uint16_t, N, -1>(), v), 0,
   2478      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2479 }
   2480 template <size_t N>
   2481 HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
   2482  return __riscv_vnclipu_wx_u8mf2(
   2483      DemoteTo(Simd<uint16_t, N, 0>(), v), 0,
   2484      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2485 }
   2486 template <size_t N>
   2487 HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
   2488  return __riscv_vnclipu_wx_u8m1(
   2489      DemoteTo(Simd<uint16_t, N, 1>(), v), 0,
   2490      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2491 }
   2492 template <size_t N>
   2493 HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
   2494  return __riscv_vnclipu_wx_u8m2(
   2495      DemoteTo(Simd<uint16_t, N, 2>(), v), 0,
   2496      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2497 }
   2498 
   2499 template <size_t N>
   2500 HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vuint32mf2_t v) {
   2501  return __riscv_vnclipu_wx_u8mf8(
   2502      DemoteTo(Simd<uint16_t, N, -2>(), v), 0,
   2503      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2504 }
   2505 template <size_t N>
   2506 HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vuint32m1_t v) {
   2507  return __riscv_vnclipu_wx_u8mf4(
   2508      DemoteTo(Simd<uint16_t, N, -1>(), v), 0,
   2509      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2510 }
   2511 template <size_t N>
   2512 HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vuint32m2_t v) {
   2513  return __riscv_vnclipu_wx_u8mf2(
   2514      DemoteTo(Simd<uint16_t, N, 0>(), v), 0,
   2515      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2516 }
   2517 template <size_t N>
   2518 HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vuint32m4_t v) {
   2519  return __riscv_vnclipu_wx_u8m1(
   2520      DemoteTo(Simd<uint16_t, N, 1>(), v), 0,
   2521      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2522 }
   2523 template <size_t N>
   2524 HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) {
   2525  return __riscv_vnclipu_wx_u8m2(
   2526      DemoteTo(Simd<uint16_t, N, 2>(), v), 0,
   2527      HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
   2528 }
   2529 
   2530 template <class D, HWY_IF_U8_D(D)>
   2531 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
   2532  return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
   2533 }
   2534 
   2535 template <class D, HWY_IF_U8_D(D)>
   2536 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
   2537  return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
   2538 }
   2539 
   2540 template <class D, HWY_IF_U16_D(D)>
   2541 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
   2542  return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
   2543 }
   2544 
   2545 template <class D, HWY_IF_U16_D(D)>
   2546 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) {
   2547  return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v));
   2548 }
   2549 
   2550 HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
   2551  const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
   2552  return __riscv_vnclipu_wx_u8mf8(
   2553      __riscv_vnclipu_wx_u16mf4(v, 0,
   2554                                HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
   2555      0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2556 }
   2557 HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
   2558  const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
   2559  return __riscv_vnclipu_wx_u8mf4(
   2560      __riscv_vnclipu_wx_u16mf2(v, 0,
   2561                                HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
   2562      0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2563 }
   2564 HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
   2565  const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
   2566  return __riscv_vnclipu_wx_u8mf2(
   2567      __riscv_vnclipu_wx_u16m1(v, 0,
   2568                               HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
   2569      0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2570 }
   2571 HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
   2572  const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
   2573  return __riscv_vnclipu_wx_u8m1(
   2574      __riscv_vnclipu_wx_u16m2(v, 0,
   2575                               HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
   2576      0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2577 }
   2578 HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
   2579  const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
   2580  return __riscv_vnclipu_wx_u8m2(
   2581      __riscv_vnclipu_wx_u16m4(v, 0,
   2582                               HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
   2583      0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2584 }
   2585 
   2586 // ------------------------------ Truncations
   2587 
   2588 template <size_t N>
   2589 HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
   2590                               const VFromD<Simd<uint64_t, N, 0>> v) {
   2591  const size_t avl = Lanes(d);
   2592  const vuint64m1_t v1 = __riscv_vand(v, 0xFF, avl);
   2593  const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
   2594      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2595  const vuint16mf4_t v3 = __riscv_vnclipu_wx_u16mf4(
   2596      v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2597  return __riscv_vnclipu_wx_u8mf8(v3, 0,
   2598                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2599 }
   2600 
   2601 template <size_t N>
   2602 HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
   2603                               const VFromD<Simd<uint64_t, N, 1>> v) {
   2604  const size_t avl = Lanes(d);
   2605  const vuint64m2_t v1 = __riscv_vand(v, 0xFF, avl);
   2606  const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(
   2607      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2608  const vuint16mf2_t v3 = __riscv_vnclipu_wx_u16mf2(
   2609      v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2610  return __riscv_vnclipu_wx_u8mf4(v3, 0,
   2611                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2612 }
   2613 
   2614 template <size_t N>
   2615 HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
   2616                               const VFromD<Simd<uint64_t, N, 2>> v) {
   2617  const size_t avl = Lanes(d);
   2618  const vuint64m4_t v1 = __riscv_vand(v, 0xFF, avl);
   2619  const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(
   2620      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2621  const vuint16m1_t v3 = __riscv_vnclipu_wx_u16m1(
   2622      v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2623  return __riscv_vnclipu_wx_u8mf2(v3, 0,
   2624                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2625 }
   2626 
   2627 template <size_t N>
   2628 HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
   2629                              const VFromD<Simd<uint64_t, N, 3>> v) {
   2630  const size_t avl = Lanes(d);
   2631  const vuint64m8_t v1 = __riscv_vand(v, 0xFF, avl);
   2632  const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(
   2633      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2634  const vuint16m2_t v3 = __riscv_vnclipu_wx_u16m2(
   2635      v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2636  return __riscv_vnclipu_wx_u8m1(v3, 0,
   2637                                 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2638 }
   2639 
   2640 template <size_t N>
   2641 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -3> d,
   2642                                const VFromD<Simd<uint64_t, N, -1>> v) {
   2643  const size_t avl = Lanes(d);
   2644  const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2645  const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
   2646      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2647  return __riscv_vnclipu_wx_u16mf4(v2, 0,
   2648                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2649 }
   2650 
   2651 template <size_t N>
   2652 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
   2653                                const VFromD<Simd<uint64_t, N, 0>> v) {
   2654  const size_t avl = Lanes(d);
   2655  const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2656  const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
   2657      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2658  return __riscv_vnclipu_wx_u16mf4(v2, 0,
   2659                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2660 }
   2661 
   2662 template <size_t N>
   2663 HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
   2664                                const VFromD<Simd<uint64_t, N, 1>> v) {
   2665  const size_t avl = Lanes(d);
   2666  const vuint64m2_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2667  const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(
   2668      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2669  return __riscv_vnclipu_wx_u16mf2(v2, 0,
   2670                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2671 }
   2672 
   2673 template <size_t N>
   2674 HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
   2675                               const VFromD<Simd<uint64_t, N, 2>> v) {
   2676  const size_t avl = Lanes(d);
   2677  const vuint64m4_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2678  const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(
   2679      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2680  return __riscv_vnclipu_wx_u16m1(v2, 0,
   2681                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2682 }
   2683 
   2684 template <size_t N>
   2685 HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
   2686                               const VFromD<Simd<uint64_t, N, 3>> v) {
   2687  const size_t avl = Lanes(d);
   2688  const vuint64m8_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2689  const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(
   2690      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2691  return __riscv_vnclipu_wx_u16m2(v2, 0,
   2692                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2693 }
   2694 
   2695 template <size_t N>
   2696 HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -2> d,
   2697                                const VFromD<Simd<uint64_t, N, -1>> v) {
   2698  const size_t avl = Lanes(d);
   2699  const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
   2700  return __riscv_vnclipu_wx_u32mf2(v1, 0,
   2701                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2702 }
   2703 
   2704 template <size_t N>
   2705 HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -1> d,
   2706                                const VFromD<Simd<uint64_t, N, 0>> v) {
   2707  const size_t avl = Lanes(d);
   2708  const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
   2709  return __riscv_vnclipu_wx_u32mf2(v1, 0,
   2710                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2711 }
   2712 
   2713 template <size_t N>
   2714 HWY_API vuint32m1_t TruncateTo(Simd<uint32_t, N, 0> d,
   2715                               const VFromD<Simd<uint64_t, N, 1>> v) {
   2716  const size_t avl = Lanes(d);
   2717  const vuint64m2_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
   2718  return __riscv_vnclipu_wx_u32m1(v1, 0,
   2719                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2720 }
   2721 
   2722 template <size_t N>
   2723 HWY_API vuint32m2_t TruncateTo(Simd<uint32_t, N, 1> d,
   2724                               const VFromD<Simd<uint64_t, N, 2>> v) {
   2725  const size_t avl = Lanes(d);
   2726  const vuint64m4_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
   2727  return __riscv_vnclipu_wx_u32m2(v1, 0,
   2728                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2729 }
   2730 
   2731 template <size_t N>
   2732 HWY_API vuint32m4_t TruncateTo(Simd<uint32_t, N, 2> d,
   2733                               const VFromD<Simd<uint64_t, N, 3>> v) {
   2734  const size_t avl = Lanes(d);
   2735  const vuint64m8_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
   2736  return __riscv_vnclipu_wx_u32m4(v1, 0,
   2737                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2738 }
   2739 
   2740 template <size_t N>
   2741 HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
   2742                               const VFromD<Simd<uint32_t, N, -1>> v) {
   2743  const size_t avl = Lanes(d);
   2744  const vuint32mf2_t v1 = __riscv_vand(v, 0xFF, avl);
   2745  const vuint16mf4_t v2 = __riscv_vnclipu_wx_u16mf4(
   2746      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2747  return __riscv_vnclipu_wx_u8mf8(v2, 0,
   2748                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2749 }
   2750 
   2751 template <size_t N>
   2752 HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
   2753                               const VFromD<Simd<uint32_t, N, 0>> v) {
   2754  const size_t avl = Lanes(d);
   2755  const vuint32m1_t v1 = __riscv_vand(v, 0xFF, avl);
   2756  const vuint16mf2_t v2 = __riscv_vnclipu_wx_u16mf2(
   2757      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2758  return __riscv_vnclipu_wx_u8mf4(v2, 0,
   2759                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2760 }
   2761 
   2762 template <size_t N>
   2763 HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
   2764                               const VFromD<Simd<uint32_t, N, 1>> v) {
   2765  const size_t avl = Lanes(d);
   2766  const vuint32m2_t v1 = __riscv_vand(v, 0xFF, avl);
   2767  const vuint16m1_t v2 = __riscv_vnclipu_wx_u16m1(
   2768      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2769  return __riscv_vnclipu_wx_u8mf2(v2, 0,
   2770                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2771 }
   2772 
   2773 template <size_t N>
   2774 HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
   2775                              const VFromD<Simd<uint32_t, N, 2>> v) {
   2776  const size_t avl = Lanes(d);
   2777  const vuint32m4_t v1 = __riscv_vand(v, 0xFF, avl);
   2778  const vuint16m2_t v2 = __riscv_vnclipu_wx_u16m2(
   2779      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2780  return __riscv_vnclipu_wx_u8m1(v2, 0,
   2781                                 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2782 }
   2783 
   2784 template <size_t N>
   2785 HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
   2786                              const VFromD<Simd<uint32_t, N, 3>> v) {
   2787  const size_t avl = Lanes(d);
   2788  const vuint32m8_t v1 = __riscv_vand(v, 0xFF, avl);
   2789  const vuint16m4_t v2 = __riscv_vnclipu_wx_u16m4(
   2790      v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2791  return __riscv_vnclipu_wx_u8m2(v2, 0,
   2792                                 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2793 }
   2794 
   2795 template <size_t N>
   2796 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -3> d,
   2797                                const VFromD<Simd<uint32_t, N, -2>> v) {
   2798  const size_t avl = Lanes(d);
   2799  const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2800  return __riscv_vnclipu_wx_u16mf4(v1, 0,
   2801                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2802 }
   2803 
   2804 template <size_t N>
   2805 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
   2806                                const VFromD<Simd<uint32_t, N, -1>> v) {
   2807  const size_t avl = Lanes(d);
   2808  const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2809  return __riscv_vnclipu_wx_u16mf4(v1, 0,
   2810                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2811 }
   2812 
   2813 template <size_t N>
   2814 HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
   2815                                const VFromD<Simd<uint32_t, N, 0>> v) {
   2816  const size_t avl = Lanes(d);
   2817  const vuint32m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2818  return __riscv_vnclipu_wx_u16mf2(v1, 0,
   2819                                   HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2820 }
   2821 
   2822 template <size_t N>
   2823 HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
   2824                               const VFromD<Simd<uint32_t, N, 1>> v) {
   2825  const size_t avl = Lanes(d);
   2826  const vuint32m2_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2827  return __riscv_vnclipu_wx_u16m1(v1, 0,
   2828                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2829 }
   2830 
   2831 template <size_t N>
   2832 HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
   2833                               const VFromD<Simd<uint32_t, N, 2>> v) {
   2834  const size_t avl = Lanes(d);
   2835  const vuint32m4_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2836  return __riscv_vnclipu_wx_u16m2(v1, 0,
   2837                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2838 }
   2839 
   2840 template <size_t N>
   2841 HWY_API vuint16m4_t TruncateTo(Simd<uint16_t, N, 2> d,
   2842                               const VFromD<Simd<uint32_t, N, 3>> v) {
   2843  const size_t avl = Lanes(d);
   2844  const vuint32m8_t v1 = __riscv_vand(v, 0xFFFF, avl);
   2845  return __riscv_vnclipu_wx_u16m4(v1, 0,
   2846                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2847 }
   2848 
   2849 template <size_t N>
   2850 HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
   2851                               const VFromD<Simd<uint16_t, N, -2>> v) {
   2852  const size_t avl = Lanes(d);
   2853  const vuint16mf4_t v1 = __riscv_vand(v, 0xFF, avl);
   2854  return __riscv_vnclipu_wx_u8mf8(v1, 0,
   2855                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2856 }
   2857 
   2858 template <size_t N>
   2859 HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
   2860                               const VFromD<Simd<uint16_t, N, -1>> v) {
   2861  const size_t avl = Lanes(d);
   2862  const vuint16mf2_t v1 = __riscv_vand(v, 0xFF, avl);
   2863  return __riscv_vnclipu_wx_u8mf4(v1, 0,
   2864                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2865 }
   2866 
   2867 template <size_t N>
   2868 HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
   2869                               const VFromD<Simd<uint16_t, N, 0>> v) {
   2870  const size_t avl = Lanes(d);
   2871  const vuint16m1_t v1 = __riscv_vand(v, 0xFF, avl);
   2872  return __riscv_vnclipu_wx_u8mf2(v1, 0,
   2873                                  HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2874 }
   2875 
   2876 template <size_t N>
   2877 HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
   2878                              const VFromD<Simd<uint16_t, N, 1>> v) {
   2879  const size_t avl = Lanes(d);
   2880  const vuint16m2_t v1 = __riscv_vand(v, 0xFF, avl);
   2881  return __riscv_vnclipu_wx_u8m1(v1, 0,
   2882                                 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2883 }
   2884 
   2885 template <size_t N>
   2886 HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
   2887                              const VFromD<Simd<uint16_t, N, 2>> v) {
   2888  const size_t avl = Lanes(d);
   2889  const vuint16m4_t v1 = __riscv_vand(v, 0xFF, avl);
   2890  return __riscv_vnclipu_wx_u8m2(v1, 0,
   2891                                 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2892 }
   2893 
   2894 template <size_t N>
   2895 HWY_API vuint8m4_t TruncateTo(Simd<uint8_t, N, 2> d,
   2896                              const VFromD<Simd<uint16_t, N, 3>> v) {
   2897  const size_t avl = Lanes(d);
   2898  const vuint16m8_t v1 = __riscv_vand(v, 0xFF, avl);
   2899  return __riscv_vnclipu_wx_u8m4(v1, 0,
   2900                                 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
   2901 }
   2902 
   2903 // ------------------------------ DemoteTo I
   2904 
   2905 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT)
   2906 HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT)
   2907 HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT)
   2908 
   2909 template <size_t N>
   2910 HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
   2911  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v));
   2912 }
   2913 template <size_t N>
   2914 HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
   2915  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v));
   2916 }
   2917 template <size_t N>
   2918 HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
   2919  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v));
   2920 }
   2921 template <size_t N>
   2922 HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
   2923  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v));
   2924 }
   2925 template <size_t N>
   2926 HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
   2927  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
   2928 }
   2929 
   2930 template <class D, HWY_IF_I8_D(D)>
   2931 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
   2932  return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
   2933 }
   2934 
   2935 template <class D, HWY_IF_I16_D(D)>
   2936 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) {
   2937  return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v));
   2938 }
   2939 
   2940 #undef HWY_RVV_DEMOTE
   2941 
   2942 // ------------------------------ DemoteTo F
   2943 
   2944 // SEW is for the source so we can use _DEMOTE_VIRT.
   2945 #define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
   2946                         SHIFT, MLEN, NAME, OP)                              \
   2947  template <size_t N>                                                        \
   2948  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME(                                 \
   2949      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
   2950    return __riscv_v##OP##SEWH##LMULH(v, Lanes(d));                          \
   2951  }
   2952 
   2953 #if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
   2954 HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
   2955 #endif
   2956 HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT)
   2957 
   2958 namespace detail {
   2959 HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteToF32WithRoundToOdd,
   2960                    fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
   2961 }  // namespace detail
   2962 
   2963 #undef HWY_RVV_DEMOTE_F
   2964 
   2965 // TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
   2966 template <size_t N>
   2967 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
   2968  return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
   2969 }
   2970 template <size_t N>
   2971 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
   2972  return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
   2973 }
   2974 template <size_t N>
   2975 HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
   2976  return __riscv_vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
   2977 }
   2978 template <size_t N>
   2979 HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
   2980  return __riscv_vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
   2981 }
   2982 template <size_t N>
   2983 HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
   2984  return __riscv_vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
   2985 }
   2986 
   2987 template <size_t N>
   2988 HWY_API vuint32mf2_t DemoteTo(Simd<uint32_t, N, -2> d, const vfloat64m1_t v) {
   2989  return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v, Lanes(d));
   2990 }
   2991 template <size_t N>
   2992 HWY_API vuint32mf2_t DemoteTo(Simd<uint32_t, N, -1> d, const vfloat64m1_t v) {
   2993  return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v, Lanes(d));
   2994 }
   2995 template <size_t N>
   2996 HWY_API vuint32m1_t DemoteTo(Simd<uint32_t, N, 0> d, const vfloat64m2_t v) {
   2997  return __riscv_vfncvt_rtz_xu_f_w_u32m1(v, Lanes(d));
   2998 }
   2999 template <size_t N>
   3000 HWY_API vuint32m2_t DemoteTo(Simd<uint32_t, N, 1> d, const vfloat64m4_t v) {
   3001  return __riscv_vfncvt_rtz_xu_f_w_u32m2(v, Lanes(d));
   3002 }
   3003 template <size_t N>
   3004 HWY_API vuint32m4_t DemoteTo(Simd<uint32_t, N, 2> d, const vfloat64m8_t v) {
   3005  return __riscv_vfncvt_rtz_xu_f_w_u32m4(v, Lanes(d));
   3006 }
   3007 
   3008 template <size_t N>
   3009 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -2> d, const vint64m1_t v) {
   3010  return __riscv_vfncvt_f_x_w_f32mf2(v, Lanes(d));
   3011 }
   3012 template <size_t N>
   3013 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -1> d, const vint64m1_t v) {
   3014  return __riscv_vfncvt_f_x_w_f32mf2(v, Lanes(d));
   3015 }
   3016 template <size_t N>
   3017 HWY_API vfloat32m1_t DemoteTo(Simd<float, N, 0> d, const vint64m2_t v) {
   3018  return __riscv_vfncvt_f_x_w_f32m1(v, Lanes(d));
   3019 }
   3020 template <size_t N>
   3021 HWY_API vfloat32m2_t DemoteTo(Simd<float, N, 1> d, const vint64m4_t v) {
   3022  return __riscv_vfncvt_f_x_w_f32m2(v, Lanes(d));
   3023 }
   3024 template <size_t N>
   3025 HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vint64m8_t v) {
   3026  return __riscv_vfncvt_f_x_w_f32m4(v, Lanes(d));
   3027 }
   3028 
   3029 template <size_t N>
   3030 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -2> d, const vuint64m1_t v) {
   3031  return __riscv_vfncvt_f_xu_w_f32mf2(v, Lanes(d));
   3032 }
   3033 template <size_t N>
   3034 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -1> d, const vuint64m1_t v) {
   3035  return __riscv_vfncvt_f_xu_w_f32mf2(v, Lanes(d));
   3036 }
   3037 template <size_t N>
   3038 HWY_API vfloat32m1_t DemoteTo(Simd<float, N, 0> d, const vuint64m2_t v) {
   3039  return __riscv_vfncvt_f_xu_w_f32m1(v, Lanes(d));
   3040 }
   3041 template <size_t N>
   3042 HWY_API vfloat32m2_t DemoteTo(Simd<float, N, 1> d, const vuint64m4_t v) {
   3043  return __riscv_vfncvt_f_xu_w_f32m2(v, Lanes(d));
   3044 }
   3045 template <size_t N>
   3046 HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vuint64m8_t v) {
   3047  return __riscv_vfncvt_f_xu_w_f32m4(v, Lanes(d));
   3048 }
   3049 
   3050 // Narrows f32 bits to bf16 using round to even.
   3051 // SEW is for the source so we can use _DEMOTE_VIRT.
   3052 #ifdef HWY_RVV_AVOID_VXRM
   3053 #define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL,    \
   3054                                       LMULD, LMULH, SHIFT, MLEN, NAME, OP)  \
   3055  template <size_t N>                                                        \
   3056  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME(                                 \
   3057      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
   3058    const auto round =                                                       \
   3059        detail::AddS(detail::AndS(ShiftRight<16>(v), 1u), 0x7FFFu);          \
   3060    v = Add(v, round);                                                       \
   3061    /* The default rounding mode appears to be RNU=0, which adds the LSB. */ \
   3062    /* Prevent further rounding by clearing the bits we want to truncate. */ \
   3063    v = detail::AndS(v, 0xFFFF0000u);                                        \
   3064    return __riscv_v##OP##CHAR##SEWH##LMULH(v, 16, Lanes(d));                \
   3065  }
   3066 
   3067 #else
   3068 #define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL,    \
   3069                                       LMULD, LMULH, SHIFT, MLEN, NAME, OP)  \
   3070  template <size_t N>                                                        \
   3071  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME(                                 \
   3072      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
   3073    return __riscv_v##OP##CHAR##SEWH##LMULH(                                 \
   3074        v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNE, Lanes(d)));             \
   3075  }
   3076 #endif  // HWY_RVV_AVOID_VXRM
   3077 namespace detail {
   3078 HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_16_NEAREST_EVEN, DemoteTo16NearestEven,
   3079                    nclipu_wx_, _DEMOTE_VIRT)
   3080 }
   3081 #undef HWY_RVV_DEMOTE_16_NEAREST_EVEN
   3082 
   3083 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
   3084 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
   3085 #else
   3086 #define HWY_NATIVE_DEMOTE_F32_TO_BF16
   3087 #endif
   3088 
   3089 template <class DBF16, HWY_IF_BF16_D(DBF16)>
   3090 HWY_API VFromD<DBF16> DemoteTo(DBF16 d, VFromD<Rebind<float, DBF16>> v) {
   3091  const DFromV<decltype(v)> df;
   3092  const RebindToUnsigned<decltype(df)> du32;
   3093  const RebindToUnsigned<decltype(d)> du16;
   3094  // Consider an f32 mantissa with the upper 7 bits set, followed by a 1-bit
   3095  // and at least one other bit set. This will round to 0 and increment the
   3096  // exponent. If the exponent was already 0xFF (NaN), then the result is -inf;
   3097  // there no wraparound because nclipu saturates. Note that in this case, the
   3098  // input cannot have been inf because its mantissa bits are zero. To avoid
   3099  // converting NaN to inf, we canonicalize the NaN to prevent the rounding.
   3100  const decltype(v) canonicalized =
   3101      IfThenElse(Eq(v, v), v, BitCast(df, Set(du32, 0x7F800000)));
   3102  return BitCast(
   3103      d, detail::DemoteTo16NearestEven(du16, BitCast(du32, canonicalized)));
   3104 }
   3105 
   3106 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
   3107 #undef HWY_NATIVE_DEMOTE_F64_TO_F16
   3108 #else
   3109 #define HWY_NATIVE_DEMOTE_F64_TO_F16
   3110 #endif
   3111 
   3112 template <class D, HWY_IF_F16_D(D)>
   3113 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
   3114  const Rebind<float, decltype(df16)> df32;
   3115  return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
   3116 }
   3117 
   3118 // ------------------------------ ConvertTo F
   3119 
   3120 #define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,       \
   3121                        SHIFT, MLEN, NAME, OP)                                 \
   3122  template <size_t N>                                                          \
   3123  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
   3124      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) {         \
   3125    return __riscv_vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d));                      \
   3126  }                                                                            \
   3127  template <size_t N>                                                          \
   3128  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
   3129      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {        \
   3130    return __riscv_vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d));                     \
   3131  }                                                                            \
   3132  /* Truncates (rounds toward zero). */                                        \
   3133  template <size_t N>                                                          \
   3134  HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
   3135                                              HWY_RVV_V(BASE, SEW, LMUL) v) {  \
   3136    return __riscv_vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d));                  \
   3137  }                                                                            \
   3138  template <size_t N>                                                          \
   3139  HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo(                                \
   3140      HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) {        \
   3141    return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d));                 \
   3142  }
   3143 
   3144 HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
   3145 #undef HWY_RVV_CONVERT
   3146 
   3147 // Uses default rounding mode. Must be separate because there is no D arg.
   3148 #define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,       \
   3149                        SHIFT, MLEN, NAME, OP)                                 \
   3150  HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
   3151    return __riscv_vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT));       \
   3152  }
   3153 HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL)
   3154 #undef HWY_RVV_NEAREST
   3155 
   3156 template <size_t N>
   3157 HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -2> d,
   3158                                       const vfloat64m1_t v) {
   3159  return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d));
   3160 }
   3161 template <size_t N>
   3162 HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -1> d,
   3163                                       const vfloat64m1_t v) {
   3164  return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d));
   3165 }
   3166 template <size_t N>
   3167 HWY_API vint32m1_t DemoteToNearestInt(Simd<int32_t, N, 0> d,
   3168                                      const vfloat64m2_t v) {
   3169  return __riscv_vfncvt_x_f_w_i32m1(v, Lanes(d));
   3170 }
   3171 template <size_t N>
   3172 HWY_API vint32m2_t DemoteToNearestInt(Simd<int32_t, N, 1> d,
   3173                                      const vfloat64m4_t v) {
   3174  return __riscv_vfncvt_x_f_w_i32m2(v, Lanes(d));
   3175 }
   3176 template <size_t N>
   3177 HWY_API vint32m4_t DemoteToNearestInt(Simd<int32_t, N, 2> d,
   3178                                      const vfloat64m8_t v) {
   3179  return __riscv_vfncvt_x_f_w_i32m4(v, Lanes(d));
   3180 }
   3181 
   3182 // ================================================== COMBINE
   3183 
   3184 namespace detail {
   3185 
   3186 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
   3187 // offsets are implicitly relative to the start of their 128-bit block.
   3188 template <typename T, size_t N, int kPow2>
   3189 HWY_INLINE size_t LanesPerBlock(Simd<T, N, kPow2> d) {
   3190  // kMinVecBytes is the minimum size of VFromD<decltype(d)> in bytes
   3191  constexpr size_t kMinVecBytes =
   3192      ScaleByPower(16, HWY_MAX(HWY_MIN(kPow2, 3), -3));
   3193  // kMinVecLanes is the minimum number of lanes in VFromD<decltype(d)>
   3194  constexpr size_t kMinVecLanes = (kMinVecBytes + sizeof(T) - 1) / sizeof(T);
   3195  // kMaxLpb is the maximum number of lanes per block
   3196  constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), MaxLanes(d));
   3197 
   3198  // If kMaxLpb <= kMinVecLanes is true, then kMaxLpb <= Lanes(d) is true
   3199  if (kMaxLpb <= kMinVecLanes) return kMaxLpb;
   3200 
   3201  // Fractional LMUL: Lanes(d) may be smaller than kMaxLpb, so honor that.
   3202  const size_t lanes_per_vec = Lanes(d);
   3203  return HWY_MIN(lanes_per_vec, kMaxLpb);
   3204 }
   3205 
   3206 template <class D, class V>
   3207 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
   3208  using T = MakeUnsigned<TFromV<V>>;
   3209  return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
   3210 }
   3211 
   3212 template <size_t kLanes, class D>
   3213 HWY_INLINE MFromD<D> FirstNPerBlock(D /* tag */) {
   3214  const RebindToUnsigned<D> du;
   3215  const RebindToSigned<D> di;
   3216  using TU = TFromD<decltype(du)>;
   3217  const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1));
   3218  return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
   3219 }
   3220 
   3221 #define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
   3222                         SHIFT, MLEN, NAME, OP)                            \
   3223  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
   3224      NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
   3225           size_t lanes) {                                                 \
   3226    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes,           \
   3227                                                HWY_RVV_AVL(SEW, SHIFT));  \
   3228  }
   3229 
   3230 #define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   3231                           SHIFT, MLEN, NAME, OP)                           \
   3232  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   3233      NAME(HWY_RVV_V(BASE, SEW, LMUL) src, size_t lanes) {                  \
   3234    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(src, lanes,                 \
   3235                                                HWY_RVV_AVL(SEW, SHIFT));   \
   3236  }
   3237 
   3238 HWY_RVV_FOREACH(HWY_RVV_SLIDE_UP, SlideUp, slideup, _ALL)
   3239 HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL)
   3240 
   3241 #undef HWY_RVV_SLIDE_UP
   3242 #undef HWY_RVV_SLIDE_DOWN
   3243 
   3244 #define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   3245                    MLEN, NAME, OP)                                         \
   3246  template <size_t kIndex>                                                  \
   3247  HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {  \
   3248    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(        \
   3249        v, kIndex); /* no AVL */                                            \
   3250  }
   3251 #define HWY_RVV_GET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
   3252                         SHIFT, MLEN, NAME, OP)                            \
   3253  template <size_t kIndex>                                                 \
   3254  HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
   3255    static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");    \
   3256    HWY_IF_CONSTEXPR(kIndex == 0) { return Trunc(v); }                     \
   3257    HWY_IF_CONSTEXPR(kIndex != 0) {                                        \
   3258      return Trunc(SlideDown(                                              \
   3259          v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)),   \
   3260                             SHIFT - 1){})));                              \
   3261    }                                                                      \
   3262  }
   3263 #define HWY_RVV_GET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   3264                             SHIFT, MLEN, NAME, OP)                           \
   3265  template <size_t kIndex>                                                    \
   3266  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {     \
   3267    static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");       \
   3268    HWY_IF_CONSTEXPR(kIndex == 0) { return v; }                               \
   3269    HWY_IF_CONSTEXPR(kIndex != 0) {                                           \
   3270      return SlideDown(                                                       \
   3271          v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)),      \
   3272                             SHIFT){}) /                                      \
   3273                 2);                                                          \
   3274    }                                                                         \
   3275  }
   3276 HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _GET_SET)
   3277 HWY_RVV_FOREACH(HWY_RVV_GET_VIRT, Get, get, _GET_SET_VIRT)
   3278 HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST)
   3279 #undef HWY_RVV_GET
   3280 #undef HWY_RVV_GET_VIRT
   3281 #undef HWY_RVV_GET_SMALLEST
   3282 
   3283 template <size_t kIndex, class D>
   3284 static HWY_INLINE HWY_MAYBE_UNUSED VFromD<AdjustSimdTagToMinVecPow2<Half<D>>>
   3285 Get(D d, VFromD<D> v) {
   3286  static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");
   3287  HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { return Get<kIndex>(v); }
   3288  HWY_IF_CONSTEXPR(kIndex != 0 && !detail::IsFull(d)) {
   3289    const AdjustSimdTagToMinVecPow2<Half<decltype(d)>> dh;
   3290    const size_t slide_down_amt =
   3291        (dh.Pow2() < DFromV<decltype(v)>().Pow2()) ? Lanes(dh) : (Lanes(d) / 2);
   3292    return ResizeBitCast(dh, SlideDown(v, slide_down_amt));
   3293  }
   3294 }
   3295 
   3296 #define HWY_RVV_PARTIAL_VEC_SET_HALF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
   3297                                     LMULH, SHIFT, MLEN, NAME, OP)             \
   3298  template <size_t kIndex>                                                     \
   3299  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   3300      NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v,     \
   3301           size_t half_N) {                                                    \
   3302    static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");        \
   3303    const DFromV<decltype(dest)> d;                                            \
   3304    HWY_IF_CONSTEXPR(kIndex == 0) {                                            \
   3305      return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v),       \
   3306                                                        half_N);               \
   3307    }                                                                          \
   3308    HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, Ext(d, v), half_N); } \
   3309  }
   3310 #define HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST(                              \
   3311    BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
   3312  template <size_t kIndex>                                                  \
   3313  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   3314      NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v,   \
   3315           size_t half_N) {                                                 \
   3316    static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");     \
   3317    HWY_IF_CONSTEXPR(kIndex == 0) {                                         \
   3318      return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, v, half_N);   \
   3319    }                                                                       \
   3320    HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, v, half_N); }      \
   3321  }
   3322 HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, _GET_SET)
   3323 HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv,
   3324                _GET_SET_VIRT)
   3325 HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST, PartialVecSetHalf, mv,
   3326                _GET_SET_SMALLEST)
   3327 #undef HWY_RVV_PARTIAL_VEC_SET_HALF
   3328 #undef HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST
   3329 
   3330 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,   \
   3331                    MLEN, NAME, OP)                                           \
   3332  template <size_t kIndex, size_t N>                                          \
   3333  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   3334      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
   3335           HWY_RVV_V(BASE, SEW, LMULH) v) {                                   \
   3336    HWY_IF_CONSTEXPR(detail::IsFull(d)) {                                     \
   3337      return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL(        \
   3338          dest, kIndex, v); /* no AVL */                                      \
   3339    }                                                                         \
   3340    HWY_IF_CONSTEXPR(!detail::IsFull(d)) {                                    \
   3341      const Half<decltype(d)> dh;                                             \
   3342      return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh));                   \
   3343    }                                                                         \
   3344  }
   3345 #define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,     \
   3346                         SHIFT, MLEN, NAME, OP)                               \
   3347  template <size_t kIndex, size_t N>                                          \
   3348  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   3349      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
   3350           HWY_RVV_V(BASE, SEW, LMULH) v) {                                   \
   3351    const Half<decltype(d)> dh;                                               \
   3352    return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh));                     \
   3353  }
   3354 #define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   3355                             SHIFT, MLEN, NAME, OP)                           \
   3356  template <size_t kIndex, size_t N>                                          \
   3357  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   3358      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
   3359           HWY_RVV_V(BASE, SEW, LMUL) v) {                                    \
   3360    return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2);                  \
   3361  }
   3362 #define HWY_RVV_SET_SMALLEST_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
   3363                                  LMULH, SHIFT, MLEN, NAME, OP)             \
   3364  template <size_t kIndex, size_t N>                                        \
   3365  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
   3366      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT - 1) d,                            \
   3367           HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \
   3368    return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2);                \
   3369  }
   3370 HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _GET_SET)
   3371 HWY_RVV_FOREACH(HWY_RVV_SET_VIRT, Set, set, _GET_SET_VIRT)
   3372 HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST)
   3373 HWY_RVV_FOREACH_UI163264(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST)
   3374 HWY_RVV_FOREACH_F(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST)
   3375 #undef HWY_RVV_SET
   3376 #undef HWY_RVV_SET_VIRT
   3377 #undef HWY_RVV_SET_SMALLEST
   3378 #undef HWY_RVV_SET_SMALLEST_VIRT
   3379 
   3380 template <size_t kIndex, class D, HWY_RVV_IF_EMULATED_D(D)>
   3381 static HWY_INLINE HWY_MAYBE_UNUSED VFromD<D> Set(
   3382    D d, VFromD<D> dest, VFromD<AdjustSimdTagToMinVecPow2<Half<D>>> v) {
   3383  const RebindToUnsigned<decltype(d)> du;
   3384  return BitCast(
   3385      d, Set<kIndex>(du, BitCast(du, dest),
   3386                     BitCast(RebindToUnsigned<DFromV<decltype(v)>>(), v)));
   3387 }
   3388 
   3389 }  // namespace detail
   3390 
   3391 // ------------------------------ SlideUpLanes
   3392 template <class D>
   3393 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   3394  return detail::SlideUp(Zero(d), v, amt);
   3395 }
   3396 
   3397 // ------------------------------ SlideDownLanes
   3398 template <class D>
   3399 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   3400  v = detail::SlideDown(v, amt);
   3401  // Zero out upper lanes if v is a partial vector
   3402  if (MaxLanes(d) < MaxLanes(DFromV<decltype(v)>())) {
   3403    v = detail::SlideUp(v, Zero(d), Lanes(d) - amt);
   3404  }
   3405  return v;
   3406 }
   3407 
   3408 // ------------------------------ ConcatUpperLower
   3409 template <class D, class V>
   3410 HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
   3411  const auto lo_lower = detail::Get<0>(d, lo);
   3412  return detail::Set<0>(d, hi, lo_lower);
   3413 }
   3414 
   3415 // ------------------------------ ConcatLowerLower
   3416 template <class D, class V>
   3417 HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
   3418  const auto hi_lower = detail::Get<0>(d, hi);
   3419  return detail::Set<1>(d, lo, hi_lower);
   3420 }
   3421 
   3422 // ------------------------------ ConcatUpperUpper
   3423 template <class D, class V>
   3424 HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
   3425  const auto lo_upper = detail::Get<1>(d, lo);
   3426  return detail::Set<0>(d, hi, lo_upper);
   3427 }
   3428 
   3429 // ------------------------------ ConcatLowerUpper
   3430 template <class D, class V>
   3431 HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
   3432  const auto lo_upper = detail::Get<1>(d, lo);
   3433  const auto hi_lower = detail::Get<0>(d, hi);
   3434  return detail::Set<1>(d, ResizeBitCast(d, lo_upper), hi_lower);
   3435 }
   3436 
   3437 // ------------------------------ Combine
   3438 template <class D2, class V>
   3439 HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
   3440  return detail::Set<1>(d2, ResizeBitCast(d2, lo), hi);
   3441 }
   3442 
   3443 // ------------------------------ ZeroExtendVector
   3444 template <class D2, class V>
   3445 HWY_API VFromD<D2> ZeroExtendVector(D2 d2, const V lo) {
   3446  return Combine(d2, Xor(lo, lo), lo);
   3447 }
   3448 
   3449 // ------------------------------ Lower/UpperHalf
   3450 
   3451 namespace detail {
   3452 
   3453 // RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
   3454 // that SEW = sizeof(T)*8 and LMUL = 1 << d.Pow2(). Add 3 to Pow2 to avoid
   3455 // negative shift counts.
   3456 template <class D>
   3457 constexpr bool IsSupportedLMUL(D d) {
   3458  return (size_t{1} << (d.Pow2() + 3)) >= sizeof(TFromD<D>);
   3459 }
   3460 
   3461 }  // namespace detail
   3462 
   3463 // If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
   3464 template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
   3465 HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
   3466  return detail::Trunc(v);
   3467 }
   3468 
   3469 // Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
   3470 // the hardware may set "vill" if we attempt such an LMUL. However, the V
   3471 // extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
   3472 // still makes sense to have half of an SEW=64 vector. We instead just return
   3473 // the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
   3474 template <class DH, class V,
   3475          hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
   3476 HWY_API V LowerHalf(const DH /* tag */, const V v) {
   3477  return v;
   3478 }
   3479 
   3480 // Same, but without D arg
   3481 template <class V>
   3482 HWY_API VFromD<Half<DFromV<V>>> LowerHalf(const V v) {
   3483  return LowerHalf(Half<DFromV<V>>(), v);
   3484 }
   3485 
   3486 template <class DH>
   3487 HWY_API VFromD<DH> UpperHalf(const DH /*d2*/, const VFromD<Twice<DH>> v) {
   3488  const Twice<DH> d;
   3489  return detail::Get<1>(d, v);
   3490 }
   3491 
   3492 // ================================================== SWIZZLE
   3493 
   3494 namespace detail {
   3495 // Special instruction for 1 lane is presumably faster?
   3496 #define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   3497                       MLEN, NAME, OP)                                         \
   3498  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {      \
   3499    return __riscv_v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT));   \
   3500  }
   3501 
   3502 HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
   3503 HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
   3504 HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
   3505 HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
   3506 #undef HWY_RVV_SLIDE1
   3507 }  // namespace detail
   3508 
   3509 // ------------------------------ Slide1Up and Slide1Down
   3510 #ifdef HWY_NATIVE_SLIDE1_UP_DOWN
   3511 #undef HWY_NATIVE_SLIDE1_UP_DOWN
   3512 #else
   3513 #define HWY_NATIVE_SLIDE1_UP_DOWN
   3514 #endif
   3515 
   3516 template <class D>
   3517 HWY_API VFromD<D> Slide1Up(D /*d*/, VFromD<D> v) {
   3518  return detail::Slide1Up(v);
   3519 }
   3520 
   3521 template <class D>
   3522 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
   3523  v = detail::Slide1Down(v);
   3524  // Zero out upper lanes if v is a partial vector
   3525  if (MaxLanes(d) < MaxLanes(DFromV<decltype(v)>())) {
   3526    v = detail::SlideUp(v, Zero(d), Lanes(d) - 1);
   3527  }
   3528  return v;
   3529 }
   3530 
   3531 // ------------------------------ GetLane
   3532 
   3533 #define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,     \
   3534                         SHIFT, MLEN, NAME, OP)                               \
   3535  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {           \
   3536    return __riscv_v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
   3537  }
   3538 
   3539 HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x, _ALL)
   3540 HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetLane, fmv_f, _ALL)
   3541 #undef HWY_RVV_GET_LANE
   3542 
   3543 // ------------------------------ ExtractLane
   3544 template <class V>
   3545 HWY_API TFromV<V> ExtractLane(const V v, size_t i) {
   3546  return GetLane(detail::SlideDown(v, i));
   3547 }
   3548 
   3549 // ------------------------------ Additional mask logical operations
   3550 
   3551 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof)
   3552 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetBeforeFirst, sbf)
   3553 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetAtOrBeforeFirst, sif)
   3554 
   3555 #define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP) \
   3556  HWY_API HWY_RVV_M(MLEN) SetAtOrAfterFirst(HWY_RVV_M(MLEN) m) {  \
   3557    return Not(SetBeforeFirst(m));                                \
   3558  }
   3559 
   3560 HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _)
   3561 #undef HWY_RVV_SET_AT_OR_AFTER_FIRST
   3562 
   3563 // ------------------------------ InsertLane
   3564 
   3565 // T template arg because TFromV<V> might not match the hwy::float16_t argument.
   3566 template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
   3567 HWY_API V InsertLane(const V v, size_t i, T t) {
   3568  const Rebind<T, DFromV<V>> d;
   3569  const RebindToUnsigned<decltype(d)> du;  // Iota0 is unsigned only
   3570  using TU = TFromD<decltype(du)>;
   3571  const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
   3572  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
   3573 }
   3574 
   3575 // For 8-bit lanes, Iota0 might overflow.
   3576 template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)>
   3577 HWY_API V InsertLane(const V v, size_t i, T t) {
   3578  const Rebind<T, DFromV<V>> d;
   3579  const auto zero = Zero(d);
   3580  const auto one = Set(d, 1);
   3581  const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
   3582  const auto is_i = SetOnlyFirst(ge_i);
   3583  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
   3584 }
   3585 
   3586 // ------------------------------ OddEven
   3587 
   3588 namespace detail {
   3589 
   3590 // Faster version using a wide constant instead of Iota0 + AndS.
   3591 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
   3592 HWY_INLINE MFromD<D> IsEven(D d) {
   3593  const RebindToUnsigned<decltype(d)> du;
   3594  const RepartitionToWide<decltype(du)> duw;
   3595  return RebindMask(d, detail::NeS(BitCast(du, Set(duw, 1)), 0u));
   3596 }
   3597 
   3598 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   3599 HWY_INLINE MFromD<D> IsEven(D d) {
   3600  const RebindToUnsigned<decltype(d)> du;  // Iota0 is unsigned only
   3601  return detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
   3602 }
   3603 
   3604 // Also provide the negated form because there is no native CompressNot.
   3605 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
   3606 HWY_INLINE MFromD<D> IsOdd(D d) {
   3607  const RebindToUnsigned<decltype(d)> du;
   3608  const RepartitionToWide<decltype(du)> duw;
   3609  return RebindMask(d, detail::EqS(BitCast(du, Set(duw, 1)), 0u));
   3610 }
   3611 
   3612 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   3613 HWY_INLINE MFromD<D> IsOdd(D d) {
   3614  const RebindToUnsigned<decltype(d)> du;  // Iota0 is unsigned only
   3615  return detail::NeS(detail::AndS(detail::Iota0(du), 1), 0);
   3616 }
   3617 
   3618 }  // namespace detail
   3619 
   3620 template <class V>
   3621 HWY_API V OddEven(const V a, const V b) {
   3622  return IfThenElse(detail::IsEven(DFromV<V>()), b, a);
   3623 }
   3624 
   3625 // ------------------------------ DupEven (OddEven)
   3626 template <class V>
   3627 HWY_API V DupEven(const V v) {
   3628  const V up = detail::Slide1Up(v);
   3629  return OddEven(up, v);
   3630 }
   3631 
   3632 // ------------------------------ DupOdd (OddEven)
   3633 template <class V>
   3634 HWY_API V DupOdd(const V v) {
   3635  const V down = detail::Slide1Down(v);
   3636  return OddEven(v, down);
   3637 }
   3638 
   3639 // ------------------------------ InterleaveEven (OddEven)
   3640 template <class D>
   3641 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3642  return OddEven(detail::Slide1Up(b), a);
   3643 }
   3644 
   3645 // ------------------------------ InterleaveOdd (OddEven)
   3646 template <class D>
   3647 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   3648  return OddEven(b, detail::Slide1Down(a));
   3649 }
   3650 
   3651 // ------------------------------ OddEvenBlocks
   3652 template <class V>
   3653 HWY_API V OddEvenBlocks(const V a, const V b) {
   3654  const RebindToUnsigned<DFromV<V>> du;  // Iota0 is unsigned only
   3655  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
   3656  const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
   3657  const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
   3658  return IfThenElse(is_even, b, a);
   3659 }
   3660 
   3661 // ------------------------------ SwapAdjacentBlocks
   3662 template <class V>
   3663 HWY_API V SwapAdjacentBlocks(const V v) {
   3664  const DFromV<V> d;
   3665  const size_t lpb = detail::LanesPerBlock(d);
   3666  const V down = detail::SlideDown(v, lpb);
   3667  const V up = detail::SlideUp(v, v, lpb);
   3668  return OddEvenBlocks(up, down);
   3669 }
   3670 
   3671 // ------------------------------ InterleaveEvenBlocks
   3672 // (SlideUpLanes, OddEvenBlocks)
   3673 
   3674 template <class D, class V = VFromD<D>>
   3675 HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
   3676  const size_t lpb = detail::LanesPerBlock(d);
   3677  return OddEvenBlocks(SlideUpLanes(d, b, lpb), a);
   3678 }
   3679 
   3680 // ------------------------------ InterleaveOddBlocks
   3681 // (SlideDownLanes, OddEvenBlocks)
   3682 
   3683 template <class D, class V = VFromD<D>>
   3684 HWY_API V InterleaveOddBlocks(D d, V a, V b) {
   3685  const size_t lpb = detail::LanesPerBlock(d);
   3686  return OddEvenBlocks(b, SlideDownLanes(d, a, lpb));
   3687 }
   3688 
   3689 // ------------------------------ TableLookupLanes
   3690 
   3691 template <class D, class VI>
   3692 HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
   3693  static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
   3694  const RebindToUnsigned<decltype(d)> du;  // instead of <D>: avoids unused d.
   3695  const auto indices = BitCast(du, vec);
   3696 #if HWY_IS_DEBUG_BUILD
   3697  using TU = TFromD<decltype(du)>;
   3698  const size_t twice_num_of_lanes = Lanes(d) * 2;
   3699  HWY_DASSERT(AllTrue(
   3700      du, Eq(indices,
   3701             detail::AndS(indices, static_cast<TU>(twice_num_of_lanes - 1)))));
   3702 #endif
   3703  return indices;
   3704 }
   3705 
   3706 template <class D, typename TI>
   3707 HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
   3708  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
   3709  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
   3710 }
   3711 
   3712 #define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   3713                      MLEN, NAME, OP)                                         \
   3714  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   3715      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) {    \
   3716    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx,                       \
   3717                                                HWY_RVV_AVL(SEW, SHIFT));     \
   3718  }
   3719 
   3720 // TableLookupLanes is supported for all types, but beware that indices are
   3721 // likely to wrap around for 8-bit lanes. When using TableLookupLanes inside
   3722 // this file, ensure that it is safe or use TableLookupLanes16 instead.
   3723 HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL)
   3724 #undef HWY_RVV_TABLE
   3725 
   3726 namespace detail {
   3727 
   3728 #define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,     \
   3729                        SHIFT, MLEN, NAME, OP)                               \
   3730  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
   3731      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \
   3732    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx,                      \
   3733                                                HWY_RVV_AVL(SEW, SHIFT));    \
   3734  }
   3735 
   3736 HWY_RVV_FOREACH_UI08(HWY_RVV_TABLE16, TableLookupLanes16, rgatherei16, _EXT)
   3737 #undef HWY_RVV_TABLE16
   3738 
   3739 // Used by Expand.
   3740 #define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
   3741                             SHIFT, MLEN, NAME, OP)                            \
   3742  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   3743      NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff,         \
   3744           HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) {     \
   3745    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx,  \
   3746                                                     HWY_RVV_AVL(SEW, SHIFT)); \
   3747  }
   3748 
   3749 HWY_RVV_FOREACH(HWY_RVV_MASKED_TABLE, MaskedTableLookupLanes, rgather, _ALL)
   3750 #undef HWY_RVV_MASKED_TABLE
   3751 
   3752 #define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD,       \
   3753                               LMULH, SHIFT, MLEN, NAME, OP)                   \
   3754  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
   3755      NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff,         \
   3756           HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) {   \
   3757    return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx,  \
   3758                                                     HWY_RVV_AVL(SEW, SHIFT)); \
   3759  }
   3760 
   3761 HWY_RVV_FOREACH_UI08(HWY_RVV_MASKED_TABLE16, MaskedTableLookupLanes16,
   3762                     rgatherei16, _EXT)
   3763 #undef HWY_RVV_MASKED_TABLE16
   3764 
   3765 }  // namespace detail
   3766 
   3767 // ------------------------------ Reverse (TableLookupLanes)
   3768 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_LE_D(D, 2)>
   3769 HWY_API VFromD<D> Reverse(D d, VFromD<D> v) {
   3770  const Rebind<uint16_t, decltype(d)> du16;
   3771  const size_t N = Lanes(d);
   3772  const auto idx =
   3773      detail::ReverseSubS(detail::Iota0(du16), static_cast<uint16_t>(N - 1));
   3774  return detail::TableLookupLanes16(v, idx);
   3775 }
   3776 
   3777 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_GT_D(D, 2)>
   3778 HWY_API VFromD<D> Reverse(D d, VFromD<D> v) {
   3779  const Half<decltype(d)> dh;
   3780  const Rebind<uint16_t, decltype(dh)> du16;
   3781  const size_t half_n = Lanes(dh);
   3782  const auto idx = detail::ReverseSubS(detail::Iota0(du16),
   3783                                       static_cast<uint16_t>(half_n - 1));
   3784  const auto reversed_lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
   3785  const auto reversed_hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
   3786  return Combine(d, reversed_lo, reversed_hi);
   3787 }
   3788 
   3789 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
   3790 HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
   3791  const RebindToUnsigned<D> du;
   3792  using TU = TFromD<decltype(du)>;
   3793  const size_t N = Lanes(du);
   3794  const auto idx =
   3795      detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
   3796  return TableLookupLanes(v, idx);
   3797 }
   3798 
   3799 // ------------------------------ ResizeBitCast
   3800 
   3801 // Extends or truncates a vector to match the given d.
   3802 namespace detail {
   3803 
   3804 template <class D>
   3805 HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
   3806  return v;
   3807 }
   3808 
   3809 // Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
   3810 // BitCast to the same lane type. Note that V may use the native lane type for
   3811 // f16, so convert D to that before checking.
   3812 #define HWY_RVV_IF_SAME_T_DV(D, V) \
   3813  hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
   3814 
   3815 // LMUL of VFromD<D> < LMUL of V: need to truncate v
   3816 template <class D, class V,  // HWY_RVV_IF_SAME_T_DV(D, V),
   3817          HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
   3818 HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
   3819  const DFromV<V> d_from;
   3820  const Half<decltype(d_from)> dh_from;
   3821  static_assert(
   3822      DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
   3823      "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
   3824  static_assert(
   3825      DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
   3826      "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
   3827      "VFromD<decltype(dh_from)>");
   3828  return ChangeLMUL(d, Trunc(v));
   3829 }
   3830 
   3831 // LMUL of VFromD<D> > LMUL of V: need to extend v
   3832 template <class D, class V,  // HWY_RVV_IF_SAME_T_DV(D, V),
   3833          HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
   3834 HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
   3835  const DFromV<V> d_from;
   3836  const Twice<decltype(d_from)> dt_from;
   3837  static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
   3838                "The LMUL of VFromD<decltype(dt_from)> must be greater than "
   3839                "the LMUL of V");
   3840  static_assert(
   3841      DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
   3842      "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
   3843      "VFromD<decltype(dt_from)>");
   3844  return ChangeLMUL(d, Ext(dt_from, v));
   3845 }
   3846 
   3847 #undef HWY_RVV_IF_SAME_T_DV
   3848 
   3849 }  // namespace detail
   3850 
   3851 template <class DTo, class VFrom>
   3852 HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
   3853  const DFromV<decltype(v)> d_from;
   3854  const Repartition<uint8_t, decltype(d_from)> du8_from;
   3855  const DFromV<VFromD<DTo>> d_to;
   3856  const Repartition<uint8_t, decltype(d_to)> du8_to;
   3857  return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
   3858 }
   3859 
   3860 // ------------------------------ Reverse2 (RotateRight, OddEven)
   3861 
   3862 // Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
   3863 #ifdef HWY_NATIVE_REVERSE2_8
   3864 #undef HWY_NATIVE_REVERSE2_8
   3865 #else
   3866 #define HWY_NATIVE_REVERSE2_8
   3867 #endif
   3868 
   3869 // Shifting and adding requires fewer instructions than blending, but casting to
   3870 // u32 only works for LMUL in [1/2, 8].
   3871 
   3872 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   3873 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
   3874  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16;
   3875  return ResizeBitCast(d, RotateRight<8>(ResizeBitCast(du16, v)));
   3876 }
   3877 
   3878 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   3879 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
   3880  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
   3881  return ResizeBitCast(d, RotateRight<16>(ResizeBitCast(du32, v)));
   3882 }
   3883 
   3884 // Shifting and adding requires fewer instructions than blending, but casting to
   3885 // u64 does not work for LMUL < 1.
   3886 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   3887 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
   3888  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
   3889  return ResizeBitCast(d, RotateRight<32>(ResizeBitCast(du64, v)));
   3890 }
   3891 
   3892 template <class D, class V = VFromD<D>, HWY_IF_T_SIZE_D(D, 8)>
   3893 HWY_API V Reverse2(D /* tag */, const V v) {
   3894  const V up = detail::Slide1Up(v);
   3895  const V down = detail::Slide1Down(v);
   3896  return OddEven(up, down);
   3897 }
   3898 
   3899 // ------------------------------ Reverse4 (TableLookupLanes)
   3900 
   3901 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   3902 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
   3903  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16;
   3904  return ResizeBitCast(d, Reverse2(du16, ResizeBitCast(du16, Reverse2(d, v))));
   3905 }
   3906 
   3907 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   3908 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
   3909  const RebindToUnsigned<D> du;
   3910  const auto idx = detail::XorS(detail::Iota0(du), 3);
   3911  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
   3912 }
   3913 
   3914 // ------------------------------ Reverse8 (TableLookupLanes)
   3915 
   3916 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   3917 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   3918  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
   3919  return ResizeBitCast(d, Reverse2(du32, ResizeBitCast(du32, Reverse4(d, v))));
   3920 }
   3921 
   3922 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   3923 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
   3924  const RebindToUnsigned<D> du;
   3925  const auto idx = detail::XorS(detail::Iota0(du), 7);
   3926  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
   3927 }
   3928 
   3929 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
   3930 template <class D, class V = VFromD<D>>
   3931 HWY_API V ReverseBlocks(D d, V v) {
   3932  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
   3933  const size_t N = Lanes(du64);
   3934  const auto rev =
   3935      detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
   3936  // Swap lo/hi u64 within each block
   3937  const auto idx = detail::XorS(rev, 1);
   3938  return ResizeBitCast(d, TableLookupLanes(ResizeBitCast(du64, v), idx));
   3939 }
   3940 
   3941 // ------------------------------ Compress
   3942 
   3943 // RVV supports all lane types natively.
   3944 #ifdef HWY_NATIVE_COMPRESS8
   3945 #undef HWY_NATIVE_COMPRESS8
   3946 #else
   3947 #define HWY_NATIVE_COMPRESS8
   3948 #endif
   3949 
   3950 template <typename T>
   3951 struct CompressIsPartition {
   3952  enum { value = 0 };
   3953 };
   3954 
   3955 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   3956                         SHIFT, MLEN, NAME, OP)                           \
   3957  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
   3958      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) {          \
   3959    return __riscv_v##OP##_vm_##CHAR##SEW##LMUL(v, mask,                  \
   3960                                                HWY_RVV_AVL(SEW, SHIFT)); \
   3961  }
   3962 
   3963 HWY_RVV_FOREACH(HWY_RVV_COMPRESS, Compress, compress, _ALL)
   3964 #undef HWY_RVV_COMPRESS
   3965 
   3966 // ------------------------------ Expand
   3967 
   3968 #ifdef HWY_NATIVE_EXPAND
   3969 #undef HWY_NATIVE_EXPAND
   3970 #else
   3971 #define HWY_NATIVE_EXPAND
   3972 #endif
   3973 
   3974 // >= 2-byte lanes: idx lanes will not overflow.
   3975 template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 1)>
   3976 HWY_API V Expand(V v, const M mask) {
   3977  const DFromV<V> d;
   3978  const RebindToUnsigned<decltype(d)> du;
   3979  const auto idx = detail::MaskedIota(du, RebindMask(du, mask));
   3980  const V zero = Zero(d);
   3981  return detail::MaskedTableLookupLanes(mask, zero, v, idx);
   3982 }
   3983 
   3984 // 1-byte lanes, LMUL < 8: promote idx to u16.
   3985 template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>,
   3986          HWY_IF_POW2_LE_D(D, 2)>
   3987 HWY_API V Expand(V v, const M mask) {
   3988  const D d;
   3989  const Rebind<uint16_t, decltype(d)> du16;
   3990  const auto idx = detail::MaskedIota(du16, RebindMask(du16, mask));
   3991  const V zero = Zero(d);
   3992  return detail::MaskedTableLookupLanes16(mask, zero, v, idx);
   3993 }
   3994 
   3995 // 1-byte lanes, max LMUL: unroll 2x.
   3996 template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>,
   3997          HWY_IF_POW2_GT_D(DFromV<V>, 2)>
   3998 HWY_API V Expand(V v, const M mask) {
   3999  const D d;
   4000  const Half<D> dh;
   4001  const auto v0 = LowerHalf(dh, v);
   4002  // TODO(janwas): skip vec<->mask if we can cast masks.
   4003  const V vmask = VecFromMask(d, mask);
   4004  const auto m0 = MaskFromVec(LowerHalf(dh, vmask));
   4005 
   4006  // Cannot just use UpperHalf, must shift by the number of inputs consumed.
   4007  const size_t count = CountTrue(dh, m0);
   4008  const auto v1 = detail::Trunc(detail::SlideDown(v, count));
   4009  const auto m1 = MaskFromVec(UpperHalf(dh, vmask));
   4010  return Combine(d, Expand(v1, m1), Expand(v0, m0));
   4011 }
   4012 
   4013 // ------------------------------ LoadExpand
   4014 template <class D>
   4015 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
   4016                             const TFromD<D>* HWY_RESTRICT unaligned) {
   4017  return Expand(LoadU(d, unaligned), mask);
   4018 }
   4019 
   4020 // ------------------------------ CompressNot
   4021 template <class V, class M>
   4022 HWY_API V CompressNot(V v, const M mask) {
   4023  return Compress(v, Not(mask));
   4024 }
   4025 
   4026 // ------------------------------ CompressBlocksNot
   4027 template <class V, class M>
   4028 HWY_API V CompressBlocksNot(V v, const M mask) {
   4029  return CompressNot(v, mask);
   4030 }
   4031 
   4032 // ------------------------------ CompressStore
   4033 template <class V, class M, class D>
   4034 HWY_API size_t CompressStore(const V v, const M mask, const D d,
   4035                             TFromD<D>* HWY_RESTRICT unaligned) {
   4036  StoreU(Compress(v, mask), d, unaligned);
   4037  return CountTrue(d, mask);
   4038 }
   4039 
   4040 // ------------------------------ CompressBlendedStore
   4041 template <class V, class M, class D>
   4042 HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
   4043                                    TFromD<D>* HWY_RESTRICT unaligned) {
   4044  const size_t count = CountTrue(d, mask);
   4045  StoreN(Compress(v, mask), d, unaligned, count);
   4046  return count;
   4047 }
   4048 
   4049 // ================================================== COMPARE (2)
   4050 
   4051 // ------------------------------ FindLastTrue
   4052 
   4053 template <class D>
   4054 HWY_API intptr_t FindLastTrue(D d, MFromD<D> m) {
   4055  const RebindToSigned<decltype(d)> di;
   4056  const intptr_t fft_rev_idx =
   4057      FindFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m))));
   4058  return (fft_rev_idx >= 0)
   4059             ? (static_cast<intptr_t>(Lanes(d) - 1) - fft_rev_idx)
   4060             : intptr_t{-1};
   4061 }
   4062 
   4063 template <class D>
   4064 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> m) {
   4065  const RebindToSigned<decltype(d)> di;
   4066  const size_t fft_rev_idx =
   4067      FindKnownFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m))));
   4068  return Lanes(d) - 1 - fft_rev_idx;
   4069 }
   4070 
   4071 // ------------------------------ ConcatOdd (Compress)
   4072 
   4073 namespace detail {
   4074 
   4075 #define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   4076                       MLEN, NAME, OP)                                         \
   4077  template <size_t kShift>                                                     \
   4078  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEWD, LMULD) v) {    \
   4079    return __riscv_v##OP##_wx_##CHAR##SEW##LMUL(v, kShift,                     \
   4080                                                HWY_RVV_AVL(SEWD, SHIFT + 1)); \
   4081  }
   4082 
   4083 HWY_RVV_FOREACH_U08(HWY_RVV_NARROW, Narrow, nsrl, _EXT)
   4084 HWY_RVV_FOREACH_U16(HWY_RVV_NARROW, Narrow, nsrl, _EXT)
   4085 HWY_RVV_FOREACH_U32(HWY_RVV_NARROW, Narrow, nsrl, _EXT)
   4086 #undef HWY_RVV_NARROW
   4087 
   4088 }  // namespace detail
   4089 
   4090 // Casting to wider and narrowing is the fastest for < 64-bit lanes.
   4091 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
   4092 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   4093  constexpr size_t kBits = sizeof(TFromD<D>) * 8;
   4094  const Twice<decltype(d)> dt;
   4095  const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw;
   4096  const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo));
   4097  return BitCast(d, detail::Narrow<kBits>(hl));
   4098 }
   4099 
   4100 // 64-bit: Combine+Compress.
   4101 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
   4102 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   4103  const Twice<decltype(d)> dt;
   4104  const VFromD<decltype(dt)> hl = Combine(dt, hi, lo);
   4105  return LowerHalf(d, Compress(hl, detail::IsOdd(dt)));
   4106 }
   4107 
   4108 // Any type, max LMUL: Compress both, then Combine.
   4109 template <class D, HWY_IF_POW2_GT_D(D, 2)>
   4110 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
   4111  const Half<decltype(d)> dh;
   4112  const MFromD<D> is_odd = detail::IsOdd(d);
   4113  const VFromD<decltype(d)> hi_odd = Compress(hi, is_odd);
   4114  const VFromD<decltype(d)> lo_odd = Compress(lo, is_odd);
   4115  return Combine(d, LowerHalf(dh, hi_odd), LowerHalf(dh, lo_odd));
   4116 }
   4117 
   4118 // ------------------------------ ConcatEven (Compress)
   4119 
   4120 // Casting to wider and narrowing is the fastest for < 64-bit lanes.
   4121 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
   4122 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   4123  const Twice<decltype(d)> dt;
   4124  const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw;
   4125  const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo));
   4126  return BitCast(d, detail::Narrow<0>(hl));
   4127 }
   4128 
   4129 // 64-bit: Combine+Compress.
   4130 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
   4131 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   4132  const Twice<decltype(d)> dt;
   4133  const VFromD<decltype(dt)> hl = Combine(dt, hi, lo);
   4134  return LowerHalf(d, Compress(hl, detail::IsEven(dt)));
   4135 }
   4136 
   4137 // Any type, max LMUL: Compress both, then Combine.
   4138 template <class D, HWY_IF_POW2_GT_D(D, 2)>
   4139 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
   4140  const Half<decltype(d)> dh;
   4141  const MFromD<D> is_even = detail::IsEven(d);
   4142  const VFromD<decltype(d)> hi_even = Compress(hi, is_even);
   4143  const VFromD<decltype(d)> lo_even = Compress(lo, is_even);
   4144  return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even));
   4145 }
   4146 
   4147 // ------------------------------ PromoteEvenTo/PromoteOddTo
   4148 #include "hwy/ops/inside-inl.h"
   4149 
   4150 // ================================================== BLOCKWISE
   4151 
   4152 // ------------------------------ CombineShiftRightBytes
   4153 template <size_t kBytes, class D, class V = VFromD<D>>
   4154 HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
   4155  const Repartition<uint8_t, decltype(d)> d8;
   4156  const auto hi8 = BitCast(d8, hi);
   4157  const auto lo8 = BitCast(d8, lo);
   4158  const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
   4159  const auto lo_down = detail::SlideDown(lo8, kBytes);
   4160  const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
   4161  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
   4162 }
   4163 
   4164 // ------------------------------ CombineShiftRightLanes
   4165 template <size_t kLanes, class D, class V = VFromD<D>>
   4166 HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
   4167  constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
   4168  const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
   4169  const auto lo_down = detail::SlideDown(lo, kLanes);
   4170  const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
   4171  return IfThenElse(is_lo, lo_down, hi_up);
   4172 }
   4173 
   4174 // ------------------------------ Shuffle2301 (ShiftLeft)
   4175 template <class V>
   4176 HWY_API V Shuffle2301(const V v) {
   4177  const DFromV<V> d;
   4178  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
   4179  const Repartition<uint64_t, decltype(d)> du64;
   4180  const auto v64 = BitCast(du64, v);
   4181  return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
   4182 }
   4183 
   4184 // ------------------------------ Shuffle2103
   4185 template <class V>
   4186 HWY_API V Shuffle2103(const V v) {
   4187  const DFromV<V> d;
   4188  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
   4189  return CombineShiftRightLanes<3>(d, v, v);
   4190 }
   4191 
   4192 // ------------------------------ Shuffle0321
   4193 template <class V>
   4194 HWY_API V Shuffle0321(const V v) {
   4195  const DFromV<V> d;
   4196  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
   4197  return CombineShiftRightLanes<1>(d, v, v);
   4198 }
   4199 
   4200 // ------------------------------ Shuffle1032
   4201 template <class V>
   4202 HWY_API V Shuffle1032(const V v) {
   4203  const DFromV<V> d;
   4204  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
   4205  return CombineShiftRightLanes<2>(d, v, v);
   4206 }
   4207 
   4208 // ------------------------------ Shuffle01
   4209 template <class V>
   4210 HWY_API V Shuffle01(const V v) {
   4211  const DFromV<V> d;
   4212  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
   4213  return CombineShiftRightLanes<1>(d, v, v);
   4214 }
   4215 
   4216 // ------------------------------ Shuffle0123
   4217 template <class V>
   4218 HWY_API V Shuffle0123(const V v) {
   4219  return Shuffle2301(Shuffle1032(v));
   4220 }
   4221 
   4222 // ------------------------------ TableLookupBytes
   4223 
   4224 template <class VT, class VI>
   4225 HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
   4226  const DFromV<VT> dt;  // T=table, I=index.
   4227  const DFromV<VI> di;
   4228  const Repartition<uint8_t, decltype(dt)> dt8;
   4229  const Repartition<uint8_t, decltype(di)> di8;
   4230  // Required for producing half-vectors with table lookups from a full vector.
   4231  // If we instead run at the LMUL of the index vector, lookups into the table
   4232  // would be truncated. Thus we run at the larger of the two LMULs and truncate
   4233  // the result vector to the original index LMUL.
   4234  constexpr int kPow2T = dt8.Pow2();
   4235  constexpr int kPow2I = di8.Pow2();
   4236  const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8;  // m=max
   4237  const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
   4238  const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
   4239  auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
   4240  // If the table is shorter, wrap around offsets so they do not reference
   4241  // undefined lanes in the newly extended vmt.
   4242  if (kPow2T < kPow2I) {
   4243    offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1));
   4244  }
   4245  const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
   4246  return BitCast(di, detail::ChangeLMUL(di8, out));
   4247 }
   4248 
   4249 template <class VT, class VI>
   4250 HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
   4251  const DFromV<VI> di;
   4252  const Repartition<int8_t, decltype(di)> di8;
   4253  const auto idx8 = BitCast(di8, idx);
   4254  const auto lookup = TableLookupBytes(vt, idx8);
   4255  return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
   4256 }
   4257 
   4258 // ------------------------------ TwoTablesLookupLanes
   4259 
   4260 // WARNING: 8-bit lanes may lead to unexpected results because idx is the same
   4261 // size and may overflow.
   4262 template <class D, HWY_IF_POW2_LE_D(D, 2)>
   4263 HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
   4264                                       VFromD<RebindToUnsigned<D>> idx) {
   4265  const Twice<decltype(d)> dt;
   4266  const RebindToUnsigned<decltype(dt)> dt_u;
   4267  const auto combined_tbl = Combine(dt, b, a);
   4268  const auto combined_idx = Combine(dt_u, idx, idx);
   4269  return LowerHalf(d, TableLookupLanes(combined_tbl, combined_idx));
   4270 }
   4271 
   4272 template <class D, HWY_IF_POW2_GT_D(D, 2)>
   4273 HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
   4274                                       VFromD<RebindToUnsigned<D>> idx) {
   4275  const RebindToUnsigned<decltype(d)> du;
   4276  using TU = TFromD<decltype(du)>;
   4277 
   4278  const size_t num_of_lanes = Lanes(d);
   4279  const auto idx_mod = detail::AndS(idx, static_cast<TU>(num_of_lanes - 1));
   4280  const auto sel_a_mask = Ne(idx, idx_mod);  // FALSE if a
   4281 
   4282  const auto a_lookup_result = TableLookupLanes(a, idx_mod);
   4283  return detail::MaskedTableLookupLanes(sel_a_mask, a_lookup_result, b,
   4284                                        idx_mod);
   4285 }
   4286 
   4287 template <class V>
   4288 HWY_API V TwoTablesLookupLanes(V a, V b,
   4289                               VFromD<RebindToUnsigned<DFromV<V>>> idx) {
   4290  const DFromV<decltype(a)> d;
   4291  return TwoTablesLookupLanes(d, a, b, idx);
   4292 }
   4293 
   4294 // ------------------------------ Broadcast
   4295 
   4296 // 8-bit requires 16-bit tables.
   4297 template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
   4298          HWY_IF_POW2_LE_D(D, 2)>
   4299 HWY_API V Broadcast(const V v) {
   4300  const D d;
   4301  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
   4302 
   4303  const Rebind<uint16_t, decltype(d)> du16;
   4304  VFromD<decltype(du16)> idx =
   4305      detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
   4306  if (kLane != 0) {
   4307    idx = detail::AddS(idx, kLane);
   4308  }
   4309  return detail::TableLookupLanes16(v, idx);
   4310 }
   4311 
   4312 // 8-bit and max LMUL: split into halves.
   4313 template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
   4314          HWY_IF_POW2_GT_D(D, 2)>
   4315 HWY_API V Broadcast(const V v) {
   4316  const D d;
   4317  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
   4318 
   4319  const Half<decltype(d)> dh;
   4320  using VH = VFromD<decltype(dh)>;
   4321  const Rebind<uint16_t, decltype(dh)> du16;
   4322  VFromD<decltype(du16)> idx =
   4323      detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
   4324  if (kLane != 0) {
   4325    idx = detail::AddS(idx, kLane);
   4326  }
   4327  const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
   4328  const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
   4329  return Combine(d, hi, lo);
   4330 }
   4331 
   4332 template <int kLane, class V, class D = DFromV<V>,
   4333          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
   4334 HWY_API V Broadcast(const V v) {
   4335  const D d;
   4336  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
   4337 
   4338  const RebindToUnsigned<decltype(d)> du;
   4339  auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
   4340  if (kLane != 0) {
   4341    idx = detail::AddS(idx, kLane);
   4342  }
   4343  return TableLookupLanes(v, idx);
   4344 }
   4345 
   4346 // ------------------------------ BroadcastLane
   4347 #ifdef HWY_NATIVE_BROADCASTLANE
   4348 #undef HWY_NATIVE_BROADCASTLANE
   4349 #else
   4350 #define HWY_NATIVE_BROADCASTLANE
   4351 #endif
   4352 
   4353 namespace detail {
   4354 
   4355 #define HWY_RVV_BROADCAST_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD,  \
   4356                               LMULH, SHIFT, MLEN, NAME, OP)              \
   4357  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
   4358      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t idx) {                    \
   4359    return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, idx,                   \
   4360                                                HWY_RVV_AVL(SEW, SHIFT)); \
   4361  }
   4362 
   4363 HWY_RVV_FOREACH(HWY_RVV_BROADCAST_LANE, BroadcastLane, rgather, _ALL)
   4364 #undef HWY_RVV_BROADCAST_LANE
   4365 
   4366 }  // namespace detail
   4367 
   4368 template <int kLane, class V>
   4369 HWY_API V BroadcastLane(V v) {
   4370  static_assert(0 <= kLane && kLane < HWY_MAX_LANES_V(V), "Invalid lane");
   4371  return detail::BroadcastLane(v, static_cast<size_t>(kLane));
   4372 }
   4373 
   4374 // ------------------------------ InsertBlock
   4375 #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
   4376 #undef HWY_NATIVE_BLK_INSERT_EXTRACT
   4377 #else
   4378 #define HWY_NATIVE_BLK_INSERT_EXTRACT
   4379 #endif
   4380 
   4381 template <int kBlockIdx, class V>
   4382 HWY_API V InsertBlock(V v, VFromD<BlockDFromD<DFromV<V>>> blk_to_insert) {
   4383  const DFromV<decltype(v)> d;
   4384  using TU = If<(sizeof(TFromV<V>) == 1 && DFromV<V>().Pow2() >= -2), uint16_t,
   4385                MakeUnsigned<TFromV<V>>>;
   4386  using TIdx = If<sizeof(TU) == 1, uint16_t, TU>;
   4387 
   4388  const Repartition<TU, decltype(d)> du;
   4389  const Rebind<TIdx, decltype(du)> d_idx;
   4390  static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
   4391                "Invalid block index");
   4392  constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU);
   4393 
   4394  constexpr size_t kBlkByteOffset =
   4395      static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock;
   4396  const auto vu = BitCast(du, v);
   4397  const auto vblk = ResizeBitCast(du, blk_to_insert);
   4398  const auto vblk_shifted = detail::SlideUp(vblk, vblk, kBlkByteOffset);
   4399  const auto insert_mask = RebindMask(
   4400      du, detail::LtS(detail::SubS(detail::Iota0(d_idx),
   4401                                   static_cast<TIdx>(kBlkByteOffset)),
   4402                      static_cast<TIdx>(kMaxLanesPerBlock)));
   4403 
   4404  return BitCast(d, IfThenElse(insert_mask, vblk_shifted, vu));
   4405 }
   4406 
   4407 // ------------------------------ BroadcastBlock
   4408 template <int kBlockIdx, class V, HWY_IF_POW2_LE_D(DFromV<V>, -3)>
   4409 HWY_API V BroadcastBlock(V v) {
   4410  const DFromV<decltype(v)> d;
   4411  const Repartition<uint8_t, decltype(d)> du8;
   4412  const Rebind<uint16_t, decltype(d)> du16;
   4413 
   4414  static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
   4415                "Invalid block index");
   4416 
   4417  const auto idx = detail::AddS(detail::AndS(detail::Iota0(du16), uint16_t{15}),
   4418                                static_cast<uint16_t>(kBlockIdx * 16));
   4419  return BitCast(d, detail::TableLookupLanes16(BitCast(du8, v), idx));
   4420 }
   4421 
   4422 template <int kBlockIdx, class V, HWY_IF_POW2_GT_D(DFromV<V>, -3)>
   4423 HWY_API V BroadcastBlock(V v) {
   4424  const DFromV<decltype(v)> d;
   4425  using TU = If<sizeof(TFromV<V>) == 1, uint16_t, MakeUnsigned<TFromV<V>>>;
   4426  const Repartition<TU, decltype(d)> du;
   4427 
   4428  static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
   4429                "Invalid block index");
   4430  constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU);
   4431 
   4432  const auto idx = detail::AddS(
   4433      detail::AndS(detail::Iota0(du), static_cast<TU>(kMaxLanesPerBlock - 1)),
   4434      static_cast<TU>(static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock));
   4435  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
   4436 }
   4437 
   4438 // ------------------------------ ExtractBlock
   4439 template <int kBlockIdx, class V>
   4440 HWY_API VFromD<BlockDFromD<DFromV<V>>> ExtractBlock(V v) {
   4441  const DFromV<decltype(v)> d;
   4442  const BlockDFromD<decltype(d)> d_block;
   4443 
   4444  static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
   4445                "Invalid block index");
   4446  constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TFromD<decltype(d)>);
   4447  constexpr size_t kBlkByteOffset =
   4448      static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock;
   4449 
   4450  return ResizeBitCast(d_block, detail::SlideDown(v, kBlkByteOffset));
   4451 }
   4452 
   4453 // ------------------------------ ShiftLeftLanes
   4454 
   4455 template <size_t kLanes, class D, class V = VFromD<D>>
   4456 HWY_API V ShiftLeftLanes(const D d, const V v) {
   4457  const RebindToSigned<decltype(d)> di;
   4458  const RebindToUnsigned<decltype(d)> du;
   4459  using TI = TFromD<decltype(di)>;
   4460  const auto shifted = detail::SlideUp(v, v, kLanes);
   4461  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
   4462  const auto idx_mod =
   4463      detail::AndS(BitCast(di, detail::Iota0(du)),
   4464                   static_cast<TI>(detail::LanesPerBlock(di) - 1));
   4465  const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
   4466  return IfThenZeroElse(clear, shifted);
   4467 }
   4468 
   4469 template <size_t kLanes, class V>
   4470 HWY_API V ShiftLeftLanes(const V v) {
   4471  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
   4472 }
   4473 
   4474 // ------------------------------ ShiftLeftBytes
   4475 
   4476 template <int kBytes, class D>
   4477 HWY_API VFromD<D> ShiftLeftBytes(D d, const VFromD<D> v) {
   4478  const Repartition<uint8_t, decltype(d)> d8;
   4479  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
   4480 }
   4481 
   4482 template <int kBytes, class V>
   4483 HWY_API V ShiftLeftBytes(const V v) {
   4484  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
   4485 }
   4486 
   4487 // ------------------------------ ShiftRightLanes
   4488 template <size_t kLanes, typename T, size_t N, int kPow2,
   4489          class V = VFromD<Simd<T, N, kPow2>>>
   4490 HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) {
   4491  const RebindToSigned<decltype(d)> di;
   4492  const RebindToUnsigned<decltype(d)> du;
   4493  using TI = TFromD<decltype(di)>;
   4494  // For partial vectors, clear upper lanes so we shift in zeros.
   4495  if (N <= 16 / sizeof(T)) {
   4496    v = detail::SlideUp(v, Zero(d), N);
   4497  }
   4498 
   4499  const auto shifted = detail::SlideDown(v, kLanes);
   4500  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
   4501  const size_t lpb = detail::LanesPerBlock(di);
   4502  const auto idx_mod =
   4503      detail::AndS(BitCast(di, detail::Iota0(du)), static_cast<TI>(lpb - 1));
   4504  const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
   4505  return IfThenElseZero(keep, shifted);
   4506 }
   4507 
   4508 // ------------------------------ ShiftRightBytes
   4509 template <int kBytes, class D, class V = VFromD<D>>
   4510 HWY_API V ShiftRightBytes(const D d, const V v) {
   4511  const Repartition<uint8_t, decltype(d)> d8;
   4512  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
   4513 }
   4514 
   4515 // ------------------------------ InterleaveWholeLower
   4516 #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
   4517 #undef HWY_NATIVE_INTERLEAVE_WHOLE
   4518 #else
   4519 #define HWY_NATIVE_INTERLEAVE_WHOLE
   4520 #endif
   4521 
   4522 namespace detail {
   4523 // Returns double-length vector with interleaved lanes.
   4524 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
   4525          HWY_IF_POW2_GT_D(D, -3)>
   4526 HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
   4527  const RebindToUnsigned<decltype(d)> du;
   4528  using TW = MakeWide<TFromD<decltype(du)>>;
   4529  const Rebind<TW, Half<decltype(du)>> dw;
   4530  const Half<decltype(du)> duh;  // cast inputs to unsigned so we zero-extend
   4531 
   4532  const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
   4533  const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
   4534  return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
   4535 }
   4536 // 64-bit: cannot PromoteTo, but can Ext.
   4537 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
   4538 HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
   4539  const RebindToUnsigned<decltype(d)> du;
   4540  const auto idx = ShiftRight<1>(detail::Iota0(du));
   4541  return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
   4542                 TableLookupLanes(detail::Ext(d, a), idx));
   4543 }
   4544 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
   4545 HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
   4546  const Half<D> dh;
   4547  const Half<decltype(dh)> dq;
   4548  const VFromD<decltype(dh)> i0 =
   4549      InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
   4550  const VFromD<decltype(dh)> i1 =
   4551      InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
   4552  return Combine(d, i1, i0);
   4553 }
   4554 
   4555 }  // namespace detail
   4556 
   4557 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
   4558 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
   4559  const RebindToUnsigned<decltype(d)> du;
   4560  const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
   4561  const RepartitionToNarrow<decltype(dw)> du_src;
   4562 
   4563  const VFromD<D> aw =
   4564      ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
   4565  const VFromD<D> bw =
   4566      ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
   4567  return Or(aw, detail::Slide1Up(bw));
   4568 }
   4569 
   4570 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   4571 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
   4572  const RebindToUnsigned<decltype(d)> du;
   4573  const auto idx = ShiftRight<1>(detail::Iota0(du));
   4574  return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
   4575 }
   4576 
   4577 // ------------------------------ InterleaveWholeUpper
   4578 
   4579 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
   4580 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
   4581  // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
   4582  // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
   4583  // true and and as the results of InterleaveWholeUpper are
   4584  // implementation-defined if Lanes(d) is less than 2.
   4585  const size_t half_N = Lanes(d) / 2;
   4586  return InterleaveWholeLower(d, detail::SlideDown(a, half_N),
   4587                              detail::SlideDown(b, half_N));
   4588 }
   4589 
   4590 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   4591 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
   4592  // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
   4593  // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
   4594  // true and as the results of InterleaveWholeUpper are implementation-defined
   4595  // if Lanes(d) is less than 2.
   4596  const size_t half_N = Lanes(d) / 2;
   4597  const RebindToUnsigned<decltype(d)> du;
   4598  const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
   4599                                static_cast<uint64_t>(half_N));
   4600  return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
   4601 }
   4602 
   4603 // ------------------------------ InterleaveLower (InterleaveWholeLower)
   4604 
   4605 namespace detail {
   4606 
   4607 // Definitely at least 128 bit: match x86 semantics (independent blocks). Using
   4608 // InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
   4609 template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
   4610 HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
   4611  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   4612  const Twice<D> dt;
   4613  const RebindToUnsigned<decltype(dt)> dt_u;
   4614  const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
   4615  // Keep only even 128-bit blocks. This is faster than u64 ConcatEven
   4616  // because we only have a single vector.
   4617  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
   4618  const VFromD<decltype(dt_u)> idx_block =
   4619      ShiftRight<kShift>(detail::Iota0(dt_u));
   4620  const MFromD<decltype(dt_u)> is_even =
   4621      detail::EqS(detail::AndS(idx_block, 1), 0);
   4622  return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
   4623 }
   4624 template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
   4625 HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
   4626  const Half<D> dh;
   4627  const VFromD<decltype(dh)> i0 =
   4628      InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
   4629  const VFromD<decltype(dh)> i1 =
   4630      InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
   4631  return Combine(d, i1, i0);
   4632 }
   4633 
   4634 // As above, for the upper half of blocks.
   4635 template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
   4636 HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
   4637  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   4638  const Twice<D> dt;
   4639  const RebindToUnsigned<decltype(dt)> dt_u;
   4640  const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
   4641  // Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
   4642  // because we only have a single vector.
   4643  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
   4644  const VFromD<decltype(dt_u)> idx_block =
   4645      ShiftRight<kShift>(detail::Iota0(dt_u));
   4646  const MFromD<decltype(dt_u)> is_odd =
   4647      detail::EqS(detail::AndS(idx_block, 1), 1);
   4648  return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
   4649 }
   4650 template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
   4651 HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
   4652  const Half<D> dh;
   4653  const VFromD<decltype(dh)> i0 =
   4654      InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
   4655  const VFromD<decltype(dh)> i1 =
   4656      InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
   4657  return Combine(d, i1, i0);
   4658 }
   4659 
   4660 // RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
   4661 // Used by functions with per-block behavior such as InterleaveLower.
   4662 template <typename T, size_t N, int kPow2>
   4663 constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
   4664  return N * sizeof(T) >= 16 && kPow2 >= 0;
   4665 }
   4666 
   4667 // Definitely less than 128-bit only if there is a small cap; fractional LMUL
   4668 // might not be enough if vectors are large.
   4669 template <typename T, size_t N, int kPow2>
   4670 constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
   4671  return N * sizeof(T) < 16;
   4672 }
   4673 
   4674 }  // namespace detail
   4675 
   4676 #define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
   4677 #define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
   4678 #define HWY_RVV_IF_CAN128_D(D) \
   4679  hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
   4680 
   4681 template <class D, class V, HWY_RVV_IF_GE128_D(D)>
   4682 HWY_API V InterleaveLower(D d, const V a, const V b) {
   4683  return detail::InterleaveLowerBlocks(d, a, b);
   4684 }
   4685 
   4686 // Single block: interleave without extra Compress.
   4687 template <class D, class V, HWY_RVV_IF_LT128_D(D)>
   4688 HWY_API V InterleaveLower(D d, const V a, const V b) {
   4689  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   4690  return InterleaveWholeLower(d, a, b);
   4691 }
   4692 
   4693 // Could be either; branch at runtime.
   4694 template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
   4695 HWY_API V InterleaveLower(D d, const V a, const V b) {
   4696  if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
   4697    return InterleaveWholeLower(d, a, b);
   4698  }
   4699  // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
   4700  const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
   4701  return ResizeBitCast(d, detail::InterleaveLowerBlocks(
   4702                              d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
   4703 }
   4704 
   4705 template <class V>
   4706 HWY_API V InterleaveLower(const V a, const V b) {
   4707  return InterleaveLower(DFromV<V>(), a, b);
   4708 }
   4709 
   4710 // ------------------------------ InterleaveUpper (Compress)
   4711 
   4712 template <class D, class V, HWY_RVV_IF_GE128_D(D)>
   4713 HWY_API V InterleaveUpper(D d, const V a, const V b) {
   4714  return detail::InterleaveUpperBlocks(d, a, b);
   4715 }
   4716 
   4717 // Single block: interleave without extra Compress.
   4718 template <class D, class V, HWY_RVV_IF_LT128_D(D)>
   4719 HWY_API V InterleaveUpper(D d, const V a, const V b) {
   4720  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   4721  return InterleaveWholeUpper(d, a, b);
   4722 }
   4723 
   4724 // Could be either; branch at runtime.
   4725 template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
   4726 HWY_API V InterleaveUpper(D d, const V a, const V b) {
   4727  if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
   4728    return InterleaveWholeUpper(d, a, b);
   4729  }
   4730  // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
   4731  const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
   4732  return ResizeBitCast(d, detail::InterleaveUpperBlocks(
   4733                              d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
   4734 }
   4735 
   4736 // ------------------------------ ZipLower
   4737 
   4738 template <class V, class DW = RepartitionToWide<DFromV<V>>>
   4739 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   4740  const RepartitionToNarrow<DW> dn;
   4741  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
   4742  return BitCast(dw, InterleaveLower(dn, a, b));
   4743 }
   4744 
   4745 template <class V, class DW = RepartitionToWide<DFromV<V>>>
   4746 HWY_API VFromD<DW> ZipLower(V a, V b) {
   4747  return BitCast(DW(), InterleaveLower(a, b));
   4748 }
   4749 
   4750 // ------------------------------ ZipUpper
   4751 template <class DW, class V>
   4752 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   4753  const RepartitionToNarrow<DW> dn;
   4754  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
   4755  return BitCast(dw, InterleaveUpper(dn, a, b));
   4756 }
   4757 
   4758 // ================================================== REDUCE
   4759 
   4760 // We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set.
   4761 #ifdef HWY_NATIVE_REDUCE_SCALAR
   4762 #undef HWY_NATIVE_REDUCE_SCALAR
   4763 #else
   4764 #define HWY_NATIVE_REDUCE_SCALAR
   4765 #endif
   4766 
   4767 // scalar = f(vector, zero_m1)
   4768 #define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   4769                       MLEN, NAME, OP)                                         \
   4770  template <size_t N>                                                          \
   4771  HWY_API HWY_RVV_T(BASE, SEW)                                                 \
   4772      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v,     \
   4773           HWY_RVV_V(BASE, SEW, m1) v0) {                                      \
   4774    return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(     \
   4775        v, v0, Lanes(d)));                                                     \
   4776  }
   4777 
   4778 // detail::RedSum, detail::RedMin, and detail::RedMax is more efficient
   4779 // for N=4 I8/U8 reductions on RVV than the default implementations of the
   4780 // the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h
   4781 #undef HWY_IF_REDUCE_D
   4782 #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
   4783 
   4784 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
   4785 #undef HWY_NATIVE_REDUCE_SUM_4_UI8
   4786 #else
   4787 #define HWY_NATIVE_REDUCE_SUM_4_UI8
   4788 #endif
   4789 
   4790 #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
   4791 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
   4792 #else
   4793 #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
   4794 #endif
   4795 
   4796 // ------------------------------ ReduceSum
   4797 
   4798 namespace detail {
   4799 HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT)
   4800 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT)
   4801 }  // namespace detail
   4802 
   4803 template <class D, HWY_IF_REDUCE_D(D)>
   4804 HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
   4805  const auto v0 = Zero(ScalableTag<TFromD<D>>());  // always m1
   4806  return detail::RedSum(d, v, v0);
   4807 }
   4808 
   4809 // ------------------------------ ReduceMin
   4810 namespace detail {
   4811 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT)
   4812 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT)
   4813 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT)
   4814 }  // namespace detail
   4815 
   4816 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
   4817 HWY_API T ReduceMin(D d, const VFromD<D> v) {
   4818  const ScalableTag<T> d1;  // always m1
   4819  return detail::RedMin(d, v, Set(d1, HighestValue<T>()));
   4820 }
   4821 
   4822 // ------------------------------ ReduceMax
   4823 namespace detail {
   4824 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT)
   4825 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT)
   4826 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT)
   4827 }  // namespace detail
   4828 
   4829 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
   4830 HWY_API T ReduceMax(D d, const VFromD<D> v) {
   4831  const ScalableTag<T> d1;  // always m1
   4832  return detail::RedMax(d, v, Set(d1, LowestValue<T>()));
   4833 }
   4834 
   4835 #undef HWY_RVV_REDUCE
   4836 
   4837 // TODO: add MaskedReduceSum/Min/Max
   4838 
   4839 // ------------------------------ SumOfLanes
   4840 
   4841 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   4842 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
   4843  return Set(d, ReduceSum(d, v));
   4844 }
   4845 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   4846 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
   4847  return Set(d, ReduceMin(d, v));
   4848 }
   4849 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   4850 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
   4851  return Set(d, ReduceMax(d, v));
   4852 }
   4853 
   4854 // ================================================== Ops with dependencies
   4855 
   4856 // ------------------------------ LoadInterleaved2
   4857 
   4858 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
   4859 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   4860 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   4861 #else
   4862 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
   4863 #endif
   4864 
   4865 // Requires Clang 16+, GCC 14+; otherwise emulated in generic_ops-inl.h.
   4866 #if HWY_HAVE_TUPLE
   4867 
   4868 #define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,   \
   4869                    MLEN, NAME, OP)                                           \
   4870  template <size_t kIndex>                                                    \
   4871  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   4872      NAME##2(HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup) {                          \
   4873    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x2_##CHAR##SEW##LMUL(tup,     \
   4874                                                                     kIndex); \
   4875  }                                                                           \
   4876  template <size_t kIndex>                                                    \
   4877  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   4878      NAME##3(HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup) {                          \
   4879    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x3_##CHAR##SEW##LMUL(tup,     \
   4880                                                                     kIndex); \
   4881  }                                                                           \
   4882  template <size_t kIndex>                                                    \
   4883  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   4884      NAME##4(HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup) {                          \
   4885    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x4_##CHAR##SEW##LMUL(tup,     \
   4886                                                                     kIndex); \
   4887  }
   4888 
   4889 HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _LE2)
   4890 #undef HWY_RVV_GET
   4891 
   4892 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   4893                    MLEN, NAME, OP)                                         \
   4894  template <size_t kIndex>                                                  \
   4895  HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) NAME##2(                          \
   4896      HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup, HWY_RVV_V(BASE, SEW, LMUL) v) {  \
   4897    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x2(     \
   4898        tup, kIndex, v);                                                    \
   4899  }                                                                         \
   4900  template <size_t kIndex>                                                  \
   4901  HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3(                          \
   4902      HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup, HWY_RVV_V(BASE, SEW, LMUL) v) {  \
   4903    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x3(     \
   4904        tup, kIndex, v);                                                    \
   4905  }                                                                         \
   4906  template <size_t kIndex>                                                  \
   4907  HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) NAME##4(                          \
   4908      HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup, HWY_RVV_V(BASE, SEW, LMUL) v) {  \
   4909    return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x4(     \
   4910        tup, kIndex, v);                                                    \
   4911  }
   4912 
   4913 HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _LE2)
   4914 #undef HWY_RVV_SET
   4915 
   4916 // RVV does not provide vcreate, so implement using Set.
   4917 #define HWY_RVV_CREATE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   4918                       MLEN, NAME, OP)                                         \
   4919  template <size_t N>                                                          \
   4920  HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2)                                      \
   4921      NAME##2(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/,                            \
   4922              HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1) {  \
   4923    HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup{};                                     \
   4924    tup = Set2<0>(tup, v0);                                                    \
   4925    tup = Set2<1>(tup, v1);                                                    \
   4926    return tup;                                                                \
   4927  }                                                                            \
   4928  template <size_t N>                                                          \
   4929  HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3(                             \
   4930      HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, HWY_RVV_V(BASE, SEW, LMUL) v0,     \
   4931      HWY_RVV_V(BASE, SEW, LMUL) v1, HWY_RVV_V(BASE, SEW, LMUL) v2) {          \
   4932    HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup{};                                     \
   4933    tup = Set3<0>(tup, v0);                                                    \
   4934    tup = Set3<1>(tup, v1);                                                    \
   4935    tup = Set3<2>(tup, v2);                                                    \
   4936    return tup;                                                                \
   4937  }                                                                            \
   4938  template <size_t N>                                                          \
   4939  HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4)                                      \
   4940      NAME##4(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/,                            \
   4941              HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,    \
   4942              HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3) {  \
   4943    HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup{};                                     \
   4944    tup = Set4<0>(tup, v0);                                                    \
   4945    tup = Set4<1>(tup, v1);                                                    \
   4946    tup = Set4<2>(tup, v2);                                                    \
   4947    tup = Set4<3>(tup, v3);                                                    \
   4948    return tup;                                                                \
   4949  }
   4950 
   4951 HWY_RVV_FOREACH(HWY_RVV_CREATE, Create, xx, _LE2_VIRT)
   4952 #undef HWY_RVV_CREATE
   4953 
   4954 template <class D>
   4955 using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D())));
   4956 template <class D>
   4957 using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D())));
   4958 template <class D>
   4959 using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
   4960 
   4961 #define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   4962                      MLEN, NAME, OP)                                         \
   4963  template <size_t N>                                                         \
   4964  HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                         \
   4965                    const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned,      \
   4966                    HWY_RVV_V(BASE, SEW, LMUL) & v0,                          \
   4967                    HWY_RVV_V(BASE, SEW, LMUL) & v1) {                        \
   4968    const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup =                               \
   4969        __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, Lanes(d)); \
   4970    v0 = Get2<0>(tup);                                                        \
   4971    v1 = Get2<1>(tup);                                                        \
   4972  }
   4973 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
   4974 HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT)
   4975 #undef HWY_RVV_LOAD2
   4976 
   4977 // ------------------------------ LoadInterleaved3
   4978 
   4979 #define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   4980                      MLEN, NAME, OP)                                         \
   4981  template <size_t N>                                                         \
   4982  HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                         \
   4983                    const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned,      \
   4984                    HWY_RVV_V(BASE, SEW, LMUL) & v0,                          \
   4985                    HWY_RVV_V(BASE, SEW, LMUL) & v1,                          \
   4986                    HWY_RVV_V(BASE, SEW, LMUL) & v2) {                        \
   4987    const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup =                               \
   4988        __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, Lanes(d)); \
   4989    v0 = Get3<0>(tup);                                                        \
   4990    v1 = Get3<1>(tup);                                                        \
   4991    v2 = Get3<2>(tup);                                                        \
   4992  }
   4993 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
   4994 HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT)
   4995 #undef HWY_RVV_LOAD3
   4996 
   4997 // ------------------------------ LoadInterleaved4
   4998 
   4999 #define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   5000                      MLEN, NAME, OP)                                         \
   5001  template <size_t N>                                                         \
   5002  HWY_API void NAME(                                                          \
   5003      HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                       \
   5004      const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned,                    \
   5005      HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1,       \
   5006      HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) {     \
   5007    const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup =                               \
   5008        __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, Lanes(d)); \
   5009    v0 = Get4<0>(tup);                                                        \
   5010    v1 = Get4<1>(tup);                                                        \
   5011    v2 = Get4<2>(tup);                                                        \
   5012    v3 = Get4<3>(tup);                                                        \
   5013  }
   5014 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
   5015 HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT)
   5016 #undef HWY_RVV_LOAD4
   5017 
   5018 // ------------------------------ StoreInterleaved2
   5019 
   5020 #define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   5021                       MLEN, NAME, OP)                                         \
   5022  template <size_t N>                                                          \
   5023  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0,                             \
   5024                    HWY_RVV_V(BASE, SEW, LMUL) v1,                             \
   5025                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                          \
   5026                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {           \
   5027    const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = Create2(d, v0, v1);            \
   5028    __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, tup, Lanes(d)); \
   5029  }
   5030 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
   5031 HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT)
   5032 #undef HWY_RVV_STORE2
   5033 
   5034 // ------------------------------ StoreInterleaved3
   5035 
   5036 #define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   5037                       MLEN, NAME, OP)                                         \
   5038  template <size_t N>                                                          \
   5039  HWY_API void NAME(                                                           \
   5040      HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,            \
   5041      HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d,         \
   5042      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {                         \
   5043    const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = Create3(d, v0, v1, v2);        \
   5044    __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, tup, Lanes(d)); \
   5045  }
   5046 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
   5047 HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT)
   5048 #undef HWY_RVV_STORE3
   5049 
   5050 // ------------------------------ StoreInterleaved4
   5051 
   5052 #define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
   5053                       MLEN, NAME, OP)                                         \
   5054  template <size_t N>                                                          \
   5055  HWY_API void NAME(                                                           \
   5056      HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,            \
   5057      HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3,            \
   5058      HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                        \
   5059      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {                         \
   5060    const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = Create4(d, v0, v1, v2, v3);    \
   5061    __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, tup, Lanes(d)); \
   5062  }
   5063 // Segments are limited to 8 registers, so we can only go up to LMUL=2.
   5064 HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT)
   5065 #undef HWY_RVV_STORE4
   5066 
   5067 #else  // !HWY_HAVE_TUPLE
   5068 
   5069 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
   5070 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
   5071                              VFromD<D>& v0, VFromD<D>& v1) {
   5072  const VFromD<D> A = LoadU(d, unaligned);  // v1[1] v0[1] v1[0] v0[0]
   5073  const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
   5074  v0 = ConcatEven(d, B, A);
   5075  v1 = ConcatOdd(d, B, A);
   5076 }
   5077 
   5078 namespace detail {
   5079 #define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   5080                             SHIFT, MLEN, NAME, OP)                           \
   5081  template <size_t N>                                                         \
   5082  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
   5083      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                  \
   5084           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) {      \
   5085    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(                          \
   5086        p, static_cast<ptrdiff_t>(stride), Lanes(d));                         \
   5087  }
   5088 HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT)
   5089 #undef HWY_RVV_LOAD_STRIDED
   5090 }  // namespace detail
   5091 
   5092 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
   5093 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   5094                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
   5095  // Offsets are bytes, and this is not documented.
   5096  v0 = detail::LoadStrided(d, unaligned + 0, 3 * sizeof(T));
   5097  v1 = detail::LoadStrided(d, unaligned + 1, 3 * sizeof(T));
   5098  v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T));
   5099 }
   5100 
   5101 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
   5102 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
   5103                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
   5104                              VFromD<D>& v3) {
   5105  // Offsets are bytes, and this is not documented.
   5106  v0 = detail::LoadStrided(d, unaligned + 0, 4 * sizeof(T));
   5107  v1 = detail::LoadStrided(d, unaligned + 1, 4 * sizeof(T));
   5108  v2 = detail::LoadStrided(d, unaligned + 2, 4 * sizeof(T));
   5109  v3 = detail::LoadStrided(d, unaligned + 3, 4 * sizeof(T));
   5110 }
   5111 
   5112 // Not 64-bit / max LMUL: interleave via promote, slide, OddEven.
   5113 template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
   5114          HWY_IF_POW2_LE_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)>
   5115 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
   5116                               T* HWY_RESTRICT unaligned) {
   5117  const RebindToUnsigned<D> du;
   5118  const Twice<RepartitionToWide<decltype(du)>> duw;
   5119  const Twice<decltype(d)> dt;
   5120  // Interleave with zero by promoting to wider (unsigned) type.
   5121  const VFromD<decltype(dt)> w0 = BitCast(dt, PromoteTo(duw, BitCast(du, v0)));
   5122  const VFromD<decltype(dt)> w1 = BitCast(dt, PromoteTo(duw, BitCast(du, v1)));
   5123  // OR second vector into the zero-valued lanes (faster than OddEven).
   5124  StoreU(Or(w0, detail::Slide1Up(w1)), dt, unaligned);
   5125 }
   5126 
   5127 // Can promote, max LMUL: two half-length
   5128 template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
   5129          HWY_IF_POW2_GT_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)>
   5130 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
   5131                               T* HWY_RESTRICT unaligned) {
   5132  const Half<decltype(d)> dh;
   5133  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), d, unaligned);
   5134  StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), d,
   5135                    unaligned + Lanes(d));
   5136 }
   5137 
   5138 namespace detail {
   5139 #define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
   5140                              SHIFT, MLEN, NAME, OP)                           \
   5141  template <size_t N>                                                          \
   5142  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                              \
   5143                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                          \
   5144                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) {    \
   5145    return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(                           \
   5146        p, static_cast<ptrdiff_t>(stride), v, Lanes(d));                       \
   5147  }
   5148 HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT)
   5149 #undef HWY_RVV_STORE_STRIDED
   5150 }  // namespace detail
   5151 
   5152 // 64-bit: strided
   5153 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8),
   5154          HWY_RVV_IF_NOT_EMULATED_D(D)>
   5155 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
   5156                               T* HWY_RESTRICT unaligned) {
   5157  // Offsets are bytes, and this is not documented.
   5158  detail::StoreStrided(v0, d, unaligned + 0, 2 * sizeof(T));
   5159  detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T));
   5160 }
   5161 
   5162 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
   5163 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
   5164                               T* HWY_RESTRICT unaligned) {
   5165  // Offsets are bytes, and this is not documented.
   5166  detail::StoreStrided(v0, d, unaligned + 0, 3 * sizeof(T));
   5167  detail::StoreStrided(v1, d, unaligned + 1, 3 * sizeof(T));
   5168  detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T));
   5169 }
   5170 
   5171 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
   5172 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
   5173                               VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
   5174  // Offsets are bytes, and this is not documented.
   5175  detail::StoreStrided(v0, d, unaligned + 0, 4 * sizeof(T));
   5176  detail::StoreStrided(v1, d, unaligned + 1, 4 * sizeof(T));
   5177  detail::StoreStrided(v2, d, unaligned + 2, 4 * sizeof(T));
   5178  detail::StoreStrided(v3, d, unaligned + 3, 4 * sizeof(T));
   5179 }
   5180 
   5181 #endif  // HWY_HAVE_TUPLE
   5182 
   5183 // Rely on generic Load/StoreInterleaved[234] for any emulated types.
   5184 // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_RVV_IF_EMULATED_D.
   5185 
   5186 // ------------------------------ Dup128VecFromValues (ResizeBitCast)
   5187 
   5188 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
   5189 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
   5190  return Set(d, t0);
   5191 }
   5192 
   5193 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
   5194 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
   5195  const auto even_lanes = Set(d, t0);
   5196 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
   5197  if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
   5198                           BitCastScalar<uint64_t>(t1)) &&
   5199      (BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
   5200    return even_lanes;
   5201  }
   5202 #endif
   5203 
   5204  const auto odd_lanes = Set(d, t1);
   5205  return OddEven(odd_lanes, even_lanes);
   5206 }
   5207 
   5208 namespace detail {
   5209 
   5210 #pragma pack(push, 1)
   5211 
   5212 template <class T>
   5213 struct alignas(8) Vec64ValsWrapper {
   5214  static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
   5215  static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
   5216  T vals[8 / sizeof(T)];
   5217 };
   5218 
   5219 #pragma pack(pop)
   5220 
   5221 }  // namespace detail
   5222 
   5223 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   5224 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   5225                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   5226                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
   5227                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
   5228                                      TFromD<D> t11, TFromD<D> t12,
   5229                                      TFromD<D> t13, TFromD<D> t14,
   5230                                      TFromD<D> t15) {
   5231  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
   5232  return ResizeBitCast(
   5233      d, Dup128VecFromValues(
   5234             du64,
   5235             BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
   5236                 {t0, t1, t2, t3, t4, t5, t6, t7}}),
   5237             BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
   5238                 {t8, t9, t10, t11, t12, t13, t14, t15}})));
   5239 }
   5240 
   5241 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   5242 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   5243                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   5244                                      TFromD<D> t5, TFromD<D> t6,
   5245                                      TFromD<D> t7) {
   5246  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
   5247  return ResizeBitCast(
   5248      d, Dup128VecFromValues(
   5249             du64,
   5250             BitCastScalar<uint64_t>(
   5251                 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
   5252             BitCastScalar<uint64_t>(
   5253                 detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
   5254 }
   5255 
   5256 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   5257 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   5258                                      TFromD<D> t2, TFromD<D> t3) {
   5259  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
   5260  return ResizeBitCast(
   5261      d,
   5262      Dup128VecFromValues(du64,
   5263                          BitCastScalar<uint64_t>(
   5264                              detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
   5265                          BitCastScalar<uint64_t>(
   5266                              detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
   5267 }
   5268 
   5269 // ------------------------------ LoadDup128
   5270 
   5271 template <class D>
   5272 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
   5273  const RebindToUnsigned<decltype(d)> du;
   5274 
   5275  // Make sure that no more than 16 bytes are loaded from p
   5276  constexpr int kLoadPow2 = d.Pow2();
   5277  constexpr size_t kMaxLanesToLoad =
   5278      HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>));
   5279  constexpr size_t kLoadN = D::template NewN<kLoadPow2, kMaxLanesToLoad>();
   5280  const Simd<TFromD<D>, kLoadN, kLoadPow2> d_load;
   5281  static_assert(d_load.MaxBytes() <= 16,
   5282                "d_load.MaxBytes() <= 16 must be true");
   5283  static_assert((d.MaxBytes() < 16) || (d_load.MaxBytes() == 16),
   5284                "d_load.MaxBytes() == 16 must be true if d.MaxBytes() >= 16 is "
   5285                "true");
   5286  static_assert((d.MaxBytes() >= 16) || (d_load.MaxBytes() == d.MaxBytes()),
   5287                "d_load.MaxBytes() == d.MaxBytes() must be true if "
   5288                "d.MaxBytes() < 16 is true");
   5289 
   5290  const VFromD<D> loaded = Load(d_load, p);
   5291  if (d.MaxBytes() <= 16) return loaded;
   5292 
   5293  // idx must be unsigned for TableLookupLanes.
   5294  using TU = TFromD<decltype(du)>;
   5295  const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
   5296  // Broadcast the first block.
   5297  const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(du), mask);
   5298  // Safe even for 8-bit lanes because indices never exceed 15.
   5299  return TableLookupLanes(loaded, idx);
   5300 }
   5301 
   5302 // ------------------------------ LoadMaskBits
   5303 
   5304 // Support all combinations of T and SHIFT(LMUL) without explicit overloads for
   5305 // each. First overload for MLEN=1..64.
   5306 namespace detail {
   5307 
   5308 // Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
   5309 // increases with lane size and decreases for increasing LMUL. Cap at 64, the
   5310 // largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
   5311 // e.g. vuint16mf8_t: (8*2 << 3) == 128.
   5312 template <class D>
   5313 using MaskTag = hwy::SizeTag<HWY_MIN(
   5314    64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -D().Pow2()))>;
   5315 
   5316 #define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)                \
   5317  HWY_INLINE HWY_RVV_M(MLEN)                                              \
   5318      NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
   5319    return __riscv_v##OP##_v_b##MLEN(bits, N);                            \
   5320  }
   5321 HWY_RVV_FOREACH_B(HWY_RVV_LOAD_MASK_BITS, LoadMaskBits, lm)
   5322 #undef HWY_RVV_LOAD_MASK_BITS
   5323 }  // namespace detail
   5324 
   5325 template <class D, class MT = detail::MaskTag<D>>
   5326 HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
   5327    -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
   5328  return detail::LoadMaskBits(MT(), bits, Lanes(d));
   5329 }
   5330 
   5331 // ------------------------------ StoreMaskBits
   5332 #define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)               \
   5333  template <class D>                                                      \
   5334  HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) {            \
   5335    const size_t N = Lanes(d);                                            \
   5336    __riscv_v##OP##_v_b##MLEN(bits, m, N);                                \
   5337    /* Non-full byte, need to clear the undefined upper bits. */          \
   5338    /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
   5339    constexpr bool kLessThan8 =                                           \
   5340        detail::ScaleByPower(16 / sizeof(TFromD<D>), d.Pow2()) < 8;       \
   5341    if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) {                       \
   5342      const int mask = (1 << N) - 1;                                      \
   5343      bits[0] = static_cast<uint8_t>(bits[0] & mask);                     \
   5344    }                                                                     \
   5345    return (N + 7) / 8;                                                   \
   5346  }
   5347 HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, StoreMaskBits, sm)
   5348 #undef HWY_RVV_STORE_MASK_BITS
   5349 
   5350 // ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
   5351 
   5352 template <class V>
   5353 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
   5354  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
   5355 }
   5356 
   5357 template <class D>
   5358 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
   5359                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
   5360  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
   5361 }
   5362 
   5363 // ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
   5364 
   5365 // NOTE: do not use this as a building block within rvv-inl - it is likely more
   5366 // efficient to use avl or detail::SlideUp.
   5367 
   5368 // Disallow for 8-bit because Iota is likely to overflow.
   5369 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   5370 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
   5371  const RebindToUnsigned<D> du;
   5372  using TU = TFromD<decltype(du)>;
   5373  return RebindMask(d, detail::LtS(detail::Iota0(du), static_cast<TU>(n)));
   5374 }
   5375 
   5376 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   5377 HWY_API MFromD<D> FirstN(const D d, const size_t n) {
   5378  const auto zero = Zero(d);
   5379  const auto one = Set(d, 1);
   5380  return Eq(detail::SlideUp(one, zero, n), one);
   5381 }
   5382 
   5383 // ------------------------------ LowerHalfOfMask/UpperHalfOfMask
   5384 
   5385 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5386 
   5387 // Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask,
   5388 // CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible
   5389 // on RVV if the __riscv_vreinterpret_v_b*_u8m1 and
   5390 // __riscv_vreinterpret_v_u8m1_b* intrinsics are available.
   5391 
   5392 // The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b*
   5393 // intrinsics available with Clang 17 and later and GCC 14 and later.
   5394 
   5395 namespace detail {
   5396 
   5397 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
   5398  return __riscv_vreinterpret_v_b1_u8m1(m);
   5399 }
   5400 
   5401 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
   5402  return __riscv_vreinterpret_v_b2_u8m1(m);
   5403 }
   5404 
   5405 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
   5406  return __riscv_vreinterpret_v_b4_u8m1(m);
   5407 }
   5408 
   5409 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
   5410  return __riscv_vreinterpret_v_b8_u8m1(m);
   5411 }
   5412 
   5413 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
   5414  return __riscv_vreinterpret_v_b16_u8m1(m);
   5415 }
   5416 
   5417 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
   5418  return __riscv_vreinterpret_v_b32_u8m1(m);
   5419 }
   5420 
   5421 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
   5422  return __riscv_vreinterpret_v_b64_u8m1(m);
   5423 }
   5424 
   5425 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr>
   5426 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5427  return __riscv_vreinterpret_v_u8m1_b1(v);
   5428 }
   5429 
   5430 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr>
   5431 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5432  return __riscv_vreinterpret_v_u8m1_b2(v);
   5433 }
   5434 
   5435 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr>
   5436 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5437  return __riscv_vreinterpret_v_u8m1_b4(v);
   5438 }
   5439 
   5440 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr>
   5441 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5442  return __riscv_vreinterpret_v_u8m1_b8(v);
   5443 }
   5444 
   5445 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr>
   5446 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5447  return __riscv_vreinterpret_v_u8m1_b16(v);
   5448 }
   5449 
   5450 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr>
   5451 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5452  return __riscv_vreinterpret_v_u8m1_b32(v);
   5453 }
   5454 
   5455 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr>
   5456 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
   5457  return __riscv_vreinterpret_v_u8m1_b64(v);
   5458 }
   5459 
   5460 }  // namespace detail
   5461 
   5462 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
   5463 #undef HWY_NATIVE_LOWER_HALF_OF_MASK
   5464 #else
   5465 #define HWY_NATIVE_LOWER_HALF_OF_MASK
   5466 #endif
   5467 
   5468 template <class D>
   5469 HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
   5470  return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
   5471 }
   5472 
   5473 #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
   5474 #undef HWY_NATIVE_UPPER_HALF_OF_MASK
   5475 #else
   5476 #define HWY_NATIVE_UPPER_HALF_OF_MASK
   5477 #endif
   5478 
   5479 template <class D>
   5480 HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) {
   5481  const size_t N = Lanes(d);
   5482 
   5483  vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
   5484  mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7));
   5485  if (HWY_MAX_LANES_D(D) >= 8) {
   5486    mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
   5487  }
   5488 
   5489  return detail::U8MaskBitsVecToMask(d, mask_bits);
   5490 }
   5491 
   5492 // ------------------------------ CombineMasks
   5493 
   5494 #ifdef HWY_NATIVE_COMBINE_MASKS
   5495 #undef HWY_NATIVE_COMBINE_MASKS
   5496 #else
   5497 #define HWY_NATIVE_COMBINE_MASKS
   5498 #endif
   5499 
   5500 template <class D>
   5501 HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) {
   5502  const Half<decltype(d)> dh;
   5503  const size_t half_N = Lanes(dh);
   5504 
   5505  const auto ext_lo_mask =
   5506      And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
   5507          FirstN(d, half_N));
   5508  vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
   5509  hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7));
   5510  if (HWY_MAX_LANES_D(D) >= 8) {
   5511    hi_mask_bits =
   5512        SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
   5513  }
   5514 
   5515  return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
   5516 }
   5517 
   5518 // ------------------------------ OrderedDemote2MasksTo
   5519 
   5520 #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
   5521 #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
   5522 #else
   5523 #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
   5524 #endif
   5525 
   5526 template <class DTo, class DFrom,
   5527          HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
   5528          class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
   5529          hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
   5530 HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
   5531                                          MFromD<DFrom> a, MFromD<DFrom> b) {
   5532  return CombineMasks(d_to, b, a);
   5533 }
   5534 
   5535 #endif  // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5536 
   5537 // ------------------------------ Dup128MaskFromMaskBits
   5538 
   5539 namespace detail {
   5540 // Even though this is only used after checking if (kN < X), this helper
   5541 // function prevents "shift count exceeded" errors.
   5542 template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
   5543 constexpr unsigned MaxMaskBits() {
   5544  return (1u << kN) - 1;
   5545 }
   5546 template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
   5547 constexpr unsigned MaxMaskBits() {
   5548  return ~0u;
   5549 }
   5550 
   5551 template <class D>
   5552 constexpr int SufficientPow2ForMask() {
   5553  return HWY_MAX(
   5554      D().Pow2() - 3 - static_cast<int>(FloorLog2(sizeof(TFromD<D>))), -3);
   5555 }
   5556 
   5557 template <class M>
   5558 static HWY_INLINE HWY_MAYBE_UNUSED M RvvVmmv(M mask) {
   5559  // The below And operation is equivalent to the RVV vmmv instruction and
   5560  // ensures that mask is not in the same register as a vector operand when used
   5561  // in RVV instructions that take both a vector operand and a mask operand.
   5562  return And(mask, mask);
   5563 }
   5564 
   5565 }  // namespace detail
   5566 
   5567 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
   5568 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   5569  constexpr size_t kN = MaxLanes(d);
   5570  if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
   5571 
   5572 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5573  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   5574  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
   5575      d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
   5576                            Set(du8, static_cast<uint8_t>(mask_bits)))));
   5577 #else
   5578  const RebindToUnsigned<decltype(d)> du8;
   5579  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
   5580      du64;
   5581 
   5582  const auto bytes = ResizeBitCast(
   5583      du8, detail::AndS(
   5584               ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
   5585               uint64_t{0x8040201008040201u}));
   5586  return detail::NeS(bytes, uint8_t{0});
   5587 #endif
   5588 }
   5589 
   5590 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
   5591 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   5592 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5593  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   5594  const ScalableTag<uint16_t, detail::SufficientPow2ForMask<D>()> du16;
   5595  // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
   5596  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
   5597      d, detail::ChangeLMUL(
   5598             ScalableTag<uint8_t>(),
   5599             BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))))));
   5600 #else
   5601  // Slow fallback for completeness; the above bits to mask cast is preferred.
   5602  const RebindToUnsigned<decltype(d)> du8;
   5603  const Repartition<uint16_t, decltype(du8)> du16;
   5604  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
   5605      du64;
   5606 
   5607  // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
   5608  // and then bitcast the replicated mask_bits to a u8 vector
   5609  const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
   5610  // Replicate bytes 8x such that each byte contains the bit that governs it.
   5611  const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
   5612 
   5613  const auto masked_out_rep8 = ResizeBitCast(
   5614      du8,
   5615      detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
   5616  return detail::NeS(masked_out_rep8, uint8_t{0});
   5617 #endif
   5618 }
   5619 
   5620 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   5621 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   5622  constexpr size_t kN = MaxLanes(d);
   5623  if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
   5624 
   5625 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5626  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   5627  // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
   5628  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
   5629      d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
   5630                            Set(du8, static_cast<uint8_t>(mask_bits)))));
   5631 #else
   5632  // Slow fallback for completeness; the above bits to mask cast is preferred.
   5633  const RebindToUnsigned<D> du;
   5634  const VFromD<decltype(du)> bits =
   5635      Shl(Set(du, uint16_t{1}), detail::AndS(detail::Iota0(du), 7));
   5636  return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
   5637 #endif
   5638 }
   5639 
   5640 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   5641 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   5642  constexpr size_t kN = MaxLanes(d);
   5643  if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
   5644 
   5645 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5646  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   5647  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
   5648      d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
   5649                            Set(du8, static_cast<uint8_t>(mask_bits * 0x11)))));
   5650 #else
   5651  // Slow fallback for completeness; the above bits to mask cast is preferred.
   5652  const RebindToUnsigned<D> du;
   5653  const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2, 4, 8);
   5654  return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
   5655 #endif
   5656 }
   5657 
   5658 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   5659 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   5660  constexpr size_t kN = MaxLanes(d);
   5661  if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
   5662 
   5663 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5664  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   5665  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
   5666      d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
   5667                            Set(du8, static_cast<uint8_t>(mask_bits * 0x55)))));
   5668 #else
   5669  // Slow fallback for completeness; the above bits to mask cast is preferred.
   5670  const RebindToUnsigned<D> du;
   5671  const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2);
   5672  return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
   5673 #endif
   5674 }
   5675 
   5676 // ------------------------------ SetMask
   5677 
   5678 #ifdef HWY_NATIVE_SET_MASK
   5679 #undef HWY_NATIVE_SET_MASK
   5680 #else
   5681 #define HWY_NATIVE_SET_MASK
   5682 #endif
   5683 
   5684 template <class D>
   5685 HWY_API MFromD<D> SetMask(D d, bool val) {
   5686  const uint8_t u8_mask_val = static_cast<uint8_t>(-static_cast<int>(val));
   5687 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   5688  const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
   5689  return detail::RvvVmmv(detail::U8MaskBitsVecToMask(
   5690      d, detail::ChangeLMUL(ScalableTag<uint8_t>(), Set(du8, u8_mask_val))));
   5691 #else
   5692  const Rebind<uint8_t, DFromV<VFromD<decltype(d)>>> du8;
   5693  return MaskFromVec(Set(du8, u8_mask_val));
   5694 #endif
   5695 }
   5696 
   5697 // ------------------------------ Abs (Max, Neg)
   5698 
   5699 template <class V, HWY_IF_SIGNED_V(V)>
   5700 HWY_API V Abs(const V v) {
   5701  return Max(v, Neg(v));
   5702 }
   5703 
   5704 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL)
   5705 
   5706 #undef HWY_RVV_RETV_ARGV2
   5707 
   5708 // ------------------------------ AbsDiff (Abs, Sub)
   5709 template <class V, HWY_IF_FLOAT_V(V)>
   5710 HWY_API V AbsDiff(const V a, const V b) {
   5711  return Abs(Sub(a, b));
   5712 }
   5713 
   5714 // ------------------------------ Round  (NearestInt, ConvertTo, CopySign)
   5715 
   5716 // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
   5717 // a dedicated instruction for that. Rounding to integer and converting back to
   5718 // float is correct except when the input magnitude is large, in which case the
   5719 // input was already an integer (because mantissa >> exponent is zero).
   5720 
   5721 namespace detail {
   5722 enum RoundingModes { kNear, kTrunc, kDown, kUp };
   5723 
   5724 template <class V>
   5725 HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
   5726  return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
   5727 }
   5728 
   5729 }  // namespace detail
   5730 
   5731 template <class V>
   5732 HWY_API V Round(const V v) {
   5733  const DFromV<V> df;
   5734 
   5735  const auto integer = NearestInt(v);  // round using current mode
   5736  const auto int_f = ConvertTo(df, integer);
   5737 
   5738  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
   5739 }
   5740 
   5741 // ------------------------------ Trunc (ConvertTo)
   5742 template <class V>
   5743 HWY_API V Trunc(const V v) {
   5744  const DFromV<V> df;
   5745  const RebindToSigned<decltype(df)> di;
   5746 
   5747  const auto integer = ConvertTo(di, v);  // round toward 0
   5748  const auto int_f = ConvertTo(df, integer);
   5749 
   5750  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
   5751 }
   5752 
   5753 // ------------------------------ Ceil
   5754 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
   5755    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
   5756 namespace detail {
   5757 #define HWY_RVV_CEIL_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
   5758                         SHIFT, MLEN, NAME, OP)                             \
   5759  HWY_API HWY_RVV_V(int, SEW, LMUL) CeilInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
   5760    return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RUP,        \
   5761                                                 HWY_RVV_AVL(SEW, SHIFT));  \
   5762  }
   5763 HWY_RVV_FOREACH_F(HWY_RVV_CEIL_INT, _, _, _ALL)
   5764 #undef HWY_RVV_CEIL_INT
   5765 
   5766 }  // namespace detail
   5767 
   5768 template <class V>
   5769 HWY_API V Ceil(const V v) {
   5770  const DFromV<V> df;
   5771 
   5772  const auto integer = detail::CeilInt(v);
   5773  const auto int_f = ConvertTo(df, integer);
   5774 
   5775  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
   5776 }
   5777 
   5778 #else  // GCC 13 or earlier or Clang 16 or earlier
   5779 
   5780 template <class V>
   5781 HWY_API V Ceil(const V v) {
   5782  const DFromV<decltype(v)> df;
   5783  const RebindToSigned<decltype(df)> di;
   5784 
   5785  using T = TFromD<decltype(df)>;
   5786 
   5787  const auto integer = ConvertTo(di, v);  // round toward 0
   5788  const auto int_f = ConvertTo(df, integer);
   5789 
   5790  // Truncating a positive non-integer ends up smaller; if so, add 1.
   5791  const auto pos1 =
   5792      IfThenElseZero(Lt(int_f, v), Set(df, ConvertScalarTo<T>(1.0)));
   5793 
   5794  return IfThenElse(detail::UseInt(v), Add(int_f, pos1), v);
   5795 }
   5796 
   5797 #endif  // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
   5798        // (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
   5799 
   5800 // ------------------------------ Floor
   5801 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \
   5802    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
   5803 namespace detail {
   5804 #define HWY_RVV_FLOOR_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
   5805                          SHIFT, MLEN, NAME, OP)                             \
   5806  HWY_API HWY_RVV_V(int, SEW, LMUL) FloorInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
   5807    return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RDN,         \
   5808                                                 HWY_RVV_AVL(SEW, SHIFT));   \
   5809  }
   5810 HWY_RVV_FOREACH_F(HWY_RVV_FLOOR_INT, _, _, _ALL)
   5811 #undef HWY_RVV_FLOOR_INT
   5812 
   5813 }  // namespace detail
   5814 
   5815 template <class V>
   5816 HWY_API V Floor(const V v) {
   5817  const DFromV<V> df;
   5818 
   5819  const auto integer = detail::FloorInt(v);
   5820  const auto int_f = ConvertTo(df, integer);
   5821 
   5822  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
   5823 }
   5824 
   5825 #else  // GCC 13 or earlier or Clang 16 or earlier
   5826 
   5827 template <class V>
   5828 HWY_API V Floor(const V v) {
   5829  const DFromV<decltype(v)> df;
   5830  const RebindToSigned<decltype(df)> di;
   5831 
   5832  using T = TFromD<decltype(df)>;
   5833 
   5834  const auto integer = ConvertTo(di, v);  // round toward 0
   5835  const auto int_f = ConvertTo(df, integer);
   5836 
   5837  // Truncating a negative non-integer ends up larger; if so, subtract 1.
   5838  const auto neg1 =
   5839      IfThenElseZero(Gt(int_f, v), Set(df, ConvertScalarTo<T>(-1.0)));
   5840 
   5841  return IfThenElse(detail::UseInt(v), Add(int_f, neg1), v);
   5842 }
   5843 
   5844 #endif  // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) ||
   5845        // (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700)
   5846 
   5847 // ------------------------------ Floating-point classification (Ne)
   5848 
   5849 // vfclass does not help because it would require 3 instructions (to AND and
   5850 // then compare the bits), whereas these are just 1-3 integer instructions.
   5851 
   5852 template <class V>
   5853 HWY_API MFromD<DFromV<V>> IsNaN(const V v) {
   5854  return Ne(v, v);
   5855 }
   5856 
   5857 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
   5858 // We use a fused Set/comparison for IsFinite.
   5859 #ifdef HWY_NATIVE_ISINF
   5860 #undef HWY_NATIVE_ISINF
   5861 #else
   5862 #define HWY_NATIVE_ISINF
   5863 #endif
   5864 
   5865 template <class V, class D = DFromV<V>>
   5866 HWY_API MFromD<D> IsInf(const V v) {
   5867  const D d;
   5868  const RebindToSigned<decltype(d)> di;
   5869  using T = TFromD<D>;
   5870  const VFromD<decltype(di)> vi = BitCast(di, v);
   5871  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
   5872  return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
   5873 }
   5874 
   5875 // Returns whether normal/subnormal/zero.
   5876 template <class V, class D = DFromV<V>>
   5877 HWY_API MFromD<D> IsFinite(const V v) {
   5878  const D d;
   5879  const RebindToUnsigned<decltype(d)> du;
   5880  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
   5881  using T = TFromD<D>;
   5882  const VFromD<decltype(du)> vu = BitCast(du, v);
   5883  // 'Shift left' to clear the sign bit, then right so we can compare with the
   5884  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
   5885  // negative and non-negative floats would be greater).
   5886  const VFromD<decltype(di)> exp =
   5887      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
   5888  return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
   5889 }
   5890 
   5891 // ------------------------------ Iota (ConvertTo)
   5892 
   5893 template <class D, typename T2, HWY_IF_UNSIGNED_D(D)>
   5894 HWY_API VFromD<D> Iota(const D d, T2 first) {
   5895  return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first));
   5896 }
   5897 
   5898 template <class D, typename T2, HWY_IF_SIGNED_D(D)>
   5899 HWY_API VFromD<D> Iota(const D d, T2 first) {
   5900  const RebindToUnsigned<D> du;
   5901  return detail::AddS(BitCast(d, detail::Iota0(du)),
   5902                      static_cast<TFromD<D>>(first));
   5903 }
   5904 
   5905 template <class D, typename T2, HWY_IF_FLOAT_D(D)>
   5906 HWY_API VFromD<D> Iota(const D d, T2 first) {
   5907  const RebindToUnsigned<D> du;
   5908  const RebindToSigned<D> di;
   5909  return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
   5910                      ConvertScalarTo<TFromD<D>>(first));
   5911 }
   5912 
   5913 // ------------------------------ BitShuffle (PromoteTo, Rol, SumsOf8)
   5914 
   5915 // Native implementation required to avoid 8-bit wraparound on long vectors.
   5916 #ifdef HWY_NATIVE_BITSHUFFLE
   5917 #undef HWY_NATIVE_BITSHUFFLE
   5918 #else
   5919 #define HWY_NATIVE_BITSHUFFLE
   5920 #endif
   5921 
   5922 // Cannot handle LMUL=8 because we promote indices.
   5923 template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
   5924          HWY_IF_UI64_D(D64), HWY_IF_POW2_LE_D(D64, 2)>
   5925 HWY_API V64 BitShuffle(V64 values, VI idx) {
   5926  const RebindToUnsigned<D64> du64;
   5927  const Repartition<uint8_t, D64> du8;
   5928  const Rebind<uint16_t, decltype(du8)> du16;
   5929  using VU8 = VFromD<decltype(du8)>;
   5930  using VU16 = VFromD<decltype(du16)>;
   5931  // For each 16-bit (to avoid wraparound for long vectors) index of an output
   5932  // byte: offset of the u64 lane to which it belongs.
   5933  const VU16 byte_offsets =
   5934      detail::AndS(detail::Iota0(du16), static_cast<uint16_t>(~7u));
   5935  // idx is for a bit; shifting makes that bytes. Promote so we can add
   5936  // byte_offsets, then we have the u8 lane index within the whole vector.
   5937  const VU16 idx16 =
   5938      Add(byte_offsets, PromoteTo(du16, ShiftRight<3>(BitCast(du8, idx))));
   5939  const VU8 bytes = detail::TableLookupLanes16(BitCast(du8, values), idx16);
   5940 
   5941  // We want to shift right by idx & 7 to extract the desired bit in `bytes`,
   5942  // and left by iota & 7 to put it in the correct output bit. To correctly
   5943  // handle shift counts from -7 to 7, we rotate (unfortunately not natively
   5944  // supported on RVV).
   5945  const VU8 rotate_left_bits = Sub(detail::Iota0(du8), BitCast(du8, idx));
   5946  const VU8 extracted_bits_mask =
   5947      BitCast(du8, Set(du64, static_cast<uint64_t>(0x8040201008040201u)));
   5948  const VU8 extracted_bits =
   5949      And(Rol(bytes, rotate_left_bits), extracted_bits_mask);
   5950  // Combine bit-sliced (one bit per byte) into one 64-bit sum.
   5951  return BitCast(D64(), SumsOf8(extracted_bits));
   5952 }
   5953 
   5954 template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>,
   5955          HWY_IF_UI64_D(D64), HWY_IF_POW2_GT_D(D64, 2)>
   5956 HWY_API V64 BitShuffle(V64 values, VI idx) {
   5957  const Half<D64> dh;
   5958  const Half<DFromV<VI>> dih;
   5959  using V64H = VFromD<decltype(dh)>;
   5960  const V64H r0 = BitShuffle(LowerHalf(dh, values), LowerHalf(dih, idx));
   5961  const V64H r1 = BitShuffle(UpperHalf(dh, values), UpperHalf(dih, idx));
   5962  return Combine(D64(), r1, r0);
   5963 }
   5964 
   5965 // ------------------------------ MulEven/Odd (Mul, OddEven)
   5966 
   5967 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
   5968          class D = DFromV<V>, class DW = RepartitionToWide<D>>
   5969 HWY_API VFromD<DW> MulEven(const V a, const V b) {
   5970  constexpr int maskVal = sizeof(TFromD<D>) == 4   ? 5
   5971                          : sizeof(TFromD<D>) == 2 ? 0x55
   5972                                                   : 0x5555;
   5973  const auto mask = Dup128MaskFromMaskBits(D(), maskVal);
   5974  const auto hi = Slide1Up(D(), MulHigh(a, b));
   5975  const auto res = MaskedMulOr(hi, mask, a, b);
   5976  return BitCast(DW(), res);
   5977 }
   5978 
   5979 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
   5980          class D = DFromV<V>, class DW = RepartitionToWide<D>>
   5981 HWY_API VFromD<DW> MulOdd(const V a, const V b) {
   5982  const auto lo = Mul(a, b);
   5983  const auto hi = MulHigh(a, b);
   5984  return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo)));
   5985 }
   5986 
   5987 // There is no 64x64 vwmul.
   5988 template <class V, HWY_IF_T_SIZE_V(V, 8)>
   5989 HWY_INLINE V MulEven(const V a, const V b) {
   5990  const auto mask = Dup128MaskFromMaskBits(DFromV<V>(), 1);
   5991  const auto hi = Slide1Up(DFromV<V>(), MulHigh(a, b));
   5992  return MaskedMulOr(hi, mask, a, b);
   5993 }
   5994 
   5995 template <class V, HWY_IF_T_SIZE_V(V, 8)>
   5996 HWY_INLINE V MulOdd(const V a, const V b) {
   5997  const auto lo = Mul(a, b);
   5998  const auto hi = MulHigh(a, b);
   5999  return OddEven(hi, detail::Slide1Down(lo));
   6000 }
   6001 
   6002 // ------------------------------ ReorderDemote2To (OddEven, Combine)
   6003 
   6004 template <class D, HWY_IF_BF16_D(D)>
   6005 HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<RepartitionToWide<D>> a,
   6006                                   VFromD<RepartitionToWide<D>> b) {
   6007  const RebindToUnsigned<decltype(dbf16)> du16;
   6008  const Half<decltype(du16)> du16_half;
   6009  const RebindToUnsigned<DFromV<decltype(a)>> du32;
   6010  const VFromD<decltype(du32)> a_in_even = PromoteTo(
   6011      du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, a)));
   6012  const VFromD<decltype(du32)> b_in_even = PromoteTo(
   6013      du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, b)));
   6014  // Equivalent to InterleaveEven, but because the upper 16 bits are zero, we
   6015  // can OR instead of OddEven.
   6016  const VFromD<decltype(du16)> a_in_odd =
   6017      detail::Slide1Up(BitCast(du16, a_in_even));
   6018  return BitCast(dbf16, Or(a_in_odd, BitCast(du16, b_in_even)));
   6019 }
   6020 
   6021 // If LMUL is not the max, Combine first to avoid another DemoteTo.
   6022 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
   6023          HWY_IF_POW2_LE_D(DN, 2), class V, HWY_IF_SIGNED_V(V),
   6024          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   6025          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6026          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6027 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   6028  const Rebind<TFromV<V>, DN> dt;
   6029  const VFromD<decltype(dt)> ab = Combine(dt, b, a);
   6030  return DemoteTo(dn, ab);
   6031 }
   6032 
   6033 template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V,
   6034          HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   6035          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6036          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6037 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   6038  const Rebind<TFromV<V>, DN> dt;
   6039  const VFromD<decltype(dt)> ab = Combine(dt, b, a);
   6040  return DemoteTo(dn, ab);
   6041 }
   6042 
   6043 // Max LMUL: must DemoteTo first, then Combine.
   6044 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
   6045          HWY_IF_POW2_GT_D(DN, 2), class V, HWY_IF_SIGNED_V(V),
   6046          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   6047          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6048          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6049 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   6050  const Half<decltype(dn)> dnh;
   6051  const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a);
   6052  const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b);
   6053  return Combine(dn, demoted_b, demoted_a);
   6054 }
   6055 
   6056 template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V,
   6057          HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   6058          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6059          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6060 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
   6061  const Half<decltype(dn)> dnh;
   6062  const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a);
   6063  const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b);
   6064  return Combine(dn, demoted_b, demoted_a);
   6065 }
   6066 
   6067 // If LMUL is not the max, Combine first to avoid another DemoteTo.
   6068 template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2),
   6069          class V, HWY_IF_F32_D(DFromV<V>),
   6070          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6071          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6072 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
   6073  const Rebind<TFromV<V>, DN> dt;
   6074  const VFromD<decltype(dt)> ab = Combine(dt, b, a);
   6075  return DemoteTo(dn, ab);
   6076 }
   6077 
   6078 // Max LMUL: must DemoteTo first, then Combine.
   6079 template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2),
   6080          class V, HWY_IF_F32_D(DFromV<V>),
   6081          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6082          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6083 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
   6084  const Half<decltype(dn)> dnh;
   6085  const RebindToUnsigned<decltype(dn)> dn_u;
   6086  const RebindToUnsigned<decltype(dnh)> dnh_u;
   6087  const auto demoted_a = BitCast(dnh_u, DemoteTo(dnh, a));
   6088  const auto demoted_b = BitCast(dnh_u, DemoteTo(dnh, b));
   6089  return BitCast(dn, Combine(dn_u, demoted_b, demoted_a));
   6090 }
   6091 
   6092 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
   6093          HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   6094          HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
   6095          class V2 = VFromD<Repartition<TFromV<V>, DN>>,
   6096          hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
   6097 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
   6098  return ReorderDemote2To(dn, a, b);
   6099 }
   6100 
   6101 // ------------------------------ WidenMulPairwiseAdd
   6102 
   6103 template <class DF, HWY_IF_F32_D(DF),
   6104          class VBF = VFromD<Repartition<hwy::bfloat16_t, DF>>>
   6105 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
   6106  const VFromD<DF> ae = PromoteEvenTo(df, a);
   6107  const VFromD<DF> be = PromoteEvenTo(df, b);
   6108  const VFromD<DF> ao = PromoteOddTo(df, a);
   6109  const VFromD<DF> bo = PromoteOddTo(df, b);
   6110  return MulAdd(ae, be, Mul(ao, bo));
   6111 }
   6112 
   6113 template <class D, HWY_IF_UI32_D(D), class V16 = VFromD<RepartitionToNarrow<D>>>
   6114 HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) {
   6115  return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b),
   6116                Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b)));
   6117 }
   6118 
   6119 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
   6120 
   6121 namespace detail {
   6122 
   6123 #define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
   6124                           SHIFT, MLEN, NAME, OP)                              \
   6125  template <size_t N>                                                          \
   6126  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME(                                   \
   6127      HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
   6128      HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {            \
   6129    return __riscv_v##OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d));              \
   6130  }
   6131 
   6132 HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmacc_vv_, _EXT_VIRT)
   6133 HWY_RVV_FOREACH_U16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmaccu_vv_, _EXT_VIRT)
   6134 #undef HWY_RVV_WIDEN_MACC
   6135 
   6136 // If LMUL is not the max, we can WidenMul first (3 instructions).
   6137 template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>,
   6138          class D16 = RepartitionToNarrow<D32>>
   6139 HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a,
   6140                                                 VFromD<D16> b, const V32 sum0,
   6141                                                 V32& sum1) {
   6142  const Twice<decltype(d32)> d32t;
   6143  using V32T = VFromD<decltype(d32t)>;
   6144  V32T sum = Combine(d32t, sum1, sum0);
   6145  sum = detail::WidenMulAcc(d32t, sum, a, b);
   6146  sum1 = UpperHalf(d32, sum);
   6147  return LowerHalf(d32, sum);
   6148 }
   6149 
   6150 // Max LMUL: must LowerHalf first (4 instructions).
   6151 template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>,
   6152          class D16 = RepartitionToNarrow<D32>>
   6153 HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a,
   6154                                                 VFromD<D16> b, const V32 sum0,
   6155                                                 V32& sum1) {
   6156  const Half<D16> d16h;
   6157  using V16H = VFromD<decltype(d16h)>;
   6158  const V16H a0 = LowerHalf(d16h, a);
   6159  const V16H a1 = UpperHalf(d16h, a);
   6160  const V16H b0 = LowerHalf(d16h, b);
   6161  const V16H b1 = UpperHalf(d16h, b);
   6162  sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
   6163  return detail::WidenMulAcc(d32, sum0, a0, b0);
   6164 }
   6165 
   6166 // If LMUL is not the max, we can WidenMul first (3 instructions).
   6167 template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>,
   6168          class D16 = RepartitionToNarrow<D32>>
   6169 HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a,
   6170                                                 VFromD<D16> b, const V32 sum0,
   6171                                                 V32& sum1) {
   6172  const Twice<decltype(d32)> d32t;
   6173  using V32T = VFromD<decltype(d32t)>;
   6174  V32T sum = Combine(d32t, sum1, sum0);
   6175  sum = detail::WidenMulAcc(d32t, sum, a, b);
   6176  sum1 = UpperHalf(d32, sum);
   6177  return LowerHalf(d32, sum);
   6178 }
   6179 
   6180 // Max LMUL: must LowerHalf first (4 instructions).
   6181 template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>,
   6182          class D16 = RepartitionToNarrow<D32>>
   6183 HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a,
   6184                                                 VFromD<D16> b, const V32 sum0,
   6185                                                 V32& sum1) {
   6186  const Half<D16> d16h;
   6187  using V16H = VFromD<decltype(d16h)>;
   6188  const V16H a0 = LowerHalf(d16h, a);
   6189  const V16H a1 = UpperHalf(d16h, a);
   6190  const V16H b0 = LowerHalf(d16h, b);
   6191  const V16H b1 = UpperHalf(d16h, b);
   6192  sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
   6193  return detail::WidenMulAcc(d32, sum0, a0, b0);
   6194 }
   6195 
   6196 }  // namespace detail
   6197 
   6198 template <class D, HWY_IF_I32_D(D), class VN, class VW>
   6199 HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
   6200                                     VW& sum1) {
   6201  return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
   6202 }
   6203 
   6204 template <class D, HWY_IF_U32_D(D), class VN, class VW>
   6205 HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0,
   6206                                     VW& sum1) {
   6207  return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1);
   6208 }
   6209 
   6210 // ------------------------------ RearrangeToOddPlusEven
   6211 
   6212 template <class VW, HWY_IF_SIGNED_V(VW)>  // vint32_t*
   6213 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
   6214  // vwmacc doubles LMUL, so we require a pairwise sum here. This op is
   6215  // expected to be less frequent than ReorderWidenMulAccumulate, hence it's
   6216  // preferable to do the extra work here rather than do manual odd/even
   6217  // extraction there.
   6218  const DFromV<VW> di32;
   6219  const RebindToUnsigned<decltype(di32)> du32;
   6220  const Twice<decltype(di32)> di32x2;
   6221  const RepartitionToWide<decltype(di32x2)> di64x2;
   6222  const RebindToUnsigned<decltype(di64x2)> du64x2;
   6223  const auto combined = BitCast(di64x2, Combine(di32x2, sum1, sum0));
   6224  // Isolate odd/even int32 in int64 lanes.
   6225  const auto even = ShiftRight<32>(ShiftLeft<32>(combined));  // sign extend
   6226  const auto odd = ShiftRight<32>(combined);
   6227  return BitCast(di32, TruncateTo(du32, BitCast(du64x2, Add(even, odd))));
   6228 }
   6229 
   6230 // For max LMUL, we cannot Combine again and instead manually unroll.
   6231 HWY_API vint32m8_t RearrangeToOddPlusEven(vint32m8_t sum0, vint32m8_t sum1) {
   6232  const DFromV<vint32m8_t> d;
   6233  const Half<decltype(d)> dh;
   6234  const vint32m4_t lo =
   6235      RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0));
   6236  const vint32m4_t hi =
   6237      RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1));
   6238  return Combine(d, hi, lo);
   6239 }
   6240 
   6241 template <class VW, HWY_IF_UNSIGNED_V(VW)>  // vuint32_t*
   6242 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
   6243  // vwmacc doubles LMUL, so we require a pairwise sum here. This op is
   6244  // expected to be less frequent than ReorderWidenMulAccumulate, hence it's
   6245  // preferable to do the extra work here rather than do manual odd/even
   6246  // extraction there.
   6247  const DFromV<VW> du32;
   6248  const Twice<decltype(du32)> du32x2;
   6249  const RepartitionToWide<decltype(du32x2)> du64x2;
   6250  const auto combined = BitCast(du64x2, Combine(du32x2, sum1, sum0));
   6251  // Isolate odd/even int32 in int64 lanes.
   6252  const auto even = detail::AndS(combined, uint64_t{0xFFFFFFFFu});
   6253  const auto odd = ShiftRight<32>(combined);
   6254  return TruncateTo(du32, Add(even, odd));
   6255 }
   6256 
   6257 // For max LMUL, we cannot Combine again and instead manually unroll.
   6258 HWY_API vuint32m8_t RearrangeToOddPlusEven(vuint32m8_t sum0, vuint32m8_t sum1) {
   6259  const DFromV<vuint32m8_t> d;
   6260  const Half<decltype(d)> dh;
   6261  const vuint32m4_t lo =
   6262      RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0));
   6263  const vuint32m4_t hi =
   6264      RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1));
   6265  return Combine(d, hi, lo);
   6266 }
   6267 
   6268 template <class VW, HWY_IF_FLOAT_V(VW)>  // vfloat*
   6269 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
   6270  return Add(sum0, sum1);  // invariant already holds
   6271 }
   6272 
   6273 // ------------------------------ Lt128
   6274 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6275 
   6276 template <class D>
   6277 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
   6278  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6279  // The subsequent computations are performed using e8mf8 (8-bit elements with
   6280  // a fractional LMUL of 1/8) for the following reasons:
   6281  // 1. It is correct for the possible input vector types e64m<1,2,4,8>. This is
   6282  //    because the resulting mask can occupy at most 1/8 of a full vector when
   6283  //    using e64m8.
   6284  // 2. It can be more efficient than using a full vector or a vector group.
   6285  //
   6286  // The algorithm computes the result as follows:
   6287  // 1. Compute cH | (=H & cL) in the high bits, where cH and cL represent the
   6288  //    comparison results for the high and low 64-bit elements, respectively.
   6289  // 2. Shift the result right by 1 to duplicate the comparison results for the
   6290  //    low bits.
   6291  // 3. Obtain the final result by performing a bitwise OR on the high and low
   6292  //    bits.
   6293  auto du8mf8 = ScalableTag<uint8_t, -3>{};
   6294  const vuint8mf8_t ltHL0 =
   6295      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
   6296  const vuint8mf8_t eqHL0 =
   6297      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
   6298  const vuint8mf8_t ltLx0 = Add(ltHL0, ltHL0);
   6299  const vuint8mf8_t resultHx = detail::AndS(OrAnd(ltHL0, ltLx0, eqHL0), 0xaa);
   6300  const vuint8mf8_t resultxL = ShiftRight<1>(resultHx);
   6301  const vuint8mf8_t result = Or(resultHx, resultxL);
   6302  auto du8m1 = ScalableTag<uint8_t>{};
   6303  return detail::U8MaskBitsVecToMask(d, detail::ChangeLMUL(du8m1, result));
   6304 }
   6305 
   6306 #else
   6307 
   6308 template <class D>
   6309 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
   6310  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6311  // Truth table of Eq and Compare for Hi and Lo u64.
   6312  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   6313  // =H =L cH cL  | out = cH | (=H & cL)
   6314  //  0  0  0  0  |  0
   6315  //  0  0  0  1  |  0
   6316  //  0  0  1  0  |  1
   6317  //  0  0  1  1  |  1
   6318  //  0  1  0  0  |  0
   6319  //  0  1  0  1  |  0
   6320  //  0  1  1  0  |  1
   6321  //  1  0  0  0  |  0
   6322  //  1  0  0  1  |  1
   6323  //  1  1  0  0  |  0
   6324  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
   6325  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
   6326  // Shift leftward so L can influence H.
   6327  const VFromD<D> ltLx = detail::Slide1Up(ltHL);
   6328  const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
   6329  // Replicate H to its neighbor.
   6330  return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
   6331 }
   6332 
   6333 #endif  // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6334 
   6335 // ------------------------------ Lt128Upper
   6336 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6337 
   6338 template <class D>
   6339 HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   6340  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6341  auto du8mf8 = ScalableTag<uint8_t, -3>{};
   6342  const vuint8mf8_t ltHL =
   6343      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
   6344  const vuint8mf8_t ltHx = detail::AndS(ltHL, 0xaa);
   6345  const vuint8mf8_t ltxL = ShiftRight<1>(ltHx);
   6346  auto du8m1 = ScalableTag<uint8_t>{};
   6347  return detail::U8MaskBitsVecToMask(d,
   6348                                     detail::ChangeLMUL(du8m1, Or(ltHx, ltxL)));
   6349 }
   6350 
   6351 #else
   6352 
   6353 template <class D>
   6354 HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   6355  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6356  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
   6357  const VFromD<D> down = detail::Slide1Down(ltHL);
   6358  // b(267743505): Clang compiler bug, workaround is DoNotOptimize
   6359  asm volatile("" : : "r,m"(GetLane(down)) : "memory");
   6360  // Replicate H to its neighbor.
   6361  return MaskFromVec(OddEven(ltHL, down));
   6362 }
   6363 
   6364 #endif  // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6365 
   6366 // ------------------------------ Eq128
   6367 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6368 
   6369 template <class D>
   6370 HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
   6371  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6372  auto du8mf8 = ScalableTag<uint8_t, -3>{};
   6373  const vuint8mf8_t eqHL =
   6374      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
   6375  const vuint8mf8_t eqxH = ShiftRight<1>(eqHL);
   6376  const vuint8mf8_t result0L = detail::AndS(And(eqHL, eqxH), 0x55);
   6377  const vuint8mf8_t resultH0 = Add(result0L, result0L);
   6378  auto du8m1 = ScalableTag<uint8_t>{};
   6379  return detail::U8MaskBitsVecToMask(
   6380      d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0)));
   6381 }
   6382 
   6383 #else
   6384 
   6385 template <class D>
   6386 HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
   6387  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6388  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
   6389  const VFromD<D> eqLH = Reverse2(d, eqHL);
   6390  const VFromD<D> eq = And(eqHL, eqLH);
   6391  // b(267743505): Clang compiler bug, workaround is DoNotOptimize
   6392  asm volatile("" : : "r,m"(GetLane(eq)) : "memory");
   6393  return MaskFromVec(eq);
   6394 }
   6395 
   6396 #endif
   6397 
   6398 // ------------------------------ Eq128Upper
   6399 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6400 
   6401 template <class D>
   6402 HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   6403  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6404  auto du8mf8 = ScalableTag<uint8_t, -3>{};
   6405  const vuint8mf8_t eqHL =
   6406      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
   6407  const vuint8mf8_t eqHx = detail::AndS(eqHL, 0xaa);
   6408  const vuint8mf8_t eqxL = ShiftRight<1>(eqHx);
   6409  auto du8m1 = ScalableTag<uint8_t>{};
   6410  return detail::U8MaskBitsVecToMask(d,
   6411                                     detail::ChangeLMUL(du8m1, Or(eqHx, eqxL)));
   6412 }
   6413 
   6414 #else
   6415 
   6416 template <class D>
   6417 HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   6418  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6419  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
   6420  // Replicate H to its neighbor.
   6421  return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
   6422 }
   6423 
   6424 #endif
   6425 
   6426 // ------------------------------ Ne128
   6427 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6428 
   6429 template <class D>
   6430 HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
   6431  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6432  auto du8mf8 = ScalableTag<uint8_t, -3>{};
   6433  const vuint8mf8_t neHL =
   6434      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b)));
   6435  const vuint8mf8_t nexH = ShiftRight<1>(neHL);
   6436  const vuint8mf8_t result0L = detail::AndS(Or(neHL, nexH), 0x55);
   6437  const vuint8mf8_t resultH0 = Add(result0L, result0L);
   6438  auto du8m1 = ScalableTag<uint8_t>{};
   6439  return detail::U8MaskBitsVecToMask(
   6440      d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0)));
   6441 }
   6442 
   6443 #else
   6444 
   6445 template <class D>
   6446 HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
   6447  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6448  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
   6449  const VFromD<D> neLH = Reverse2(d, neHL);
   6450  // b(267743505): Clang compiler bug, workaround is DoNotOptimize
   6451  asm volatile("" : : "r,m"(GetLane(neLH)) : "memory");
   6452  return MaskFromVec(Or(neHL, neLH));
   6453 }
   6454 
   6455 #endif
   6456 
   6457 // ------------------------------ Ne128Upper
   6458 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   6459 
   6460 template <class D>
   6461 HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   6462  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6463  auto du8mf8 = ScalableTag<uint8_t, -3>{};
   6464  const vuint8mf8_t neHL =
   6465      detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b)));
   6466  const vuint8mf8_t neHx = detail::AndS(neHL, 0xaa);
   6467  const vuint8mf8_t nexL = ShiftRight<1>(neHx);
   6468  auto du8m1 = ScalableTag<uint8_t>{};
   6469  return detail::U8MaskBitsVecToMask(d,
   6470                                     detail::ChangeLMUL(du8m1, Or(neHx, nexL)));
   6471 }
   6472 
   6473 #else
   6474 
   6475 template <class D>
   6476 HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
   6477  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
   6478  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
   6479  const VFromD<D> down = detail::Slide1Down(neHL);
   6480  // b(267743505): Clang compiler bug, workaround is DoNotOptimize
   6481  asm volatile("" : : "r,m"(GetLane(down)) : "memory");
   6482  // Replicate H to its neighbor.
   6483  return MaskFromVec(OddEven(neHL, down));
   6484 }
   6485 
   6486 #endif
   6487 
   6488 // ------------------------------ Min128, Max128 (Lt128)
   6489 
   6490 template <class D>
   6491 HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
   6492  const VFromD<D> aXH = detail::Slide1Down(a);
   6493  const VFromD<D> bXH = detail::Slide1Down(b);
   6494  const VFromD<D> minHL = Min(a, b);
   6495  const MFromD<D> ltXH = Lt(aXH, bXH);
   6496  const MFromD<D> eqXH = Eq(aXH, bXH);
   6497  // If the upper lane is the decider, take lo from the same reg.
   6498  const VFromD<D> lo = IfThenElse(ltXH, a, b);
   6499  // The upper lane is just minHL; if they are equal, we also need to use the
   6500  // actual min of the lower lanes.
   6501  return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
   6502 }
   6503 
   6504 template <class D>
   6505 HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
   6506  const VFromD<D> aXH = detail::Slide1Down(a);
   6507  const VFromD<D> bXH = detail::Slide1Down(b);
   6508  const VFromD<D> maxHL = Max(a, b);
   6509  const MFromD<D> ltXH = Lt(aXH, bXH);
   6510  const MFromD<D> eqXH = Eq(aXH, bXH);
   6511  // If the upper lane is the decider, take lo from the same reg.
   6512  const VFromD<D> lo = IfThenElse(ltXH, b, a);
   6513  // The upper lane is just maxHL; if they are equal, we also need to use the
   6514  // actual min of the lower lanes.
   6515  return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
   6516 }
   6517 
   6518 template <class D>
   6519 HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
   6520  return IfThenElse(Lt128Upper(d, a, b), a, b);
   6521 }
   6522 
   6523 template <class D>
   6524 HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
   6525  return IfThenElse(Lt128Upper(d, b, a), a, b);
   6526 }
   6527 
   6528 // ================================================== END MACROS
   6529 #undef HWY_RVV_AVL
   6530 #undef HWY_RVV_D
   6531 #undef HWY_RVV_FOREACH
   6532 #undef HWY_RVV_FOREACH_08_ALL
   6533 #undef HWY_RVV_FOREACH_08_ALL_VIRT
   6534 #undef HWY_RVV_FOREACH_08_DEMOTE
   6535 #undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
   6536 #undef HWY_RVV_FOREACH_08_EXT
   6537 #undef HWY_RVV_FOREACH_08_EXT_VIRT
   6538 #undef HWY_RVV_FOREACH_08_TRUNC
   6539 #undef HWY_RVV_FOREACH_08_VIRT
   6540 #undef HWY_RVV_FOREACH_16_ALL
   6541 #undef HWY_RVV_FOREACH_16_ALL_VIRT
   6542 #undef HWY_RVV_FOREACH_16_DEMOTE
   6543 #undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
   6544 #undef HWY_RVV_FOREACH_16_EXT
   6545 #undef HWY_RVV_FOREACH_16_EXT_VIRT
   6546 #undef HWY_RVV_FOREACH_16_TRUNC
   6547 #undef HWY_RVV_FOREACH_16_VIRT
   6548 #undef HWY_RVV_FOREACH_32_ALL
   6549 #undef HWY_RVV_FOREACH_32_ALL_VIRT
   6550 #undef HWY_RVV_FOREACH_32_DEMOTE
   6551 #undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
   6552 #undef HWY_RVV_FOREACH_32_EXT
   6553 #undef HWY_RVV_FOREACH_32_EXT_VIRT
   6554 #undef HWY_RVV_FOREACH_32_TRUNC
   6555 #undef HWY_RVV_FOREACH_32_VIRT
   6556 #undef HWY_RVV_FOREACH_64_ALL
   6557 #undef HWY_RVV_FOREACH_64_ALL_VIRT
   6558 #undef HWY_RVV_FOREACH_64_DEMOTE
   6559 #undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
   6560 #undef HWY_RVV_FOREACH_64_EXT
   6561 #undef HWY_RVV_FOREACH_64_EXT_VIRT
   6562 #undef HWY_RVV_FOREACH_64_TRUNC
   6563 #undef HWY_RVV_FOREACH_64_VIRT
   6564 #undef HWY_RVV_FOREACH_B
   6565 #undef HWY_RVV_FOREACH_F
   6566 #undef HWY_RVV_FOREACH_F16
   6567 #undef HWY_RVV_FOREACH_F32
   6568 #undef HWY_RVV_FOREACH_F3264
   6569 #undef HWY_RVV_FOREACH_F64
   6570 #undef HWY_RVV_FOREACH_I
   6571 #undef HWY_RVV_FOREACH_I08
   6572 #undef HWY_RVV_FOREACH_I16
   6573 #undef HWY_RVV_FOREACH_I163264
   6574 #undef HWY_RVV_FOREACH_I32
   6575 #undef HWY_RVV_FOREACH_I64
   6576 #undef HWY_RVV_FOREACH_U
   6577 #undef HWY_RVV_FOREACH_U08
   6578 #undef HWY_RVV_FOREACH_U16
   6579 #undef HWY_RVV_FOREACH_U163264
   6580 #undef HWY_RVV_FOREACH_U32
   6581 #undef HWY_RVV_FOREACH_U64
   6582 #undef HWY_RVV_FOREACH_UI
   6583 #undef HWY_RVV_FOREACH_UI08
   6584 #undef HWY_RVV_FOREACH_UI16
   6585 #undef HWY_RVV_FOREACH_UI163264
   6586 #undef HWY_RVV_FOREACH_UI32
   6587 #undef HWY_RVV_FOREACH_UI3264
   6588 #undef HWY_RVV_FOREACH_UI64
   6589 #undef HWY_RVV_IF_EMULATED_D
   6590 #undef HWY_RVV_IF_CAN128_D
   6591 #undef HWY_RVV_IF_GE128_D
   6592 #undef HWY_RVV_IF_LT128_D
   6593 #undef HWY_RVV_INSERT_VXRM
   6594 #undef HWY_RVV_M
   6595 #undef HWY_RVV_RETM_ARGM
   6596 #undef HWY_RVV_RETV_ARGMVV
   6597 #undef HWY_RVV_RETV_ARGV
   6598 #undef HWY_RVV_RETV_ARGVS
   6599 #undef HWY_RVV_RETV_ARGVV
   6600 #undef HWY_RVV_T
   6601 #undef HWY_RVV_V
   6602 // NOLINTNEXTLINE(google-readability-namespace-comments)
   6603 }  // namespace HWY_NAMESPACE
   6604 }  // namespace hwy
   6605 HWY_AFTER_NAMESPACE();