rvv-inl.h (282168B)
1 // Copyright 2021 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // RISC-V V vectors (length not known at compile time). 17 // External include guard in highway.h - see comment there. 18 19 #pragma push_macro("__riscv_v_elen") 20 21 // Workaround that ensures that all of the __riscv_vsetvl_* and 22 // __riscv_vsetvlmax_* macros in riscv_vector.h are defined when compiling with 23 // Clang 20 with dynamic dispatch and a baseline target of SCALAR or EMU128 24 #if HWY_COMPILER_CLANG >= 2000 && HWY_COMPILER_CLANG < 2100 && \ 25 (!defined(__riscv_v_elen) || __riscv_v_elen < 64) 26 #undef __riscv_v_elen 27 #define __riscv_v_elen 64 28 #endif 29 30 #include <riscv_vector.h> 31 32 #pragma pop_macro("__riscv_v_elen") 33 34 #include "hwy/ops/shared-inl.h" 35 36 HWY_BEFORE_NAMESPACE(); 37 namespace hwy { 38 namespace HWY_NAMESPACE { 39 40 // Support for vfloat16m*_t and PromoteTo/DemoteTo. 41 #ifdef __riscv_zvfhmin 42 #define HWY_RVV_HAVE_F16C 1 43 #else 44 #define HWY_RVV_HAVE_F16C 0 45 #endif 46 47 template <class V> 48 struct DFromV_t {}; // specialized in macros 49 template <class V> 50 using DFromV = typename DFromV_t<RemoveConst<V>>::type; 51 52 template <class V> 53 using TFromV = TFromD<DFromV<V>>; 54 55 template <typename T, size_t N, int kPow2> 56 constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) { 57 // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower 58 // argument enables fractional LMUL < 1. Limit to 64 because that is the 59 // largest value for which vbool##_t are defined. 60 return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2)); 61 } 62 63 namespace detail { 64 65 template <class D> 66 class AdjustSimdTagToMinVecPow2_t {}; 67 68 template <typename T, size_t N, int kPow2> 69 class AdjustSimdTagToMinVecPow2_t<Simd<T, N, kPow2>> { 70 private: 71 using D = Simd<T, N, kPow2>; 72 static constexpr int kMinVecPow2 = 73 -3 + static_cast<int>(FloorLog2(sizeof(T))); 74 static constexpr size_t kNumMaxLanes = HWY_MAX_LANES_D(D); 75 static constexpr int kNewPow2 = HWY_MAX(kPow2, kMinVecPow2); 76 static constexpr size_t kNewN = D::template NewN<kNewPow2, kNumMaxLanes>(); 77 78 public: 79 using type = Simd<T, kNewN, kNewPow2>; 80 }; 81 82 template <class D> 83 using AdjustSimdTagToMinVecPow2 = 84 typename AdjustSimdTagToMinVecPow2_t<RemoveConst<D>>::type; 85 86 } // namespace detail 87 88 // ================================================== MACROS 89 90 // Generate specializations and function definitions using X macros. Although 91 // harder to read and debug, writing everything manually is too bulky. 92 93 namespace detail { // for code folding 94 95 // For all mask sizes MLEN: (1/Nth of a register, one bit per lane) 96 // The first three arguments are arbitrary SEW, LMUL, SHIFT such that 97 // SEW >> SHIFT = MLEN. 98 #define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \ 99 X_MACRO(64, 0, 64, NAME, OP) \ 100 X_MACRO(32, 0, 32, NAME, OP) \ 101 X_MACRO(16, 0, 16, NAME, OP) \ 102 X_MACRO(8, 0, 8, NAME, OP) \ 103 X_MACRO(8, 1, 4, NAME, OP) \ 104 X_MACRO(8, 2, 2, NAME, OP) \ 105 X_MACRO(8, 3, 1, NAME, OP) 106 107 // For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows 108 // reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or 109 // _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix. 110 // 111 // Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same 112 // reason, also pass the double-width and half SEW and LMUL (suffixed D and H, 113 // respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8). 114 // Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP 115 116 // LMULS = _TRUNC: truncatable (not the smallest LMUL) 117 #define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ 118 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ 119 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ 120 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ 121 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ 122 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ 123 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) 124 125 #define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ 126 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ 127 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ 128 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ 129 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ 130 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) 131 132 #define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ 133 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ 134 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ 135 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ 136 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) 137 138 #define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ 139 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ 140 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ 141 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) 142 143 #define HWY_RVV_FOREACH_08_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ 144 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ 145 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ 146 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) 147 148 #define HWY_RVV_FOREACH_16_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ 149 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ 150 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ 151 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) 152 153 #define HWY_RVV_FOREACH_32_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ 154 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ 155 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ 156 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) 157 158 #define HWY_RVV_FOREACH_64_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ 159 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ 160 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ 161 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) 162 163 // LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH. 164 #define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 165 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ 166 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ 167 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ 168 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ 169 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ 170 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) 171 172 #define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 173 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ 174 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ 175 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ 176 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ 177 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ 178 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) 179 180 #define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 181 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ 182 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ 183 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ 184 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ 185 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) 186 187 #define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 188 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ 189 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ 190 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ 191 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) 192 193 // LMULS = _LE2: <= 2 194 #define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 195 X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \ 196 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ 197 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ 198 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ 199 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) 200 201 #define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 202 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ 203 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ 204 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ 205 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) 206 207 #define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 208 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ 209 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ 210 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) 211 212 #define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 213 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ 214 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) 215 216 // LMULS = _EXT: not the largest LMUL 217 #define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 218 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 219 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) 220 221 #define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 222 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 223 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) 224 225 #define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 226 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 227 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) 228 229 #define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 230 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 231 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) 232 233 // LMULS = _ALL (2^MinPow2() <= LMUL <= 8) 234 #define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 235 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 236 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) 237 238 #define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 239 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 240 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) 241 242 #define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 243 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 244 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) 245 246 #define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 247 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 248 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) 249 250 // 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least 251 // 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even 252 // though RISC-V LMUL must be at least SEW/64 (notice that this rules out 253 // LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to 254 // one less than should be supported, with all other parameters (vector type 255 // etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes() 256 // returns half of what it usually would. 257 // 258 // Notice that we can only add overloads whenever there is a D argument: those 259 // are unique with respect to non-virtual-LMUL overloads because their kPow2 260 // template argument differs. Otherwise, there is no actual vuint64mf2_t, and 261 // defining another overload with the same LMUL would be an error. Thus we have 262 // a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is 263 // _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most 264 // functions that take a D. 265 266 #define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 267 268 #define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 269 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP) 270 271 #define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 272 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP) 273 274 #define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 275 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP) 276 277 // ALL + VIRT 278 #define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 279 HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 280 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 281 282 #define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 283 HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 284 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 285 286 #define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 287 HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 288 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 289 290 #define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 291 HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ 292 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 293 294 // LE2 + VIRT 295 #define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 296 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 297 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 298 299 #define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 300 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 301 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 302 303 #define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 304 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 305 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 306 307 #define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 308 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ 309 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 310 311 // GET/SET + VIRT 312 #define HWY_RVV_FOREACH_08_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 313 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ 314 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ 315 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) 316 317 #define HWY_RVV_FOREACH_16_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 318 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ 319 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) 320 321 #define HWY_RVV_FOREACH_32_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 322 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) 323 324 #define HWY_RVV_FOREACH_64_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 325 326 // For the smallest LMUL for each SEW, similar to the LowerHalf operator, we 327 // provide the Get and Set operator that returns the same vector type. 328 #define HWY_RVV_FOREACH_08_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ 329 X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) 330 331 #define HWY_RVV_FOREACH_16_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ 332 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) 333 334 #define HWY_RVV_FOREACH_32_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ 335 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) 336 337 #define HWY_RVV_FOREACH_64_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ 338 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) 339 340 // EXT + VIRT 341 #define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 342 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 343 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 344 345 #define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 346 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 347 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 348 349 #define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 350 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 351 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 352 353 #define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 354 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ 355 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 356 357 // DEMOTE + VIRT 358 #define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 359 HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 360 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 361 362 #define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 363 HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 364 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 365 366 #define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 367 HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 368 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 369 370 #define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ 371 HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ 372 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) 373 374 // SEW for unsigned: 375 #define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ 376 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP) 377 #define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ 378 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP) 379 #define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ 380 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP) 381 #define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ 382 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP) 383 384 // SEW for signed: 385 #define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ 386 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP) 387 #define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ 388 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP) 389 #define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ 390 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP) 391 #define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \ 392 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP) 393 394 // SEW for float: 395 396 // Used for conversion instructions if HWY_RVV_HAVE_F16C. 397 #define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) \ 398 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP) 399 400 #if HWY_HAVE_FLOAT16 401 // Full support for f16 in all ops 402 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ 403 HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) 404 // Only BF16 is emulated. 405 #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D) 406 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D) 407 #define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D) 408 #else 409 #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) 410 #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) 411 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) 412 #define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D) 413 #endif 414 #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ 415 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP) 416 #define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \ 417 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP) 418 419 // Commonly used type/SEW groups: 420 #define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \ 421 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ 422 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) 423 424 #define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \ 425 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ 426 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) 427 428 #define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ 429 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ 430 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) 431 432 #define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \ 433 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ 434 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) 435 436 #define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \ 437 HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ 438 HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) 439 440 #define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ 441 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ 442 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ 443 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) 444 445 #define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \ 446 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ 447 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ 448 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) 449 450 #define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \ 451 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ 452 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) 453 454 #define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \ 455 HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ 456 HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) 457 458 // For all combinations of SEW: 459 #define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ 460 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ 461 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) 462 463 #define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ 464 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ 465 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) 466 467 #define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \ 468 HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ 469 HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) 470 471 // Commonly used type categories: 472 #define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \ 473 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ 474 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) 475 476 #define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \ 477 HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \ 478 HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) 479 480 // Assemble types for use in x-macros 481 #define HWY_RVV_T(BASE, SEW) BASE##SEW##_t 482 #define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT> 483 #define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t 484 #define HWY_RVV_TUP(BASE, SEW, LMUL, TUP) v##BASE##SEW##LMUL##x##TUP##_t 485 #define HWY_RVV_M(MLEN) vbool##MLEN##_t 486 487 } // namespace detail 488 489 // Until we have full intrinsic support for fractional LMUL, mixed-precision 490 // code can use LMUL 1..8 (adequate unless they need many registers). 491 #define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 492 MLEN, NAME, OP) \ 493 template <> \ 494 struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \ 495 using Lane = HWY_RVV_T(BASE, SEW); \ 496 using type = ScalableTag<Lane, SHIFT>; \ 497 }; 498 499 HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL) 500 #undef HWY_SPECIALIZE 501 502 // ------------------------------ Lanes 503 504 // WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL! 505 506 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 507 // HWY_RVV_CAPPED_LANES_SPECIAL_CASES provides some additional optimizations 508 // to CappedLanes in non-debug builds 509 #define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \ 510 if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) { \ 511 /* If cap is known to be greater than or equal to MaxLanes(d), */ \ 512 /* HWY_MIN(cap, Lanes(d)) will be equal to Lanes(d) */ \ 513 return Lanes(d); \ 514 } \ 515 \ 516 if ((__builtin_constant_p((cap & (cap - 1)) == 0) && \ 517 ((cap & (cap - 1)) == 0)) || \ 518 (__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) && \ 519 (cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) { \ 520 /* If cap is known to be a power of 2, then */ \ 521 /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \ 522 /* result as HWY_MIN(cap, Lanes(d)) as kMaxLanes is a power of 2 and */ \ 523 /* as (cap > VLMAX && cap < 2 * VLMAX) can only be true if cap is not a */ \ 524 /* power of 2 since VLMAX is always a power of 2 */ \ 525 \ 526 /* If cap is known to be less than or equal to 4, then */ \ 527 /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \ 528 /* result as HWY_MIN(cap, Lanes(d)) as HWY_MIN(cap, kMaxLanes) <= 4 is */ \ 529 /* true if cap <= 4 and as vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \ 530 /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) */ \ 531 /* if HWY_MIN(cap, kMaxLanes) <= 4 is true */ \ 532 \ 533 /* If cap is known to be less than or equal to kMinLanesPerFullVec, */ \ 534 /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \ 535 /* same result as HWY_MIN(cap, Lanes(d)) as */ \ 536 /* HWY_MIN(cap, kMaxLanes) <= kMinLanesPerFullVec is true if */ \ 537 /* cap <= kMinLanesPerFullVec is true */ \ 538 \ 539 /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then either */ \ 540 /* cap <= 4 or cap <= kMinLanesPerFullVec must be true */ \ 541 \ 542 /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is known to be true, */ \ 543 /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \ 544 /* same result as HWY_MIN(cap, Lanes(d)) */ \ 545 \ 546 /* If no cap, avoid the HWY_MIN. */ \ 547 return detail::IsFull(d) \ 548 ? __riscv_vsetvl_e##SEW##LMUL(cap) \ 549 : __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \ 550 } 551 #else 552 #define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) 553 #endif 554 555 #define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 556 MLEN, NAME, OP) \ 557 template <size_t N> \ 558 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ 559 constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \ 560 constexpr size_t kCap = MaxLanes(d); \ 561 /* If no cap, avoid generating a constant by using VLMAX. */ \ 562 return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \ 563 : __riscv_vsetvl_e##SEW##LMUL(kCap); \ 564 } \ 565 template <size_t N> \ 566 HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \ 567 /* NOTE: Section 6.3 of the RVV specification, which can be found at */ \ 568 /* https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, */ \ 569 /* allows vsetvl to return a result less than Lanes(d) but greater than */ \ 570 /* or equal to ((cap + 1) / 2) if */ \ 571 /* (Lanes(d) > 2 && cap > HWY_MAX(Lanes(d), 4) && cap < (2 * Lanes(d))) */ \ 572 /* is true */ \ 573 \ 574 /* VLMAX is the number of lanes in a vector of type */ \ 575 /* VFromD<decltype(d)>, which is returned by */ \ 576 /* Lanes(DFromV<VFromD<decltype(d)>>()) */ \ 577 \ 578 /* VLMAX is guaranteed to be a power of 2 under Section 2 of the RVV */ \ 579 /* specification */ \ 580 \ 581 /* The VLMAX of a vector of type VFromD<decltype(d)> is at least 2 as */ \ 582 /* the HWY_RVV target requires support for the RVV Zvl128b extension, */ \ 583 /* which guarantees that vectors with LMUL=1 are at least 16 bytes */ \ 584 \ 585 /* If VLMAX == 2 is true, then vsetvl(cap) is equal to HWY_MIN(cap, 2) */ \ 586 /* as cap == 3 is the only value such that */ \ 587 /* (cap > VLMAX && cap < 2 * VLMAX) if VLMAX == 2 and as */ \ 588 /* ((3 + 1) / 2) is equal to 2 */ \ 589 \ 590 /* If cap <= 4 is true, then vsetvl(cap) must be equal to */ \ 591 /* HWY_MIN(cap, VLMAX) as cap <= VLMAX is true if VLMAX >= 4 is true */ \ 592 /* and as vsetvl(cap) is guaranteed to be equal to HWY_MIN(cap, VLMAX) */ \ 593 /* if VLMAX == 2 */ \ 594 \ 595 /* We want CappedLanes(d, cap) to return Lanes(d) if cap > Lanes(d) as */ \ 596 /* LoadN(d, p, cap) expects to load exactly HWY_MIN(cap, Lanes(d)) */ \ 597 /* lanes and StoreN(v, d, p, cap) expects to store exactly */ \ 598 /* HWY_MIN(cap, Lanes(d)) lanes, even in the case where vsetvl returns */ \ 599 /* a result that is less than HWY_MIN(cap, Lanes(d)) */ \ 600 \ 601 /* kMinLanesPerFullVec is the minimum value of VLMAX for a vector of */ \ 602 /* type VFromD<decltype(d)> */ \ 603 constexpr size_t kMinLanesPerFullVec = \ 604 detail::ScaleByPower(16 / (SEW / 8), SHIFT); \ 605 /* kMaxLanes is the maximum number of lanes returned by Lanes(d) */ \ 606 constexpr size_t kMaxLanes = MaxLanes(d); \ 607 \ 608 HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \ 609 \ 610 if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) { \ 611 /* If kMaxLanes <= kMinLanesPerFullVec is true, then */ \ 612 /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return */ \ 613 /* HWY_MIN(cap, Lanes(d)) as */ \ 614 /* HWY_MIN(cap, kMaxLanes) <= kMaxLanes <= VLMAX is true if */ \ 615 /* kMaxLanes <= kMinLanesPerFullVec is true */ \ 616 \ 617 /* If kMaxLanes <= 4 is true, then vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \ 618 /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) as */ \ 619 /* HWY_MIN(cap, kMaxLanes) <= 4 is true if kMaxLanes <= 4 is true */ \ 620 \ 621 /* If kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then */ \ 622 /* either kMaxLanes <= 4 or kMaxLanes <= kMinLanesPerFullVec must be */ \ 623 /* true */ \ 624 \ 625 return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \ 626 } else { \ 627 /* If kMaxLanes > HWY_MAX(kMinLanesPerFullVec, 4) is true, need to */ \ 628 /* obtain the actual number of lanes using Lanes(d) and clamp cap to */ \ 629 /* the result of Lanes(d) */ \ 630 const size_t actual = Lanes(d); \ 631 return HWY_MIN(actual, cap); \ 632 } \ 633 } 634 635 #define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 636 SHIFT, MLEN, NAME, OP) \ 637 template <size_t N> \ 638 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ 639 constexpr size_t kCap = MaxLanes(d); \ 640 /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \ 641 /* vsetvl may or may not be correct, so do it ourselves. */ \ 642 const size_t actual = \ 643 detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \ 644 return HWY_MIN(actual, kCap); \ 645 } \ 646 template <size_t N> \ 647 HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \ 648 /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \ 649 /* vsetvl may or may not be correct, so do it ourselves. */ \ 650 const size_t actual = \ 651 detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \ 652 /* If no cap, avoid an extra HWY_MIN. */ \ 653 return detail::IsFull(d) ? HWY_MIN(actual, cap) \ 654 : HWY_MIN(HWY_MIN(actual, cap), MaxLanes(d)); \ 655 } 656 657 HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL) 658 HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT) 659 #undef HWY_RVV_LANES 660 #undef HWY_RVV_LANES_VIRT 661 #undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES 662 663 template <class D, HWY_RVV_IF_EMULATED_D(D)> 664 HWY_API size_t Lanes(D /* tag*/) { 665 return Lanes(RebindToUnsigned<D>()); 666 } 667 668 template <class D, HWY_RVV_IF_EMULATED_D(D)> 669 HWY_API size_t CappedLanes(D /* tag*/, size_t cap) { 670 return CappedLanes(RebindToUnsigned<D>(), cap); 671 } 672 673 // ------------------------------ Common x-macros 674 675 // Last argument to most intrinsics. Use when the op has no d arg of its own, 676 // which means there is no user-specified cap. 677 #define HWY_RVV_AVL(SEW, SHIFT) \ 678 Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>()) 679 680 // vector = f(vector), e.g. Not 681 #define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 682 SHIFT, MLEN, NAME, OP) \ 683 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 684 return __riscv_v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ 685 } 686 687 // vector = f(vector, scalar), e.g. detail::AddS 688 #define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 689 SHIFT, MLEN, NAME, OP) \ 690 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 691 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ 692 return __riscv_v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \ 693 } 694 695 // vector = f(vector, vector), e.g. Add 696 #define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 697 SHIFT, MLEN, NAME, OP) \ 698 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 699 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ 700 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(a, b, \ 701 HWY_RVV_AVL(SEW, SHIFT)); \ 702 } 703 704 // vector = f(vector, mask, vector, vector), e.g. MaskedAddOr 705 #define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 706 SHIFT, MLEN, NAME, OP) \ 707 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 708 NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \ 709 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ 710 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \ 711 HWY_RVV_AVL(SEW, SHIFT)); \ 712 } 713 714 // mask = f(mask) 715 #define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \ 716 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \ 717 return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \ 718 } 719 720 // ================================================== INIT 721 722 // ------------------------------ Set 723 724 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 725 MLEN, NAME, OP) \ 726 template <size_t N> \ 727 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 728 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \ 729 return __riscv_v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \ 730 } 731 732 HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT) 733 HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT) 734 #undef HWY_RVV_SET 735 736 // Treat bfloat16_t as int16_t (using the previously defined Set overloads); 737 // required for Zero and VFromD. 738 template <class D, HWY_IF_BF16_D(D)> 739 decltype(Set(RebindToSigned<D>(), 0)) Set(D d, hwy::bfloat16_t arg) { 740 return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg)); 741 } 742 #if !HWY_HAVE_FLOAT16 // Otherwise already defined above. 743 // WARNING: returns a different type than emulated bfloat16_t so that we can 744 // implement PromoteTo overloads for both bfloat16_t and float16_t, and also 745 // provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t). 746 template <class D, HWY_IF_F16_D(D)> 747 decltype(Set(RebindToUnsigned<D>(), 0)) Set(D d, hwy::float16_t arg) { 748 return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg)); 749 } 750 #endif 751 752 template <class D> 753 using VFromD = decltype(Set(D(), TFromD<D>())); 754 755 // ------------------------------ Zero 756 757 template <class D> 758 HWY_API VFromD<D> Zero(D d) { 759 // Cast to support bfloat16_t. 760 const RebindToUnsigned<decltype(d)> du; 761 return BitCast(d, Set(du, 0)); 762 } 763 764 // ------------------------------ Undefined 765 766 // RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized 767 // by it gives unpredictable results. It should only be used for maskoff, so 768 // keep it internal. For the Highway op, just use Zero (single instruction). 769 namespace detail { 770 #define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 771 SHIFT, MLEN, NAME, OP) \ 772 template <size_t N> \ 773 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 774 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \ 775 return __riscv_v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \ 776 } 777 778 HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL) 779 #undef HWY_RVV_UNDEFINED 780 } // namespace detail 781 782 template <class D> 783 HWY_API VFromD<D> Undefined(D d) { 784 return Zero(d); 785 } 786 787 // ------------------------------ BitCast 788 789 namespace detail { 790 791 // Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.) 792 #define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 793 MLEN, NAME, OP) \ 794 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 795 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \ 796 v); /* no AVL */ \ 797 } 798 HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC) 799 #undef HWY_RVV_TRUNC 800 801 // Doubles LMUL to `d2` (the arg is only necessary for _VIRT). 802 #define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 803 MLEN, NAME, OP) \ 804 template <size_t N> \ 805 HWY_API HWY_RVV_V(BASE, SEW, LMULD) \ 806 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ 807 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 808 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD( \ 809 v); /* no AVL */ \ 810 } 811 HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT) 812 #undef HWY_RVV_EXT 813 814 // For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is 815 // the same as the actual input type. 816 #define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 817 SHIFT, MLEN, NAME, OP) \ 818 template <size_t N> \ 819 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 820 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ 821 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 822 return v; \ 823 } 824 HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT) 825 #undef HWY_RVV_EXT_VIRT 826 827 template <class D, HWY_RVV_IF_EMULATED_D(D)> 828 VFromD<D> Ext(D d, VFromD<Half<D>> v) { 829 const RebindToUnsigned<decltype(d)> du; 830 const Half<decltype(du)> duh; 831 return BitCast(d, Ext(du, BitCast(duh, v))); 832 } 833 834 // For BitCastToByte, the D arg is only to prevent duplicate definitions caused 835 // by _ALL_VIRT. 836 837 // There is no reinterpret from u8 <-> u8, so just return. 838 #define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 839 SHIFT, MLEN, NAME, OP) \ 840 template <typename T, size_t N> \ 841 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ 842 vuint8##LMUL##_t v) { \ 843 return v; \ 844 } \ 845 template <size_t N> \ 846 HWY_API vuint8##LMUL##_t BitCastFromByte( \ 847 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ 848 return v; \ 849 } 850 851 // For i8, need a single reinterpret (HWY_RVV_CAST_IF does two). 852 #define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 853 SHIFT, MLEN, NAME, OP) \ 854 template <typename T, size_t N> \ 855 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ 856 vint8##LMUL##_t v) { \ 857 return __riscv_vreinterpret_v_i8##LMUL##_u8##LMUL(v); \ 858 } \ 859 template <size_t N> \ 860 HWY_API vint8##LMUL##_t BitCastFromByte( \ 861 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ 862 return __riscv_vreinterpret_v_u8##LMUL##_i8##LMUL(v); \ 863 } 864 865 // Separate u/i because clang only provides signed <-> unsigned reinterpret for 866 // the same SEW. 867 #define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 868 MLEN, NAME, OP) \ 869 template <typename T, size_t N> \ 870 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ 871 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 872 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \ 873 } \ 874 template <size_t N> \ 875 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ 876 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ 877 return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \ 878 } 879 880 // Signed/Float: first cast to/from unsigned 881 #define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 882 SHIFT, MLEN, NAME, OP) \ 883 template <typename T, size_t N> \ 884 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ 885 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 886 return __riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ 887 __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \ 888 } \ 889 template <size_t N> \ 890 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ 891 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ 892 return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ 893 __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \ 894 } 895 896 // Additional versions for virtual LMUL using LMULH for byte vectors. 897 #define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 898 SHIFT, MLEN, NAME, OP) \ 899 template <typename T, size_t N> \ 900 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ 901 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 902 return detail::Trunc(__riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \ 903 } \ 904 template <size_t N> \ 905 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ 906 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ 907 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ 908 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ 909 return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \ 910 } 911 912 // Signed/Float: first cast to/from unsigned 913 #define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 914 SHIFT, MLEN, NAME, OP) \ 915 template <typename T, size_t N> \ 916 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \ 917 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 918 return detail::Trunc(__riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ 919 __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \ 920 } \ 921 template <size_t N> \ 922 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ 923 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ 924 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ 925 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ 926 return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ 927 __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \ 928 } 929 930 HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL) 931 HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL) 932 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL) 933 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL) 934 HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT) 935 HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) 936 HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL) 937 HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) 938 #if HWY_HAVE_FLOAT16 // HWY_RVV_FOREACH_F already covered float16_ 939 #elif HWY_RVV_HAVE_F16C // zvfhmin provides reinterpret* intrinsics: 940 HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL) 941 HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) 942 #else 943 template <class D, HWY_IF_F16_D(D)> 944 HWY_INLINE VFromD<RebindToUnsigned<D>> BitCastFromByte( 945 D /* d */, VFromD<Repartition<uint8_t, D>> v) { 946 return BitCastFromByte(RebindToUnsigned<D>(), v); 947 } 948 #endif 949 950 #undef HWY_RVV_CAST_U8 951 #undef HWY_RVV_CAST_I8 952 #undef HWY_RVV_CAST_U 953 #undef HWY_RVV_CAST_IF 954 #undef HWY_RVV_CAST_VIRT_U 955 #undef HWY_RVV_CAST_VIRT_IF 956 957 template <class D, HWY_IF_BF16_D(D)> 958 HWY_INLINE VFromD<RebindToSigned<D>> BitCastFromByte( 959 D d, VFromD<Repartition<uint8_t, D>> v) { 960 return BitCastFromByte(RebindToSigned<decltype(d)>(), v); 961 } 962 963 } // namespace detail 964 965 template <class D, class FromV> 966 HWY_API VFromD<D> BitCast(D d, FromV v) { 967 return detail::BitCastFromByte(d, detail::BitCastToByte(d, v)); 968 } 969 970 // ------------------------------ Iota 971 972 namespace detail { 973 974 #define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 975 MLEN, NAME, OP) \ 976 template <size_t N> \ 977 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ 978 return __riscv_v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \ 979 } 980 981 // For i8 lanes, this may well wrap around. Unsigned only is less error-prone. 982 HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT) 983 #undef HWY_RVV_IOTA 984 985 // Used by Expand. 986 #define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 987 SHIFT, MLEN, NAME, OP) \ 988 template <size_t N> \ 989 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 990 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) { \ 991 return __riscv_v##OP##_##CHAR##SEW##LMUL(mask, Lanes(d)); \ 992 } 993 994 HWY_RVV_FOREACH_U(HWY_RVV_MASKED_IOTA, MaskedIota, iota_m, _ALL_VIRT) 995 #undef HWY_RVV_MASKED_IOTA 996 997 } // namespace detail 998 999 // ================================================== LOGICAL 1000 1001 // ------------------------------ Not 1002 1003 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV, Not, not, _ALL) 1004 1005 template <class V, HWY_IF_FLOAT_V(V)> 1006 HWY_API V Not(const V v) { 1007 using DF = DFromV<V>; 1008 using DU = RebindToUnsigned<DF>; 1009 return BitCast(DF(), Not(BitCast(DU(), v))); 1010 } 1011 1012 // ------------------------------ And 1013 1014 // Non-vector version (ideally immediate) for use with Iota0 1015 namespace detail { 1016 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL) 1017 } // namespace detail 1018 1019 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, And, and, _ALL) 1020 1021 template <class V, HWY_IF_FLOAT_V(V)> 1022 HWY_API V And(const V a, const V b) { 1023 using DF = DFromV<V>; 1024 using DU = RebindToUnsigned<DF>; 1025 return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b))); 1026 } 1027 1028 // ------------------------------ Or 1029 1030 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Or, or, _ALL) 1031 1032 template <class V, HWY_IF_FLOAT_V(V)> 1033 HWY_API V Or(const V a, const V b) { 1034 using DF = DFromV<V>; 1035 using DU = RebindToUnsigned<DF>; 1036 return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b))); 1037 } 1038 1039 // ------------------------------ Xor 1040 1041 // Non-vector version (ideally immediate) for use with Iota0 1042 namespace detail { 1043 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL) 1044 } // namespace detail 1045 1046 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Xor, xor, _ALL) 1047 1048 template <class V, HWY_IF_FLOAT_V(V)> 1049 HWY_API V Xor(const V a, const V b) { 1050 using DF = DFromV<V>; 1051 using DU = RebindToUnsigned<DF>; 1052 return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b))); 1053 } 1054 1055 // ------------------------------ AndNot 1056 template <class V> 1057 HWY_API V AndNot(const V not_a, const V b) { 1058 return And(Not(not_a), b); 1059 } 1060 1061 // ------------------------------ Xor3 1062 template <class V> 1063 HWY_API V Xor3(V x1, V x2, V x3) { 1064 return Xor(x1, Xor(x2, x3)); 1065 } 1066 1067 // ------------------------------ Or3 1068 template <class V> 1069 HWY_API V Or3(V o1, V o2, V o3) { 1070 return Or(o1, Or(o2, o3)); 1071 } 1072 1073 // ------------------------------ OrAnd 1074 template <class V> 1075 HWY_API V OrAnd(const V o, const V a1, const V a2) { 1076 return Or(o, And(a1, a2)); 1077 } 1078 1079 // ------------------------------ CopySign 1080 1081 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL) 1082 1083 template <class V> 1084 HWY_API V CopySignToAbs(const V abs, const V sign) { 1085 // RVV can also handle abs < 0, so no extra action needed. 1086 return CopySign(abs, sign); 1087 } 1088 1089 // ================================================== ARITHMETIC 1090 1091 // Per-target flags to prevent generic_ops-inl.h defining Add etc. 1092 #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS 1093 #undef HWY_NATIVE_OPERATOR_REPLACEMENTS 1094 #else 1095 #define HWY_NATIVE_OPERATOR_REPLACEMENTS 1096 #endif 1097 1098 // ------------------------------ Add 1099 1100 namespace detail { 1101 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL) 1102 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL) 1103 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL) 1104 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL) 1105 } // namespace detail 1106 1107 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL) 1108 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL) 1109 1110 // ------------------------------ Sub 1111 namespace detail { 1112 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL) 1113 } // namespace detail 1114 1115 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL) 1116 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL) 1117 1118 // ------------------------------ Neg (ReverseSubS, Xor) 1119 1120 template <class V, HWY_IF_SIGNED_V(V)> 1121 HWY_API V Neg(const V v) { 1122 return detail::ReverseSubS(v, 0); 1123 } 1124 1125 // vector = f(vector), but argument is repeated 1126 #define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1127 SHIFT, MLEN, NAME, OP) \ 1128 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 1129 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \ 1130 HWY_RVV_AVL(SEW, SHIFT)); \ 1131 } 1132 1133 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL) 1134 1135 #if !HWY_HAVE_FLOAT16 1136 1137 template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t 1138 HWY_API V Neg(V v) { 1139 const DFromV<decltype(v)> d; 1140 const RebindToUnsigned<decltype(d)> du; 1141 using TU = TFromD<decltype(du)>; 1142 return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>()))); 1143 } 1144 1145 #endif // !HWY_HAVE_FLOAT16 1146 1147 // ------------------------------ SaturatedAdd 1148 1149 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB 1150 #undef HWY_NATIVE_I32_SATURATED_ADDSUB 1151 #else 1152 #define HWY_NATIVE_I32_SATURATED_ADDSUB 1153 #endif 1154 1155 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB 1156 #undef HWY_NATIVE_U32_SATURATED_ADDSUB 1157 #else 1158 #define HWY_NATIVE_U32_SATURATED_ADDSUB 1159 #endif 1160 1161 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB 1162 #undef HWY_NATIVE_I64_SATURATED_ADDSUB 1163 #else 1164 #define HWY_NATIVE_I64_SATURATED_ADDSUB 1165 #endif 1166 1167 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB 1168 #undef HWY_NATIVE_U64_SATURATED_ADDSUB 1169 #else 1170 #define HWY_NATIVE_U64_SATURATED_ADDSUB 1171 #endif 1172 1173 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL) 1174 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL) 1175 1176 // ------------------------------ SaturatedSub 1177 1178 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL) 1179 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL) 1180 1181 // ------------------------------ AverageRound 1182 1183 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 1184 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 1185 #else 1186 #define HWY_NATIVE_AVERAGE_ROUND_UI32 1187 #endif 1188 1189 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 1190 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 1191 #else 1192 #define HWY_NATIVE_AVERAGE_ROUND_UI64 1193 #endif 1194 1195 // Define this to opt-out of the default behavior, which is AVOID on certain 1196 // compiler versions. You can define only this to use VXRM, or define both this 1197 // and HWY_RVV_AVOID_VXRM to always avoid VXRM. 1198 #ifndef HWY_RVV_CHOOSE_VXRM 1199 1200 // Assume that GCC-13 defaults to 'avoid VXRM'. Tested with GCC 13.1.0. 1201 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400 1202 #define HWY_RVV_AVOID_VXRM 1203 // Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid. 1204 // Assume that Clang 16 and earlier avoid VXRM. 1205 #elif HWY_COMPILER_CLANG && \ 1206 (HWY_COMPILER_CLANG < 1700 || __riscv_v_intrinsic < 11000) 1207 #define HWY_RVV_AVOID_VXRM 1208 #endif 1209 1210 #endif // HWY_RVV_CHOOSE_VXRM 1211 1212 // Adding __RISCV_VXRM_* was a backwards-incompatible change and it is not clear 1213 // how to detect whether it is supported or required. #ifdef __RISCV_VXRM_RDN 1214 // does not work because it seems to be a compiler built-in, but neither does 1215 // __has_builtin(__RISCV_VXRM_RDN). The intrinsics version was also not updated, 1216 // so we require a macro to opt out of the new intrinsics. 1217 #ifdef HWY_RVV_AVOID_VXRM 1218 #define HWY_RVV_INSERT_VXRM(vxrm, avl) avl 1219 #define __RISCV_VXRM_RNU 1220 #define __RISCV_VXRM_RDN 1221 #else // default: use new vxrm arguments 1222 #define HWY_RVV_INSERT_VXRM(vxrm, avl) vxrm, avl 1223 #endif 1224 1225 // Extra rounding mode = up argument. 1226 #define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1227 SHIFT, MLEN, NAME, OP) \ 1228 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1229 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ 1230 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \ 1231 a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ 1232 } 1233 1234 HWY_RVV_FOREACH_I(HWY_RVV_RETV_AVERAGE, AverageRound, aadd, _ALL) 1235 HWY_RVV_FOREACH_U(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL) 1236 1237 #undef HWY_RVV_RETV_AVERAGE 1238 1239 // ------------------------------ ShiftLeft[Same] 1240 1241 // Intrinsics do not define .vi forms, so use .vx instead. 1242 #define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 1243 MLEN, NAME, OP) \ 1244 template <int kBits> \ 1245 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 1246 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, \ 1247 HWY_RVV_AVL(SEW, SHIFT)); \ 1248 } \ 1249 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1250 NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ 1251 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \ 1252 HWY_RVV_AVL(SEW, SHIFT)); \ 1253 } 1254 1255 HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL) 1256 1257 // ------------------------------ ShiftRight[Same] 1258 1259 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL) 1260 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL) 1261 1262 #undef HWY_RVV_SHIFT 1263 1264 // ------------------------------ RoundingShiftRight[Same] 1265 1266 #ifdef HWY_NATIVE_ROUNDING_SHR 1267 #undef HWY_NATIVE_ROUNDING_SHR 1268 #else 1269 #define HWY_NATIVE_ROUNDING_SHR 1270 #endif 1271 1272 // Intrinsics do not define .vi forms, so use .vx instead. 1273 #define HWY_RVV_ROUNDING_SHR(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1274 SHIFT, MLEN, NAME, OP) \ 1275 template <int kBits> \ 1276 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 1277 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL( \ 1278 v, kBits, \ 1279 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ 1280 } \ 1281 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1282 NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ 1283 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL( \ 1284 v, static_cast<uint8_t>(bits), \ 1285 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ 1286 } 1287 1288 HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssrl, _ALL) 1289 HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssra, _ALL) 1290 1291 #undef HWY_RVV_ROUNDING_SHR 1292 1293 // ------------------------------ SumsOf8 (ShiftRight, Add) 1294 template <class VU8, HWY_IF_U8_D(DFromV<VU8>)> 1295 HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) { 1296 const DFromV<VU8> du8; 1297 const RepartitionToWide<decltype(du8)> du16; 1298 const RepartitionToWide<decltype(du16)> du32; 1299 const RepartitionToWide<decltype(du32)> du64; 1300 using VU16 = VFromD<decltype(du16)>; 1301 1302 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); 1303 const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF); 1304 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); 1305 1306 const VU16 szz_FE_zz_BA_zz_76_zz_32 = 1307 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); 1308 const VU16 sxx_FC_xx_B8_xx_74_xx_30 = 1309 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); 1310 const VU16 szz_zz_xx_FC_zz_zz_xx_74 = 1311 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); 1312 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = 1313 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); 1314 return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull); 1315 } 1316 1317 template <class VI8, HWY_IF_I8_D(DFromV<VI8>)> 1318 HWY_API VFromD<Repartition<int64_t, DFromV<VI8>>> SumsOf8(const VI8 v) { 1319 const DFromV<VI8> di8; 1320 const RepartitionToWide<decltype(di8)> di16; 1321 const RepartitionToWide<decltype(di16)> di32; 1322 const RepartitionToWide<decltype(di32)> di64; 1323 const RebindToUnsigned<decltype(di32)> du32; 1324 const RebindToUnsigned<decltype(di64)> du64; 1325 using VI16 = VFromD<decltype(di16)>; 1326 1327 const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v)); 1328 const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v))); 1329 const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); 1330 1331 const VI16 sDC_zz_98_zz_54_zz_10_zz = 1332 BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); 1333 const VI16 sFC_xx_B8_xx_74_xx_30_xx = 1334 Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz); 1335 const VI16 sB8_xx_zz_zz_30_xx_zz_zz = 1336 BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx))); 1337 const VI16 sF8_xx_xx_xx_70_xx_xx_xx = 1338 Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz); 1339 return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx)); 1340 } 1341 1342 // ------------------------------ RotateRight 1343 template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 1344 HWY_API V RotateRight(const V v) { 1345 const DFromV<decltype(v)> d; 1346 const RebindToUnsigned<decltype(d)> du; 1347 1348 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8; 1349 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 1350 if (kBits == 0) return v; 1351 1352 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), 1353 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); 1354 } 1355 1356 // ------------------------------ Shl 1357 #define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1358 SHIFT, MLEN, NAME, OP) \ 1359 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1360 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ 1361 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, bits, \ 1362 HWY_RVV_AVL(SEW, SHIFT)); \ 1363 } 1364 1365 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL) 1366 1367 #define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1368 SHIFT, MLEN, NAME, OP) \ 1369 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1370 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ 1371 const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \ 1372 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, BitCast(du, bits), \ 1373 HWY_RVV_AVL(SEW, SHIFT)); \ 1374 } 1375 1376 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL) 1377 1378 // ------------------------------ Shr 1379 1380 HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL) 1381 HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL) 1382 1383 #undef HWY_RVV_SHIFT_II 1384 #undef HWY_RVV_SHIFT_VV 1385 1386 // ------------------------------ RoundingShr 1387 #define HWY_RVV_ROUNDING_SHR_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 1388 LMULH, SHIFT, MLEN, NAME, OP) \ 1389 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1390 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ 1391 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \ 1392 v, bits, \ 1393 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ 1394 } 1395 1396 HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR_VV, RoundingShr, ssrl, _ALL) 1397 1398 #define HWY_RVV_ROUNDING_SHR_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 1399 LMULH, SHIFT, MLEN, NAME, OP) \ 1400 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1401 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ 1402 const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \ 1403 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \ 1404 v, BitCast(du, bits), \ 1405 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ 1406 } 1407 1408 HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR_II, RoundingShr, ssra, _ALL) 1409 1410 #undef HWY_RVV_ROUNDING_SHR_VV 1411 #undef HWY_RVV_ROUNDING_SHR_II 1412 1413 // ------------------------------ Min 1414 1415 namespace detail { 1416 1417 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MinS, minu_vx, _ALL) 1418 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MinS, min_vx, _ALL) 1419 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MinS, fmin_vf, _ALL) 1420 1421 } // namespace detail 1422 1423 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL) 1424 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL) 1425 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL) 1426 1427 // ------------------------------ Max 1428 1429 namespace detail { 1430 1431 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL) 1432 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL) 1433 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL) 1434 1435 } // namespace detail 1436 1437 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL) 1438 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL) 1439 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL) 1440 1441 // ------------------------------ Mul 1442 1443 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. 1444 #ifdef HWY_NATIVE_MUL_8 1445 #undef HWY_NATIVE_MUL_8 1446 #else 1447 #define HWY_NATIVE_MUL_8 1448 #endif 1449 #ifdef HWY_NATIVE_MUL_64 1450 #undef HWY_NATIVE_MUL_64 1451 #else 1452 #define HWY_NATIVE_MUL_64 1453 #endif 1454 1455 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL) 1456 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL) 1457 1458 // ------------------------------ MulHigh 1459 1460 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) 1461 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) 1462 1463 // ------------------------------ MulFixedPoint15 1464 1465 // Extra rounding mode = up argument. 1466 #define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 1467 MLEN, NAME, OP) \ 1468 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1469 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ 1470 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \ 1471 a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ 1472 } 1473 1474 HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL) 1475 1476 #undef HWY_RVV_MUL15 1477 1478 // ------------------------------ Div 1479 #ifdef HWY_NATIVE_INT_DIV 1480 #undef HWY_NATIVE_INT_DIV 1481 #else 1482 #define HWY_NATIVE_INT_DIV 1483 #endif 1484 1485 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Div, divu, _ALL) 1486 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Div, div, _ALL) 1487 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL) 1488 1489 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Mod, remu, _ALL) 1490 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Mod, rem, _ALL) 1491 1492 // ------------------------------ MaskedAddOr etc. 1493 1494 #ifdef HWY_NATIVE_MASKED_ARITH 1495 #undef HWY_NATIVE_MASKED_ARITH 1496 #else 1497 #define HWY_NATIVE_MASKED_ARITH 1498 #endif 1499 1500 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMinOr, minu, _ALL) 1501 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMinOr, min, _ALL) 1502 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMinOr, fmin, _ALL) 1503 1504 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, maxu, _ALL) 1505 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, max, _ALL) 1506 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMaxOr, fmax, _ALL) 1507 1508 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedAddOr, add, _ALL) 1509 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedAddOr, fadd, _ALL) 1510 1511 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedSubOr, sub, _ALL) 1512 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedSubOr, fsub, _ALL) 1513 1514 HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGMVV, MaskedMulOr, mul, _ALL) 1515 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedMulOr, fmul, _ALL) 1516 1517 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedDivOr, divu, _ALL) 1518 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedDivOr, div, _ALL) 1519 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGMVV, MaskedDivOr, fdiv, _ALL) 1520 1521 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedModOr, remu, _ALL) 1522 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedModOr, rem, _ALL) 1523 1524 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, saddu, _ALL) 1525 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatAddOr, sadd, _ALL) 1526 1527 HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssubu, _ALL) 1528 HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGMVV, MaskedSatSubOr, ssub, _ALL) 1529 1530 // ------------------------------ ApproximateReciprocal 1531 #ifdef HWY_NATIVE_F64_APPROX_RECIP 1532 #undef HWY_NATIVE_F64_APPROX_RECIP 1533 #else 1534 #define HWY_NATIVE_F64_APPROX_RECIP 1535 #endif 1536 1537 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL) 1538 1539 // ------------------------------ Sqrt 1540 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL) 1541 1542 // ------------------------------ ApproximateReciprocalSqrt 1543 #ifdef HWY_NATIVE_F64_APPROX_RSQRT 1544 #undef HWY_NATIVE_F64_APPROX_RSQRT 1545 #else 1546 #define HWY_NATIVE_F64_APPROX_RSQRT 1547 #endif 1548 1549 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL) 1550 1551 // ------------------------------ MulAdd 1552 1553 // Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. 1554 #ifdef HWY_NATIVE_INT_FMA 1555 #undef HWY_NATIVE_INT_FMA 1556 #else 1557 #define HWY_NATIVE_INT_FMA 1558 #endif 1559 1560 // Note: op is still named vv, not vvv. 1561 #define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 1562 MLEN, NAME, OP) \ 1563 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1564 NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \ 1565 HWY_RVV_V(BASE, SEW, LMUL) add) { \ 1566 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, \ 1567 HWY_RVV_AVL(SEW, SHIFT)); \ 1568 } 1569 1570 HWY_RVV_FOREACH_UI(HWY_RVV_FMA, MulAdd, macc, _ALL) 1571 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL) 1572 1573 // ------------------------------ NegMulAdd 1574 HWY_RVV_FOREACH_UI(HWY_RVV_FMA, NegMulAdd, nmsac, _ALL) 1575 HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL) 1576 1577 // ------------------------------ MulSub 1578 HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL) 1579 1580 // ------------------------------ NegMulSub 1581 HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL) 1582 1583 #undef HWY_RVV_FMA 1584 1585 // ================================================== COMPARE 1586 1587 // ------------------------------ MClear 1588 1589 // mask = f() 1590 #define HWY_RVV_RETM(SEW, SHIFT, MLEN, NAME, OP) \ 1591 HWY_API HWY_RVV_M(MLEN) NAME##MLEN() { \ 1592 return __riscv_vm##OP##_m_b##MLEN(HWY_RVV_AVL(SEW, SHIFT)); \ 1593 } 1594 1595 namespace detail { 1596 HWY_RVV_FOREACH_B(HWY_RVV_RETM, MClear, clr) // with ##MLEN suffix 1597 } // namespace detail 1598 1599 #undef HWY_RVV_RETM 1600 1601 // Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in 1602 // vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th 1603 // of all bits; SEW=8 / LMUL=4 = half of all bits. 1604 1605 // mask = f(vector, vector) 1606 #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1607 SHIFT, MLEN, NAME, OP) \ 1608 HWY_API HWY_RVV_M(MLEN) \ 1609 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ 1610 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN( \ 1611 a, b, HWY_RVV_AVL(SEW, SHIFT)); \ 1612 } 1613 1614 // mask = f(mask, vector, vector) 1615 #define HWY_RVV_RETM_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1616 SHIFT, MLEN, NAME, OP) \ 1617 HWY_API HWY_RVV_M(MLEN) \ 1618 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) a, \ 1619 HWY_RVV_V(BASE, SEW, LMUL) b) { \ 1620 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN##_mu( \ 1621 m, detail::MClear##MLEN(), a, b, HWY_RVV_AVL(SEW, SHIFT)); \ 1622 } 1623 1624 // mask = f(vector, scalar) 1625 #define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1626 SHIFT, MLEN, NAME, OP) \ 1627 HWY_API HWY_RVV_M(MLEN) \ 1628 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ 1629 return __riscv_v##OP##_##CHAR##SEW##LMUL##_b##MLEN( \ 1630 a, b, HWY_RVV_AVL(SEW, SHIFT)); \ 1631 } 1632 1633 #ifdef HWY_NATIVE_MASKED_COMP 1634 #undef HWY_NATIVE_MASKED_COMP 1635 #else 1636 #define HWY_NATIVE_MASKED_COMP 1637 #endif 1638 1639 // ------------------------------ Eq 1640 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL) 1641 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL) 1642 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedEq, mseq, _ALL) 1643 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedEq, mfeq, _ALL) 1644 1645 namespace detail { 1646 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL) 1647 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL) 1648 } // namespace detail 1649 1650 // ------------------------------ Ne 1651 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL) 1652 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL) 1653 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedNe, msne, _ALL) 1654 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedNe, mfne, _ALL) 1655 1656 namespace detail { 1657 HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL) 1658 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL) 1659 } // namespace detail 1660 1661 // ------------------------------ Lt 1662 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL) 1663 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL) 1664 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL) 1665 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLt, msltu, _ALL) 1666 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLt, mslt, _ALL) 1667 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLt, mflt, _ALL) 1668 1669 namespace detail { 1670 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL) 1671 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL) 1672 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL) 1673 } // namespace detail 1674 1675 // ------------------------------ Le 1676 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL) 1677 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL) 1678 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL) 1679 HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLe, msleu, _ALL) 1680 HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLe, msle, _ALL) 1681 HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLe, mfle, _ALL) 1682 1683 template <class D> 1684 using MFromD = decltype(Eq(Zero(D()), Zero(D()))); 1685 1686 template <class V, class M, class D = DFromV<V>> 1687 HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) { 1688 return MaskedNe(m, v, v); 1689 } 1690 1691 #undef HWY_RVV_RETM_ARGMVV 1692 #undef HWY_RVV_RETM_ARGVV 1693 #undef HWY_RVV_RETM_ARGVS 1694 1695 // ------------------------------ Gt/Ge (Lt, Le) 1696 1697 // Swap args to reverse comparisons: 1698 template <class V> 1699 HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) { 1700 return Lt(b, a); 1701 } 1702 1703 template <class V> 1704 HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) { 1705 return Le(b, a); 1706 } 1707 1708 template <class V, class M, class D = DFromV<V>> 1709 HWY_API MFromD<D> MaskedGt(M m, V a, V b) { 1710 return MaskedLt(m, b, a); 1711 } 1712 1713 template <class V, class M, class D = DFromV<V>> 1714 HWY_API MFromD<D> MaskedGe(M m, V a, V b) { 1715 return MaskedLe(m, b, a); 1716 } 1717 1718 // ------------------------------ TestBit 1719 template <class V> 1720 HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) { 1721 return detail::NeS(And(a, bit), 0); 1722 } 1723 1724 // ------------------------------ Not 1725 // NOLINTNEXTLINE 1726 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not ) 1727 1728 // ------------------------------ And 1729 1730 // mask = f(mask_a, mask_b) (note arg2,arg1 order!) 1731 #define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \ 1732 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \ 1733 return __riscv_vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \ 1734 } 1735 1736 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, And, and) 1737 1738 // ------------------------------ AndNot 1739 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn) 1740 1741 // ------------------------------ Or 1742 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or) 1743 1744 // ------------------------------ Xor 1745 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor) 1746 1747 // ------------------------------ ExclusiveNeither 1748 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor) 1749 1750 #undef HWY_RVV_RETM_ARGMM 1751 1752 // ------------------------------ IfThenElse 1753 1754 #define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1755 SHIFT, MLEN, NAME, OP) \ 1756 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1757 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ 1758 HWY_RVV_V(BASE, SEW, LMUL) no) { \ 1759 return __riscv_v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m, \ 1760 HWY_RVV_AVL(SEW, SHIFT)); \ 1761 } 1762 1763 HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL) 1764 1765 #undef HWY_RVV_IF_THEN_ELSE 1766 1767 // ------------------------------ IfThenElseZero 1768 template <class M, class V> 1769 HWY_API V IfThenElseZero(const M mask, const V yes) { 1770 return IfThenElse(mask, yes, Zero(DFromV<V>())); 1771 } 1772 1773 // ------------------------------ IfThenZeroElse 1774 1775 #define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 1776 LMULH, SHIFT, MLEN, NAME, OP) \ 1777 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1778 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \ 1779 return __riscv_v##OP##_##CHAR##SEW##LMUL(no, 0, m, \ 1780 HWY_RVV_AVL(SEW, SHIFT)); \ 1781 } 1782 1783 HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL) 1784 HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL) 1785 1786 #undef HWY_RVV_IF_THEN_ZERO_ELSE 1787 1788 // ------------------------------ MaskFromVec 1789 template <class V> 1790 HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) { 1791 return detail::NeS(v, 0); 1792 } 1793 1794 // ------------------------------ IsNegative (MFromD) 1795 #ifdef HWY_NATIVE_IS_NEGATIVE 1796 #undef HWY_NATIVE_IS_NEGATIVE 1797 #else 1798 #define HWY_NATIVE_IS_NEGATIVE 1799 #endif 1800 1801 // Generic for all vector lengths 1802 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 1803 HWY_API MFromD<DFromV<V>> IsNegative(V v) { 1804 const DFromV<decltype(v)> d; 1805 const RebindToSigned<decltype(d)> di; 1806 using TI = TFromD<decltype(di)>; 1807 1808 return detail::LtS(BitCast(di, v), static_cast<TI>(0)); 1809 } 1810 1811 // ------------------------------ MaskFalse 1812 1813 // For mask ops including vmclr, elements past VL are tail-agnostic and cannot 1814 // be relied upon, so define a variant of the generic_ops-inl implementation of 1815 // MaskFalse that ensures all bits are zero as required by mask_test. 1816 #ifdef HWY_NATIVE_MASK_FALSE 1817 #undef HWY_NATIVE_MASK_FALSE 1818 #else 1819 #define HWY_NATIVE_MASK_FALSE 1820 #endif 1821 1822 template <class D> 1823 HWY_API MFromD<D> MaskFalse(D d) { 1824 const DFromV<VFromD<decltype(d)>> d_full; 1825 return MaskFromVec(Zero(d_full)); 1826 } 1827 1828 // ------------------------------ RebindMask 1829 template <class D, typename MFrom> 1830 HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) { 1831 // No need to check lane size/LMUL are the same: if not, casting MFrom to 1832 // MFromD<D> would fail. 1833 return mask; 1834 } 1835 1836 // ------------------------------ VecFromMask 1837 1838 // Returns mask ? ~0 : 0. No longer use sub.vx(Zero(), 1, mask) because per the 1839 // default mask-agnostic policy, the result of inactive lanes may also be ~0. 1840 #define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1841 SHIFT, MLEN, NAME, OP) \ 1842 template <size_t N> \ 1843 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1844 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \ 1845 /* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \ 1846 const DFromV<VFromD<decltype(d)>> d_full; \ 1847 const RebindToSigned<decltype(d_full)> di; \ 1848 using TI = TFromD<decltype(di)>; \ 1849 return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \ 1850 Lanes(d_full))); \ 1851 } 1852 1853 HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT) 1854 1855 #undef HWY_RVV_VEC_FROM_MASK 1856 1857 template <class D, HWY_IF_FLOAT_D(D)> 1858 HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) { 1859 return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask)); 1860 } 1861 1862 // ------------------------------ IfVecThenElse (MaskFromVec) 1863 template <class V> 1864 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { 1865 return IfThenElse(MaskFromVec(mask), yes, no); 1866 } 1867 1868 // ------------------------------ BroadcastSignBit 1869 template <class V, HWY_IF_SIGNED_V(V)> 1870 HWY_API V BroadcastSignBit(const V v) { 1871 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v); 1872 } 1873 1874 // ------------------------------ IfNegativeThenElse (BroadcastSignBit) 1875 template <class V> 1876 HWY_API V IfNegativeThenElse(V v, V yes, V no) { 1877 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float"); 1878 return IfThenElse(IsNegative(v), yes, no); 1879 } 1880 1881 // ------------------------------ FindFirstTrue 1882 1883 #define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ 1884 template <class D> \ 1885 HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ 1886 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ 1887 return __riscv_vfirst_m_b##MLEN(m, Lanes(d)); \ 1888 } \ 1889 template <class D> \ 1890 HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ 1891 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ 1892 return static_cast<size_t>(__riscv_vfirst_m_b##MLEN(m, Lanes(d))); \ 1893 } 1894 1895 HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _) 1896 #undef HWY_RVV_FIND_FIRST_TRUE 1897 1898 // ------------------------------ AllFalse 1899 template <class D> 1900 HWY_API bool AllFalse(D d, MFromD<D> m) { 1901 return FindFirstTrue(d, m) < 0; 1902 } 1903 1904 // ------------------------------ AllTrue 1905 1906 #define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ 1907 template <class D> \ 1908 HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \ 1909 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ 1910 return AllFalse(d, __riscv_vmnot_m_b##MLEN(m, Lanes(d))); \ 1911 } 1912 1913 HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _) 1914 #undef HWY_RVV_ALL_TRUE 1915 1916 // ------------------------------ CountTrue 1917 1918 #define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ 1919 template <class D> \ 1920 HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \ 1921 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ 1922 return __riscv_vcpop_m_b##MLEN(m, Lanes(d)); \ 1923 } 1924 1925 HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _) 1926 #undef HWY_RVV_COUNT_TRUE 1927 1928 // ------------------------------ PromoteMaskTo 1929 1930 #ifdef HWY_NATIVE_PROMOTE_MASK_TO 1931 #undef HWY_NATIVE_PROMOTE_MASK_TO 1932 #else 1933 #define HWY_NATIVE_PROMOTE_MASK_TO 1934 #endif 1935 1936 template <class DTo, class DFrom, 1937 HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)), 1938 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr> 1939 HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, 1940 MFromD<DFrom> m) { 1941 return m; 1942 } 1943 1944 // ------------------------------ DemoteMaskTo 1945 1946 #ifdef HWY_NATIVE_DEMOTE_MASK_TO 1947 #undef HWY_NATIVE_DEMOTE_MASK_TO 1948 #else 1949 #define HWY_NATIVE_DEMOTE_MASK_TO 1950 #endif 1951 1952 template <class DTo, class DFrom, 1953 HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1), 1954 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr> 1955 HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, 1956 MFromD<DFrom> m) { 1957 return m; 1958 } 1959 1960 // ================================================== MEMORY 1961 1962 // ------------------------------ Load 1963 1964 #define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 1965 MLEN, NAME, OP) \ 1966 template <size_t N> \ 1967 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1968 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 1969 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ 1970 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ 1971 detail::NativeLanePointer(p), Lanes(d)); \ 1972 } 1973 HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT) 1974 #undef HWY_RVV_LOAD 1975 1976 template <class D, HWY_RVV_IF_EMULATED_D(D)> 1977 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { 1978 const RebindToUnsigned<decltype(d)> du; 1979 return BitCast(d, Load(du, detail::U16LanePointer(p))); 1980 } 1981 1982 // ------------------------------ LoadU 1983 template <class D> 1984 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 1985 // RVV only requires element alignment, not vector alignment. 1986 return Load(d, p); 1987 } 1988 1989 // ------------------------------ MaskedLoad 1990 1991 #define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 1992 SHIFT, MLEN, NAME, OP) \ 1993 template <size_t N> \ 1994 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 1995 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 1996 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ 1997 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \ 1998 m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \ 1999 } \ 2000 template <size_t N> \ 2001 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 2002 NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ 2003 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2004 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ 2005 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \ 2006 m, v, detail::NativeLanePointer(p), Lanes(d)); \ 2007 } 2008 2009 HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT) 2010 #undef HWY_RVV_MASKED_LOAD 2011 2012 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2013 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 2014 const TFromD<D>* HWY_RESTRICT p) { 2015 const RebindToUnsigned<decltype(d)> du; 2016 return BitCast(d, 2017 MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p))); 2018 } 2019 2020 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2021 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> no, MFromD<D> m, D d, 2022 const TFromD<D>* HWY_RESTRICT p) { 2023 const RebindToUnsigned<decltype(d)> du; 2024 return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du, 2025 detail::U16LanePointer(p))); 2026 } 2027 2028 // ------------------------------ LoadN 2029 2030 // Native with avl is faster than the generic_ops using FirstN. 2031 #ifdef HWY_NATIVE_LOAD_N 2032 #undef HWY_NATIVE_LOAD_N 2033 #else 2034 #define HWY_NATIVE_LOAD_N 2035 #endif 2036 2037 #define HWY_RVV_LOADN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 2038 MLEN, NAME, OP) \ 2039 template <size_t N> \ 2040 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 2041 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2042 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \ 2043 /* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */ \ 2044 /* operation below will leave any lanes past the first */ \ 2045 /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */ \ 2046 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \ 2047 Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \ 2048 } \ 2049 template <size_t N> \ 2050 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \ 2051 HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2052 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \ 2053 /* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \ 2054 /* operation below will set any lanes past the first */ \ 2055 /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */ \ 2056 /* corresponding lanes in no */ \ 2057 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \ 2058 no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \ 2059 } 2060 2061 HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT) 2062 #undef HWY_RVV_LOADN 2063 2064 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2065 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2066 size_t num_lanes) { 2067 const RebindToUnsigned<D> du; 2068 return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes)); 2069 } 2070 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2071 HWY_API VFromD<D> LoadNOr(VFromD<D> v, D d, const TFromD<D>* HWY_RESTRICT p, 2072 size_t num_lanes) { 2073 const RebindToUnsigned<D> du; 2074 return BitCast( 2075 d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes)); 2076 } 2077 2078 // ------------------------------ Store 2079 2080 #define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 2081 MLEN, NAME, OP) \ 2082 template <size_t N> \ 2083 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ 2084 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2085 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ 2086 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ 2087 detail::NativeLanePointer(p), v, Lanes(d)); \ 2088 } 2089 HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT) 2090 #undef HWY_RVV_STORE 2091 2092 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2093 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2094 const RebindToUnsigned<decltype(d)> du; 2095 Store(BitCast(du, v), du, detail::U16LanePointer(p)); 2096 } 2097 2098 // ------------------------------ BlendedStore 2099 2100 #define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 2101 SHIFT, MLEN, NAME, OP) \ 2102 template <size_t N> \ 2103 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ 2104 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2105 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ 2106 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \ 2107 m, detail::NativeLanePointer(p), v, Lanes(d)); \ 2108 } 2109 HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT) 2110 #undef HWY_RVV_BLENDED_STORE 2111 2112 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2113 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 2114 TFromD<D>* HWY_RESTRICT p) { 2115 const RebindToUnsigned<decltype(d)> du; 2116 BlendedStore(BitCast(du, v), RebindMask(du, m), du, 2117 detail::U16LanePointer(p)); 2118 } 2119 2120 // ------------------------------ StoreN 2121 2122 namespace detail { 2123 2124 #define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 2125 MLEN, NAME, OP) \ 2126 template <size_t N> \ 2127 HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \ 2128 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \ 2129 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ 2130 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ 2131 detail::NativeLanePointer(p), v, count); \ 2132 } 2133 HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT) 2134 #undef HWY_RVV_STOREN 2135 2136 template <class D, HWY_RVV_IF_EMULATED_D(D)> 2137 HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2138 const RebindToUnsigned<decltype(d)> du; 2139 StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p)); 2140 } 2141 2142 } // namespace detail 2143 2144 #ifdef HWY_NATIVE_STORE_N 2145 #undef HWY_NATIVE_STORE_N 2146 #else 2147 #define HWY_NATIVE_STORE_N 2148 #endif 2149 2150 template <class D> 2151 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p, 2152 size_t max_lanes_to_store) { 2153 // NOTE: Need to clamp max_lanes_to_store to Lanes(d), even if 2154 // MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible for 2155 // detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than 2156 // Lanes(DFromV<VFromD<D>>()) lanes to p if 2157 // max_lanes_to_store > Lanes(DFromV<VFromD<D>>()) and 2158 // max_lanes_to_store < 2 * Lanes(DFromV<VFromD<D>>()) are both true. 2159 2160 // Also need to make sure that no more than Lanes(d) lanes are stored to p 2161 // if Lanes(d) < Lanes(DFromV<VFromD<D>>()) is true, which is possible if 2162 // MaxLanes(d) < MaxLanes(DFromV<VFromD<D>>()) or 2163 // d.Pow2() < DFromV<VFromD<D>>().Pow2() is true. 2164 detail::StoreN(CappedLanes(d, max_lanes_to_store), v, d, p); 2165 } 2166 2167 // ------------------------------ StoreU 2168 template <class V, class D> 2169 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) { 2170 // RVV only requires element alignment, not vector alignment. 2171 Store(v, d, p); 2172 } 2173 2174 // ------------------------------ Stream 2175 template <class V, class D, typename T> 2176 HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) { 2177 Store(v, d, aligned); 2178 } 2179 2180 // ------------------------------ ScatterOffset 2181 2182 #ifdef HWY_NATIVE_SCATTER 2183 #undef HWY_NATIVE_SCATTER 2184 #else 2185 #define HWY_NATIVE_SCATTER 2186 #endif 2187 2188 #define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 2189 SHIFT, MLEN, NAME, OP) \ 2190 template <size_t N> \ 2191 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ 2192 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2193 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ 2194 HWY_RVV_V(int, SEW, LMUL) offset) { \ 2195 const RebindToUnsigned<decltype(d)> du; \ 2196 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ 2197 detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \ 2198 } 2199 HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT) 2200 #undef HWY_RVV_SCATTER 2201 2202 // ------------------------------ ScatterIndex 2203 template <class D> 2204 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, 2205 VFromD<RebindToSigned<D>> indices) { 2206 constexpr size_t kBits = CeilLog2(sizeof(TFromD<D>)); 2207 return ScatterOffset(v, d, base, ShiftLeft<kBits>(indices)); 2208 } 2209 2210 // ------------------------------ MaskedScatterIndex 2211 2212 #define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 2213 LMULH, SHIFT, MLEN, NAME, OP) \ 2214 template <size_t N> \ 2215 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ 2216 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2217 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ 2218 HWY_RVV_V(int, SEW, LMUL) indices) { \ 2219 const RebindToUnsigned<decltype(d)> du; \ 2220 constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \ 2221 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \ 2222 m, detail::NativeLanePointer(base), \ 2223 ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \ 2224 } 2225 HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT) 2226 #undef HWY_RVV_MASKED_SCATTER 2227 2228 // ------------------------------ GatherOffset 2229 2230 #ifdef HWY_NATIVE_GATHER 2231 #undef HWY_NATIVE_GATHER 2232 #else 2233 #define HWY_NATIVE_GATHER 2234 #endif 2235 2236 #define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 2237 MLEN, NAME, OP) \ 2238 template <size_t N> \ 2239 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 2240 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2241 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ 2242 HWY_RVV_V(int, SEW, LMUL) offset) { \ 2243 const RebindToUnsigned<decltype(d)> du; \ 2244 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ 2245 detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \ 2246 } 2247 HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT) 2248 #undef HWY_RVV_GATHER 2249 2250 // ------------------------------ GatherIndex 2251 2252 template <class D> 2253 HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base, 2254 const VFromD<RebindToSigned<D>> index) { 2255 constexpr size_t kBits = CeilLog2(sizeof(TFromD<D>)); 2256 return GatherOffset(d, base, ShiftLeft<kBits>(index)); 2257 } 2258 2259 // ------------------------------ MaskedGatherIndexOr 2260 2261 #define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 2262 SHIFT, MLEN, NAME, OP) \ 2263 template <size_t N> \ 2264 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 2265 NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \ 2266 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 2267 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ 2268 HWY_RVV_V(int, SEW, LMUL) indices) { \ 2269 const RebindToUnsigned<decltype(d)> du; \ 2270 const RebindToSigned<decltype(d)> di; \ 2271 (void)di; /* for HWY_DASSERT */ \ 2272 constexpr size_t kBits = CeilLog2(SEW / 8); \ 2273 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \ 2274 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \ 2275 m, no, detail::NativeLanePointer(base), \ 2276 ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \ 2277 } 2278 HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndexOr, lux, _ALL_VIRT) 2279 #undef HWY_RVV_MASKED_GATHER 2280 2281 template <class D> 2282 HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, const TFromD<D>* base, 2283 VFromD<RebindToSigned<D>> indices) { 2284 return MaskedGatherIndexOr(Zero(d), m, d, base, indices); 2285 } 2286 2287 // ================================================== CONVERT 2288 2289 // ------------------------------ PromoteTo 2290 2291 // SEW is for the input. 2292 #define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 2293 SHIFT, MLEN, NAME, OP) \ 2294 template <size_t N> \ 2295 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ 2296 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 2297 return __riscv_v##OP##CHAR##SEWD##LMULD(v, Lanes(d)); \ 2298 } 2299 2300 HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) 2301 HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) 2302 HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) 2303 HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) 2304 HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) 2305 HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) 2306 HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, _EXT_VIRT) 2307 2308 #if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C 2309 2310 HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, 2311 _EXT_VIRT) 2312 2313 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. 2314 #ifdef HWY_NATIVE_F16C 2315 #undef HWY_NATIVE_F16C 2316 #else 2317 #define HWY_NATIVE_F16C 2318 #endif 2319 #endif // HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C 2320 2321 #undef HWY_RVV_PROMOTE 2322 2323 // The above X-macro cannot handle 4x promotion nor type switching. 2324 // TODO(janwas): use BASE2 arg to allow the latter. 2325 #define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \ 2326 SHIFT, ADD) \ 2327 template <size_t N> \ 2328 HWY_API HWY_RVV_V(BASE, BITS, LMUL) \ 2329 PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \ 2330 HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \ 2331 return __riscv_v##OP##CHAR##BITS##LMUL(v, Lanes(d)); \ 2332 } 2333 2334 #define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ 2335 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \ 2336 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \ 2337 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \ 2338 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \ 2339 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1) 2340 2341 #define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ 2342 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \ 2343 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \ 2344 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \ 2345 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2) 2346 2347 #define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ 2348 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \ 2349 HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) 2350 2351 #define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ 2352 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf8, -3, 3) \ 2353 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf4, -2, 3) \ 2354 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, mf2, -1, 3) \ 2355 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m1, 0, 3) 2356 2357 HWY_RVV_PROMOTE_X8(zext_vf8_, uint, u, 64, uint, 8) 2358 HWY_RVV_PROMOTE_X8(sext_vf8_, int, i, 64, int, 8) 2359 2360 HWY_RVV_PROMOTE_X4_FROM_U8(zext_vf4_, uint, u, 32, uint, 8) 2361 HWY_RVV_PROMOTE_X4_FROM_U8(sext_vf4_, int, i, 32, int, 8) 2362 HWY_RVV_PROMOTE_X4(zext_vf4_, uint, u, 64, uint, 16) 2363 HWY_RVV_PROMOTE_X4(sext_vf4_, int, i, 64, int, 16) 2364 2365 // i32 to f64 2366 HWY_RVV_PROMOTE_X2(fwcvt_f_x_v_, float, f, 64, int, 32) 2367 2368 // u32 to f64 2369 HWY_RVV_PROMOTE_X2(fwcvt_f_xu_v_, float, f, 64, uint, 32) 2370 2371 // f32 to i64 2372 HWY_RVV_PROMOTE_X2(fwcvt_rtz_x_f_v_, int, i, 64, float, 32) 2373 2374 // f32 to u64 2375 HWY_RVV_PROMOTE_X2(fwcvt_rtz_xu_f_v_, uint, u, 64, float, 32) 2376 2377 #undef HWY_RVV_PROMOTE_X8 2378 #undef HWY_RVV_PROMOTE_X4_FROM_U8 2379 #undef HWY_RVV_PROMOTE_X4 2380 #undef HWY_RVV_PROMOTE_X2 2381 #undef HWY_RVV_PROMOTE 2382 2383 // I16->I64 or U16->U64 PromoteTo with virtual LMUL 2384 template <size_t N> 2385 HWY_API auto PromoteTo(Simd<int64_t, N, -1> d, 2386 VFromD<Rebind<int16_t, decltype(d)>> v) 2387 -> VFromD<decltype(d)> { 2388 return PromoteTo(ScalableTag<int64_t>(), v); 2389 } 2390 2391 template <size_t N> 2392 HWY_API auto PromoteTo(Simd<uint64_t, N, -1> d, 2393 VFromD<Rebind<uint16_t, decltype(d)>> v) 2394 -> VFromD<decltype(d)> { 2395 return PromoteTo(ScalableTag<uint64_t>(), v); 2396 } 2397 2398 // Unsigned to signed: cast for unsigned promote. 2399 template <class D, HWY_IF_I16_D(D)> 2400 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { 2401 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); 2402 } 2403 2404 template <class D, HWY_IF_I32_D(D)> 2405 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { 2406 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); 2407 } 2408 2409 template <class D, HWY_IF_I32_D(D)> 2410 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) { 2411 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); 2412 } 2413 2414 template <class D, HWY_IF_I64_D(D)> 2415 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) { 2416 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); 2417 } 2418 2419 template <class D, HWY_IF_I64_D(D)> 2420 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) { 2421 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); 2422 } 2423 2424 template <class D, HWY_IF_I64_D(D)> 2425 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { 2426 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v)); 2427 } 2428 2429 template <class D, HWY_IF_F32_D(D)> 2430 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) { 2431 const RebindToSigned<decltype(d)> di32; 2432 const Rebind<uint16_t, decltype(d)> du16; 2433 return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 2434 } 2435 2436 // ------------------------------ DemoteTo U 2437 2438 // SEW is for the source so we can use _DEMOTE_VIRT. 2439 #define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 2440 MLEN, NAME, OP) \ 2441 template <size_t N> \ 2442 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ 2443 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 2444 return __riscv_v##OP##CHAR##SEWH##LMULH( \ 2445 v, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \ 2446 } 2447 2448 // Unsigned -> unsigned 2449 HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) 2450 HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) 2451 HWY_RVV_FOREACH_U64(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) 2452 2453 // SEW is for the source so we can use _DEMOTE_VIRT. 2454 #define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 2455 SHIFT, MLEN, NAME, OP) \ 2456 template <size_t N> \ 2457 HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \ 2458 HWY_RVV_D(uint, SEWH, N, SHIFT - 1) dn, HWY_RVV_V(int, SEW, LMUL) v) { \ 2459 const HWY_RVV_D(uint, SEW, N, SHIFT) du; \ 2460 /* First clamp negative numbers to zero to match x86 packus. */ \ 2461 return DemoteTo(dn, BitCast(du, detail::MaxS(v, 0))); \ 2462 } 2463 HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) 2464 HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) 2465 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) 2466 #undef HWY_RVV_DEMOTE_I_TO_U 2467 2468 template <size_t N> 2469 HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) { 2470 return __riscv_vnclipu_wx_u8mf8( 2471 DemoteTo(Simd<uint16_t, N, -2>(), v), 0, 2472 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2473 } 2474 template <size_t N> 2475 HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) { 2476 return __riscv_vnclipu_wx_u8mf4( 2477 DemoteTo(Simd<uint16_t, N, -1>(), v), 0, 2478 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2479 } 2480 template <size_t N> 2481 HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) { 2482 return __riscv_vnclipu_wx_u8mf2( 2483 DemoteTo(Simd<uint16_t, N, 0>(), v), 0, 2484 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2485 } 2486 template <size_t N> 2487 HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) { 2488 return __riscv_vnclipu_wx_u8m1( 2489 DemoteTo(Simd<uint16_t, N, 1>(), v), 0, 2490 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2491 } 2492 template <size_t N> 2493 HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) { 2494 return __riscv_vnclipu_wx_u8m2( 2495 DemoteTo(Simd<uint16_t, N, 2>(), v), 0, 2496 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2497 } 2498 2499 template <size_t N> 2500 HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vuint32mf2_t v) { 2501 return __riscv_vnclipu_wx_u8mf8( 2502 DemoteTo(Simd<uint16_t, N, -2>(), v), 0, 2503 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2504 } 2505 template <size_t N> 2506 HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vuint32m1_t v) { 2507 return __riscv_vnclipu_wx_u8mf4( 2508 DemoteTo(Simd<uint16_t, N, -1>(), v), 0, 2509 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2510 } 2511 template <size_t N> 2512 HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vuint32m2_t v) { 2513 return __riscv_vnclipu_wx_u8mf2( 2514 DemoteTo(Simd<uint16_t, N, 0>(), v), 0, 2515 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2516 } 2517 template <size_t N> 2518 HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vuint32m4_t v) { 2519 return __riscv_vnclipu_wx_u8m1( 2520 DemoteTo(Simd<uint16_t, N, 1>(), v), 0, 2521 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2522 } 2523 template <size_t N> 2524 HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) { 2525 return __riscv_vnclipu_wx_u8m2( 2526 DemoteTo(Simd<uint16_t, N, 2>(), v), 0, 2527 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); 2528 } 2529 2530 template <class D, HWY_IF_U8_D(D)> 2531 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) { 2532 return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v)); 2533 } 2534 2535 template <class D, HWY_IF_U8_D(D)> 2536 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) { 2537 return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v)); 2538 } 2539 2540 template <class D, HWY_IF_U16_D(D)> 2541 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) { 2542 return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v)); 2543 } 2544 2545 template <class D, HWY_IF_U16_D(D)> 2546 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<uint64_t, D>> v) { 2547 return DemoteTo(d, DemoteTo(Rebind<uint32_t, D>(), v)); 2548 } 2549 2550 HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) { 2551 const size_t avl = Lanes(ScalableTag<uint8_t, -3>()); 2552 return __riscv_vnclipu_wx_u8mf8( 2553 __riscv_vnclipu_wx_u16mf4(v, 0, 2554 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), 2555 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2556 } 2557 HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) { 2558 const size_t avl = Lanes(ScalableTag<uint8_t, -2>()); 2559 return __riscv_vnclipu_wx_u8mf4( 2560 __riscv_vnclipu_wx_u16mf2(v, 0, 2561 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), 2562 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2563 } 2564 HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) { 2565 const size_t avl = Lanes(ScalableTag<uint8_t, -1>()); 2566 return __riscv_vnclipu_wx_u8mf2( 2567 __riscv_vnclipu_wx_u16m1(v, 0, 2568 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), 2569 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2570 } 2571 HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) { 2572 const size_t avl = Lanes(ScalableTag<uint8_t, 0>()); 2573 return __riscv_vnclipu_wx_u8m1( 2574 __riscv_vnclipu_wx_u16m2(v, 0, 2575 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), 2576 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2577 } 2578 HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) { 2579 const size_t avl = Lanes(ScalableTag<uint8_t, 1>()); 2580 return __riscv_vnclipu_wx_u8m2( 2581 __riscv_vnclipu_wx_u16m4(v, 0, 2582 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), 2583 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2584 } 2585 2586 // ------------------------------ Truncations 2587 2588 template <size_t N> 2589 HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d, 2590 const VFromD<Simd<uint64_t, N, 0>> v) { 2591 const size_t avl = Lanes(d); 2592 const vuint64m1_t v1 = __riscv_vand(v, 0xFF, avl); 2593 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2( 2594 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2595 const vuint16mf4_t v3 = __riscv_vnclipu_wx_u16mf4( 2596 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2597 return __riscv_vnclipu_wx_u8mf8(v3, 0, 2598 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2599 } 2600 2601 template <size_t N> 2602 HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d, 2603 const VFromD<Simd<uint64_t, N, 1>> v) { 2604 const size_t avl = Lanes(d); 2605 const vuint64m2_t v1 = __riscv_vand(v, 0xFF, avl); 2606 const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1( 2607 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2608 const vuint16mf2_t v3 = __riscv_vnclipu_wx_u16mf2( 2609 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2610 return __riscv_vnclipu_wx_u8mf4(v3, 0, 2611 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2612 } 2613 2614 template <size_t N> 2615 HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d, 2616 const VFromD<Simd<uint64_t, N, 2>> v) { 2617 const size_t avl = Lanes(d); 2618 const vuint64m4_t v1 = __riscv_vand(v, 0xFF, avl); 2619 const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2( 2620 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2621 const vuint16m1_t v3 = __riscv_vnclipu_wx_u16m1( 2622 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2623 return __riscv_vnclipu_wx_u8mf2(v3, 0, 2624 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2625 } 2626 2627 template <size_t N> 2628 HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d, 2629 const VFromD<Simd<uint64_t, N, 3>> v) { 2630 const size_t avl = Lanes(d); 2631 const vuint64m8_t v1 = __riscv_vand(v, 0xFF, avl); 2632 const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4( 2633 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2634 const vuint16m2_t v3 = __riscv_vnclipu_wx_u16m2( 2635 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2636 return __riscv_vnclipu_wx_u8m1(v3, 0, 2637 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2638 } 2639 2640 template <size_t N> 2641 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -3> d, 2642 const VFromD<Simd<uint64_t, N, -1>> v) { 2643 const size_t avl = Lanes(d); 2644 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl); 2645 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2( 2646 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2647 return __riscv_vnclipu_wx_u16mf4(v2, 0, 2648 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2649 } 2650 2651 template <size_t N> 2652 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d, 2653 const VFromD<Simd<uint64_t, N, 0>> v) { 2654 const size_t avl = Lanes(d); 2655 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl); 2656 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2( 2657 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2658 return __riscv_vnclipu_wx_u16mf4(v2, 0, 2659 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2660 } 2661 2662 template <size_t N> 2663 HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d, 2664 const VFromD<Simd<uint64_t, N, 1>> v) { 2665 const size_t avl = Lanes(d); 2666 const vuint64m2_t v1 = __riscv_vand(v, 0xFFFF, avl); 2667 const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1( 2668 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2669 return __riscv_vnclipu_wx_u16mf2(v2, 0, 2670 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2671 } 2672 2673 template <size_t N> 2674 HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d, 2675 const VFromD<Simd<uint64_t, N, 2>> v) { 2676 const size_t avl = Lanes(d); 2677 const vuint64m4_t v1 = __riscv_vand(v, 0xFFFF, avl); 2678 const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2( 2679 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2680 return __riscv_vnclipu_wx_u16m1(v2, 0, 2681 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2682 } 2683 2684 template <size_t N> 2685 HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d, 2686 const VFromD<Simd<uint64_t, N, 3>> v) { 2687 const size_t avl = Lanes(d); 2688 const vuint64m8_t v1 = __riscv_vand(v, 0xFFFF, avl); 2689 const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4( 2690 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2691 return __riscv_vnclipu_wx_u16m2(v2, 0, 2692 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2693 } 2694 2695 template <size_t N> 2696 HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -2> d, 2697 const VFromD<Simd<uint64_t, N, -1>> v) { 2698 const size_t avl = Lanes(d); 2699 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); 2700 return __riscv_vnclipu_wx_u32mf2(v1, 0, 2701 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2702 } 2703 2704 template <size_t N> 2705 HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -1> d, 2706 const VFromD<Simd<uint64_t, N, 0>> v) { 2707 const size_t avl = Lanes(d); 2708 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); 2709 return __riscv_vnclipu_wx_u32mf2(v1, 0, 2710 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2711 } 2712 2713 template <size_t N> 2714 HWY_API vuint32m1_t TruncateTo(Simd<uint32_t, N, 0> d, 2715 const VFromD<Simd<uint64_t, N, 1>> v) { 2716 const size_t avl = Lanes(d); 2717 const vuint64m2_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); 2718 return __riscv_vnclipu_wx_u32m1(v1, 0, 2719 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2720 } 2721 2722 template <size_t N> 2723 HWY_API vuint32m2_t TruncateTo(Simd<uint32_t, N, 1> d, 2724 const VFromD<Simd<uint64_t, N, 2>> v) { 2725 const size_t avl = Lanes(d); 2726 const vuint64m4_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); 2727 return __riscv_vnclipu_wx_u32m2(v1, 0, 2728 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2729 } 2730 2731 template <size_t N> 2732 HWY_API vuint32m4_t TruncateTo(Simd<uint32_t, N, 2> d, 2733 const VFromD<Simd<uint64_t, N, 3>> v) { 2734 const size_t avl = Lanes(d); 2735 const vuint64m8_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); 2736 return __riscv_vnclipu_wx_u32m4(v1, 0, 2737 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2738 } 2739 2740 template <size_t N> 2741 HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d, 2742 const VFromD<Simd<uint32_t, N, -1>> v) { 2743 const size_t avl = Lanes(d); 2744 const vuint32mf2_t v1 = __riscv_vand(v, 0xFF, avl); 2745 const vuint16mf4_t v2 = __riscv_vnclipu_wx_u16mf4( 2746 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2747 return __riscv_vnclipu_wx_u8mf8(v2, 0, 2748 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2749 } 2750 2751 template <size_t N> 2752 HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d, 2753 const VFromD<Simd<uint32_t, N, 0>> v) { 2754 const size_t avl = Lanes(d); 2755 const vuint32m1_t v1 = __riscv_vand(v, 0xFF, avl); 2756 const vuint16mf2_t v2 = __riscv_vnclipu_wx_u16mf2( 2757 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2758 return __riscv_vnclipu_wx_u8mf4(v2, 0, 2759 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2760 } 2761 2762 template <size_t N> 2763 HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d, 2764 const VFromD<Simd<uint32_t, N, 1>> v) { 2765 const size_t avl = Lanes(d); 2766 const vuint32m2_t v1 = __riscv_vand(v, 0xFF, avl); 2767 const vuint16m1_t v2 = __riscv_vnclipu_wx_u16m1( 2768 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2769 return __riscv_vnclipu_wx_u8mf2(v2, 0, 2770 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2771 } 2772 2773 template <size_t N> 2774 HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d, 2775 const VFromD<Simd<uint32_t, N, 2>> v) { 2776 const size_t avl = Lanes(d); 2777 const vuint32m4_t v1 = __riscv_vand(v, 0xFF, avl); 2778 const vuint16m2_t v2 = __riscv_vnclipu_wx_u16m2( 2779 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2780 return __riscv_vnclipu_wx_u8m1(v2, 0, 2781 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2782 } 2783 2784 template <size_t N> 2785 HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d, 2786 const VFromD<Simd<uint32_t, N, 3>> v) { 2787 const size_t avl = Lanes(d); 2788 const vuint32m8_t v1 = __riscv_vand(v, 0xFF, avl); 2789 const vuint16m4_t v2 = __riscv_vnclipu_wx_u16m4( 2790 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2791 return __riscv_vnclipu_wx_u8m2(v2, 0, 2792 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2793 } 2794 2795 template <size_t N> 2796 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -3> d, 2797 const VFromD<Simd<uint32_t, N, -2>> v) { 2798 const size_t avl = Lanes(d); 2799 const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl); 2800 return __riscv_vnclipu_wx_u16mf4(v1, 0, 2801 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2802 } 2803 2804 template <size_t N> 2805 HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d, 2806 const VFromD<Simd<uint32_t, N, -1>> v) { 2807 const size_t avl = Lanes(d); 2808 const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl); 2809 return __riscv_vnclipu_wx_u16mf4(v1, 0, 2810 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2811 } 2812 2813 template <size_t N> 2814 HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d, 2815 const VFromD<Simd<uint32_t, N, 0>> v) { 2816 const size_t avl = Lanes(d); 2817 const vuint32m1_t v1 = __riscv_vand(v, 0xFFFF, avl); 2818 return __riscv_vnclipu_wx_u16mf2(v1, 0, 2819 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2820 } 2821 2822 template <size_t N> 2823 HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d, 2824 const VFromD<Simd<uint32_t, N, 1>> v) { 2825 const size_t avl = Lanes(d); 2826 const vuint32m2_t v1 = __riscv_vand(v, 0xFFFF, avl); 2827 return __riscv_vnclipu_wx_u16m1(v1, 0, 2828 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2829 } 2830 2831 template <size_t N> 2832 HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d, 2833 const VFromD<Simd<uint32_t, N, 2>> v) { 2834 const size_t avl = Lanes(d); 2835 const vuint32m4_t v1 = __riscv_vand(v, 0xFFFF, avl); 2836 return __riscv_vnclipu_wx_u16m2(v1, 0, 2837 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2838 } 2839 2840 template <size_t N> 2841 HWY_API vuint16m4_t TruncateTo(Simd<uint16_t, N, 2> d, 2842 const VFromD<Simd<uint32_t, N, 3>> v) { 2843 const size_t avl = Lanes(d); 2844 const vuint32m8_t v1 = __riscv_vand(v, 0xFFFF, avl); 2845 return __riscv_vnclipu_wx_u16m4(v1, 0, 2846 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2847 } 2848 2849 template <size_t N> 2850 HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d, 2851 const VFromD<Simd<uint16_t, N, -2>> v) { 2852 const size_t avl = Lanes(d); 2853 const vuint16mf4_t v1 = __riscv_vand(v, 0xFF, avl); 2854 return __riscv_vnclipu_wx_u8mf8(v1, 0, 2855 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2856 } 2857 2858 template <size_t N> 2859 HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d, 2860 const VFromD<Simd<uint16_t, N, -1>> v) { 2861 const size_t avl = Lanes(d); 2862 const vuint16mf2_t v1 = __riscv_vand(v, 0xFF, avl); 2863 return __riscv_vnclipu_wx_u8mf4(v1, 0, 2864 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2865 } 2866 2867 template <size_t N> 2868 HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d, 2869 const VFromD<Simd<uint16_t, N, 0>> v) { 2870 const size_t avl = Lanes(d); 2871 const vuint16m1_t v1 = __riscv_vand(v, 0xFF, avl); 2872 return __riscv_vnclipu_wx_u8mf2(v1, 0, 2873 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2874 } 2875 2876 template <size_t N> 2877 HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d, 2878 const VFromD<Simd<uint16_t, N, 1>> v) { 2879 const size_t avl = Lanes(d); 2880 const vuint16m2_t v1 = __riscv_vand(v, 0xFF, avl); 2881 return __riscv_vnclipu_wx_u8m1(v1, 0, 2882 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2883 } 2884 2885 template <size_t N> 2886 HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d, 2887 const VFromD<Simd<uint16_t, N, 2>> v) { 2888 const size_t avl = Lanes(d); 2889 const vuint16m4_t v1 = __riscv_vand(v, 0xFF, avl); 2890 return __riscv_vnclipu_wx_u8m2(v1, 0, 2891 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2892 } 2893 2894 template <size_t N> 2895 HWY_API vuint8m4_t TruncateTo(Simd<uint8_t, N, 2> d, 2896 const VFromD<Simd<uint16_t, N, 3>> v) { 2897 const size_t avl = Lanes(d); 2898 const vuint16m8_t v1 = __riscv_vand(v, 0xFF, avl); 2899 return __riscv_vnclipu_wx_u8m4(v1, 0, 2900 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); 2901 } 2902 2903 // ------------------------------ DemoteTo I 2904 2905 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) 2906 HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) 2907 HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) 2908 2909 template <size_t N> 2910 HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) { 2911 return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v)); 2912 } 2913 template <size_t N> 2914 HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) { 2915 return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v)); 2916 } 2917 template <size_t N> 2918 HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) { 2919 return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v)); 2920 } 2921 template <size_t N> 2922 HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) { 2923 return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v)); 2924 } 2925 template <size_t N> 2926 HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) { 2927 return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v)); 2928 } 2929 2930 template <class D, HWY_IF_I8_D(D)> 2931 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) { 2932 return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v)); 2933 } 2934 2935 template <class D, HWY_IF_I16_D(D)> 2936 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<int64_t, D>> v) { 2937 return DemoteTo(d, DemoteTo(Rebind<int32_t, D>(), v)); 2938 } 2939 2940 #undef HWY_RVV_DEMOTE 2941 2942 // ------------------------------ DemoteTo F 2943 2944 // SEW is for the source so we can use _DEMOTE_VIRT. 2945 #define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 2946 SHIFT, MLEN, NAME, OP) \ 2947 template <size_t N> \ 2948 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ 2949 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 2950 return __riscv_v##OP##SEWH##LMULH(v, Lanes(d)); \ 2951 } 2952 2953 #if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C 2954 HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT) 2955 #endif 2956 HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_f_f_w_f, _DEMOTE_VIRT) 2957 2958 namespace detail { 2959 HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteToF32WithRoundToOdd, 2960 fncvt_rod_f_f_w_f, _DEMOTE_VIRT) 2961 } // namespace detail 2962 2963 #undef HWY_RVV_DEMOTE_F 2964 2965 // TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F. 2966 template <size_t N> 2967 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) { 2968 return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); 2969 } 2970 template <size_t N> 2971 HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) { 2972 return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); 2973 } 2974 template <size_t N> 2975 HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) { 2976 return __riscv_vfncvt_rtz_x_f_w_i32m1(v, Lanes(d)); 2977 } 2978 template <size_t N> 2979 HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) { 2980 return __riscv_vfncvt_rtz_x_f_w_i32m2(v, Lanes(d)); 2981 } 2982 template <size_t N> 2983 HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) { 2984 return __riscv_vfncvt_rtz_x_f_w_i32m4(v, Lanes(d)); 2985 } 2986 2987 template <size_t N> 2988 HWY_API vuint32mf2_t DemoteTo(Simd<uint32_t, N, -2> d, const vfloat64m1_t v) { 2989 return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v, Lanes(d)); 2990 } 2991 template <size_t N> 2992 HWY_API vuint32mf2_t DemoteTo(Simd<uint32_t, N, -1> d, const vfloat64m1_t v) { 2993 return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v, Lanes(d)); 2994 } 2995 template <size_t N> 2996 HWY_API vuint32m1_t DemoteTo(Simd<uint32_t, N, 0> d, const vfloat64m2_t v) { 2997 return __riscv_vfncvt_rtz_xu_f_w_u32m1(v, Lanes(d)); 2998 } 2999 template <size_t N> 3000 HWY_API vuint32m2_t DemoteTo(Simd<uint32_t, N, 1> d, const vfloat64m4_t v) { 3001 return __riscv_vfncvt_rtz_xu_f_w_u32m2(v, Lanes(d)); 3002 } 3003 template <size_t N> 3004 HWY_API vuint32m4_t DemoteTo(Simd<uint32_t, N, 2> d, const vfloat64m8_t v) { 3005 return __riscv_vfncvt_rtz_xu_f_w_u32m4(v, Lanes(d)); 3006 } 3007 3008 template <size_t N> 3009 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -2> d, const vint64m1_t v) { 3010 return __riscv_vfncvt_f_x_w_f32mf2(v, Lanes(d)); 3011 } 3012 template <size_t N> 3013 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -1> d, const vint64m1_t v) { 3014 return __riscv_vfncvt_f_x_w_f32mf2(v, Lanes(d)); 3015 } 3016 template <size_t N> 3017 HWY_API vfloat32m1_t DemoteTo(Simd<float, N, 0> d, const vint64m2_t v) { 3018 return __riscv_vfncvt_f_x_w_f32m1(v, Lanes(d)); 3019 } 3020 template <size_t N> 3021 HWY_API vfloat32m2_t DemoteTo(Simd<float, N, 1> d, const vint64m4_t v) { 3022 return __riscv_vfncvt_f_x_w_f32m2(v, Lanes(d)); 3023 } 3024 template <size_t N> 3025 HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vint64m8_t v) { 3026 return __riscv_vfncvt_f_x_w_f32m4(v, Lanes(d)); 3027 } 3028 3029 template <size_t N> 3030 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -2> d, const vuint64m1_t v) { 3031 return __riscv_vfncvt_f_xu_w_f32mf2(v, Lanes(d)); 3032 } 3033 template <size_t N> 3034 HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -1> d, const vuint64m1_t v) { 3035 return __riscv_vfncvt_f_xu_w_f32mf2(v, Lanes(d)); 3036 } 3037 template <size_t N> 3038 HWY_API vfloat32m1_t DemoteTo(Simd<float, N, 0> d, const vuint64m2_t v) { 3039 return __riscv_vfncvt_f_xu_w_f32m1(v, Lanes(d)); 3040 } 3041 template <size_t N> 3042 HWY_API vfloat32m2_t DemoteTo(Simd<float, N, 1> d, const vuint64m4_t v) { 3043 return __riscv_vfncvt_f_xu_w_f32m2(v, Lanes(d)); 3044 } 3045 template <size_t N> 3046 HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vuint64m8_t v) { 3047 return __riscv_vfncvt_f_xu_w_f32m4(v, Lanes(d)); 3048 } 3049 3050 // Narrows f32 bits to bf16 using round to even. 3051 // SEW is for the source so we can use _DEMOTE_VIRT. 3052 #ifdef HWY_RVV_AVOID_VXRM 3053 #define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \ 3054 LMULD, LMULH, SHIFT, MLEN, NAME, OP) \ 3055 template <size_t N> \ 3056 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ 3057 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3058 const auto round = \ 3059 detail::AddS(detail::AndS(ShiftRight<16>(v), 1u), 0x7FFFu); \ 3060 v = Add(v, round); \ 3061 /* The default rounding mode appears to be RNU=0, which adds the LSB. */ \ 3062 /* Prevent further rounding by clearing the bits we want to truncate. */ \ 3063 v = detail::AndS(v, 0xFFFF0000u); \ 3064 return __riscv_v##OP##CHAR##SEWH##LMULH(v, 16, Lanes(d)); \ 3065 } 3066 3067 #else 3068 #define HWY_RVV_DEMOTE_16_NEAREST_EVEN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, \ 3069 LMULD, LMULH, SHIFT, MLEN, NAME, OP) \ 3070 template <size_t N> \ 3071 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ 3072 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3073 return __riscv_v##OP##CHAR##SEWH##LMULH( \ 3074 v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNE, Lanes(d))); \ 3075 } 3076 #endif // HWY_RVV_AVOID_VXRM 3077 namespace detail { 3078 HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_16_NEAREST_EVEN, DemoteTo16NearestEven, 3079 nclipu_wx_, _DEMOTE_VIRT) 3080 } 3081 #undef HWY_RVV_DEMOTE_16_NEAREST_EVEN 3082 3083 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 3084 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 3085 #else 3086 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 3087 #endif 3088 3089 template <class DBF16, HWY_IF_BF16_D(DBF16)> 3090 HWY_API VFromD<DBF16> DemoteTo(DBF16 d, VFromD<Rebind<float, DBF16>> v) { 3091 const DFromV<decltype(v)> df; 3092 const RebindToUnsigned<decltype(df)> du32; 3093 const RebindToUnsigned<decltype(d)> du16; 3094 // Consider an f32 mantissa with the upper 7 bits set, followed by a 1-bit 3095 // and at least one other bit set. This will round to 0 and increment the 3096 // exponent. If the exponent was already 0xFF (NaN), then the result is -inf; 3097 // there no wraparound because nclipu saturates. Note that in this case, the 3098 // input cannot have been inf because its mantissa bits are zero. To avoid 3099 // converting NaN to inf, we canonicalize the NaN to prevent the rounding. 3100 const decltype(v) canonicalized = 3101 IfThenElse(Eq(v, v), v, BitCast(df, Set(du32, 0x7F800000))); 3102 return BitCast( 3103 d, detail::DemoteTo16NearestEven(du16, BitCast(du32, canonicalized))); 3104 } 3105 3106 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 3107 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 3108 #else 3109 #define HWY_NATIVE_DEMOTE_F64_TO_F16 3110 #endif 3111 3112 template <class D, HWY_IF_F16_D(D)> 3113 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) { 3114 const Rebind<float, decltype(df16)> df32; 3115 return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v)); 3116 } 3117 3118 // ------------------------------ ConvertTo F 3119 3120 #define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3121 SHIFT, MLEN, NAME, OP) \ 3122 template <size_t N> \ 3123 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ 3124 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \ 3125 return __riscv_vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \ 3126 } \ 3127 template <size_t N> \ 3128 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ 3129 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) { \ 3130 return __riscv_vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \ 3131 } \ 3132 /* Truncates (rounds toward zero). */ \ 3133 template <size_t N> \ 3134 HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \ 3135 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3136 return __riscv_vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \ 3137 } \ 3138 template <size_t N> \ 3139 HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo( \ 3140 HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3141 return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d)); \ 3142 } 3143 3144 HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT) 3145 #undef HWY_RVV_CONVERT 3146 3147 // Uses default rounding mode. Must be separate because there is no D arg. 3148 #define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3149 SHIFT, MLEN, NAME, OP) \ 3150 HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3151 return __riscv_vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ 3152 } 3153 HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL) 3154 #undef HWY_RVV_NEAREST 3155 3156 template <size_t N> 3157 HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -2> d, 3158 const vfloat64m1_t v) { 3159 return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d)); 3160 } 3161 template <size_t N> 3162 HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -1> d, 3163 const vfloat64m1_t v) { 3164 return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d)); 3165 } 3166 template <size_t N> 3167 HWY_API vint32m1_t DemoteToNearestInt(Simd<int32_t, N, 0> d, 3168 const vfloat64m2_t v) { 3169 return __riscv_vfncvt_x_f_w_i32m1(v, Lanes(d)); 3170 } 3171 template <size_t N> 3172 HWY_API vint32m2_t DemoteToNearestInt(Simd<int32_t, N, 1> d, 3173 const vfloat64m4_t v) { 3174 return __riscv_vfncvt_x_f_w_i32m2(v, Lanes(d)); 3175 } 3176 template <size_t N> 3177 HWY_API vint32m4_t DemoteToNearestInt(Simd<int32_t, N, 2> d, 3178 const vfloat64m8_t v) { 3179 return __riscv_vfncvt_x_f_w_i32m4(v, Lanes(d)); 3180 } 3181 3182 // ================================================== COMBINE 3183 3184 namespace detail { 3185 3186 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes 3187 // offsets are implicitly relative to the start of their 128-bit block. 3188 template <typename T, size_t N, int kPow2> 3189 HWY_INLINE size_t LanesPerBlock(Simd<T, N, kPow2> d) { 3190 // kMinVecBytes is the minimum size of VFromD<decltype(d)> in bytes 3191 constexpr size_t kMinVecBytes = 3192 ScaleByPower(16, HWY_MAX(HWY_MIN(kPow2, 3), -3)); 3193 // kMinVecLanes is the minimum number of lanes in VFromD<decltype(d)> 3194 constexpr size_t kMinVecLanes = (kMinVecBytes + sizeof(T) - 1) / sizeof(T); 3195 // kMaxLpb is the maximum number of lanes per block 3196 constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), MaxLanes(d)); 3197 3198 // If kMaxLpb <= kMinVecLanes is true, then kMaxLpb <= Lanes(d) is true 3199 if (kMaxLpb <= kMinVecLanes) return kMaxLpb; 3200 3201 // Fractional LMUL: Lanes(d) may be smaller than kMaxLpb, so honor that. 3202 const size_t lanes_per_vec = Lanes(d); 3203 return HWY_MIN(lanes_per_vec, kMaxLpb); 3204 } 3205 3206 template <class D, class V> 3207 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { 3208 using T = MakeUnsigned<TFromV<V>>; 3209 return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1))); 3210 } 3211 3212 template <size_t kLanes, class D> 3213 HWY_INLINE MFromD<D> FirstNPerBlock(D /* tag */) { 3214 const RebindToUnsigned<D> du; 3215 const RebindToSigned<D> di; 3216 using TU = TFromD<decltype(du)>; 3217 const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1)); 3218 return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes)); 3219 } 3220 3221 #define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3222 SHIFT, MLEN, NAME, OP) \ 3223 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3224 NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \ 3225 size_t lanes) { \ 3226 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \ 3227 HWY_RVV_AVL(SEW, SHIFT)); \ 3228 } 3229 3230 #define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3231 SHIFT, MLEN, NAME, OP) \ 3232 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3233 NAME(HWY_RVV_V(BASE, SEW, LMUL) src, size_t lanes) { \ 3234 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(src, lanes, \ 3235 HWY_RVV_AVL(SEW, SHIFT)); \ 3236 } 3237 3238 HWY_RVV_FOREACH(HWY_RVV_SLIDE_UP, SlideUp, slideup, _ALL) 3239 HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL) 3240 3241 #undef HWY_RVV_SLIDE_UP 3242 #undef HWY_RVV_SLIDE_DOWN 3243 3244 #define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 3245 MLEN, NAME, OP) \ 3246 template <size_t kIndex> \ 3247 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3248 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \ 3249 v, kIndex); /* no AVL */ \ 3250 } 3251 #define HWY_RVV_GET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3252 SHIFT, MLEN, NAME, OP) \ 3253 template <size_t kIndex> \ 3254 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3255 static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ 3256 HWY_IF_CONSTEXPR(kIndex == 0) { return Trunc(v); } \ 3257 HWY_IF_CONSTEXPR(kIndex != 0) { \ 3258 return Trunc(SlideDown( \ 3259 v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \ 3260 SHIFT - 1){}))); \ 3261 } \ 3262 } 3263 #define HWY_RVV_GET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3264 SHIFT, MLEN, NAME, OP) \ 3265 template <size_t kIndex> \ 3266 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3267 static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ 3268 HWY_IF_CONSTEXPR(kIndex == 0) { return v; } \ 3269 HWY_IF_CONSTEXPR(kIndex != 0) { \ 3270 return SlideDown( \ 3271 v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \ 3272 SHIFT){}) / \ 3273 2); \ 3274 } \ 3275 } 3276 HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _GET_SET) 3277 HWY_RVV_FOREACH(HWY_RVV_GET_VIRT, Get, get, _GET_SET_VIRT) 3278 HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST) 3279 #undef HWY_RVV_GET 3280 #undef HWY_RVV_GET_VIRT 3281 #undef HWY_RVV_GET_SMALLEST 3282 3283 template <size_t kIndex, class D> 3284 static HWY_INLINE HWY_MAYBE_UNUSED VFromD<AdjustSimdTagToMinVecPow2<Half<D>>> 3285 Get(D d, VFromD<D> v) { 3286 static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); 3287 HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { return Get<kIndex>(v); } 3288 HWY_IF_CONSTEXPR(kIndex != 0 && !detail::IsFull(d)) { 3289 const AdjustSimdTagToMinVecPow2<Half<decltype(d)>> dh; 3290 const size_t slide_down_amt = 3291 (dh.Pow2() < DFromV<decltype(v)>().Pow2()) ? Lanes(dh) : (Lanes(d) / 2); 3292 return ResizeBitCast(dh, SlideDown(v, slide_down_amt)); 3293 } 3294 } 3295 3296 #define HWY_RVV_PARTIAL_VEC_SET_HALF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 3297 LMULH, SHIFT, MLEN, NAME, OP) \ 3298 template <size_t kIndex> \ 3299 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3300 NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v, \ 3301 size_t half_N) { \ 3302 static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ 3303 const DFromV<decltype(dest)> d; \ 3304 HWY_IF_CONSTEXPR(kIndex == 0) { \ 3305 return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \ 3306 half_N); \ 3307 } \ 3308 HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, Ext(d, v), half_N); } \ 3309 } 3310 #define HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST( \ 3311 BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP) \ 3312 template <size_t kIndex> \ 3313 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3314 NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v, \ 3315 size_t half_N) { \ 3316 static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ 3317 HWY_IF_CONSTEXPR(kIndex == 0) { \ 3318 return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, v, half_N); \ 3319 } \ 3320 HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, v, half_N); } \ 3321 } 3322 HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, _GET_SET) 3323 HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, 3324 _GET_SET_VIRT) 3325 HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST, PartialVecSetHalf, mv, 3326 _GET_SET_SMALLEST) 3327 #undef HWY_RVV_PARTIAL_VEC_SET_HALF 3328 #undef HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST 3329 3330 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 3331 MLEN, NAME, OP) \ 3332 template <size_t kIndex, size_t N> \ 3333 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3334 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \ 3335 HWY_RVV_V(BASE, SEW, LMULH) v) { \ 3336 HWY_IF_CONSTEXPR(detail::IsFull(d)) { \ 3337 return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL( \ 3338 dest, kIndex, v); /* no AVL */ \ 3339 } \ 3340 HWY_IF_CONSTEXPR(!detail::IsFull(d)) { \ 3341 const Half<decltype(d)> dh; \ 3342 return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh)); \ 3343 } \ 3344 } 3345 #define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3346 SHIFT, MLEN, NAME, OP) \ 3347 template <size_t kIndex, size_t N> \ 3348 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3349 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \ 3350 HWY_RVV_V(BASE, SEW, LMULH) v) { \ 3351 const Half<decltype(d)> dh; \ 3352 return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh)); \ 3353 } 3354 #define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3355 SHIFT, MLEN, NAME, OP) \ 3356 template <size_t kIndex, size_t N> \ 3357 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3358 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \ 3359 HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3360 return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2); \ 3361 } 3362 #define HWY_RVV_SET_SMALLEST_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 3363 LMULH, SHIFT, MLEN, NAME, OP) \ 3364 template <size_t kIndex, size_t N> \ 3365 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3366 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT - 1) d, \ 3367 HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3368 return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2); \ 3369 } 3370 HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _GET_SET) 3371 HWY_RVV_FOREACH(HWY_RVV_SET_VIRT, Set, set, _GET_SET_VIRT) 3372 HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST) 3373 HWY_RVV_FOREACH_UI163264(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST) 3374 HWY_RVV_FOREACH_F(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST) 3375 #undef HWY_RVV_SET 3376 #undef HWY_RVV_SET_VIRT 3377 #undef HWY_RVV_SET_SMALLEST 3378 #undef HWY_RVV_SET_SMALLEST_VIRT 3379 3380 template <size_t kIndex, class D, HWY_RVV_IF_EMULATED_D(D)> 3381 static HWY_INLINE HWY_MAYBE_UNUSED VFromD<D> Set( 3382 D d, VFromD<D> dest, VFromD<AdjustSimdTagToMinVecPow2<Half<D>>> v) { 3383 const RebindToUnsigned<decltype(d)> du; 3384 return BitCast( 3385 d, Set<kIndex>(du, BitCast(du, dest), 3386 BitCast(RebindToUnsigned<DFromV<decltype(v)>>(), v))); 3387 } 3388 3389 } // namespace detail 3390 3391 // ------------------------------ SlideUpLanes 3392 template <class D> 3393 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3394 return detail::SlideUp(Zero(d), v, amt); 3395 } 3396 3397 // ------------------------------ SlideDownLanes 3398 template <class D> 3399 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3400 v = detail::SlideDown(v, amt); 3401 // Zero out upper lanes if v is a partial vector 3402 if (MaxLanes(d) < MaxLanes(DFromV<decltype(v)>())) { 3403 v = detail::SlideUp(v, Zero(d), Lanes(d) - amt); 3404 } 3405 return v; 3406 } 3407 3408 // ------------------------------ ConcatUpperLower 3409 template <class D, class V> 3410 HWY_API V ConcatUpperLower(D d, const V hi, const V lo) { 3411 const auto lo_lower = detail::Get<0>(d, lo); 3412 return detail::Set<0>(d, hi, lo_lower); 3413 } 3414 3415 // ------------------------------ ConcatLowerLower 3416 template <class D, class V> 3417 HWY_API V ConcatLowerLower(D d, const V hi, const V lo) { 3418 const auto hi_lower = detail::Get<0>(d, hi); 3419 return detail::Set<1>(d, lo, hi_lower); 3420 } 3421 3422 // ------------------------------ ConcatUpperUpper 3423 template <class D, class V> 3424 HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) { 3425 const auto lo_upper = detail::Get<1>(d, lo); 3426 return detail::Set<0>(d, hi, lo_upper); 3427 } 3428 3429 // ------------------------------ ConcatLowerUpper 3430 template <class D, class V> 3431 HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { 3432 const auto lo_upper = detail::Get<1>(d, lo); 3433 const auto hi_lower = detail::Get<0>(d, hi); 3434 return detail::Set<1>(d, ResizeBitCast(d, lo_upper), hi_lower); 3435 } 3436 3437 // ------------------------------ Combine 3438 template <class D2, class V> 3439 HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) { 3440 return detail::Set<1>(d2, ResizeBitCast(d2, lo), hi); 3441 } 3442 3443 // ------------------------------ ZeroExtendVector 3444 template <class D2, class V> 3445 HWY_API VFromD<D2> ZeroExtendVector(D2 d2, const V lo) { 3446 return Combine(d2, Xor(lo, lo), lo); 3447 } 3448 3449 // ------------------------------ Lower/UpperHalf 3450 3451 namespace detail { 3452 3453 // RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note 3454 // that SEW = sizeof(T)*8 and LMUL = 1 << d.Pow2(). Add 3 to Pow2 to avoid 3455 // negative shift counts. 3456 template <class D> 3457 constexpr bool IsSupportedLMUL(D d) { 3458 return (size_t{1} << (d.Pow2() + 3)) >= sizeof(TFromD<D>); 3459 } 3460 3461 } // namespace detail 3462 3463 // If IsSupportedLMUL, just 'truncate' i.e. halve LMUL. 3464 template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr> 3465 HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) { 3466 return detail::Trunc(v); 3467 } 3468 3469 // Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and 3470 // the hardware may set "vill" if we attempt such an LMUL. However, the V 3471 // extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it 3472 // still makes sense to have half of an SEW=64 vector. We instead just return 3473 // the vector, and rely on the kPow2 in DH to halve the return value of Lanes(). 3474 template <class DH, class V, 3475 hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr> 3476 HWY_API V LowerHalf(const DH /* tag */, const V v) { 3477 return v; 3478 } 3479 3480 // Same, but without D arg 3481 template <class V> 3482 HWY_API VFromD<Half<DFromV<V>>> LowerHalf(const V v) { 3483 return LowerHalf(Half<DFromV<V>>(), v); 3484 } 3485 3486 template <class DH> 3487 HWY_API VFromD<DH> UpperHalf(const DH /*d2*/, const VFromD<Twice<DH>> v) { 3488 const Twice<DH> d; 3489 return detail::Get<1>(d, v); 3490 } 3491 3492 // ================================================== SWIZZLE 3493 3494 namespace detail { 3495 // Special instruction for 1 lane is presumably faster? 3496 #define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 3497 MLEN, NAME, OP) \ 3498 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3499 return __riscv_v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \ 3500 } 3501 3502 HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL) 3503 HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL) 3504 HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL) 3505 HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL) 3506 #undef HWY_RVV_SLIDE1 3507 } // namespace detail 3508 3509 // ------------------------------ Slide1Up and Slide1Down 3510 #ifdef HWY_NATIVE_SLIDE1_UP_DOWN 3511 #undef HWY_NATIVE_SLIDE1_UP_DOWN 3512 #else 3513 #define HWY_NATIVE_SLIDE1_UP_DOWN 3514 #endif 3515 3516 template <class D> 3517 HWY_API VFromD<D> Slide1Up(D /*d*/, VFromD<D> v) { 3518 return detail::Slide1Up(v); 3519 } 3520 3521 template <class D> 3522 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 3523 v = detail::Slide1Down(v); 3524 // Zero out upper lanes if v is a partial vector 3525 if (MaxLanes(d) < MaxLanes(DFromV<decltype(v)>())) { 3526 v = detail::SlideUp(v, Zero(d), Lanes(d) - 1); 3527 } 3528 return v; 3529 } 3530 3531 // ------------------------------ GetLane 3532 3533 #define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3534 SHIFT, MLEN, NAME, OP) \ 3535 HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 3536 return __riscv_v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \ 3537 } 3538 3539 HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x, _ALL) 3540 HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetLane, fmv_f, _ALL) 3541 #undef HWY_RVV_GET_LANE 3542 3543 // ------------------------------ ExtractLane 3544 template <class V> 3545 HWY_API TFromV<V> ExtractLane(const V v, size_t i) { 3546 return GetLane(detail::SlideDown(v, i)); 3547 } 3548 3549 // ------------------------------ Additional mask logical operations 3550 3551 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof) 3552 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetBeforeFirst, sbf) 3553 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetAtOrBeforeFirst, sif) 3554 3555 #define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP) \ 3556 HWY_API HWY_RVV_M(MLEN) SetAtOrAfterFirst(HWY_RVV_M(MLEN) m) { \ 3557 return Not(SetBeforeFirst(m)); \ 3558 } 3559 3560 HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _) 3561 #undef HWY_RVV_SET_AT_OR_AFTER_FIRST 3562 3563 // ------------------------------ InsertLane 3564 3565 // T template arg because TFromV<V> might not match the hwy::float16_t argument. 3566 template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)> 3567 HWY_API V InsertLane(const V v, size_t i, T t) { 3568 const Rebind<T, DFromV<V>> d; 3569 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only 3570 using TU = TFromD<decltype(du)>; 3571 const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i)); 3572 return IfThenElse(RebindMask(d, is_i), Set(d, t), v); 3573 } 3574 3575 // For 8-bit lanes, Iota0 might overflow. 3576 template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)> 3577 HWY_API V InsertLane(const V v, size_t i, T t) { 3578 const Rebind<T, DFromV<V>> d; 3579 const auto zero = Zero(d); 3580 const auto one = Set(d, 1); 3581 const auto ge_i = Eq(detail::SlideUp(zero, one, i), one); 3582 const auto is_i = SetOnlyFirst(ge_i); 3583 return IfThenElse(RebindMask(d, is_i), Set(d, t), v); 3584 } 3585 3586 // ------------------------------ OddEven 3587 3588 namespace detail { 3589 3590 // Faster version using a wide constant instead of Iota0 + AndS. 3591 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> 3592 HWY_INLINE MFromD<D> IsEven(D d) { 3593 const RebindToUnsigned<decltype(d)> du; 3594 const RepartitionToWide<decltype(du)> duw; 3595 return RebindMask(d, detail::NeS(BitCast(du, Set(duw, 1)), 0u)); 3596 } 3597 3598 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3599 HWY_INLINE MFromD<D> IsEven(D d) { 3600 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only 3601 return detail::EqS(detail::AndS(detail::Iota0(du), 1), 0); 3602 } 3603 3604 // Also provide the negated form because there is no native CompressNot. 3605 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)> 3606 HWY_INLINE MFromD<D> IsOdd(D d) { 3607 const RebindToUnsigned<decltype(d)> du; 3608 const RepartitionToWide<decltype(du)> duw; 3609 return RebindMask(d, detail::EqS(BitCast(du, Set(duw, 1)), 0u)); 3610 } 3611 3612 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3613 HWY_INLINE MFromD<D> IsOdd(D d) { 3614 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only 3615 return detail::NeS(detail::AndS(detail::Iota0(du), 1), 0); 3616 } 3617 3618 } // namespace detail 3619 3620 template <class V> 3621 HWY_API V OddEven(const V a, const V b) { 3622 return IfThenElse(detail::IsEven(DFromV<V>()), b, a); 3623 } 3624 3625 // ------------------------------ DupEven (OddEven) 3626 template <class V> 3627 HWY_API V DupEven(const V v) { 3628 const V up = detail::Slide1Up(v); 3629 return OddEven(up, v); 3630 } 3631 3632 // ------------------------------ DupOdd (OddEven) 3633 template <class V> 3634 HWY_API V DupOdd(const V v) { 3635 const V down = detail::Slide1Down(v); 3636 return OddEven(v, down); 3637 } 3638 3639 // ------------------------------ InterleaveEven (OddEven) 3640 template <class D> 3641 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 3642 return OddEven(detail::Slide1Up(b), a); 3643 } 3644 3645 // ------------------------------ InterleaveOdd (OddEven) 3646 template <class D> 3647 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 3648 return OddEven(b, detail::Slide1Down(a)); 3649 } 3650 3651 // ------------------------------ OddEvenBlocks 3652 template <class V> 3653 HWY_API V OddEvenBlocks(const V a, const V b) { 3654 const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only 3655 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>)); 3656 const auto idx_block = ShiftRight<kShift>(detail::Iota0(du)); 3657 const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0); 3658 return IfThenElse(is_even, b, a); 3659 } 3660 3661 // ------------------------------ SwapAdjacentBlocks 3662 template <class V> 3663 HWY_API V SwapAdjacentBlocks(const V v) { 3664 const DFromV<V> d; 3665 const size_t lpb = detail::LanesPerBlock(d); 3666 const V down = detail::SlideDown(v, lpb); 3667 const V up = detail::SlideUp(v, v, lpb); 3668 return OddEvenBlocks(up, down); 3669 } 3670 3671 // ------------------------------ InterleaveEvenBlocks 3672 // (SlideUpLanes, OddEvenBlocks) 3673 3674 template <class D, class V = VFromD<D>> 3675 HWY_API V InterleaveEvenBlocks(D d, V a, V b) { 3676 const size_t lpb = detail::LanesPerBlock(d); 3677 return OddEvenBlocks(SlideUpLanes(d, b, lpb), a); 3678 } 3679 3680 // ------------------------------ InterleaveOddBlocks 3681 // (SlideDownLanes, OddEvenBlocks) 3682 3683 template <class D, class V = VFromD<D>> 3684 HWY_API V InterleaveOddBlocks(D d, V a, V b) { 3685 const size_t lpb = detail::LanesPerBlock(d); 3686 return OddEvenBlocks(b, SlideDownLanes(d, a, lpb)); 3687 } 3688 3689 // ------------------------------ TableLookupLanes 3690 3691 template <class D, class VI> 3692 HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) { 3693 static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane"); 3694 const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d. 3695 const auto indices = BitCast(du, vec); 3696 #if HWY_IS_DEBUG_BUILD 3697 using TU = TFromD<decltype(du)>; 3698 const size_t twice_num_of_lanes = Lanes(d) * 2; 3699 HWY_DASSERT(AllTrue( 3700 du, Eq(indices, 3701 detail::AndS(indices, static_cast<TU>(twice_num_of_lanes - 1))))); 3702 #endif 3703 return indices; 3704 } 3705 3706 template <class D, typename TI> 3707 HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) { 3708 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); 3709 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx)); 3710 } 3711 3712 #define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 3713 MLEN, NAME, OP) \ 3714 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3715 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ 3716 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \ 3717 HWY_RVV_AVL(SEW, SHIFT)); \ 3718 } 3719 3720 // TableLookupLanes is supported for all types, but beware that indices are 3721 // likely to wrap around for 8-bit lanes. When using TableLookupLanes inside 3722 // this file, ensure that it is safe or use TableLookupLanes16 instead. 3723 HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL) 3724 #undef HWY_RVV_TABLE 3725 3726 namespace detail { 3727 3728 #define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3729 SHIFT, MLEN, NAME, OP) \ 3730 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3731 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \ 3732 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \ 3733 HWY_RVV_AVL(SEW, SHIFT)); \ 3734 } 3735 3736 HWY_RVV_FOREACH_UI08(HWY_RVV_TABLE16, TableLookupLanes16, rgatherei16, _EXT) 3737 #undef HWY_RVV_TABLE16 3738 3739 // Used by Expand. 3740 #define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3741 SHIFT, MLEN, NAME, OP) \ 3742 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3743 NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \ 3744 HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ 3745 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \ 3746 HWY_RVV_AVL(SEW, SHIFT)); \ 3747 } 3748 3749 HWY_RVV_FOREACH(HWY_RVV_MASKED_TABLE, MaskedTableLookupLanes, rgather, _ALL) 3750 #undef HWY_RVV_MASKED_TABLE 3751 3752 #define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 3753 LMULH, SHIFT, MLEN, NAME, OP) \ 3754 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3755 NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \ 3756 HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \ 3757 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \ 3758 HWY_RVV_AVL(SEW, SHIFT)); \ 3759 } 3760 3761 HWY_RVV_FOREACH_UI08(HWY_RVV_MASKED_TABLE16, MaskedTableLookupLanes16, 3762 rgatherei16, _EXT) 3763 #undef HWY_RVV_MASKED_TABLE16 3764 3765 } // namespace detail 3766 3767 // ------------------------------ Reverse (TableLookupLanes) 3768 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_LE_D(D, 2)> 3769 HWY_API VFromD<D> Reverse(D d, VFromD<D> v) { 3770 const Rebind<uint16_t, decltype(d)> du16; 3771 const size_t N = Lanes(d); 3772 const auto idx = 3773 detail::ReverseSubS(detail::Iota0(du16), static_cast<uint16_t>(N - 1)); 3774 return detail::TableLookupLanes16(v, idx); 3775 } 3776 3777 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_GT_D(D, 2)> 3778 HWY_API VFromD<D> Reverse(D d, VFromD<D> v) { 3779 const Half<decltype(d)> dh; 3780 const Rebind<uint16_t, decltype(dh)> du16; 3781 const size_t half_n = Lanes(dh); 3782 const auto idx = detail::ReverseSubS(detail::Iota0(du16), 3783 static_cast<uint16_t>(half_n - 1)); 3784 const auto reversed_lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx); 3785 const auto reversed_hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx); 3786 return Combine(d, reversed_lo, reversed_hi); 3787 } 3788 3789 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> 3790 HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) { 3791 const RebindToUnsigned<D> du; 3792 using TU = TFromD<decltype(du)>; 3793 const size_t N = Lanes(du); 3794 const auto idx = 3795 detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1)); 3796 return TableLookupLanes(v, idx); 3797 } 3798 3799 // ------------------------------ ResizeBitCast 3800 3801 // Extends or truncates a vector to match the given d. 3802 namespace detail { 3803 3804 template <class D> 3805 HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) { 3806 return v; 3807 } 3808 3809 // Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already 3810 // BitCast to the same lane type. Note that V may use the native lane type for 3811 // f16, so convert D to that before checking. 3812 #define HWY_RVV_IF_SAME_T_DV(D, V) \ 3813 hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr 3814 3815 // LMUL of VFromD<D> < LMUL of V: need to truncate v 3816 template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V), 3817 HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)> 3818 HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) { 3819 const DFromV<V> d_from; 3820 const Half<decltype(d_from)> dh_from; 3821 static_assert( 3822 DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(), 3823 "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V"); 3824 static_assert( 3825 DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(), 3826 "The LMUL of VFromD<D> must be less than or equal to the LMUL of " 3827 "VFromD<decltype(dh_from)>"); 3828 return ChangeLMUL(d, Trunc(v)); 3829 } 3830 3831 // LMUL of VFromD<D> > LMUL of V: need to extend v 3832 template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V), 3833 HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())> 3834 HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) { 3835 const DFromV<V> d_from; 3836 const Twice<decltype(d_from)> dt_from; 3837 static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(), 3838 "The LMUL of VFromD<decltype(dt_from)> must be greater than " 3839 "the LMUL of V"); 3840 static_assert( 3841 DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(), 3842 "The LMUL of VFromD<D> must be greater than or equal to the LMUL of " 3843 "VFromD<decltype(dt_from)>"); 3844 return ChangeLMUL(d, Ext(dt_from, v)); 3845 } 3846 3847 #undef HWY_RVV_IF_SAME_T_DV 3848 3849 } // namespace detail 3850 3851 template <class DTo, class VFrom> 3852 HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) { 3853 const DFromV<decltype(v)> d_from; 3854 const Repartition<uint8_t, decltype(d_from)> du8_from; 3855 const DFromV<VFromD<DTo>> d_to; 3856 const Repartition<uint8_t, decltype(d_to)> du8_to; 3857 return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v))); 3858 } 3859 3860 // ------------------------------ Reverse2 (RotateRight, OddEven) 3861 3862 // Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. 3863 #ifdef HWY_NATIVE_REVERSE2_8 3864 #undef HWY_NATIVE_REVERSE2_8 3865 #else 3866 #define HWY_NATIVE_REVERSE2_8 3867 #endif 3868 3869 // Shifting and adding requires fewer instructions than blending, but casting to 3870 // u32 only works for LMUL in [1/2, 8]. 3871 3872 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3873 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { 3874 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16; 3875 return ResizeBitCast(d, RotateRight<8>(ResizeBitCast(du16, v))); 3876 } 3877 3878 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3879 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { 3880 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32; 3881 return ResizeBitCast(d, RotateRight<16>(ResizeBitCast(du32, v))); 3882 } 3883 3884 // Shifting and adding requires fewer instructions than blending, but casting to 3885 // u64 does not work for LMUL < 1. 3886 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3887 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { 3888 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64; 3889 return ResizeBitCast(d, RotateRight<32>(ResizeBitCast(du64, v))); 3890 } 3891 3892 template <class D, class V = VFromD<D>, HWY_IF_T_SIZE_D(D, 8)> 3893 HWY_API V Reverse2(D /* tag */, const V v) { 3894 const V up = detail::Slide1Up(v); 3895 const V down = detail::Slide1Down(v); 3896 return OddEven(up, down); 3897 } 3898 3899 // ------------------------------ Reverse4 (TableLookupLanes) 3900 3901 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3902 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { 3903 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16; 3904 return ResizeBitCast(d, Reverse2(du16, ResizeBitCast(du16, Reverse2(d, v)))); 3905 } 3906 3907 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 3908 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { 3909 const RebindToUnsigned<D> du; 3910 const auto idx = detail::XorS(detail::Iota0(du), 3); 3911 return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); 3912 } 3913 3914 // ------------------------------ Reverse8 (TableLookupLanes) 3915 3916 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3917 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 3918 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32; 3919 return ResizeBitCast(d, Reverse2(du32, ResizeBitCast(du32, Reverse4(d, v)))); 3920 } 3921 3922 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 3923 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 3924 const RebindToUnsigned<D> du; 3925 const auto idx = detail::XorS(detail::Iota0(du), 7); 3926 return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); 3927 } 3928 3929 // ------------------------------ ReverseBlocks (Reverse, Shuffle01) 3930 template <class D, class V = VFromD<D>> 3931 HWY_API V ReverseBlocks(D d, V v) { 3932 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64; 3933 const size_t N = Lanes(du64); 3934 const auto rev = 3935 detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1)); 3936 // Swap lo/hi u64 within each block 3937 const auto idx = detail::XorS(rev, 1); 3938 return ResizeBitCast(d, TableLookupLanes(ResizeBitCast(du64, v), idx)); 3939 } 3940 3941 // ------------------------------ Compress 3942 3943 // RVV supports all lane types natively. 3944 #ifdef HWY_NATIVE_COMPRESS8 3945 #undef HWY_NATIVE_COMPRESS8 3946 #else 3947 #define HWY_NATIVE_COMPRESS8 3948 #endif 3949 3950 template <typename T> 3951 struct CompressIsPartition { 3952 enum { value = 0 }; 3953 }; 3954 3955 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 3956 SHIFT, MLEN, NAME, OP) \ 3957 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 3958 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ 3959 return __riscv_v##OP##_vm_##CHAR##SEW##LMUL(v, mask, \ 3960 HWY_RVV_AVL(SEW, SHIFT)); \ 3961 } 3962 3963 HWY_RVV_FOREACH(HWY_RVV_COMPRESS, Compress, compress, _ALL) 3964 #undef HWY_RVV_COMPRESS 3965 3966 // ------------------------------ Expand 3967 3968 #ifdef HWY_NATIVE_EXPAND 3969 #undef HWY_NATIVE_EXPAND 3970 #else 3971 #define HWY_NATIVE_EXPAND 3972 #endif 3973 3974 // >= 2-byte lanes: idx lanes will not overflow. 3975 template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 1)> 3976 HWY_API V Expand(V v, const M mask) { 3977 const DFromV<V> d; 3978 const RebindToUnsigned<decltype(d)> du; 3979 const auto idx = detail::MaskedIota(du, RebindMask(du, mask)); 3980 const V zero = Zero(d); 3981 return detail::MaskedTableLookupLanes(mask, zero, v, idx); 3982 } 3983 3984 // 1-byte lanes, LMUL < 8: promote idx to u16. 3985 template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>, 3986 HWY_IF_POW2_LE_D(D, 2)> 3987 HWY_API V Expand(V v, const M mask) { 3988 const D d; 3989 const Rebind<uint16_t, decltype(d)> du16; 3990 const auto idx = detail::MaskedIota(du16, RebindMask(du16, mask)); 3991 const V zero = Zero(d); 3992 return detail::MaskedTableLookupLanes16(mask, zero, v, idx); 3993 } 3994 3995 // 1-byte lanes, max LMUL: unroll 2x. 3996 template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>, 3997 HWY_IF_POW2_GT_D(DFromV<V>, 2)> 3998 HWY_API V Expand(V v, const M mask) { 3999 const D d; 4000 const Half<D> dh; 4001 const auto v0 = LowerHalf(dh, v); 4002 // TODO(janwas): skip vec<->mask if we can cast masks. 4003 const V vmask = VecFromMask(d, mask); 4004 const auto m0 = MaskFromVec(LowerHalf(dh, vmask)); 4005 4006 // Cannot just use UpperHalf, must shift by the number of inputs consumed. 4007 const size_t count = CountTrue(dh, m0); 4008 const auto v1 = detail::Trunc(detail::SlideDown(v, count)); 4009 const auto m1 = MaskFromVec(UpperHalf(dh, vmask)); 4010 return Combine(d, Expand(v1, m1), Expand(v0, m0)); 4011 } 4012 4013 // ------------------------------ LoadExpand 4014 template <class D> 4015 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 4016 const TFromD<D>* HWY_RESTRICT unaligned) { 4017 return Expand(LoadU(d, unaligned), mask); 4018 } 4019 4020 // ------------------------------ CompressNot 4021 template <class V, class M> 4022 HWY_API V CompressNot(V v, const M mask) { 4023 return Compress(v, Not(mask)); 4024 } 4025 4026 // ------------------------------ CompressBlocksNot 4027 template <class V, class M> 4028 HWY_API V CompressBlocksNot(V v, const M mask) { 4029 return CompressNot(v, mask); 4030 } 4031 4032 // ------------------------------ CompressStore 4033 template <class V, class M, class D> 4034 HWY_API size_t CompressStore(const V v, const M mask, const D d, 4035 TFromD<D>* HWY_RESTRICT unaligned) { 4036 StoreU(Compress(v, mask), d, unaligned); 4037 return CountTrue(d, mask); 4038 } 4039 4040 // ------------------------------ CompressBlendedStore 4041 template <class V, class M, class D> 4042 HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d, 4043 TFromD<D>* HWY_RESTRICT unaligned) { 4044 const size_t count = CountTrue(d, mask); 4045 StoreN(Compress(v, mask), d, unaligned, count); 4046 return count; 4047 } 4048 4049 // ================================================== COMPARE (2) 4050 4051 // ------------------------------ FindLastTrue 4052 4053 template <class D> 4054 HWY_API intptr_t FindLastTrue(D d, MFromD<D> m) { 4055 const RebindToSigned<decltype(d)> di; 4056 const intptr_t fft_rev_idx = 4057 FindFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m)))); 4058 return (fft_rev_idx >= 0) 4059 ? (static_cast<intptr_t>(Lanes(d) - 1) - fft_rev_idx) 4060 : intptr_t{-1}; 4061 } 4062 4063 template <class D> 4064 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> m) { 4065 const RebindToSigned<decltype(d)> di; 4066 const size_t fft_rev_idx = 4067 FindKnownFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m)))); 4068 return Lanes(d) - 1 - fft_rev_idx; 4069 } 4070 4071 // ------------------------------ ConcatOdd (Compress) 4072 4073 namespace detail { 4074 4075 #define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4076 MLEN, NAME, OP) \ 4077 template <size_t kShift> \ 4078 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEWD, LMULD) v) { \ 4079 return __riscv_v##OP##_wx_##CHAR##SEW##LMUL(v, kShift, \ 4080 HWY_RVV_AVL(SEWD, SHIFT + 1)); \ 4081 } 4082 4083 HWY_RVV_FOREACH_U08(HWY_RVV_NARROW, Narrow, nsrl, _EXT) 4084 HWY_RVV_FOREACH_U16(HWY_RVV_NARROW, Narrow, nsrl, _EXT) 4085 HWY_RVV_FOREACH_U32(HWY_RVV_NARROW, Narrow, nsrl, _EXT) 4086 #undef HWY_RVV_NARROW 4087 4088 } // namespace detail 4089 4090 // Casting to wider and narrowing is the fastest for < 64-bit lanes. 4091 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> 4092 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 4093 constexpr size_t kBits = sizeof(TFromD<D>) * 8; 4094 const Twice<decltype(d)> dt; 4095 const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw; 4096 const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo)); 4097 return BitCast(d, detail::Narrow<kBits>(hl)); 4098 } 4099 4100 // 64-bit: Combine+Compress. 4101 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> 4102 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 4103 const Twice<decltype(d)> dt; 4104 const VFromD<decltype(dt)> hl = Combine(dt, hi, lo); 4105 return LowerHalf(d, Compress(hl, detail::IsOdd(dt))); 4106 } 4107 4108 // Any type, max LMUL: Compress both, then Combine. 4109 template <class D, HWY_IF_POW2_GT_D(D, 2)> 4110 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 4111 const Half<decltype(d)> dh; 4112 const MFromD<D> is_odd = detail::IsOdd(d); 4113 const VFromD<decltype(d)> hi_odd = Compress(hi, is_odd); 4114 const VFromD<decltype(d)> lo_odd = Compress(lo, is_odd); 4115 return Combine(d, LowerHalf(dh, hi_odd), LowerHalf(dh, lo_odd)); 4116 } 4117 4118 // ------------------------------ ConcatEven (Compress) 4119 4120 // Casting to wider and narrowing is the fastest for < 64-bit lanes. 4121 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> 4122 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 4123 const Twice<decltype(d)> dt; 4124 const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw; 4125 const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo)); 4126 return BitCast(d, detail::Narrow<0>(hl)); 4127 } 4128 4129 // 64-bit: Combine+Compress. 4130 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> 4131 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 4132 const Twice<decltype(d)> dt; 4133 const VFromD<decltype(dt)> hl = Combine(dt, hi, lo); 4134 return LowerHalf(d, Compress(hl, detail::IsEven(dt))); 4135 } 4136 4137 // Any type, max LMUL: Compress both, then Combine. 4138 template <class D, HWY_IF_POW2_GT_D(D, 2)> 4139 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 4140 const Half<decltype(d)> dh; 4141 const MFromD<D> is_even = detail::IsEven(d); 4142 const VFromD<decltype(d)> hi_even = Compress(hi, is_even); 4143 const VFromD<decltype(d)> lo_even = Compress(lo, is_even); 4144 return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even)); 4145 } 4146 4147 // ------------------------------ PromoteEvenTo/PromoteOddTo 4148 #include "hwy/ops/inside-inl.h" 4149 4150 // ================================================== BLOCKWISE 4151 4152 // ------------------------------ CombineShiftRightBytes 4153 template <size_t kBytes, class D, class V = VFromD<D>> 4154 HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) { 4155 const Repartition<uint8_t, decltype(d)> d8; 4156 const auto hi8 = BitCast(d8, hi); 4157 const auto lo8 = BitCast(d8, lo); 4158 const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes); 4159 const auto lo_down = detail::SlideDown(lo8, kBytes); 4160 const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); 4161 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); 4162 } 4163 4164 // ------------------------------ CombineShiftRightLanes 4165 template <size_t kLanes, class D, class V = VFromD<D>> 4166 HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) { 4167 constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes; 4168 const auto hi_up = detail::SlideUp(hi, hi, kLanesUp); 4169 const auto lo_down = detail::SlideDown(lo, kLanes); 4170 const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d); 4171 return IfThenElse(is_lo, lo_down, hi_up); 4172 } 4173 4174 // ------------------------------ Shuffle2301 (ShiftLeft) 4175 template <class V> 4176 HWY_API V Shuffle2301(const V v) { 4177 const DFromV<V> d; 4178 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); 4179 const Repartition<uint64_t, decltype(d)> du64; 4180 const auto v64 = BitCast(du64, v); 4181 return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64))); 4182 } 4183 4184 // ------------------------------ Shuffle2103 4185 template <class V> 4186 HWY_API V Shuffle2103(const V v) { 4187 const DFromV<V> d; 4188 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); 4189 return CombineShiftRightLanes<3>(d, v, v); 4190 } 4191 4192 // ------------------------------ Shuffle0321 4193 template <class V> 4194 HWY_API V Shuffle0321(const V v) { 4195 const DFromV<V> d; 4196 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); 4197 return CombineShiftRightLanes<1>(d, v, v); 4198 } 4199 4200 // ------------------------------ Shuffle1032 4201 template <class V> 4202 HWY_API V Shuffle1032(const V v) { 4203 const DFromV<V> d; 4204 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types"); 4205 return CombineShiftRightLanes<2>(d, v, v); 4206 } 4207 4208 // ------------------------------ Shuffle01 4209 template <class V> 4210 HWY_API V Shuffle01(const V v) { 4211 const DFromV<V> d; 4212 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types"); 4213 return CombineShiftRightLanes<1>(d, v, v); 4214 } 4215 4216 // ------------------------------ Shuffle0123 4217 template <class V> 4218 HWY_API V Shuffle0123(const V v) { 4219 return Shuffle2301(Shuffle1032(v)); 4220 } 4221 4222 // ------------------------------ TableLookupBytes 4223 4224 template <class VT, class VI> 4225 HWY_API VI TableLookupBytes(const VT vt, const VI vi) { 4226 const DFromV<VT> dt; // T=table, I=index. 4227 const DFromV<VI> di; 4228 const Repartition<uint8_t, decltype(dt)> dt8; 4229 const Repartition<uint8_t, decltype(di)> di8; 4230 // Required for producing half-vectors with table lookups from a full vector. 4231 // If we instead run at the LMUL of the index vector, lookups into the table 4232 // would be truncated. Thus we run at the larger of the two LMULs and truncate 4233 // the result vector to the original index LMUL. 4234 constexpr int kPow2T = dt8.Pow2(); 4235 constexpr int kPow2I = di8.Pow2(); 4236 const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max 4237 const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt)); 4238 const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi)); 4239 auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8)); 4240 // If the table is shorter, wrap around offsets so they do not reference 4241 // undefined lanes in the newly extended vmt. 4242 if (kPow2T < kPow2I) { 4243 offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1)); 4244 } 4245 const auto out = TableLookupLanes(vmt, Add(vmi, offsets)); 4246 return BitCast(di, detail::ChangeLMUL(di8, out)); 4247 } 4248 4249 template <class VT, class VI> 4250 HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) { 4251 const DFromV<VI> di; 4252 const Repartition<int8_t, decltype(di)> di8; 4253 const auto idx8 = BitCast(di8, idx); 4254 const auto lookup = TableLookupBytes(vt, idx8); 4255 return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup)); 4256 } 4257 4258 // ------------------------------ TwoTablesLookupLanes 4259 4260 // WARNING: 8-bit lanes may lead to unexpected results because idx is the same 4261 // size and may overflow. 4262 template <class D, HWY_IF_POW2_LE_D(D, 2)> 4263 HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, 4264 VFromD<RebindToUnsigned<D>> idx) { 4265 const Twice<decltype(d)> dt; 4266 const RebindToUnsigned<decltype(dt)> dt_u; 4267 const auto combined_tbl = Combine(dt, b, a); 4268 const auto combined_idx = Combine(dt_u, idx, idx); 4269 return LowerHalf(d, TableLookupLanes(combined_tbl, combined_idx)); 4270 } 4271 4272 template <class D, HWY_IF_POW2_GT_D(D, 2)> 4273 HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, 4274 VFromD<RebindToUnsigned<D>> idx) { 4275 const RebindToUnsigned<decltype(d)> du; 4276 using TU = TFromD<decltype(du)>; 4277 4278 const size_t num_of_lanes = Lanes(d); 4279 const auto idx_mod = detail::AndS(idx, static_cast<TU>(num_of_lanes - 1)); 4280 const auto sel_a_mask = Ne(idx, idx_mod); // FALSE if a 4281 4282 const auto a_lookup_result = TableLookupLanes(a, idx_mod); 4283 return detail::MaskedTableLookupLanes(sel_a_mask, a_lookup_result, b, 4284 idx_mod); 4285 } 4286 4287 template <class V> 4288 HWY_API V TwoTablesLookupLanes(V a, V b, 4289 VFromD<RebindToUnsigned<DFromV<V>>> idx) { 4290 const DFromV<decltype(a)> d; 4291 return TwoTablesLookupLanes(d, a, b, idx); 4292 } 4293 4294 // ------------------------------ Broadcast 4295 4296 // 8-bit requires 16-bit tables. 4297 template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1), 4298 HWY_IF_POW2_LE_D(D, 2)> 4299 HWY_API V Broadcast(const V v) { 4300 const D d; 4301 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d)); 4302 4303 const Rebind<uint16_t, decltype(d)> du16; 4304 VFromD<decltype(du16)> idx = 4305 detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16)); 4306 if (kLane != 0) { 4307 idx = detail::AddS(idx, kLane); 4308 } 4309 return detail::TableLookupLanes16(v, idx); 4310 } 4311 4312 // 8-bit and max LMUL: split into halves. 4313 template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1), 4314 HWY_IF_POW2_GT_D(D, 2)> 4315 HWY_API V Broadcast(const V v) { 4316 const D d; 4317 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d)); 4318 4319 const Half<decltype(d)> dh; 4320 using VH = VFromD<decltype(dh)>; 4321 const Rebind<uint16_t, decltype(dh)> du16; 4322 VFromD<decltype(du16)> idx = 4323 detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16)); 4324 if (kLane != 0) { 4325 idx = detail::AddS(idx, kLane); 4326 } 4327 const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx); 4328 const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx); 4329 return Combine(d, hi, lo); 4330 } 4331 4332 template <int kLane, class V, class D = DFromV<V>, 4333 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> 4334 HWY_API V Broadcast(const V v) { 4335 const D d; 4336 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d)); 4337 4338 const RebindToUnsigned<decltype(d)> du; 4339 auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du)); 4340 if (kLane != 0) { 4341 idx = detail::AddS(idx, kLane); 4342 } 4343 return TableLookupLanes(v, idx); 4344 } 4345 4346 // ------------------------------ BroadcastLane 4347 #ifdef HWY_NATIVE_BROADCASTLANE 4348 #undef HWY_NATIVE_BROADCASTLANE 4349 #else 4350 #define HWY_NATIVE_BROADCASTLANE 4351 #endif 4352 4353 namespace detail { 4354 4355 #define HWY_RVV_BROADCAST_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ 4356 LMULH, SHIFT, MLEN, NAME, OP) \ 4357 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 4358 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t idx) { \ 4359 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, idx, \ 4360 HWY_RVV_AVL(SEW, SHIFT)); \ 4361 } 4362 4363 HWY_RVV_FOREACH(HWY_RVV_BROADCAST_LANE, BroadcastLane, rgather, _ALL) 4364 #undef HWY_RVV_BROADCAST_LANE 4365 4366 } // namespace detail 4367 4368 template <int kLane, class V> 4369 HWY_API V BroadcastLane(V v) { 4370 static_assert(0 <= kLane && kLane < HWY_MAX_LANES_V(V), "Invalid lane"); 4371 return detail::BroadcastLane(v, static_cast<size_t>(kLane)); 4372 } 4373 4374 // ------------------------------ InsertBlock 4375 #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT 4376 #undef HWY_NATIVE_BLK_INSERT_EXTRACT 4377 #else 4378 #define HWY_NATIVE_BLK_INSERT_EXTRACT 4379 #endif 4380 4381 template <int kBlockIdx, class V> 4382 HWY_API V InsertBlock(V v, VFromD<BlockDFromD<DFromV<V>>> blk_to_insert) { 4383 const DFromV<decltype(v)> d; 4384 using TU = If<(sizeof(TFromV<V>) == 1 && DFromV<V>().Pow2() >= -2), uint16_t, 4385 MakeUnsigned<TFromV<V>>>; 4386 using TIdx = If<sizeof(TU) == 1, uint16_t, TU>; 4387 4388 const Repartition<TU, decltype(d)> du; 4389 const Rebind<TIdx, decltype(du)> d_idx; 4390 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), 4391 "Invalid block index"); 4392 constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU); 4393 4394 constexpr size_t kBlkByteOffset = 4395 static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock; 4396 const auto vu = BitCast(du, v); 4397 const auto vblk = ResizeBitCast(du, blk_to_insert); 4398 const auto vblk_shifted = detail::SlideUp(vblk, vblk, kBlkByteOffset); 4399 const auto insert_mask = RebindMask( 4400 du, detail::LtS(detail::SubS(detail::Iota0(d_idx), 4401 static_cast<TIdx>(kBlkByteOffset)), 4402 static_cast<TIdx>(kMaxLanesPerBlock))); 4403 4404 return BitCast(d, IfThenElse(insert_mask, vblk_shifted, vu)); 4405 } 4406 4407 // ------------------------------ BroadcastBlock 4408 template <int kBlockIdx, class V, HWY_IF_POW2_LE_D(DFromV<V>, -3)> 4409 HWY_API V BroadcastBlock(V v) { 4410 const DFromV<decltype(v)> d; 4411 const Repartition<uint8_t, decltype(d)> du8; 4412 const Rebind<uint16_t, decltype(d)> du16; 4413 4414 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), 4415 "Invalid block index"); 4416 4417 const auto idx = detail::AddS(detail::AndS(detail::Iota0(du16), uint16_t{15}), 4418 static_cast<uint16_t>(kBlockIdx * 16)); 4419 return BitCast(d, detail::TableLookupLanes16(BitCast(du8, v), idx)); 4420 } 4421 4422 template <int kBlockIdx, class V, HWY_IF_POW2_GT_D(DFromV<V>, -3)> 4423 HWY_API V BroadcastBlock(V v) { 4424 const DFromV<decltype(v)> d; 4425 using TU = If<sizeof(TFromV<V>) == 1, uint16_t, MakeUnsigned<TFromV<V>>>; 4426 const Repartition<TU, decltype(d)> du; 4427 4428 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), 4429 "Invalid block index"); 4430 constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU); 4431 4432 const auto idx = detail::AddS( 4433 detail::AndS(detail::Iota0(du), static_cast<TU>(kMaxLanesPerBlock - 1)), 4434 static_cast<TU>(static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock)); 4435 return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); 4436 } 4437 4438 // ------------------------------ ExtractBlock 4439 template <int kBlockIdx, class V> 4440 HWY_API VFromD<BlockDFromD<DFromV<V>>> ExtractBlock(V v) { 4441 const DFromV<decltype(v)> d; 4442 const BlockDFromD<decltype(d)> d_block; 4443 4444 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), 4445 "Invalid block index"); 4446 constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TFromD<decltype(d)>); 4447 constexpr size_t kBlkByteOffset = 4448 static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock; 4449 4450 return ResizeBitCast(d_block, detail::SlideDown(v, kBlkByteOffset)); 4451 } 4452 4453 // ------------------------------ ShiftLeftLanes 4454 4455 template <size_t kLanes, class D, class V = VFromD<D>> 4456 HWY_API V ShiftLeftLanes(const D d, const V v) { 4457 const RebindToSigned<decltype(d)> di; 4458 const RebindToUnsigned<decltype(d)> du; 4459 using TI = TFromD<decltype(di)>; 4460 const auto shifted = detail::SlideUp(v, v, kLanes); 4461 // Match x86 semantics by zeroing lower lanes in 128-bit blocks 4462 const auto idx_mod = 4463 detail::AndS(BitCast(di, detail::Iota0(du)), 4464 static_cast<TI>(detail::LanesPerBlock(di) - 1)); 4465 const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes)); 4466 return IfThenZeroElse(clear, shifted); 4467 } 4468 4469 template <size_t kLanes, class V> 4470 HWY_API V ShiftLeftLanes(const V v) { 4471 return ShiftLeftLanes<kLanes>(DFromV<V>(), v); 4472 } 4473 4474 // ------------------------------ ShiftLeftBytes 4475 4476 template <int kBytes, class D> 4477 HWY_API VFromD<D> ShiftLeftBytes(D d, const VFromD<D> v) { 4478 const Repartition<uint8_t, decltype(d)> d8; 4479 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v))); 4480 } 4481 4482 template <int kBytes, class V> 4483 HWY_API V ShiftLeftBytes(const V v) { 4484 return ShiftLeftBytes<kBytes>(DFromV<V>(), v); 4485 } 4486 4487 // ------------------------------ ShiftRightLanes 4488 template <size_t kLanes, typename T, size_t N, int kPow2, 4489 class V = VFromD<Simd<T, N, kPow2>>> 4490 HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) { 4491 const RebindToSigned<decltype(d)> di; 4492 const RebindToUnsigned<decltype(d)> du; 4493 using TI = TFromD<decltype(di)>; 4494 // For partial vectors, clear upper lanes so we shift in zeros. 4495 if (N <= 16 / sizeof(T)) { 4496 v = detail::SlideUp(v, Zero(d), N); 4497 } 4498 4499 const auto shifted = detail::SlideDown(v, kLanes); 4500 // Match x86 semantics by zeroing upper lanes in 128-bit blocks 4501 const size_t lpb = detail::LanesPerBlock(di); 4502 const auto idx_mod = 4503 detail::AndS(BitCast(di, detail::Iota0(du)), static_cast<TI>(lpb - 1)); 4504 const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes)); 4505 return IfThenElseZero(keep, shifted); 4506 } 4507 4508 // ------------------------------ ShiftRightBytes 4509 template <int kBytes, class D, class V = VFromD<D>> 4510 HWY_API V ShiftRightBytes(const D d, const V v) { 4511 const Repartition<uint8_t, decltype(d)> d8; 4512 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v))); 4513 } 4514 4515 // ------------------------------ InterleaveWholeLower 4516 #ifdef HWY_NATIVE_INTERLEAVE_WHOLE 4517 #undef HWY_NATIVE_INTERLEAVE_WHOLE 4518 #else 4519 #define HWY_NATIVE_INTERLEAVE_WHOLE 4520 #endif 4521 4522 namespace detail { 4523 // Returns double-length vector with interleaved lanes. 4524 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4525 HWY_IF_POW2_GT_D(D, -3)> 4526 HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) { 4527 const RebindToUnsigned<decltype(d)> du; 4528 using TW = MakeWide<TFromD<decltype(du)>>; 4529 const Rebind<TW, Half<decltype(du)>> dw; 4530 const Half<decltype(du)> duh; // cast inputs to unsigned so we zero-extend 4531 4532 const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a)); 4533 const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b)); 4534 return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw))))); 4535 } 4536 // 64-bit: cannot PromoteTo, but can Ext. 4537 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)> 4538 HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) { 4539 const RebindToUnsigned<decltype(d)> du; 4540 const auto idx = ShiftRight<1>(detail::Iota0(du)); 4541 return OddEven(TableLookupLanes(detail::Ext(d, b), idx), 4542 TableLookupLanes(detail::Ext(d, a), idx)); 4543 } 4544 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)> 4545 HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) { 4546 const Half<D> dh; 4547 const Half<decltype(dh)> dq; 4548 const VFromD<decltype(dh)> i0 = 4549 InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b)); 4550 const VFromD<decltype(dh)> i1 = 4551 InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b)); 4552 return Combine(d, i1, i0); 4553 } 4554 4555 } // namespace detail 4556 4557 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))> 4558 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { 4559 const RebindToUnsigned<decltype(d)> du; 4560 const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw; 4561 const RepartitionToNarrow<decltype(dw)> du_src; 4562 4563 const VFromD<D> aw = 4564 ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a))); 4565 const VFromD<D> bw = 4566 ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b))); 4567 return Or(aw, detail::Slide1Up(bw)); 4568 } 4569 4570 template <class D, HWY_IF_T_SIZE_D(D, 8)> 4571 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { 4572 const RebindToUnsigned<decltype(d)> du; 4573 const auto idx = ShiftRight<1>(detail::Iota0(du)); 4574 return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx)); 4575 } 4576 4577 // ------------------------------ InterleaveWholeUpper 4578 4579 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))> 4580 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { 4581 // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only 4582 // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is 4583 // true and and as the results of InterleaveWholeUpper are 4584 // implementation-defined if Lanes(d) is less than 2. 4585 const size_t half_N = Lanes(d) / 2; 4586 return InterleaveWholeLower(d, detail::SlideDown(a, half_N), 4587 detail::SlideDown(b, half_N)); 4588 } 4589 4590 template <class D, HWY_IF_T_SIZE_D(D, 8)> 4591 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { 4592 // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only 4593 // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is 4594 // true and as the results of InterleaveWholeUpper are implementation-defined 4595 // if Lanes(d) is less than 2. 4596 const size_t half_N = Lanes(d) / 2; 4597 const RebindToUnsigned<decltype(d)> du; 4598 const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)), 4599 static_cast<uint64_t>(half_N)); 4600 return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx)); 4601 } 4602 4603 // ------------------------------ InterleaveLower (InterleaveWholeLower) 4604 4605 namespace detail { 4606 4607 // Definitely at least 128 bit: match x86 semantics (independent blocks). Using 4608 // InterleaveWhole and 64-bit Compress avoids 8-bit overflow. 4609 template <class D, class V, HWY_IF_POW2_LE_D(D, 2)> 4610 HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) { 4611 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); 4612 const Twice<D> dt; 4613 const RebindToUnsigned<decltype(dt)> dt_u; 4614 const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b); 4615 // Keep only even 128-bit blocks. This is faster than u64 ConcatEven 4616 // because we only have a single vector. 4617 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>)); 4618 const VFromD<decltype(dt_u)> idx_block = 4619 ShiftRight<kShift>(detail::Iota0(dt_u)); 4620 const MFromD<decltype(dt_u)> is_even = 4621 detail::EqS(detail::AndS(idx_block, 1), 0); 4622 return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even))); 4623 } 4624 template <class D, class V, HWY_IF_POW2_GT_D(D, 2)> 4625 HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) { 4626 const Half<D> dh; 4627 const VFromD<decltype(dh)> i0 = 4628 InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b)); 4629 const VFromD<decltype(dh)> i1 = 4630 InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b)); 4631 return Combine(d, i1, i0); 4632 } 4633 4634 // As above, for the upper half of blocks. 4635 template <class D, class V, HWY_IF_POW2_LE_D(D, 2)> 4636 HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) { 4637 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); 4638 const Twice<D> dt; 4639 const RebindToUnsigned<decltype(dt)> dt_u; 4640 const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b); 4641 // Keep only odd 128-bit blocks. This is faster than u64 ConcatEven 4642 // because we only have a single vector. 4643 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>)); 4644 const VFromD<decltype(dt_u)> idx_block = 4645 ShiftRight<kShift>(detail::Iota0(dt_u)); 4646 const MFromD<decltype(dt_u)> is_odd = 4647 detail::EqS(detail::AndS(idx_block, 1), 1); 4648 return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd))); 4649 } 4650 template <class D, class V, HWY_IF_POW2_GT_D(D, 2)> 4651 HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) { 4652 const Half<D> dh; 4653 const VFromD<decltype(dh)> i0 = 4654 InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b)); 4655 const VFromD<decltype(dh)> i1 = 4656 InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b)); 4657 return Combine(d, i1, i0); 4658 } 4659 4660 // RVV vectors are at least 128 bit when there is no fractional LMUL nor cap. 4661 // Used by functions with per-block behavior such as InterleaveLower. 4662 template <typename T, size_t N, int kPow2> 4663 constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) { 4664 return N * sizeof(T) >= 16 && kPow2 >= 0; 4665 } 4666 4667 // Definitely less than 128-bit only if there is a small cap; fractional LMUL 4668 // might not be enough if vectors are large. 4669 template <typename T, size_t N, int kPow2> 4670 constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) { 4671 return N * sizeof(T) < 16; 4672 } 4673 4674 } // namespace detail 4675 4676 #define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr 4677 #define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr 4678 #define HWY_RVV_IF_CAN128_D(D) \ 4679 hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr 4680 4681 template <class D, class V, HWY_RVV_IF_GE128_D(D)> 4682 HWY_API V InterleaveLower(D d, const V a, const V b) { 4683 return detail::InterleaveLowerBlocks(d, a, b); 4684 } 4685 4686 // Single block: interleave without extra Compress. 4687 template <class D, class V, HWY_RVV_IF_LT128_D(D)> 4688 HWY_API V InterleaveLower(D d, const V a, const V b) { 4689 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); 4690 return InterleaveWholeLower(d, a, b); 4691 } 4692 4693 // Could be either; branch at runtime. 4694 template <class D, class V, HWY_RVV_IF_CAN128_D(D)> 4695 HWY_API V InterleaveLower(D d, const V a, const V b) { 4696 if (Lanes(d) * sizeof(TFromD<D>) <= 16) { 4697 return InterleaveWholeLower(d, a, b); 4698 } 4699 // Fractional LMUL: use LMUL=1 to ensure we can cast to u64. 4700 const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1; 4701 return ResizeBitCast(d, detail::InterleaveLowerBlocks( 4702 d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b))); 4703 } 4704 4705 template <class V> 4706 HWY_API V InterleaveLower(const V a, const V b) { 4707 return InterleaveLower(DFromV<V>(), a, b); 4708 } 4709 4710 // ------------------------------ InterleaveUpper (Compress) 4711 4712 template <class D, class V, HWY_RVV_IF_GE128_D(D)> 4713 HWY_API V InterleaveUpper(D d, const V a, const V b) { 4714 return detail::InterleaveUpperBlocks(d, a, b); 4715 } 4716 4717 // Single block: interleave without extra Compress. 4718 template <class D, class V, HWY_RVV_IF_LT128_D(D)> 4719 HWY_API V InterleaveUpper(D d, const V a, const V b) { 4720 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch"); 4721 return InterleaveWholeUpper(d, a, b); 4722 } 4723 4724 // Could be either; branch at runtime. 4725 template <class D, class V, HWY_RVV_IF_CAN128_D(D)> 4726 HWY_API V InterleaveUpper(D d, const V a, const V b) { 4727 if (Lanes(d) * sizeof(TFromD<D>) <= 16) { 4728 return InterleaveWholeUpper(d, a, b); 4729 } 4730 // Fractional LMUL: use LMUL=1 to ensure we can cast to u64. 4731 const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1; 4732 return ResizeBitCast(d, detail::InterleaveUpperBlocks( 4733 d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b))); 4734 } 4735 4736 // ------------------------------ ZipLower 4737 4738 template <class V, class DW = RepartitionToWide<DFromV<V>>> 4739 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 4740 const RepartitionToNarrow<DW> dn; 4741 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch"); 4742 return BitCast(dw, InterleaveLower(dn, a, b)); 4743 } 4744 4745 template <class V, class DW = RepartitionToWide<DFromV<V>>> 4746 HWY_API VFromD<DW> ZipLower(V a, V b) { 4747 return BitCast(DW(), InterleaveLower(a, b)); 4748 } 4749 4750 // ------------------------------ ZipUpper 4751 template <class DW, class V> 4752 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 4753 const RepartitionToNarrow<DW> dn; 4754 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch"); 4755 return BitCast(dw, InterleaveUpper(dn, a, b)); 4756 } 4757 4758 // ================================================== REDUCE 4759 4760 // We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set. 4761 #ifdef HWY_NATIVE_REDUCE_SCALAR 4762 #undef HWY_NATIVE_REDUCE_SCALAR 4763 #else 4764 #define HWY_NATIVE_REDUCE_SCALAR 4765 #endif 4766 4767 // scalar = f(vector, zero_m1) 4768 #define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4769 MLEN, NAME, OP) \ 4770 template <size_t N> \ 4771 HWY_API HWY_RVV_T(BASE, SEW) \ 4772 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \ 4773 HWY_RVV_V(BASE, SEW, m1) v0) { \ 4774 return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \ 4775 v, v0, Lanes(d))); \ 4776 } 4777 4778 // detail::RedSum, detail::RedMin, and detail::RedMax is more efficient 4779 // for N=4 I8/U8 reductions on RVV than the default implementations of the 4780 // the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h 4781 #undef HWY_IF_REDUCE_D 4782 #define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr 4783 4784 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 4785 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 4786 #else 4787 #define HWY_NATIVE_REDUCE_SUM_4_UI8 4788 #endif 4789 4790 #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 4791 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 4792 #else 4793 #define HWY_NATIVE_REDUCE_MINMAX_4_UI8 4794 #endif 4795 4796 // ------------------------------ ReduceSum 4797 4798 namespace detail { 4799 HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT) 4800 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT) 4801 } // namespace detail 4802 4803 template <class D, HWY_IF_REDUCE_D(D)> 4804 HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) { 4805 const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1 4806 return detail::RedSum(d, v, v0); 4807 } 4808 4809 // ------------------------------ ReduceMin 4810 namespace detail { 4811 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT) 4812 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT) 4813 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT) 4814 } // namespace detail 4815 4816 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)> 4817 HWY_API T ReduceMin(D d, const VFromD<D> v) { 4818 const ScalableTag<T> d1; // always m1 4819 return detail::RedMin(d, v, Set(d1, HighestValue<T>())); 4820 } 4821 4822 // ------------------------------ ReduceMax 4823 namespace detail { 4824 HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT) 4825 HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT) 4826 HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT) 4827 } // namespace detail 4828 4829 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)> 4830 HWY_API T ReduceMax(D d, const VFromD<D> v) { 4831 const ScalableTag<T> d1; // always m1 4832 return detail::RedMax(d, v, Set(d1, LowestValue<T>())); 4833 } 4834 4835 #undef HWY_RVV_REDUCE 4836 4837 // TODO: add MaskedReduceSum/Min/Max 4838 4839 // ------------------------------ SumOfLanes 4840 4841 template <class D, HWY_IF_LANES_GT_D(D, 1)> 4842 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 4843 return Set(d, ReduceSum(d, v)); 4844 } 4845 template <class D, HWY_IF_LANES_GT_D(D, 1)> 4846 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) { 4847 return Set(d, ReduceMin(d, v)); 4848 } 4849 template <class D, HWY_IF_LANES_GT_D(D, 1)> 4850 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) { 4851 return Set(d, ReduceMax(d, v)); 4852 } 4853 4854 // ================================================== Ops with dependencies 4855 4856 // ------------------------------ LoadInterleaved2 4857 4858 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. 4859 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED 4860 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED 4861 #else 4862 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED 4863 #endif 4864 4865 // Requires Clang 16+, GCC 14+; otherwise emulated in generic_ops-inl.h. 4866 #if HWY_HAVE_TUPLE 4867 4868 #define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4869 MLEN, NAME, OP) \ 4870 template <size_t kIndex> \ 4871 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 4872 NAME##2(HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup) { \ 4873 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x2_##CHAR##SEW##LMUL(tup, \ 4874 kIndex); \ 4875 } \ 4876 template <size_t kIndex> \ 4877 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 4878 NAME##3(HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup) { \ 4879 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x3_##CHAR##SEW##LMUL(tup, \ 4880 kIndex); \ 4881 } \ 4882 template <size_t kIndex> \ 4883 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 4884 NAME##4(HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup) { \ 4885 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x4_##CHAR##SEW##LMUL(tup, \ 4886 kIndex); \ 4887 } 4888 4889 HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _LE2) 4890 #undef HWY_RVV_GET 4891 4892 #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4893 MLEN, NAME, OP) \ 4894 template <size_t kIndex> \ 4895 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) NAME##2( \ 4896 HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 4897 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x2( \ 4898 tup, kIndex, v); \ 4899 } \ 4900 template <size_t kIndex> \ 4901 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \ 4902 HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 4903 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x3( \ 4904 tup, kIndex, v); \ 4905 } \ 4906 template <size_t kIndex> \ 4907 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) NAME##4( \ 4908 HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \ 4909 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x4( \ 4910 tup, kIndex, v); \ 4911 } 4912 4913 HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _LE2) 4914 #undef HWY_RVV_SET 4915 4916 // RVV does not provide vcreate, so implement using Set. 4917 #define HWY_RVV_CREATE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4918 MLEN, NAME, OP) \ 4919 template <size_t N> \ 4920 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) \ 4921 NAME##2(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, \ 4922 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1) { \ 4923 HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup{}; \ 4924 tup = Set2<0>(tup, v0); \ 4925 tup = Set2<1>(tup, v1); \ 4926 return tup; \ 4927 } \ 4928 template <size_t N> \ 4929 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \ 4930 HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, HWY_RVV_V(BASE, SEW, LMUL) v0, \ 4931 HWY_RVV_V(BASE, SEW, LMUL) v1, HWY_RVV_V(BASE, SEW, LMUL) v2) { \ 4932 HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup{}; \ 4933 tup = Set3<0>(tup, v0); \ 4934 tup = Set3<1>(tup, v1); \ 4935 tup = Set3<2>(tup, v2); \ 4936 return tup; \ 4937 } \ 4938 template <size_t N> \ 4939 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) \ 4940 NAME##4(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, \ 4941 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ 4942 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3) { \ 4943 HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup{}; \ 4944 tup = Set4<0>(tup, v0); \ 4945 tup = Set4<1>(tup, v1); \ 4946 tup = Set4<2>(tup, v2); \ 4947 tup = Set4<3>(tup, v3); \ 4948 return tup; \ 4949 } 4950 4951 HWY_RVV_FOREACH(HWY_RVV_CREATE, Create, xx, _LE2_VIRT) 4952 #undef HWY_RVV_CREATE 4953 4954 template <class D> 4955 using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D()))); 4956 template <class D> 4957 using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D()))); 4958 template <class D> 4959 using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D()))); 4960 4961 #define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4962 MLEN, NAME, OP) \ 4963 template <size_t N> \ 4964 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 4965 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ 4966 HWY_RVV_V(BASE, SEW, LMUL) & v0, \ 4967 HWY_RVV_V(BASE, SEW, LMUL) & v1) { \ 4968 const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = \ 4969 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, Lanes(d)); \ 4970 v0 = Get2<0>(tup); \ 4971 v1 = Get2<1>(tup); \ 4972 } 4973 // Segments are limited to 8 registers, so we can only go up to LMUL=2. 4974 HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT) 4975 #undef HWY_RVV_LOAD2 4976 4977 // ------------------------------ LoadInterleaved3 4978 4979 #define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 4980 MLEN, NAME, OP) \ 4981 template <size_t N> \ 4982 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 4983 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ 4984 HWY_RVV_V(BASE, SEW, LMUL) & v0, \ 4985 HWY_RVV_V(BASE, SEW, LMUL) & v1, \ 4986 HWY_RVV_V(BASE, SEW, LMUL) & v2) { \ 4987 const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = \ 4988 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, Lanes(d)); \ 4989 v0 = Get3<0>(tup); \ 4990 v1 = Get3<1>(tup); \ 4991 v2 = Get3<2>(tup); \ 4992 } 4993 // Segments are limited to 8 registers, so we can only go up to LMUL=2. 4994 HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT) 4995 #undef HWY_RVV_LOAD3 4996 4997 // ------------------------------ LoadInterleaved4 4998 4999 #define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 5000 MLEN, NAME, OP) \ 5001 template <size_t N> \ 5002 HWY_API void NAME( \ 5003 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 5004 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ 5005 HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \ 5006 HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \ 5007 const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = \ 5008 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, Lanes(d)); \ 5009 v0 = Get4<0>(tup); \ 5010 v1 = Get4<1>(tup); \ 5011 v2 = Get4<2>(tup); \ 5012 v3 = Get4<3>(tup); \ 5013 } 5014 // Segments are limited to 8 registers, so we can only go up to LMUL=2. 5015 HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT) 5016 #undef HWY_RVV_LOAD4 5017 5018 // ------------------------------ StoreInterleaved2 5019 5020 #define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 5021 MLEN, NAME, OP) \ 5022 template <size_t N> \ 5023 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \ 5024 HWY_RVV_V(BASE, SEW, LMUL) v1, \ 5025 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 5026 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ 5027 const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = Create2(d, v0, v1); \ 5028 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, tup, Lanes(d)); \ 5029 } 5030 // Segments are limited to 8 registers, so we can only go up to LMUL=2. 5031 HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT) 5032 #undef HWY_RVV_STORE2 5033 5034 // ------------------------------ StoreInterleaved3 5035 5036 #define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 5037 MLEN, NAME, OP) \ 5038 template <size_t N> \ 5039 HWY_API void NAME( \ 5040 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ 5041 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 5042 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ 5043 const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = Create3(d, v0, v1, v2); \ 5044 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, tup, Lanes(d)); \ 5045 } 5046 // Segments are limited to 8 registers, so we can only go up to LMUL=2. 5047 HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT) 5048 #undef HWY_RVV_STORE3 5049 5050 // ------------------------------ StoreInterleaved4 5051 5052 #define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ 5053 MLEN, NAME, OP) \ 5054 template <size_t N> \ 5055 HWY_API void NAME( \ 5056 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ 5057 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \ 5058 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 5059 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ 5060 const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = Create4(d, v0, v1, v2, v3); \ 5061 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, tup, Lanes(d)); \ 5062 } 5063 // Segments are limited to 8 registers, so we can only go up to LMUL=2. 5064 HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT) 5065 #undef HWY_RVV_STORE4 5066 5067 #else // !HWY_HAVE_TUPLE 5068 5069 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)> 5070 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, 5071 VFromD<D>& v0, VFromD<D>& v1) { 5072 const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] 5073 const VFromD<D> B = LoadU(d, unaligned + Lanes(d)); 5074 v0 = ConcatEven(d, B, A); 5075 v1 = ConcatOdd(d, B, A); 5076 } 5077 5078 namespace detail { 5079 #define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 5080 SHIFT, MLEN, NAME, OP) \ 5081 template <size_t N> \ 5082 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ 5083 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 5084 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \ 5085 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ 5086 p, static_cast<ptrdiff_t>(stride), Lanes(d)); \ 5087 } 5088 HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT) 5089 #undef HWY_RVV_LOAD_STRIDED 5090 } // namespace detail 5091 5092 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)> 5093 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 5094 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 5095 // Offsets are bytes, and this is not documented. 5096 v0 = detail::LoadStrided(d, unaligned + 0, 3 * sizeof(T)); 5097 v1 = detail::LoadStrided(d, unaligned + 1, 3 * sizeof(T)); 5098 v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T)); 5099 } 5100 5101 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)> 5102 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, 5103 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 5104 VFromD<D>& v3) { 5105 // Offsets are bytes, and this is not documented. 5106 v0 = detail::LoadStrided(d, unaligned + 0, 4 * sizeof(T)); 5107 v1 = detail::LoadStrided(d, unaligned + 1, 4 * sizeof(T)); 5108 v2 = detail::LoadStrided(d, unaligned + 2, 4 * sizeof(T)); 5109 v3 = detail::LoadStrided(d, unaligned + 3, 4 * sizeof(T)); 5110 } 5111 5112 // Not 64-bit / max LMUL: interleave via promote, slide, OddEven. 5113 template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8), 5114 HWY_IF_POW2_LE_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)> 5115 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 5116 T* HWY_RESTRICT unaligned) { 5117 const RebindToUnsigned<D> du; 5118 const Twice<RepartitionToWide<decltype(du)>> duw; 5119 const Twice<decltype(d)> dt; 5120 // Interleave with zero by promoting to wider (unsigned) type. 5121 const VFromD<decltype(dt)> w0 = BitCast(dt, PromoteTo(duw, BitCast(du, v0))); 5122 const VFromD<decltype(dt)> w1 = BitCast(dt, PromoteTo(duw, BitCast(du, v1))); 5123 // OR second vector into the zero-valued lanes (faster than OddEven). 5124 StoreU(Or(w0, detail::Slide1Up(w1)), dt, unaligned); 5125 } 5126 5127 // Can promote, max LMUL: two half-length 5128 template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8), 5129 HWY_IF_POW2_GT_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)> 5130 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 5131 T* HWY_RESTRICT unaligned) { 5132 const Half<decltype(d)> dh; 5133 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), d, unaligned); 5134 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), d, 5135 unaligned + Lanes(d)); 5136 } 5137 5138 namespace detail { 5139 #define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 5140 SHIFT, MLEN, NAME, OP) \ 5141 template <size_t N> \ 5142 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ 5143 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ 5144 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \ 5145 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ 5146 p, static_cast<ptrdiff_t>(stride), v, Lanes(d)); \ 5147 } 5148 HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT) 5149 #undef HWY_RVV_STORE_STRIDED 5150 } // namespace detail 5151 5152 // 64-bit: strided 5153 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8), 5154 HWY_RVV_IF_NOT_EMULATED_D(D)> 5155 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 5156 T* HWY_RESTRICT unaligned) { 5157 // Offsets are bytes, and this is not documented. 5158 detail::StoreStrided(v0, d, unaligned + 0, 2 * sizeof(T)); 5159 detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T)); 5160 } 5161 5162 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)> 5163 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 5164 T* HWY_RESTRICT unaligned) { 5165 // Offsets are bytes, and this is not documented. 5166 detail::StoreStrided(v0, d, unaligned + 0, 3 * sizeof(T)); 5167 detail::StoreStrided(v1, d, unaligned + 1, 3 * sizeof(T)); 5168 detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T)); 5169 } 5170 5171 template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)> 5172 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 5173 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { 5174 // Offsets are bytes, and this is not documented. 5175 detail::StoreStrided(v0, d, unaligned + 0, 4 * sizeof(T)); 5176 detail::StoreStrided(v1, d, unaligned + 1, 4 * sizeof(T)); 5177 detail::StoreStrided(v2, d, unaligned + 2, 4 * sizeof(T)); 5178 detail::StoreStrided(v3, d, unaligned + 3, 4 * sizeof(T)); 5179 } 5180 5181 #endif // HWY_HAVE_TUPLE 5182 5183 // Rely on generic Load/StoreInterleaved[234] for any emulated types. 5184 // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_RVV_IF_EMULATED_D. 5185 5186 // ------------------------------ Dup128VecFromValues (ResizeBitCast) 5187 5188 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)> 5189 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) { 5190 return Set(d, t0); 5191 } 5192 5193 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)> 5194 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) { 5195 const auto even_lanes = Set(d, t0); 5196 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 5197 if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) == 5198 BitCastScalar<uint64_t>(t1)) && 5199 (BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) { 5200 return even_lanes; 5201 } 5202 #endif 5203 5204 const auto odd_lanes = Set(d, t1); 5205 return OddEven(odd_lanes, even_lanes); 5206 } 5207 5208 namespace detail { 5209 5210 #pragma pack(push, 1) 5211 5212 template <class T> 5213 struct alignas(8) Vec64ValsWrapper { 5214 static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true"); 5215 static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true"); 5216 T vals[8 / sizeof(T)]; 5217 }; 5218 5219 #pragma pack(pop) 5220 5221 } // namespace detail 5222 5223 template <class D, HWY_IF_T_SIZE_D(D, 1)> 5224 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 5225 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 5226 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 5227 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 5228 TFromD<D> t11, TFromD<D> t12, 5229 TFromD<D> t13, TFromD<D> t14, 5230 TFromD<D> t15) { 5231 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64; 5232 return ResizeBitCast( 5233 d, Dup128VecFromValues( 5234 du64, 5235 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{ 5236 {t0, t1, t2, t3, t4, t5, t6, t7}}), 5237 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{ 5238 {t8, t9, t10, t11, t12, t13, t14, t15}}))); 5239 } 5240 5241 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5242 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 5243 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 5244 TFromD<D> t5, TFromD<D> t6, 5245 TFromD<D> t7) { 5246 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64; 5247 return ResizeBitCast( 5248 d, Dup128VecFromValues( 5249 du64, 5250 BitCastScalar<uint64_t>( 5251 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}), 5252 BitCastScalar<uint64_t>( 5253 detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}}))); 5254 } 5255 5256 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5257 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 5258 TFromD<D> t2, TFromD<D> t3) { 5259 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64; 5260 return ResizeBitCast( 5261 d, 5262 Dup128VecFromValues(du64, 5263 BitCastScalar<uint64_t>( 5264 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}), 5265 BitCastScalar<uint64_t>( 5266 detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}}))); 5267 } 5268 5269 // ------------------------------ LoadDup128 5270 5271 template <class D> 5272 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) { 5273 const RebindToUnsigned<decltype(d)> du; 5274 5275 // Make sure that no more than 16 bytes are loaded from p 5276 constexpr int kLoadPow2 = d.Pow2(); 5277 constexpr size_t kMaxLanesToLoad = 5278 HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>)); 5279 constexpr size_t kLoadN = D::template NewN<kLoadPow2, kMaxLanesToLoad>(); 5280 const Simd<TFromD<D>, kLoadN, kLoadPow2> d_load; 5281 static_assert(d_load.MaxBytes() <= 16, 5282 "d_load.MaxBytes() <= 16 must be true"); 5283 static_assert((d.MaxBytes() < 16) || (d_load.MaxBytes() == 16), 5284 "d_load.MaxBytes() == 16 must be true if d.MaxBytes() >= 16 is " 5285 "true"); 5286 static_assert((d.MaxBytes() >= 16) || (d_load.MaxBytes() == d.MaxBytes()), 5287 "d_load.MaxBytes() == d.MaxBytes() must be true if " 5288 "d.MaxBytes() < 16 is true"); 5289 5290 const VFromD<D> loaded = Load(d_load, p); 5291 if (d.MaxBytes() <= 16) return loaded; 5292 5293 // idx must be unsigned for TableLookupLanes. 5294 using TU = TFromD<decltype(du)>; 5295 const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1); 5296 // Broadcast the first block. 5297 const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(du), mask); 5298 // Safe even for 8-bit lanes because indices never exceed 15. 5299 return TableLookupLanes(loaded, idx); 5300 } 5301 5302 // ------------------------------ LoadMaskBits 5303 5304 // Support all combinations of T and SHIFT(LMUL) without explicit overloads for 5305 // each. First overload for MLEN=1..64. 5306 namespace detail { 5307 5308 // Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN 5309 // increases with lane size and decreases for increasing LMUL. Cap at 64, the 5310 // largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL 5311 // e.g. vuint16mf8_t: (8*2 << 3) == 128. 5312 template <class D> 5313 using MaskTag = hwy::SizeTag<HWY_MIN( 5314 64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -D().Pow2()))>; 5315 5316 #define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ 5317 HWY_INLINE HWY_RVV_M(MLEN) \ 5318 NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \ 5319 return __riscv_v##OP##_v_b##MLEN(bits, N); \ 5320 } 5321 HWY_RVV_FOREACH_B(HWY_RVV_LOAD_MASK_BITS, LoadMaskBits, lm) 5322 #undef HWY_RVV_LOAD_MASK_BITS 5323 } // namespace detail 5324 5325 template <class D, class MT = detail::MaskTag<D>> 5326 HWY_API auto LoadMaskBits(D d, const uint8_t* bits) 5327 -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) { 5328 return detail::LoadMaskBits(MT(), bits, Lanes(d)); 5329 } 5330 5331 // ------------------------------ StoreMaskBits 5332 #define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ 5333 template <class D> \ 5334 HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \ 5335 const size_t N = Lanes(d); \ 5336 __riscv_v##OP##_v_b##MLEN(bits, m, N); \ 5337 /* Non-full byte, need to clear the undefined upper bits. */ \ 5338 /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \ 5339 constexpr bool kLessThan8 = \ 5340 detail::ScaleByPower(16 / sizeof(TFromD<D>), d.Pow2()) < 8; \ 5341 if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \ 5342 const int mask = (1 << N) - 1; \ 5343 bits[0] = static_cast<uint8_t>(bits[0] & mask); \ 5344 } \ 5345 return (N + 7) / 8; \ 5346 } 5347 HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, StoreMaskBits, sm) 5348 #undef HWY_RVV_STORE_MASK_BITS 5349 5350 // ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits) 5351 5352 template <class V> 5353 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { 5354 return Compress(v, LoadMaskBits(DFromV<V>(), bits)); 5355 } 5356 5357 template <class D> 5358 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 5359 D d, TFromD<D>* HWY_RESTRICT unaligned) { 5360 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); 5361 } 5362 5363 // ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp) 5364 5365 // NOTE: do not use this as a building block within rvv-inl - it is likely more 5366 // efficient to use avl or detail::SlideUp. 5367 5368 // Disallow for 8-bit because Iota is likely to overflow. 5369 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 5370 HWY_API MFromD<D> FirstN(const D d, const size_t n) { 5371 const RebindToUnsigned<D> du; 5372 using TU = TFromD<decltype(du)>; 5373 return RebindMask(d, detail::LtS(detail::Iota0(du), static_cast<TU>(n))); 5374 } 5375 5376 template <class D, HWY_IF_T_SIZE_D(D, 1)> 5377 HWY_API MFromD<D> FirstN(const D d, const size_t n) { 5378 const auto zero = Zero(d); 5379 const auto one = Set(d, 1); 5380 return Eq(detail::SlideUp(one, zero, n), one); 5381 } 5382 5383 // ------------------------------ LowerHalfOfMask/UpperHalfOfMask 5384 5385 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5386 5387 // Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask, 5388 // CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible 5389 // on RVV if the __riscv_vreinterpret_v_b*_u8m1 and 5390 // __riscv_vreinterpret_v_u8m1_b* intrinsics are available. 5391 5392 // The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b* 5393 // intrinsics available with Clang 17 and later and GCC 14 and later. 5394 5395 namespace detail { 5396 5397 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) { 5398 return __riscv_vreinterpret_v_b1_u8m1(m); 5399 } 5400 5401 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) { 5402 return __riscv_vreinterpret_v_b2_u8m1(m); 5403 } 5404 5405 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) { 5406 return __riscv_vreinterpret_v_b4_u8m1(m); 5407 } 5408 5409 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) { 5410 return __riscv_vreinterpret_v_b8_u8m1(m); 5411 } 5412 5413 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) { 5414 return __riscv_vreinterpret_v_b16_u8m1(m); 5415 } 5416 5417 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) { 5418 return __riscv_vreinterpret_v_b32_u8m1(m); 5419 } 5420 5421 HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) { 5422 return __riscv_vreinterpret_v_b64_u8m1(m); 5423 } 5424 5425 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr> 5426 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5427 return __riscv_vreinterpret_v_u8m1_b1(v); 5428 } 5429 5430 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr> 5431 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5432 return __riscv_vreinterpret_v_u8m1_b2(v); 5433 } 5434 5435 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr> 5436 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5437 return __riscv_vreinterpret_v_u8m1_b4(v); 5438 } 5439 5440 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr> 5441 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5442 return __riscv_vreinterpret_v_u8m1_b8(v); 5443 } 5444 5445 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr> 5446 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5447 return __riscv_vreinterpret_v_u8m1_b16(v); 5448 } 5449 5450 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr> 5451 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5452 return __riscv_vreinterpret_v_u8m1_b32(v); 5453 } 5454 5455 template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr> 5456 HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) { 5457 return __riscv_vreinterpret_v_u8m1_b64(v); 5458 } 5459 5460 } // namespace detail 5461 5462 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK 5463 #undef HWY_NATIVE_LOWER_HALF_OF_MASK 5464 #else 5465 #define HWY_NATIVE_LOWER_HALF_OF_MASK 5466 #endif 5467 5468 template <class D> 5469 HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) { 5470 return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m)); 5471 } 5472 5473 #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK 5474 #undef HWY_NATIVE_UPPER_HALF_OF_MASK 5475 #else 5476 #define HWY_NATIVE_UPPER_HALF_OF_MASK 5477 #endif 5478 5479 template <class D> 5480 HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) { 5481 const size_t N = Lanes(d); 5482 5483 vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m); 5484 mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7)); 5485 if (HWY_MAX_LANES_D(D) >= 8) { 5486 mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8); 5487 } 5488 5489 return detail::U8MaskBitsVecToMask(d, mask_bits); 5490 } 5491 5492 // ------------------------------ CombineMasks 5493 5494 #ifdef HWY_NATIVE_COMBINE_MASKS 5495 #undef HWY_NATIVE_COMBINE_MASKS 5496 #else 5497 #define HWY_NATIVE_COMBINE_MASKS 5498 #endif 5499 5500 template <class D> 5501 HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) { 5502 const Half<decltype(d)> dh; 5503 const size_t half_N = Lanes(dh); 5504 5505 const auto ext_lo_mask = 5506 And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)), 5507 FirstN(d, half_N)); 5508 vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi); 5509 hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7)); 5510 if (HWY_MAX_LANES_D(D) >= 8) { 5511 hi_mask_bits = 5512 SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8); 5513 } 5514 5515 return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits)); 5516 } 5517 5518 // ------------------------------ OrderedDemote2MasksTo 5519 5520 #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 5521 #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 5522 #else 5523 #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 5524 #endif 5525 5526 template <class DTo, class DFrom, 5527 HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2), 5528 class DTo_2 = Repartition<TFromD<DTo>, DFrom>, 5529 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr> 5530 HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/, 5531 MFromD<DFrom> a, MFromD<DFrom> b) { 5532 return CombineMasks(d_to, b, a); 5533 } 5534 5535 #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5536 5537 // ------------------------------ Dup128MaskFromMaskBits 5538 5539 namespace detail { 5540 // Even though this is only used after checking if (kN < X), this helper 5541 // function prevents "shift count exceeded" errors. 5542 template <size_t kN, HWY_IF_LANES_LE(kN, 31)> 5543 constexpr unsigned MaxMaskBits() { 5544 return (1u << kN) - 1; 5545 } 5546 template <size_t kN, HWY_IF_LANES_GT(kN, 31)> 5547 constexpr unsigned MaxMaskBits() { 5548 return ~0u; 5549 } 5550 5551 template <class D> 5552 constexpr int SufficientPow2ForMask() { 5553 return HWY_MAX( 5554 D().Pow2() - 3 - static_cast<int>(FloorLog2(sizeof(TFromD<D>))), -3); 5555 } 5556 5557 template <class M> 5558 static HWY_INLINE HWY_MAYBE_UNUSED M RvvVmmv(M mask) { 5559 // The below And operation is equivalent to the RVV vmmv instruction and 5560 // ensures that mask is not in the same register as a vector operand when used 5561 // in RVV instructions that take both a vector operand and a mask operand. 5562 return And(mask, mask); 5563 } 5564 5565 } // namespace detail 5566 5567 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)> 5568 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5569 constexpr size_t kN = MaxLanes(d); 5570 if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>(); 5571 5572 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5573 const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8; 5574 return detail::RvvVmmv(detail::U8MaskBitsVecToMask( 5575 d, detail::ChangeLMUL(ScalableTag<uint8_t>(), 5576 Set(du8, static_cast<uint8_t>(mask_bits))))); 5577 #else 5578 const RebindToUnsigned<decltype(d)> du8; 5579 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>> 5580 du64; 5581 5582 const auto bytes = ResizeBitCast( 5583 du8, detail::AndS( 5584 ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))), 5585 uint64_t{0x8040201008040201u})); 5586 return detail::NeS(bytes, uint8_t{0}); 5587 #endif 5588 } 5589 5590 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)> 5591 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5592 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5593 const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8; 5594 const ScalableTag<uint16_t, detail::SufficientPow2ForMask<D>()> du16; 5595 // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes. 5596 return detail::RvvVmmv(detail::U8MaskBitsVecToMask( 5597 d, detail::ChangeLMUL( 5598 ScalableTag<uint8_t>(), 5599 BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)))))); 5600 #else 5601 // Slow fallback for completeness; the above bits to mask cast is preferred. 5602 const RebindToUnsigned<decltype(d)> du8; 5603 const Repartition<uint16_t, decltype(du8)> du16; 5604 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>> 5605 du64; 5606 5607 // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector, 5608 // and then bitcast the replicated mask_bits to a u8 vector 5609 const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))); 5610 // Replicate bytes 8x such that each byte contains the bit that governs it. 5611 const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8))); 5612 5613 const auto masked_out_rep8 = ResizeBitCast( 5614 du8, 5615 detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u})); 5616 return detail::NeS(masked_out_rep8, uint8_t{0}); 5617 #endif 5618 } 5619 5620 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5621 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5622 constexpr size_t kN = MaxLanes(d); 5623 if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>(); 5624 5625 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5626 const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8; 5627 // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes. 5628 return detail::RvvVmmv(detail::U8MaskBitsVecToMask( 5629 d, detail::ChangeLMUL(ScalableTag<uint8_t>(), 5630 Set(du8, static_cast<uint8_t>(mask_bits))))); 5631 #else 5632 // Slow fallback for completeness; the above bits to mask cast is preferred. 5633 const RebindToUnsigned<D> du; 5634 const VFromD<decltype(du)> bits = 5635 Shl(Set(du, uint16_t{1}), detail::AndS(detail::Iota0(du), 7)); 5636 return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits); 5637 #endif 5638 } 5639 5640 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5641 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5642 constexpr size_t kN = MaxLanes(d); 5643 if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>(); 5644 5645 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5646 const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8; 5647 return detail::RvvVmmv(detail::U8MaskBitsVecToMask( 5648 d, detail::ChangeLMUL(ScalableTag<uint8_t>(), 5649 Set(du8, static_cast<uint8_t>(mask_bits * 0x11))))); 5650 #else 5651 // Slow fallback for completeness; the above bits to mask cast is preferred. 5652 const RebindToUnsigned<D> du; 5653 const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2, 4, 8); 5654 return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits); 5655 #endif 5656 } 5657 5658 template <class D, HWY_IF_T_SIZE_D(D, 8)> 5659 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5660 constexpr size_t kN = MaxLanes(d); 5661 if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>(); 5662 5663 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5664 const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8; 5665 return detail::RvvVmmv(detail::U8MaskBitsVecToMask( 5666 d, detail::ChangeLMUL(ScalableTag<uint8_t>(), 5667 Set(du8, static_cast<uint8_t>(mask_bits * 0x55))))); 5668 #else 5669 // Slow fallback for completeness; the above bits to mask cast is preferred. 5670 const RebindToUnsigned<D> du; 5671 const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 1, 2); 5672 return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits); 5673 #endif 5674 } 5675 5676 // ------------------------------ SetMask 5677 5678 #ifdef HWY_NATIVE_SET_MASK 5679 #undef HWY_NATIVE_SET_MASK 5680 #else 5681 #define HWY_NATIVE_SET_MASK 5682 #endif 5683 5684 template <class D> 5685 HWY_API MFromD<D> SetMask(D d, bool val) { 5686 const uint8_t u8_mask_val = static_cast<uint8_t>(-static_cast<int>(val)); 5687 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 5688 const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8; 5689 return detail::RvvVmmv(detail::U8MaskBitsVecToMask( 5690 d, detail::ChangeLMUL(ScalableTag<uint8_t>(), Set(du8, u8_mask_val)))); 5691 #else 5692 const Rebind<uint8_t, DFromV<VFromD<decltype(d)>>> du8; 5693 return MaskFromVec(Set(du8, u8_mask_val)); 5694 #endif 5695 } 5696 5697 // ------------------------------ Abs (Max, Neg) 5698 5699 template <class V, HWY_IF_SIGNED_V(V)> 5700 HWY_API V Abs(const V v) { 5701 return Max(v, Neg(v)); 5702 } 5703 5704 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL) 5705 5706 #undef HWY_RVV_RETV_ARGV2 5707 5708 // ------------------------------ AbsDiff (Abs, Sub) 5709 template <class V, HWY_IF_FLOAT_V(V)> 5710 HWY_API V AbsDiff(const V a, const V b) { 5711 return Abs(Sub(a, b)); 5712 } 5713 5714 // ------------------------------ Round (NearestInt, ConvertTo, CopySign) 5715 5716 // IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have 5717 // a dedicated instruction for that. Rounding to integer and converting back to 5718 // float is correct except when the input magnitude is large, in which case the 5719 // input was already an integer (because mantissa >> exponent is zero). 5720 5721 namespace detail { 5722 enum RoundingModes { kNear, kTrunc, kDown, kUp }; 5723 5724 template <class V> 5725 HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) { 5726 return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>()); 5727 } 5728 5729 } // namespace detail 5730 5731 template <class V> 5732 HWY_API V Round(const V v) { 5733 const DFromV<V> df; 5734 5735 const auto integer = NearestInt(v); // round using current mode 5736 const auto int_f = ConvertTo(df, integer); 5737 5738 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); 5739 } 5740 5741 // ------------------------------ Trunc (ConvertTo) 5742 template <class V> 5743 HWY_API V Trunc(const V v) { 5744 const DFromV<V> df; 5745 const RebindToSigned<decltype(df)> di; 5746 5747 const auto integer = ConvertTo(di, v); // round toward 0 5748 const auto int_f = ConvertTo(df, integer); 5749 5750 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); 5751 } 5752 5753 // ------------------------------ Ceil 5754 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \ 5755 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700) 5756 namespace detail { 5757 #define HWY_RVV_CEIL_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 5758 SHIFT, MLEN, NAME, OP) \ 5759 HWY_API HWY_RVV_V(int, SEW, LMUL) CeilInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 5760 return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RUP, \ 5761 HWY_RVV_AVL(SEW, SHIFT)); \ 5762 } 5763 HWY_RVV_FOREACH_F(HWY_RVV_CEIL_INT, _, _, _ALL) 5764 #undef HWY_RVV_CEIL_INT 5765 5766 } // namespace detail 5767 5768 template <class V> 5769 HWY_API V Ceil(const V v) { 5770 const DFromV<V> df; 5771 5772 const auto integer = detail::CeilInt(v); 5773 const auto int_f = ConvertTo(df, integer); 5774 5775 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); 5776 } 5777 5778 #else // GCC 13 or earlier or Clang 16 or earlier 5779 5780 template <class V> 5781 HWY_API V Ceil(const V v) { 5782 const DFromV<decltype(v)> df; 5783 const RebindToSigned<decltype(df)> di; 5784 5785 using T = TFromD<decltype(df)>; 5786 5787 const auto integer = ConvertTo(di, v); // round toward 0 5788 const auto int_f = ConvertTo(df, integer); 5789 5790 // Truncating a positive non-integer ends up smaller; if so, add 1. 5791 const auto pos1 = 5792 IfThenElseZero(Lt(int_f, v), Set(df, ConvertScalarTo<T>(1.0))); 5793 5794 return IfThenElse(detail::UseInt(v), Add(int_f, pos1), v); 5795 } 5796 5797 #endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || 5798 // (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700) 5799 5800 // ------------------------------ Floor 5801 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || \ 5802 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700) 5803 namespace detail { 5804 #define HWY_RVV_FLOOR_INT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 5805 SHIFT, MLEN, NAME, OP) \ 5806 HWY_API HWY_RVV_V(int, SEW, LMUL) FloorInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ 5807 return __riscv_vfcvt_x_f_v_i##SEW##LMUL##_rm(v, __RISCV_FRM_RDN, \ 5808 HWY_RVV_AVL(SEW, SHIFT)); \ 5809 } 5810 HWY_RVV_FOREACH_F(HWY_RVV_FLOOR_INT, _, _, _ALL) 5811 #undef HWY_RVV_FLOOR_INT 5812 5813 } // namespace detail 5814 5815 template <class V> 5816 HWY_API V Floor(const V v) { 5817 const DFromV<V> df; 5818 5819 const auto integer = detail::FloorInt(v); 5820 const auto int_f = ConvertTo(df, integer); 5821 5822 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); 5823 } 5824 5825 #else // GCC 13 or earlier or Clang 16 or earlier 5826 5827 template <class V> 5828 HWY_API V Floor(const V v) { 5829 const DFromV<decltype(v)> df; 5830 const RebindToSigned<decltype(df)> di; 5831 5832 using T = TFromD<decltype(df)>; 5833 5834 const auto integer = ConvertTo(di, v); // round toward 0 5835 const auto int_f = ConvertTo(df, integer); 5836 5837 // Truncating a negative non-integer ends up larger; if so, subtract 1. 5838 const auto neg1 = 5839 IfThenElseZero(Gt(int_f, v), Set(df, ConvertScalarTo<T>(-1.0))); 5840 5841 return IfThenElse(detail::UseInt(v), Add(int_f, neg1), v); 5842 } 5843 5844 #endif // (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1400) || 5845 // (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1700) 5846 5847 // ------------------------------ Floating-point classification (Ne) 5848 5849 // vfclass does not help because it would require 3 instructions (to AND and 5850 // then compare the bits), whereas these are just 1-3 integer instructions. 5851 5852 template <class V> 5853 HWY_API MFromD<DFromV<V>> IsNaN(const V v) { 5854 return Ne(v, v); 5855 } 5856 5857 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. 5858 // We use a fused Set/comparison for IsFinite. 5859 #ifdef HWY_NATIVE_ISINF 5860 #undef HWY_NATIVE_ISINF 5861 #else 5862 #define HWY_NATIVE_ISINF 5863 #endif 5864 5865 template <class V, class D = DFromV<V>> 5866 HWY_API MFromD<D> IsInf(const V v) { 5867 const D d; 5868 const RebindToSigned<decltype(d)> di; 5869 using T = TFromD<D>; 5870 const VFromD<decltype(di)> vi = BitCast(di, v); 5871 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 5872 return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>())); 5873 } 5874 5875 // Returns whether normal/subnormal/zero. 5876 template <class V, class D = DFromV<V>> 5877 HWY_API MFromD<D> IsFinite(const V v) { 5878 const D d; 5879 const RebindToUnsigned<decltype(d)> du; 5880 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison 5881 using T = TFromD<D>; 5882 const VFromD<decltype(du)> vu = BitCast(du, v); 5883 // 'Shift left' to clear the sign bit, then right so we can compare with the 5884 // max exponent (cannot compare with MaxExponentTimes2 directly because it is 5885 // negative and non-negative floats would be greater). 5886 const VFromD<decltype(di)> exp = 5887 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); 5888 return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>())); 5889 } 5890 5891 // ------------------------------ Iota (ConvertTo) 5892 5893 template <class D, typename T2, HWY_IF_UNSIGNED_D(D)> 5894 HWY_API VFromD<D> Iota(const D d, T2 first) { 5895 return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first)); 5896 } 5897 5898 template <class D, typename T2, HWY_IF_SIGNED_D(D)> 5899 HWY_API VFromD<D> Iota(const D d, T2 first) { 5900 const RebindToUnsigned<D> du; 5901 return detail::AddS(BitCast(d, detail::Iota0(du)), 5902 static_cast<TFromD<D>>(first)); 5903 } 5904 5905 template <class D, typename T2, HWY_IF_FLOAT_D(D)> 5906 HWY_API VFromD<D> Iota(const D d, T2 first) { 5907 const RebindToUnsigned<D> du; 5908 const RebindToSigned<D> di; 5909 return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), 5910 ConvertScalarTo<TFromD<D>>(first)); 5911 } 5912 5913 // ------------------------------ BitShuffle (PromoteTo, Rol, SumsOf8) 5914 5915 // Native implementation required to avoid 8-bit wraparound on long vectors. 5916 #ifdef HWY_NATIVE_BITSHUFFLE 5917 #undef HWY_NATIVE_BITSHUFFLE 5918 #else 5919 #define HWY_NATIVE_BITSHUFFLE 5920 #endif 5921 5922 // Cannot handle LMUL=8 because we promote indices. 5923 template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>, 5924 HWY_IF_UI64_D(D64), HWY_IF_POW2_LE_D(D64, 2)> 5925 HWY_API V64 BitShuffle(V64 values, VI idx) { 5926 const RebindToUnsigned<D64> du64; 5927 const Repartition<uint8_t, D64> du8; 5928 const Rebind<uint16_t, decltype(du8)> du16; 5929 using VU8 = VFromD<decltype(du8)>; 5930 using VU16 = VFromD<decltype(du16)>; 5931 // For each 16-bit (to avoid wraparound for long vectors) index of an output 5932 // byte: offset of the u64 lane to which it belongs. 5933 const VU16 byte_offsets = 5934 detail::AndS(detail::Iota0(du16), static_cast<uint16_t>(~7u)); 5935 // idx is for a bit; shifting makes that bytes. Promote so we can add 5936 // byte_offsets, then we have the u8 lane index within the whole vector. 5937 const VU16 idx16 = 5938 Add(byte_offsets, PromoteTo(du16, ShiftRight<3>(BitCast(du8, idx)))); 5939 const VU8 bytes = detail::TableLookupLanes16(BitCast(du8, values), idx16); 5940 5941 // We want to shift right by idx & 7 to extract the desired bit in `bytes`, 5942 // and left by iota & 7 to put it in the correct output bit. To correctly 5943 // handle shift counts from -7 to 7, we rotate (unfortunately not natively 5944 // supported on RVV). 5945 const VU8 rotate_left_bits = Sub(detail::Iota0(du8), BitCast(du8, idx)); 5946 const VU8 extracted_bits_mask = 5947 BitCast(du8, Set(du64, static_cast<uint64_t>(0x8040201008040201u))); 5948 const VU8 extracted_bits = 5949 And(Rol(bytes, rotate_left_bits), extracted_bits_mask); 5950 // Combine bit-sliced (one bit per byte) into one 64-bit sum. 5951 return BitCast(D64(), SumsOf8(extracted_bits)); 5952 } 5953 5954 template <class V64, class VI, HWY_IF_UI8(TFromV<VI>), class D64 = DFromV<V64>, 5955 HWY_IF_UI64_D(D64), HWY_IF_POW2_GT_D(D64, 2)> 5956 HWY_API V64 BitShuffle(V64 values, VI idx) { 5957 const Half<D64> dh; 5958 const Half<DFromV<VI>> dih; 5959 using V64H = VFromD<decltype(dh)>; 5960 const V64H r0 = BitShuffle(LowerHalf(dh, values), LowerHalf(dih, idx)); 5961 const V64H r1 = BitShuffle(UpperHalf(dh, values), UpperHalf(dih, idx)); 5962 return Combine(D64(), r1, r0); 5963 } 5964 5965 // ------------------------------ MulEven/Odd (Mul, OddEven) 5966 5967 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)), 5968 class D = DFromV<V>, class DW = RepartitionToWide<D>> 5969 HWY_API VFromD<DW> MulEven(const V a, const V b) { 5970 constexpr int maskVal = sizeof(TFromD<D>) == 4 ? 5 5971 : sizeof(TFromD<D>) == 2 ? 0x55 5972 : 0x5555; 5973 const auto mask = Dup128MaskFromMaskBits(D(), maskVal); 5974 const auto hi = Slide1Up(D(), MulHigh(a, b)); 5975 const auto res = MaskedMulOr(hi, mask, a, b); 5976 return BitCast(DW(), res); 5977 } 5978 5979 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)), 5980 class D = DFromV<V>, class DW = RepartitionToWide<D>> 5981 HWY_API VFromD<DW> MulOdd(const V a, const V b) { 5982 const auto lo = Mul(a, b); 5983 const auto hi = MulHigh(a, b); 5984 return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo))); 5985 } 5986 5987 // There is no 64x64 vwmul. 5988 template <class V, HWY_IF_T_SIZE_V(V, 8)> 5989 HWY_INLINE V MulEven(const V a, const V b) { 5990 const auto mask = Dup128MaskFromMaskBits(DFromV<V>(), 1); 5991 const auto hi = Slide1Up(DFromV<V>(), MulHigh(a, b)); 5992 return MaskedMulOr(hi, mask, a, b); 5993 } 5994 5995 template <class V, HWY_IF_T_SIZE_V(V, 8)> 5996 HWY_INLINE V MulOdd(const V a, const V b) { 5997 const auto lo = Mul(a, b); 5998 const auto hi = MulHigh(a, b); 5999 return OddEven(hi, detail::Slide1Down(lo)); 6000 } 6001 6002 // ------------------------------ ReorderDemote2To (OddEven, Combine) 6003 6004 template <class D, HWY_IF_BF16_D(D)> 6005 HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<RepartitionToWide<D>> a, 6006 VFromD<RepartitionToWide<D>> b) { 6007 const RebindToUnsigned<decltype(dbf16)> du16; 6008 const Half<decltype(du16)> du16_half; 6009 const RebindToUnsigned<DFromV<decltype(a)>> du32; 6010 const VFromD<decltype(du32)> a_in_even = PromoteTo( 6011 du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, a))); 6012 const VFromD<decltype(du32)> b_in_even = PromoteTo( 6013 du32, detail::DemoteTo16NearestEven(du16_half, BitCast(du32, b))); 6014 // Equivalent to InterleaveEven, but because the upper 16 bits are zero, we 6015 // can OR instead of OddEven. 6016 const VFromD<decltype(du16)> a_in_odd = 6017 detail::Slide1Up(BitCast(du16, a_in_even)); 6018 return BitCast(dbf16, Or(a_in_odd, BitCast(du16, b_in_even))); 6019 } 6020 6021 // If LMUL is not the max, Combine first to avoid another DemoteTo. 6022 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), 6023 HWY_IF_POW2_LE_D(DN, 2), class V, HWY_IF_SIGNED_V(V), 6024 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 6025 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6026 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6027 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 6028 const Rebind<TFromV<V>, DN> dt; 6029 const VFromD<decltype(dt)> ab = Combine(dt, b, a); 6030 return DemoteTo(dn, ab); 6031 } 6032 6033 template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V, 6034 HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 6035 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6036 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6037 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 6038 const Rebind<TFromV<V>, DN> dt; 6039 const VFromD<decltype(dt)> ab = Combine(dt, b, a); 6040 return DemoteTo(dn, ab); 6041 } 6042 6043 // Max LMUL: must DemoteTo first, then Combine. 6044 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), 6045 HWY_IF_POW2_GT_D(DN, 2), class V, HWY_IF_SIGNED_V(V), 6046 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 6047 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6048 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6049 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 6050 const Half<decltype(dn)> dnh; 6051 const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a); 6052 const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b); 6053 return Combine(dn, demoted_b, demoted_a); 6054 } 6055 6056 template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V, 6057 HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 6058 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6059 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6060 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 6061 const Half<decltype(dn)> dnh; 6062 const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a); 6063 const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b); 6064 return Combine(dn, demoted_b, demoted_a); 6065 } 6066 6067 // If LMUL is not the max, Combine first to avoid another DemoteTo. 6068 template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2), 6069 class V, HWY_IF_F32_D(DFromV<V>), 6070 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6071 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6072 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { 6073 const Rebind<TFromV<V>, DN> dt; 6074 const VFromD<decltype(dt)> ab = Combine(dt, b, a); 6075 return DemoteTo(dn, ab); 6076 } 6077 6078 // Max LMUL: must DemoteTo first, then Combine. 6079 template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2), 6080 class V, HWY_IF_F32_D(DFromV<V>), 6081 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6082 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6083 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { 6084 const Half<decltype(dn)> dnh; 6085 const RebindToUnsigned<decltype(dn)> dn_u; 6086 const RebindToUnsigned<decltype(dnh)> dnh_u; 6087 const auto demoted_a = BitCast(dnh_u, DemoteTo(dnh, a)); 6088 const auto demoted_b = BitCast(dnh_u, DemoteTo(dnh, b)); 6089 return BitCast(dn, Combine(dn_u, demoted_b, demoted_a)); 6090 } 6091 6092 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V, 6093 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 6094 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 6095 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 6096 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr> 6097 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { 6098 return ReorderDemote2To(dn, a, b); 6099 } 6100 6101 // ------------------------------ WidenMulPairwiseAdd 6102 6103 template <class DF, HWY_IF_F32_D(DF), 6104 class VBF = VFromD<Repartition<hwy::bfloat16_t, DF>>> 6105 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 6106 const VFromD<DF> ae = PromoteEvenTo(df, a); 6107 const VFromD<DF> be = PromoteEvenTo(df, b); 6108 const VFromD<DF> ao = PromoteOddTo(df, a); 6109 const VFromD<DF> bo = PromoteOddTo(df, b); 6110 return MulAdd(ae, be, Mul(ao, bo)); 6111 } 6112 6113 template <class D, HWY_IF_UI32_D(D), class V16 = VFromD<RepartitionToNarrow<D>>> 6114 HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) { 6115 return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), 6116 Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b))); 6117 } 6118 6119 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) 6120 6121 namespace detail { 6122 6123 #define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ 6124 SHIFT, MLEN, NAME, OP) \ 6125 template <size_t N> \ 6126 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ 6127 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \ 6128 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ 6129 return __riscv_v##OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \ 6130 } 6131 6132 HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmacc_vv_, _EXT_VIRT) 6133 HWY_RVV_FOREACH_U16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmaccu_vv_, _EXT_VIRT) 6134 #undef HWY_RVV_WIDEN_MACC 6135 6136 // If LMUL is not the max, we can WidenMul first (3 instructions). 6137 template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>, 6138 class D16 = RepartitionToNarrow<D32>> 6139 HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a, 6140 VFromD<D16> b, const V32 sum0, 6141 V32& sum1) { 6142 const Twice<decltype(d32)> d32t; 6143 using V32T = VFromD<decltype(d32t)>; 6144 V32T sum = Combine(d32t, sum1, sum0); 6145 sum = detail::WidenMulAcc(d32t, sum, a, b); 6146 sum1 = UpperHalf(d32, sum); 6147 return LowerHalf(d32, sum); 6148 } 6149 6150 // Max LMUL: must LowerHalf first (4 instructions). 6151 template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>, 6152 class D16 = RepartitionToNarrow<D32>> 6153 HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a, 6154 VFromD<D16> b, const V32 sum0, 6155 V32& sum1) { 6156 const Half<D16> d16h; 6157 using V16H = VFromD<decltype(d16h)>; 6158 const V16H a0 = LowerHalf(d16h, a); 6159 const V16H a1 = UpperHalf(d16h, a); 6160 const V16H b0 = LowerHalf(d16h, b); 6161 const V16H b1 = UpperHalf(d16h, b); 6162 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1); 6163 return detail::WidenMulAcc(d32, sum0, a0, b0); 6164 } 6165 6166 // If LMUL is not the max, we can WidenMul first (3 instructions). 6167 template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>, 6168 class D16 = RepartitionToNarrow<D32>> 6169 HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a, 6170 VFromD<D16> b, const V32 sum0, 6171 V32& sum1) { 6172 const Twice<decltype(d32)> d32t; 6173 using V32T = VFromD<decltype(d32t)>; 6174 V32T sum = Combine(d32t, sum1, sum0); 6175 sum = detail::WidenMulAcc(d32t, sum, a, b); 6176 sum1 = UpperHalf(d32, sum); 6177 return LowerHalf(d32, sum); 6178 } 6179 6180 // Max LMUL: must LowerHalf first (4 instructions). 6181 template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>, 6182 class D16 = RepartitionToNarrow<D32>> 6183 HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a, 6184 VFromD<D16> b, const V32 sum0, 6185 V32& sum1) { 6186 const Half<D16> d16h; 6187 using V16H = VFromD<decltype(d16h)>; 6188 const V16H a0 = LowerHalf(d16h, a); 6189 const V16H a1 = UpperHalf(d16h, a); 6190 const V16H b0 = LowerHalf(d16h, b); 6191 const V16H b1 = UpperHalf(d16h, b); 6192 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1); 6193 return detail::WidenMulAcc(d32, sum0, a0, b0); 6194 } 6195 6196 } // namespace detail 6197 6198 template <class D, HWY_IF_I32_D(D), class VN, class VW> 6199 HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0, 6200 VW& sum1) { 6201 return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1); 6202 } 6203 6204 template <class D, HWY_IF_U32_D(D), class VN, class VW> 6205 HWY_API VW ReorderWidenMulAccumulate(D d32, VN a, VN b, const VW sum0, 6206 VW& sum1) { 6207 return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1); 6208 } 6209 6210 // ------------------------------ RearrangeToOddPlusEven 6211 6212 template <class VW, HWY_IF_SIGNED_V(VW)> // vint32_t* 6213 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { 6214 // vwmacc doubles LMUL, so we require a pairwise sum here. This op is 6215 // expected to be less frequent than ReorderWidenMulAccumulate, hence it's 6216 // preferable to do the extra work here rather than do manual odd/even 6217 // extraction there. 6218 const DFromV<VW> di32; 6219 const RebindToUnsigned<decltype(di32)> du32; 6220 const Twice<decltype(di32)> di32x2; 6221 const RepartitionToWide<decltype(di32x2)> di64x2; 6222 const RebindToUnsigned<decltype(di64x2)> du64x2; 6223 const auto combined = BitCast(di64x2, Combine(di32x2, sum1, sum0)); 6224 // Isolate odd/even int32 in int64 lanes. 6225 const auto even = ShiftRight<32>(ShiftLeft<32>(combined)); // sign extend 6226 const auto odd = ShiftRight<32>(combined); 6227 return BitCast(di32, TruncateTo(du32, BitCast(du64x2, Add(even, odd)))); 6228 } 6229 6230 // For max LMUL, we cannot Combine again and instead manually unroll. 6231 HWY_API vint32m8_t RearrangeToOddPlusEven(vint32m8_t sum0, vint32m8_t sum1) { 6232 const DFromV<vint32m8_t> d; 6233 const Half<decltype(d)> dh; 6234 const vint32m4_t lo = 6235 RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0)); 6236 const vint32m4_t hi = 6237 RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1)); 6238 return Combine(d, hi, lo); 6239 } 6240 6241 template <class VW, HWY_IF_UNSIGNED_V(VW)> // vuint32_t* 6242 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { 6243 // vwmacc doubles LMUL, so we require a pairwise sum here. This op is 6244 // expected to be less frequent than ReorderWidenMulAccumulate, hence it's 6245 // preferable to do the extra work here rather than do manual odd/even 6246 // extraction there. 6247 const DFromV<VW> du32; 6248 const Twice<decltype(du32)> du32x2; 6249 const RepartitionToWide<decltype(du32x2)> du64x2; 6250 const auto combined = BitCast(du64x2, Combine(du32x2, sum1, sum0)); 6251 // Isolate odd/even int32 in int64 lanes. 6252 const auto even = detail::AndS(combined, uint64_t{0xFFFFFFFFu}); 6253 const auto odd = ShiftRight<32>(combined); 6254 return TruncateTo(du32, Add(even, odd)); 6255 } 6256 6257 // For max LMUL, we cannot Combine again and instead manually unroll. 6258 HWY_API vuint32m8_t RearrangeToOddPlusEven(vuint32m8_t sum0, vuint32m8_t sum1) { 6259 const DFromV<vuint32m8_t> d; 6260 const Half<decltype(d)> dh; 6261 const vuint32m4_t lo = 6262 RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0)); 6263 const vuint32m4_t hi = 6264 RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1)); 6265 return Combine(d, hi, lo); 6266 } 6267 6268 template <class VW, HWY_IF_FLOAT_V(VW)> // vfloat* 6269 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { 6270 return Add(sum0, sum1); // invariant already holds 6271 } 6272 6273 // ------------------------------ Lt128 6274 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6275 6276 template <class D> 6277 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) { 6278 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6279 // The subsequent computations are performed using e8mf8 (8-bit elements with 6280 // a fractional LMUL of 1/8) for the following reasons: 6281 // 1. It is correct for the possible input vector types e64m<1,2,4,8>. This is 6282 // because the resulting mask can occupy at most 1/8 of a full vector when 6283 // using e64m8. 6284 // 2. It can be more efficient than using a full vector or a vector group. 6285 // 6286 // The algorithm computes the result as follows: 6287 // 1. Compute cH | (=H & cL) in the high bits, where cH and cL represent the 6288 // comparison results for the high and low 64-bit elements, respectively. 6289 // 2. Shift the result right by 1 to duplicate the comparison results for the 6290 // low bits. 6291 // 3. Obtain the final result by performing a bitwise OR on the high and low 6292 // bits. 6293 auto du8mf8 = ScalableTag<uint8_t, -3>{}; 6294 const vuint8mf8_t ltHL0 = 6295 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b))); 6296 const vuint8mf8_t eqHL0 = 6297 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b))); 6298 const vuint8mf8_t ltLx0 = Add(ltHL0, ltHL0); 6299 const vuint8mf8_t resultHx = detail::AndS(OrAnd(ltHL0, ltLx0, eqHL0), 0xaa); 6300 const vuint8mf8_t resultxL = ShiftRight<1>(resultHx); 6301 const vuint8mf8_t result = Or(resultHx, resultxL); 6302 auto du8m1 = ScalableTag<uint8_t>{}; 6303 return detail::U8MaskBitsVecToMask(d, detail::ChangeLMUL(du8m1, result)); 6304 } 6305 6306 #else 6307 6308 template <class D> 6309 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) { 6310 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6311 // Truth table of Eq and Compare for Hi and Lo u64. 6312 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) 6313 // =H =L cH cL | out = cH | (=H & cL) 6314 // 0 0 0 0 | 0 6315 // 0 0 0 1 | 0 6316 // 0 0 1 0 | 1 6317 // 0 0 1 1 | 1 6318 // 0 1 0 0 | 0 6319 // 0 1 0 1 | 0 6320 // 0 1 1 0 | 1 6321 // 1 0 0 0 | 0 6322 // 1 0 0 1 | 1 6323 // 1 1 0 0 | 0 6324 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 6325 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 6326 // Shift leftward so L can influence H. 6327 const VFromD<D> ltLx = detail::Slide1Up(ltHL); 6328 const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx); 6329 // Replicate H to its neighbor. 6330 return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx))); 6331 } 6332 6333 #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6334 6335 // ------------------------------ Lt128Upper 6336 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6337 6338 template <class D> 6339 HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 6340 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6341 auto du8mf8 = ScalableTag<uint8_t, -3>{}; 6342 const vuint8mf8_t ltHL = 6343 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b))); 6344 const vuint8mf8_t ltHx = detail::AndS(ltHL, 0xaa); 6345 const vuint8mf8_t ltxL = ShiftRight<1>(ltHx); 6346 auto du8m1 = ScalableTag<uint8_t>{}; 6347 return detail::U8MaskBitsVecToMask(d, 6348 detail::ChangeLMUL(du8m1, Or(ltHx, ltxL))); 6349 } 6350 6351 #else 6352 6353 template <class D> 6354 HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 6355 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6356 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 6357 const VFromD<D> down = detail::Slide1Down(ltHL); 6358 // b(267743505): Clang compiler bug, workaround is DoNotOptimize 6359 asm volatile("" : : "r,m"(GetLane(down)) : "memory"); 6360 // Replicate H to its neighbor. 6361 return MaskFromVec(OddEven(ltHL, down)); 6362 } 6363 6364 #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6365 6366 // ------------------------------ Eq128 6367 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6368 6369 template <class D> 6370 HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) { 6371 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6372 auto du8mf8 = ScalableTag<uint8_t, -3>{}; 6373 const vuint8mf8_t eqHL = 6374 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b))); 6375 const vuint8mf8_t eqxH = ShiftRight<1>(eqHL); 6376 const vuint8mf8_t result0L = detail::AndS(And(eqHL, eqxH), 0x55); 6377 const vuint8mf8_t resultH0 = Add(result0L, result0L); 6378 auto du8m1 = ScalableTag<uint8_t>{}; 6379 return detail::U8MaskBitsVecToMask( 6380 d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0))); 6381 } 6382 6383 #else 6384 6385 template <class D> 6386 HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) { 6387 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6388 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 6389 const VFromD<D> eqLH = Reverse2(d, eqHL); 6390 const VFromD<D> eq = And(eqHL, eqLH); 6391 // b(267743505): Clang compiler bug, workaround is DoNotOptimize 6392 asm volatile("" : : "r,m"(GetLane(eq)) : "memory"); 6393 return MaskFromVec(eq); 6394 } 6395 6396 #endif 6397 6398 // ------------------------------ Eq128Upper 6399 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6400 6401 template <class D> 6402 HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 6403 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6404 auto du8mf8 = ScalableTag<uint8_t, -3>{}; 6405 const vuint8mf8_t eqHL = 6406 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b))); 6407 const vuint8mf8_t eqHx = detail::AndS(eqHL, 0xaa); 6408 const vuint8mf8_t eqxL = ShiftRight<1>(eqHx); 6409 auto du8m1 = ScalableTag<uint8_t>{}; 6410 return detail::U8MaskBitsVecToMask(d, 6411 detail::ChangeLMUL(du8m1, Or(eqHx, eqxL))); 6412 } 6413 6414 #else 6415 6416 template <class D> 6417 HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 6418 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6419 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 6420 // Replicate H to its neighbor. 6421 return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL))); 6422 } 6423 6424 #endif 6425 6426 // ------------------------------ Ne128 6427 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6428 6429 template <class D> 6430 HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) { 6431 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6432 auto du8mf8 = ScalableTag<uint8_t, -3>{}; 6433 const vuint8mf8_t neHL = 6434 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b))); 6435 const vuint8mf8_t nexH = ShiftRight<1>(neHL); 6436 const vuint8mf8_t result0L = detail::AndS(Or(neHL, nexH), 0x55); 6437 const vuint8mf8_t resultH0 = Add(result0L, result0L); 6438 auto du8m1 = ScalableTag<uint8_t>{}; 6439 return detail::U8MaskBitsVecToMask( 6440 d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0))); 6441 } 6442 6443 #else 6444 6445 template <class D> 6446 HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) { 6447 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6448 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 6449 const VFromD<D> neLH = Reverse2(d, neHL); 6450 // b(267743505): Clang compiler bug, workaround is DoNotOptimize 6451 asm volatile("" : : "r,m"(GetLane(neLH)) : "memory"); 6452 return MaskFromVec(Or(neHL, neLH)); 6453 } 6454 6455 #endif 6456 6457 // ------------------------------ Ne128Upper 6458 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400 6459 6460 template <class D> 6461 HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 6462 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6463 auto du8mf8 = ScalableTag<uint8_t, -3>{}; 6464 const vuint8mf8_t neHL = 6465 detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b))); 6466 const vuint8mf8_t neHx = detail::AndS(neHL, 0xaa); 6467 const vuint8mf8_t nexL = ShiftRight<1>(neHx); 6468 auto du8m1 = ScalableTag<uint8_t>{}; 6469 return detail::U8MaskBitsVecToMask(d, 6470 detail::ChangeLMUL(du8m1, Or(neHx, nexL))); 6471 } 6472 6473 #else 6474 6475 template <class D> 6476 HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 6477 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 6478 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 6479 const VFromD<D> down = detail::Slide1Down(neHL); 6480 // b(267743505): Clang compiler bug, workaround is DoNotOptimize 6481 asm volatile("" : : "r,m"(GetLane(down)) : "memory"); 6482 // Replicate H to its neighbor. 6483 return MaskFromVec(OddEven(neHL, down)); 6484 } 6485 6486 #endif 6487 6488 // ------------------------------ Min128, Max128 (Lt128) 6489 6490 template <class D> 6491 HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) { 6492 const VFromD<D> aXH = detail::Slide1Down(a); 6493 const VFromD<D> bXH = detail::Slide1Down(b); 6494 const VFromD<D> minHL = Min(a, b); 6495 const MFromD<D> ltXH = Lt(aXH, bXH); 6496 const MFromD<D> eqXH = Eq(aXH, bXH); 6497 // If the upper lane is the decider, take lo from the same reg. 6498 const VFromD<D> lo = IfThenElse(ltXH, a, b); 6499 // The upper lane is just minHL; if they are equal, we also need to use the 6500 // actual min of the lower lanes. 6501 return OddEven(minHL, IfThenElse(eqXH, minHL, lo)); 6502 } 6503 6504 template <class D> 6505 HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) { 6506 const VFromD<D> aXH = detail::Slide1Down(a); 6507 const VFromD<D> bXH = detail::Slide1Down(b); 6508 const VFromD<D> maxHL = Max(a, b); 6509 const MFromD<D> ltXH = Lt(aXH, bXH); 6510 const MFromD<D> eqXH = Eq(aXH, bXH); 6511 // If the upper lane is the decider, take lo from the same reg. 6512 const VFromD<D> lo = IfThenElse(ltXH, b, a); 6513 // The upper lane is just maxHL; if they are equal, we also need to use the 6514 // actual min of the lower lanes. 6515 return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo)); 6516 } 6517 6518 template <class D> 6519 HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { 6520 return IfThenElse(Lt128Upper(d, a, b), a, b); 6521 } 6522 6523 template <class D> 6524 HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { 6525 return IfThenElse(Lt128Upper(d, b, a), a, b); 6526 } 6527 6528 // ================================================== END MACROS 6529 #undef HWY_RVV_AVL 6530 #undef HWY_RVV_D 6531 #undef HWY_RVV_FOREACH 6532 #undef HWY_RVV_FOREACH_08_ALL 6533 #undef HWY_RVV_FOREACH_08_ALL_VIRT 6534 #undef HWY_RVV_FOREACH_08_DEMOTE 6535 #undef HWY_RVV_FOREACH_08_DEMOTE_VIRT 6536 #undef HWY_RVV_FOREACH_08_EXT 6537 #undef HWY_RVV_FOREACH_08_EXT_VIRT 6538 #undef HWY_RVV_FOREACH_08_TRUNC 6539 #undef HWY_RVV_FOREACH_08_VIRT 6540 #undef HWY_RVV_FOREACH_16_ALL 6541 #undef HWY_RVV_FOREACH_16_ALL_VIRT 6542 #undef HWY_RVV_FOREACH_16_DEMOTE 6543 #undef HWY_RVV_FOREACH_16_DEMOTE_VIRT 6544 #undef HWY_RVV_FOREACH_16_EXT 6545 #undef HWY_RVV_FOREACH_16_EXT_VIRT 6546 #undef HWY_RVV_FOREACH_16_TRUNC 6547 #undef HWY_RVV_FOREACH_16_VIRT 6548 #undef HWY_RVV_FOREACH_32_ALL 6549 #undef HWY_RVV_FOREACH_32_ALL_VIRT 6550 #undef HWY_RVV_FOREACH_32_DEMOTE 6551 #undef HWY_RVV_FOREACH_32_DEMOTE_VIRT 6552 #undef HWY_RVV_FOREACH_32_EXT 6553 #undef HWY_RVV_FOREACH_32_EXT_VIRT 6554 #undef HWY_RVV_FOREACH_32_TRUNC 6555 #undef HWY_RVV_FOREACH_32_VIRT 6556 #undef HWY_RVV_FOREACH_64_ALL 6557 #undef HWY_RVV_FOREACH_64_ALL_VIRT 6558 #undef HWY_RVV_FOREACH_64_DEMOTE 6559 #undef HWY_RVV_FOREACH_64_DEMOTE_VIRT 6560 #undef HWY_RVV_FOREACH_64_EXT 6561 #undef HWY_RVV_FOREACH_64_EXT_VIRT 6562 #undef HWY_RVV_FOREACH_64_TRUNC 6563 #undef HWY_RVV_FOREACH_64_VIRT 6564 #undef HWY_RVV_FOREACH_B 6565 #undef HWY_RVV_FOREACH_F 6566 #undef HWY_RVV_FOREACH_F16 6567 #undef HWY_RVV_FOREACH_F32 6568 #undef HWY_RVV_FOREACH_F3264 6569 #undef HWY_RVV_FOREACH_F64 6570 #undef HWY_RVV_FOREACH_I 6571 #undef HWY_RVV_FOREACH_I08 6572 #undef HWY_RVV_FOREACH_I16 6573 #undef HWY_RVV_FOREACH_I163264 6574 #undef HWY_RVV_FOREACH_I32 6575 #undef HWY_RVV_FOREACH_I64 6576 #undef HWY_RVV_FOREACH_U 6577 #undef HWY_RVV_FOREACH_U08 6578 #undef HWY_RVV_FOREACH_U16 6579 #undef HWY_RVV_FOREACH_U163264 6580 #undef HWY_RVV_FOREACH_U32 6581 #undef HWY_RVV_FOREACH_U64 6582 #undef HWY_RVV_FOREACH_UI 6583 #undef HWY_RVV_FOREACH_UI08 6584 #undef HWY_RVV_FOREACH_UI16 6585 #undef HWY_RVV_FOREACH_UI163264 6586 #undef HWY_RVV_FOREACH_UI32 6587 #undef HWY_RVV_FOREACH_UI3264 6588 #undef HWY_RVV_FOREACH_UI64 6589 #undef HWY_RVV_IF_EMULATED_D 6590 #undef HWY_RVV_IF_CAN128_D 6591 #undef HWY_RVV_IF_GE128_D 6592 #undef HWY_RVV_IF_LT128_D 6593 #undef HWY_RVV_INSERT_VXRM 6594 #undef HWY_RVV_M 6595 #undef HWY_RVV_RETM_ARGM 6596 #undef HWY_RVV_RETV_ARGMVV 6597 #undef HWY_RVV_RETV_ARGV 6598 #undef HWY_RVV_RETV_ARGVS 6599 #undef HWY_RVV_RETV_ARGVV 6600 #undef HWY_RVV_T 6601 #undef HWY_RVV_V 6602 // NOLINTNEXTLINE(google-readability-namespace-comments) 6603 } // namespace HWY_NAMESPACE 6604 } // namespace hwy 6605 HWY_AFTER_NAMESPACE();