loongarch_lasx-inl.h (179443B)
1 // Copyright 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // 256-bit LASX vectors and operations. 16 // External include guard in highway.h - see comment there. 17 18 #include "hwy/ops/loongarch_lsx-inl.h" 19 #include "hwy/ops/shared-inl.h" 20 21 #ifndef __loongarch_asx 22 // If LASX is to be runtime dispatched (instead of in baseline), we need 23 // to enable it *and* define __loongarch_asx or the intrinsic header will 24 // fail to compile. 25 // 26 // For consistency, the same pattern as the lsxintrin.h handling in 27 // loongarch_lsx-inl.h is used (instead of moving lasxintrin.h after 28 // HWY_BEFORE_NAMESPACE). 29 HWY_PUSH_ATTRIBUTES("lsx,lasx") 30 #define __loongarch_asx 31 #include <lasxintrin.h> 32 #undef __loongarch_asx 33 // Prevent "unused push_attribute" warning from Clang. 34 HWY_MAYBE_UNUSED static void HWY_CONCAT(hwy_lasx_dummy, __COUNTER__) () {} 35 HWY_POP_ATTRIBUTES 36 #else 37 #include <lasxintrin.h> 38 #endif 39 40 HWY_BEFORE_NAMESPACE(); 41 namespace hwy { 42 namespace HWY_NAMESPACE { 43 namespace detail { 44 45 template <typename T> 46 struct Raw256 { 47 using type = __m256i; 48 }; 49 template <> 50 struct Raw256<float> { 51 using type = __m256; 52 }; 53 template <> 54 struct Raw256<double> { 55 using type = __m256d; 56 }; 57 58 } // namespace detail 59 60 template <typename T> 61 class Vec256 { 62 using Raw = typename detail::Raw256<T>::type; 63 64 public: 65 using PrivateT = T; // only for DFromV 66 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV 67 68 // Compound assignment. Only usable if there is a corresponding non-member 69 // binary operator overload. For example, only f32 and f64 support division. 70 HWY_INLINE Vec256& operator*=(const Vec256 other) { 71 return *this = (*this * other); 72 } 73 HWY_INLINE Vec256& operator/=(const Vec256 other) { 74 return *this = (*this / other); 75 } 76 HWY_INLINE Vec256& operator+=(const Vec256 other) { 77 return *this = (*this + other); 78 } 79 HWY_INLINE Vec256& operator-=(const Vec256 other) { 80 return *this = (*this - other); 81 } 82 HWY_INLINE Vec256& operator%=(const Vec256 other) { 83 return *this = (*this % other); 84 } 85 HWY_INLINE Vec256& operator&=(const Vec256 other) { 86 return *this = (*this & other); 87 } 88 HWY_INLINE Vec256& operator|=(const Vec256 other) { 89 return *this = (*this | other); 90 } 91 HWY_INLINE Vec256& operator^=(const Vec256 other) { 92 return *this = (*this ^ other); 93 } 94 95 Raw raw; 96 }; 97 98 namespace detail { 99 100 template <typename T> 101 using RawMask256 = typename Raw256<T>::type; 102 103 } // namespace detail 104 105 template <typename T> 106 struct Mask256 { 107 using Raw = typename detail::RawMask256<T>; 108 109 using PrivateT = T; // only for DFromM 110 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM 111 112 Raw raw; 113 }; 114 115 template <typename T> 116 using Full256 = Simd<T, 32 / sizeof(T), 0>; 117 118 // ------------------------------ Zero 119 120 // Cannot use VFromD here because it is defined in terms of Zero. 121 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 122 HWY_API Vec256<TFromD<D>> Zero(D /* tag */) { 123 return Vec256<TFromD<D>>{__lasx_xvreplgr2vr_d(0)}; 124 } 125 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)> 126 HWY_API Vec256<bfloat16_t> Zero(D /* tag */) { 127 return Vec256<bfloat16_t>{__lasx_xvreplgr2vr_d(0)}; 128 } 129 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F16_D(D)> 130 HWY_API Vec256<float16_t> Zero(D /* tag */) { 131 return Vec256<float16_t>{__lasx_xvreplgr2vr_d(0)}; 132 } 133 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 134 HWY_API Vec256<float> Zero(D /* tag */) { 135 return Vec256<float>{reinterpret_cast<__m256>(__lasx_xvreplgr2vr_d(0))}; 136 } 137 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 138 HWY_API Vec256<double> Zero(D /* tag */) { 139 return Vec256<double>{reinterpret_cast<__m256d>(__lasx_xvreplgr2vr_d(0))}; 140 } 141 142 // ------------------------------ BitCast 143 144 namespace detail { 145 146 HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; } 147 HWY_INLINE __m256i BitCastToInteger(__m256 v) { 148 return reinterpret_cast<__m256i>(v); 149 } 150 HWY_INLINE __m256i BitCastToInteger(__m256d v) { 151 return reinterpret_cast<__m256i>(v); 152 } 153 154 template <typename T> 155 HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) { 156 return Vec256<uint8_t>{BitCastToInteger(v.raw)}; 157 } 158 159 // Cannot rely on function overloading because return types differ. 160 template <typename T> 161 struct BitCastFromInteger256 { 162 HWY_INLINE __m256i operator()(__m256i v) { return v; } 163 }; 164 template <> 165 struct BitCastFromInteger256<float> { 166 HWY_INLINE __m256 operator()(__m256i v) { 167 return reinterpret_cast<__m256>(v); 168 } 169 }; 170 template <> 171 struct BitCastFromInteger256<double> { 172 HWY_INLINE __m256d operator()(__m256i v) { 173 return reinterpret_cast<__m256d>(v); 174 } 175 }; 176 177 template <class D, HWY_IF_V_SIZE_D(D, 32)> 178 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, Vec256<uint8_t> v) { 179 return VFromD<D>{BitCastFromInteger256<TFromD<D>>()(v.raw)}; 180 } 181 182 } // namespace detail 183 184 template <class D, HWY_IF_V_SIZE_D(D, 32), typename FromT> 185 HWY_API VFromD<D> BitCast(D d, Vec256<FromT> v) { 186 return detail::BitCastFromByte(d, detail::BitCastToByte(v)); 187 } 188 189 // ------------------------------ Set 190 191 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 192 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 193 return VFromD<D>{__lasx_xvreplgr2vr_b(static_cast<char>(t))}; // NOLINT 194 } 195 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)> 196 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 197 return VFromD<D>{__lasx_xvreplgr2vr_h(static_cast<short>(t))}; // NOLINT 198 } 199 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> 200 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 201 return VFromD<D>{__lasx_xvreplgr2vr_w(static_cast<int>(t))}; 202 } 203 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> 204 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 205 return VFromD<D>{__lasx_xvreplgr2vr_d(static_cast<long long>(t))}; // NOLINT 206 } 207 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 208 HWY_API Vec256<float> Set(D /* tag */, float t) { 209 return BitCast(D(), Vec256<int32_t>{__lasx_xvldrepl_w(&t, 0)}); 210 } 211 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 212 HWY_API Vec256<double> Set(D /* tag */, double t) { 213 return BitCast(D(), Vec256<int64_t>{__lasx_xvldrepl_d(&t, 0)}); 214 } 215 216 // ------------------------------ ResizeBitCast 217 218 // 32-byte vector to 32-byte vector 219 template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16), 220 HWY_IF_V_SIZE_D(D, HWY_MAX_LANES_V(FromV) * sizeof(TFromV<FromV>))> 221 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 222 return BitCast(d, v); 223 } 224 225 // 32-byte vector to 16-byte vector 226 template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16), 227 HWY_IF_V_SIZE_D(D, 16)> 228 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 229 const DFromV<decltype(v)> d_from; 230 const Half<decltype(d_from)> dh_from; 231 return BitCast(d, LowerHalf(dh_from, v)); 232 } 233 234 // 32-byte vector to <= 8-byte vector 235 template <class D, class FromV, HWY_IF_V_SIZE_GT_V(FromV, 16), 236 HWY_IF_V_SIZE_LE_D(D, 8)> 237 HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) { 238 return VFromD<D>{ResizeBitCast(Full128<TFromD<D>>(), v).raw}; 239 } 240 241 // <= 16-byte vector to 32-byte vector 242 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), 243 HWY_IF_V_SIZE_D(D, 32)> 244 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 245 typedef uint64_t GccRawU64M128Vec __attribute__((__vector_size__(16))); 246 247 const GccRawU64M128Vec raw_v0 = reinterpret_cast<GccRawU64M128Vec>(v.raw); 248 #if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_nondeterministic_value) 249 const GccRawU64M128Vec raw_v1 = __builtin_nondeterministic_value(raw_v0); 250 #else 251 const GccRawU64M128Vec raw_v1 = raw_v0; 252 #endif 253 254 const Repartition<uint64_t, decltype(d)> du64; 255 const Half<decltype(du64)> dh_u64; 256 return BitCast( 257 d, 258 Combine(du64, VFromD<decltype(dh_u64)>{reinterpret_cast<__m128i>(raw_v1)}, 259 VFromD<decltype(dh_u64)>{reinterpret_cast<__m128i>(raw_v0)})); 260 } 261 262 // ------------------------------ Dup128VecFromValues 263 264 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 32)> 265 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 266 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 267 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 268 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 269 TFromD<D> t11, TFromD<D> t12, 270 TFromD<D> t13, TFromD<D> t14, 271 TFromD<D> t15) { 272 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(32))); 273 GccI8RawVectType raw_i8_vec = { 274 static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2), 275 static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5), 276 static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8), 277 static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11), 278 static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14), 279 static_cast<char>(t15), static_cast<char>(t0), static_cast<char>(t1), 280 static_cast<char>(t2), static_cast<char>(t3), static_cast<char>(t4), 281 static_cast<char>(t5), static_cast<char>(t6), static_cast<char>(t7), 282 static_cast<char>(t8), static_cast<char>(t9), static_cast<char>(t10), 283 static_cast<char>(t11), static_cast<char>(t12), static_cast<char>(t13), 284 static_cast<char>(t14), static_cast<char>(t15)}; 285 return VFromD<D>{reinterpret_cast<__m256i>(raw_i8_vec)}; 286 } 287 288 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 32)> 289 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 290 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 291 TFromD<D> t5, TFromD<D> t6, 292 TFromD<D> t7) { 293 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(32))); 294 GccI16RawVectType raw_i16_vec = { 295 static_cast<int16_t>(t0), static_cast<int16_t>(t1), 296 static_cast<int16_t>(t2), static_cast<int16_t>(t3), 297 static_cast<int16_t>(t4), static_cast<int16_t>(t5), 298 static_cast<int16_t>(t6), static_cast<int16_t>(t7), 299 static_cast<int16_t>(t0), static_cast<int16_t>(t1), 300 static_cast<int16_t>(t2), static_cast<int16_t>(t3), 301 static_cast<int16_t>(t4), static_cast<int16_t>(t5), 302 static_cast<int16_t>(t6), static_cast<int16_t>(t7)}; 303 return VFromD<D>{reinterpret_cast<__m256i>(raw_i16_vec)}; 304 } 305 306 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 32)> 307 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 308 TFromD<D> t2, TFromD<D> t3) { 309 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(32))); 310 GccI32RawVectType raw_i32_vec = { 311 static_cast<int32_t>(t0), static_cast<int32_t>(t1), 312 static_cast<int32_t>(t2), static_cast<int32_t>(t3), 313 static_cast<int32_t>(t0), static_cast<int32_t>(t1), 314 static_cast<int32_t>(t2), static_cast<int32_t>(t3)}; 315 return VFromD<D>{reinterpret_cast<__m256i>(raw_i32_vec)}; 316 } 317 318 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 32)> 319 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 320 TFromD<D> t2, TFromD<D> t3) { 321 typedef float GccF32RawVectType __attribute__((__vector_size__(32))); 322 GccF32RawVectType raw_f32_vec = {t0, t1, t2, t3, t0, t1, t2, t3}; 323 return Vec256<float>{reinterpret_cast<__m256>(raw_f32_vec)}; 324 } 325 326 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 32)> 327 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 328 typedef int64_t GccI64RawVectType __attribute__((__vector_size__(32))); 329 const GccI64RawVectType raw_i64_vec = { 330 static_cast<int64_t>(t0), static_cast<int64_t>(t1), 331 static_cast<int64_t>(t0), static_cast<int64_t>(t1)}; 332 return VFromD<D>{reinterpret_cast<__m256i>(raw_i64_vec)}; 333 } 334 335 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 32)> 336 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 337 typedef double GccF64RawVectType __attribute__((__vector_size__(32))); 338 const GccF64RawVectType raw_f64_vec = {t0, t1, t0, t1}; 339 return VFromD<D>{reinterpret_cast<__m256d>(raw_f64_vec)}; 340 } 341 342 // ------------------------------ And 343 344 template <typename T> 345 HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) { 346 const DFromV<decltype(a)> d; // for float16_t 347 const RebindToUnsigned<decltype(d)> du; 348 return BitCast(d, VFromD<decltype(du)>{__lasx_xvand_v(BitCast(du, a).raw, 349 BitCast(du, b).raw)}); 350 } 351 352 // ------------------------------ AndNot 353 354 // Returns ~not_mask & mask. 355 template <typename T> 356 HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) { 357 const DFromV<decltype(mask)> d; // for float16_t 358 const RebindToUnsigned<decltype(d)> du; 359 return BitCast(d, VFromD<decltype(du)>{__lasx_xvandn_v( 360 BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); 361 } 362 363 // ------------------------------ Or 364 365 template <typename T> 366 HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) { 367 const DFromV<decltype(a)> d; // for float16_t 368 const RebindToUnsigned<decltype(d)> du; 369 return BitCast(d, VFromD<decltype(du)>{ 370 __lasx_xvor_v(BitCast(du, a).raw, BitCast(du, b).raw)}); 371 } 372 373 // ------------------------------ Xor 374 375 template <typename T> 376 HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) { 377 const DFromV<decltype(a)> d; // for float16_t 378 const RebindToUnsigned<decltype(d)> du; 379 return BitCast(d, VFromD<decltype(du)>{__lasx_xvxor_v(BitCast(du, a).raw, 380 BitCast(du, b).raw)}); 381 } 382 383 // ------------------------------ Not 384 template <typename T> 385 HWY_API Vec256<T> Not(const Vec256<T> v) { 386 const DFromV<decltype(v)> d; 387 const RebindToUnsigned<decltype(d)> du; 388 return BitCast(d, VFromD<decltype(du)>{__lasx_xvnor_v(BitCast(du, v).raw, 389 BitCast(du, v).raw)}); 390 } 391 392 // ------------------------------ Xor3 393 template <typename T> 394 HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) { 395 return Xor(x1, Xor(x2, x3)); 396 } 397 398 // ------------------------------ Or3 399 template <typename T> 400 HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) { 401 return Or(o1, Or(o2, o3)); 402 } 403 404 // ------------------------------ OrAnd 405 template <typename T> 406 HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) { 407 return Or(o, And(a1, a2)); 408 } 409 410 // ------------------------------ IfVecThenElse 411 template <typename T> 412 HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) { 413 return IfThenElse(MaskFromVec(mask), yes, no); 414 } 415 416 // ------------------------------ Operator overloads (internal-only if float) 417 418 template <typename T> 419 HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) { 420 return And(a, b); 421 } 422 423 template <typename T> 424 HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) { 425 return Or(a, b); 426 } 427 428 template <typename T> 429 HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) { 430 return Xor(a, b); 431 } 432 433 // ------------------------------ PopulationCount 434 435 namespace detail { 436 437 template <typename T> 438 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec256<T> v) { 439 return Vec256<T>{__lasx_xvpcnt_b(v.raw)}; 440 } 441 template <typename T> 442 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec256<T> v) { 443 return Vec256<T>{__lasx_xvpcnt_h(v.raw)}; 444 } 445 template <typename T> 446 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec256<T> v) { 447 return Vec256<T>{__lasx_xvpcnt_w(v.raw)}; 448 } 449 template <typename T> 450 HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec256<T> v) { 451 return Vec256<T>{__lasx_xvpcnt_d(v.raw)}; 452 } 453 454 } // namespace detail 455 456 template <typename T> 457 HWY_API Vec256<T> PopulationCount(Vec256<T> v) { 458 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); 459 } 460 461 // ------------------------------ Mask 462 463 // Mask and Vec are the same (true = FF..FF). 464 template <typename T> 465 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) { 466 return Mask256<T>{v.raw}; 467 } 468 469 template <typename T> 470 HWY_API Vec256<T> VecFromMask(const Mask256<T> v) { 471 return Vec256<T>{v.raw}; 472 } 473 474 // ------------------------------ IfThenElse 475 476 // mask ? yes : no 477 template <typename T> 478 HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) { 479 const DFromV<decltype(yes)> d; 480 RebindToSigned<decltype(d)> di; 481 return BitCast(d, VFromD<decltype(di)>{__lasx_xvbitsel_v( 482 BitCast(di, no).raw, BitCast(di, yes).raw, 483 RebindMask(di, mask).raw)}); 484 } 485 486 // mask ? yes : 0 487 template <typename T> 488 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) { 489 return yes & VecFromMask(mask); 490 } 491 492 // mask ? 0 : no 493 template <typename T> 494 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) { 495 return AndNot(VecFromMask(mask), no); 496 } 497 498 template <typename T> 499 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) { 500 static_assert(IsSigned<T>(), "Only for float"); 501 const DFromV<decltype(v)> d; 502 const auto zero = Zero(d); 503 return IfThenElse(v < zero, zero, v); 504 } 505 506 // ------------------------------ Mask logical 507 508 template <typename T> 509 HWY_API Mask256<T> Not(const Mask256<T> m) { 510 const Full256<T> d; 511 return MaskFromVec(Not(VecFromMask(d, m))); 512 } 513 514 template <typename T> 515 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) { 516 const Full256<T> d; 517 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 518 } 519 520 template <typename T> 521 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) { 522 const Full256<T> d; 523 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 524 } 525 526 template <typename T> 527 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) { 528 const Full256<T> d; 529 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 530 } 531 532 template <typename T> 533 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) { 534 const Full256<T> d; 535 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 536 } 537 538 template <typename T> 539 HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) { 540 const Full256<T> d; 541 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 542 } 543 544 // ================================================== COMPARE 545 546 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 547 548 template <class DTo, HWY_IF_V_SIZE_D(DTo, 32), typename TFrom> 549 HWY_API MFromD<DTo> RebindMask(DTo d_to, Mask256<TFrom> m) { 550 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 551 const Full256<TFrom> dfrom; 552 return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m))); 553 } 554 555 template <typename T> 556 HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) { 557 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 558 return (v & bit) == bit; 559 } 560 561 // ------------------------------ Equality 562 563 template <typename T, HWY_IF_T_SIZE(T, 1)> 564 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) { 565 return Mask256<T>{__lasx_xvseq_b(a.raw, b.raw)}; 566 } 567 568 template <typename T, HWY_IF_UI16(T)> 569 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) { 570 return Mask256<T>{__lasx_xvseq_h(a.raw, b.raw)}; 571 } 572 573 template <typename T, HWY_IF_UI32(T)> 574 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) { 575 return Mask256<T>{__lasx_xvseq_w(a.raw, b.raw)}; 576 } 577 578 template <typename T, HWY_IF_UI64(T)> 579 HWY_API Mask256<T> operator==(Vec256<T> a, Vec256<T> b) { 580 return Mask256<T>{__lasx_xvseq_d(a.raw, b.raw)}; 581 } 582 583 HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) { 584 const DFromV<decltype(a)> d; 585 const RebindToSigned<decltype(d)> di; 586 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_ceq_s(a.raw, b.raw)}); 587 } 588 589 HWY_API Mask256<double> operator==(Vec256<double> a, Vec256<double> b) { 590 const DFromV<decltype(a)> d; 591 const RebindToSigned<decltype(d)> di; 592 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_ceq_d(a.raw, b.raw)}); 593 } 594 595 // ------------------------------ Inequality 596 597 template <typename T, HWY_IF_NOT_FLOAT3264(T)> 598 HWY_API Mask256<T> operator!=(Vec256<T> a, Vec256<T> b) { 599 return Not(a == b); 600 } 601 HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) { 602 const DFromV<decltype(a)> d; 603 const RebindToSigned<decltype(d)> di; 604 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cne_s(a.raw, b.raw)}); 605 } 606 HWY_API Mask256<double> operator!=(Vec256<double> a, Vec256<double> b) { 607 const DFromV<decltype(a)> d; 608 const RebindToSigned<decltype(d)> di; 609 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cne_d(a.raw, b.raw)}); 610 } 611 612 // ------------------------------ Strict inequality 613 614 namespace detail { 615 616 HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a, 617 Vec256<int8_t> b) { 618 return Mask256<int8_t>{__lasx_xvslt_b(b.raw, a.raw)}; 619 } 620 HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a, 621 Vec256<int16_t> b) { 622 return Mask256<int16_t>{__lasx_xvslt_h(b.raw, a.raw)}; 623 } 624 HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a, 625 Vec256<int32_t> b) { 626 return Mask256<int32_t>{__lasx_xvslt_w(b.raw, a.raw)}; 627 } 628 HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a, 629 Vec256<int64_t> b) { 630 return Mask256<int64_t>{__lasx_xvslt_d(b.raw, a.raw)}; 631 } 632 633 HWY_API Mask256<uint8_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint8_t> a, 634 Vec256<uint8_t> b) { 635 return Mask256<uint8_t>{__lasx_xvslt_bu(b.raw, a.raw)}; 636 } 637 HWY_API Mask256<uint16_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> a, 638 Vec256<uint16_t> b) { 639 return Mask256<uint16_t>{__lasx_xvslt_hu(b.raw, a.raw)}; 640 } 641 HWY_API Mask256<uint32_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> a, 642 Vec256<uint32_t> b) { 643 return Mask256<uint32_t>{__lasx_xvslt_wu(b.raw, a.raw)}; 644 } 645 HWY_API Mask256<uint64_t> Gt(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> a, 646 Vec256<uint64_t> b) { 647 return Mask256<uint64_t>{__lasx_xvslt_du(b.raw, a.raw)}; 648 } 649 650 HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a, 651 Vec256<float> b) { 652 const DFromV<decltype(a)> d; 653 const RebindToSigned<decltype(d)> di; 654 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_clt_s(b.raw, a.raw)}); 655 } 656 HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a, 657 Vec256<double> b) { 658 const DFromV<decltype(a)> d; 659 const RebindToSigned<decltype(d)> di; 660 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_clt_d(b.raw, a.raw)}); 661 } 662 663 } // namespace detail 664 665 template <typename T> 666 HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) { 667 return detail::Gt(hwy::TypeTag<T>(), a, b); 668 } 669 670 // ------------------------------ Weak inequality 671 672 namespace detail { 673 674 template <typename T> 675 HWY_INLINE Mask256<T> Ge(hwy::SignedTag /*tag*/, Vec256<T> a, Vec256<T> b) { 676 return Not(b > a); 677 } 678 679 template <typename T> 680 HWY_INLINE Mask256<T> Ge(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) { 681 return Not(b > a); 682 } 683 684 HWY_INLINE Mask256<float> Ge(hwy::FloatTag /*tag*/, Vec256<float> a, 685 Vec256<float> b) { 686 const DFromV<decltype(a)> d; 687 const RebindToSigned<decltype(d)> di; 688 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cle_s(b.raw, a.raw)}); 689 } 690 HWY_INLINE Mask256<double> Ge(hwy::FloatTag /*tag*/, Vec256<double> a, 691 Vec256<double> b) { 692 const DFromV<decltype(a)> d; 693 const RebindToSigned<decltype(d)> di; 694 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cle_d(b.raw, a.raw)}); 695 } 696 697 } // namespace detail 698 699 template <typename T> 700 HWY_API Mask256<T> operator>=(Vec256<T> a, Vec256<T> b) { 701 return detail::Ge(hwy::TypeTag<T>(), a, b); 702 } 703 704 // ------------------------------ Reversed comparisons 705 706 template <typename T> 707 HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) { 708 return b > a; 709 } 710 711 template <typename T> 712 HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) { 713 return b >= a; 714 } 715 716 // ------------------------------ Min (Gt, IfThenElse) 717 718 // Unsigned 719 HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) { 720 return Vec256<uint8_t>{__lasx_xvmin_bu(a.raw, b.raw)}; 721 } 722 HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a, 723 const Vec256<uint16_t> b) { 724 return Vec256<uint16_t>{__lasx_xvmin_hu(a.raw, b.raw)}; 725 } 726 HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a, 727 const Vec256<uint32_t> b) { 728 return Vec256<uint32_t>{__lasx_xvmin_wu(a.raw, b.raw)}; 729 } 730 HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a, 731 const Vec256<uint64_t> b) { 732 return Vec256<uint64_t>{__lasx_xvmin_du(a.raw, b.raw)}; 733 } 734 735 // Signed 736 HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) { 737 return Vec256<int8_t>{__lasx_xvmin_b(a.raw, b.raw)}; 738 } 739 HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) { 740 return Vec256<int16_t>{__lasx_xvmin_h(a.raw, b.raw)}; 741 } 742 HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) { 743 return Vec256<int32_t>{__lasx_xvmin_w(a.raw, b.raw)}; 744 } 745 HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) { 746 return Vec256<int64_t>{__lasx_xvmin_d(a.raw, b.raw)}; 747 } 748 749 // Float 750 HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) { 751 return Vec256<float>{__lasx_xvfmin_s(a.raw, b.raw)}; 752 } 753 HWY_API Vec256<double> Min(const Vec256<double> a, const Vec256<double> b) { 754 return Vec256<double>{__lasx_xvfmin_d(a.raw, b.raw)}; 755 } 756 757 // ------------------------------ Max (Gt, IfThenElse) 758 759 // Unsigned 760 HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) { 761 return Vec256<uint8_t>{__lasx_xvmax_bu(a.raw, b.raw)}; 762 } 763 HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a, 764 const Vec256<uint16_t> b) { 765 return Vec256<uint16_t>{__lasx_xvmax_hu(a.raw, b.raw)}; 766 } 767 HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a, 768 const Vec256<uint32_t> b) { 769 return Vec256<uint32_t>{__lasx_xvmax_wu(a.raw, b.raw)}; 770 } 771 HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a, 772 const Vec256<uint64_t> b) { 773 return Vec256<uint64_t>{__lasx_xvmax_du(a.raw, b.raw)}; 774 } 775 776 // Signed 777 HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) { 778 return Vec256<int8_t>{__lasx_xvmax_b(a.raw, b.raw)}; 779 } 780 HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) { 781 return Vec256<int16_t>{__lasx_xvmax_h(a.raw, b.raw)}; 782 } 783 HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) { 784 return Vec256<int32_t>{__lasx_xvmax_w(a.raw, b.raw)}; 785 } 786 HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) { 787 return Vec256<int64_t>{__lasx_xvmax_d(a.raw, b.raw)}; 788 } 789 790 // Float 791 HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) { 792 return Vec256<float>{__lasx_xvfmax_s(a.raw, b.raw)}; 793 } 794 HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) { 795 return Vec256<double>{__lasx_xvfmax_d(a.raw, b.raw)}; 796 } 797 798 // ------------------------------ MinMagnitude and MaxMagnitude 799 800 HWY_API Vec256<float> MinMagnitude(Vec256<float> a, Vec256<float> b) { 801 return Vec256<float>{__lasx_xvfmina_s(a.raw, b.raw)}; 802 } 803 HWY_API Vec256<double> MinMagnitude(Vec256<double> a, Vec256<double> b) { 804 return Vec256<double>{__lasx_xvfmina_d(a.raw, b.raw)}; 805 } 806 807 HWY_API Vec256<float> MaxMagnitude(Vec256<float> a, Vec256<float> b) { 808 return Vec256<float>{__lasx_xvfmaxa_s(a.raw, b.raw)}; 809 } 810 HWY_API Vec256<double> MaxMagnitude(Vec256<double> a, Vec256<double> b) { 811 return Vec256<double>{__lasx_xvfmaxa_d(a.raw, b.raw)}; 812 } 813 814 // ------------------------------ Iota 815 816 namespace detail { 817 818 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 819 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 820 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(32))); 821 const GccI8RawVectType raw_i8_vec = { 822 static_cast<char>(0), static_cast<char>(1), static_cast<char>(2), 823 static_cast<char>(3), static_cast<char>(4), static_cast<char>(5), 824 static_cast<char>(6), static_cast<char>(7), static_cast<char>(8), 825 static_cast<char>(9), static_cast<char>(10), static_cast<char>(11), 826 static_cast<char>(12), static_cast<char>(13), static_cast<char>(14), 827 static_cast<char>(15), static_cast<char>(16), static_cast<char>(17), 828 static_cast<char>(18), static_cast<char>(19), static_cast<char>(20), 829 static_cast<char>(21), static_cast<char>(22), static_cast<char>(23), 830 static_cast<char>(24), static_cast<char>(25), static_cast<char>(26), 831 static_cast<char>(27), static_cast<char>(28), static_cast<char>(29), 832 static_cast<char>(30), static_cast<char>(31)}; 833 return VFromD<D>{reinterpret_cast<__m256i>(raw_i8_vec)}; 834 } 835 836 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI16_D(D)> 837 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 838 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(32))); 839 const GccI16RawVectType raw_i16_vec = { 840 static_cast<int16_t>(0), static_cast<int16_t>(1), 841 static_cast<int16_t>(2), static_cast<int16_t>(3), 842 static_cast<int16_t>(4), static_cast<int16_t>(5), 843 static_cast<int16_t>(6), static_cast<int16_t>(7), 844 static_cast<int16_t>(8), static_cast<int16_t>(9), 845 static_cast<int16_t>(10), static_cast<int16_t>(11), 846 static_cast<int16_t>(12), static_cast<int16_t>(13), 847 static_cast<int16_t>(14), static_cast<int16_t>(15)}; 848 return VFromD<D>{reinterpret_cast<__m256i>(raw_i16_vec)}; 849 } 850 851 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> 852 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 853 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(32))); 854 const GccI32RawVectType raw_i32_vec = { 855 static_cast<int32_t>(0), static_cast<int32_t>(1), static_cast<int32_t>(2), 856 static_cast<int32_t>(3), static_cast<int32_t>(4), static_cast<int32_t>(5), 857 static_cast<int32_t>(6), static_cast<int32_t>(7)}; 858 return VFromD<D>{reinterpret_cast<__m256i>(raw_i32_vec)}; 859 } 860 861 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> 862 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 863 typedef int64_t GccI64RawVectType __attribute__((__vector_size__(32))); 864 const GccI64RawVectType raw_i64_vec = { 865 static_cast<int64_t>(0), static_cast<int64_t>(1), static_cast<int64_t>(2), 866 static_cast<int64_t>(3)}; 867 return VFromD<D>{reinterpret_cast<__m256i>(raw_i64_vec)}; 868 } 869 870 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 871 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 872 typedef float GccF32RawVectType __attribute__((__vector_size__(32))); 873 const GccF32RawVectType raw_f32_vec = {0.0f, 1.0f, 2.0f, 3.0f, 874 4.0f, 5.0f, 6.0f, 7.0f}; 875 return VFromD<D>{reinterpret_cast<__m256>(raw_f32_vec)}; 876 } 877 878 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 879 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 880 typedef double GccF64RawVectType __attribute__((__vector_size__(32))); 881 const GccF64RawVectType raw_f64_vec = {0.0, 1.0, 2.0, 3.0}; 882 return VFromD<D>{reinterpret_cast<__m256d>(raw_f64_vec)}; 883 } 884 885 } // namespace detail 886 887 template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2> 888 HWY_API VFromD<D> Iota(D d, const T2 first) { 889 return detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first)); 890 } 891 892 // ------------------------------ FirstN (Iota, Lt) 893 894 template <class D, HWY_IF_V_SIZE_D(D, 32), class M = MFromD<D>> 895 HWY_API M FirstN(const D d, size_t n) { 896 constexpr size_t kN = MaxLanes(d); 897 n = HWY_MIN(n, kN); 898 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. 899 using TI = TFromD<decltype(di)>; 900 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(n))); 901 } 902 903 // ================================================== ARITHMETIC 904 905 // ------------------------------ Addition 906 907 // Unsigned 908 HWY_API Vec256<uint8_t> operator+(Vec256<uint8_t> a, Vec256<uint8_t> b) { 909 return Vec256<uint8_t>{__lasx_xvadd_b(a.raw, b.raw)}; 910 } 911 HWY_API Vec256<uint16_t> operator+(Vec256<uint16_t> a, Vec256<uint16_t> b) { 912 return Vec256<uint16_t>{__lasx_xvadd_h(a.raw, b.raw)}; 913 } 914 HWY_API Vec256<uint32_t> operator+(Vec256<uint32_t> a, Vec256<uint32_t> b) { 915 return Vec256<uint32_t>{__lasx_xvadd_w(a.raw, b.raw)}; 916 } 917 HWY_API Vec256<uint64_t> operator+(Vec256<uint64_t> a, Vec256<uint64_t> b) { 918 return Vec256<uint64_t>{__lasx_xvadd_d(a.raw, b.raw)}; 919 } 920 921 // Signed 922 HWY_API Vec256<int8_t> operator+(Vec256<int8_t> a, Vec256<int8_t> b) { 923 return Vec256<int8_t>{__lasx_xvadd_b(a.raw, b.raw)}; 924 } 925 HWY_API Vec256<int16_t> operator+(Vec256<int16_t> a, Vec256<int16_t> b) { 926 return Vec256<int16_t>{__lasx_xvadd_h(a.raw, b.raw)}; 927 } 928 HWY_API Vec256<int32_t> operator+(Vec256<int32_t> a, Vec256<int32_t> b) { 929 return Vec256<int32_t>{__lasx_xvadd_w(a.raw, b.raw)}; 930 } 931 HWY_API Vec256<int64_t> operator+(Vec256<int64_t> a, Vec256<int64_t> b) { 932 return Vec256<int64_t>{__lasx_xvadd_d(a.raw, b.raw)}; 933 } 934 935 HWY_API Vec256<float> operator+(Vec256<float> a, Vec256<float> b) { 936 return Vec256<float>{__lasx_xvfadd_s(a.raw, b.raw)}; 937 } 938 HWY_API Vec256<double> operator+(Vec256<double> a, Vec256<double> b) { 939 return Vec256<double>{__lasx_xvfadd_d(a.raw, b.raw)}; 940 } 941 942 template <typename T> 943 HWY_API Vec256<T> Add(Vec256<T> a, Vec256<T> b) { 944 return a + b; 945 } 946 947 // ------------------------------ Subtraction 948 949 // Unsigne 950 HWY_API Vec256<uint8_t> operator-(Vec256<uint8_t> a, Vec256<uint8_t> b) { 951 return Vec256<uint8_t>{__lasx_xvsub_b(a.raw, b.raw)}; 952 } 953 HWY_API Vec256<uint16_t> operator-(Vec256<uint16_t> a, Vec256<uint16_t> b) { 954 return Vec256<uint16_t>{__lasx_xvsub_h(a.raw, b.raw)}; 955 } 956 HWY_API Vec256<uint32_t> operator-(Vec256<uint32_t> a, Vec256<uint32_t> b) { 957 return Vec256<uint32_t>{__lasx_xvsub_w(a.raw, b.raw)}; 958 } 959 HWY_API Vec256<uint64_t> operator-(Vec256<uint64_t> a, Vec256<uint64_t> b) { 960 return Vec256<uint64_t>{__lasx_xvsub_d(a.raw, b.raw)}; 961 } 962 963 // Signed 964 HWY_API Vec256<int8_t> operator-(Vec256<int8_t> a, Vec256<int8_t> b) { 965 return Vec256<int8_t>{__lasx_xvsub_b(a.raw, b.raw)}; 966 } 967 HWY_API Vec256<int16_t> operator-(Vec256<int16_t> a, Vec256<int16_t> b) { 968 return Vec256<int16_t>{__lasx_xvsub_h(a.raw, b.raw)}; 969 } 970 HWY_API Vec256<int32_t> operator-(Vec256<int32_t> a, Vec256<int32_t> b) { 971 return Vec256<int32_t>{__lasx_xvsub_w(a.raw, b.raw)}; 972 } 973 HWY_API Vec256<int64_t> operator-(Vec256<int64_t> a, Vec256<int64_t> b) { 974 return Vec256<int64_t>{__lasx_xvsub_d(a.raw, b.raw)}; 975 } 976 977 HWY_API Vec256<float> operator-(Vec256<float> a, Vec256<float> b) { 978 return Vec256<float>{__lasx_xvfsub_s(a.raw, b.raw)}; 979 } 980 HWY_API Vec256<double> operator-(Vec256<double> a, Vec256<double> b) { 981 return Vec256<double>{__lasx_xvfsub_d(a.raw, b.raw)}; 982 } 983 984 // ------------------------------ SumsOf8 985 HWY_API Vec256<uint64_t> SumsOf8(Vec256<uint8_t> v) { 986 v.raw = __lasx_xvhaddw_hu_bu(v.raw, v.raw); 987 v.raw = __lasx_xvhaddw_wu_hu(v.raw, v.raw); 988 return Vec256<uint64_t>{__lasx_xvhaddw_du_wu(v.raw, v.raw)}; 989 } 990 HWY_API Vec256<int64_t> SumsOf8(Vec256<int8_t> v) { 991 v.raw = __lasx_xvhaddw_h_b(v.raw, v.raw); 992 v.raw = __lasx_xvhaddw_w_h(v.raw, v.raw); 993 return Vec256<int64_t>{__lasx_xvhaddw_d_w(v.raw, v.raw)}; 994 } 995 996 // ------------------------------ SaturatedAdd 997 998 // Returns a + b clamped to the destination range. 999 1000 // Unsigned 1001 HWY_API Vec256<uint8_t> SaturatedAdd(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1002 return Vec256<uint8_t>{__lasx_xvsadd_bu(a.raw, b.raw)}; 1003 } 1004 HWY_API Vec256<uint16_t> SaturatedAdd(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1005 return Vec256<uint16_t>{__lasx_xvsadd_hu(a.raw, b.raw)}; 1006 } 1007 HWY_API Vec256<uint32_t> SaturatedAdd(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1008 return Vec256<uint32_t>{__lasx_xvsadd_wu(a.raw, b.raw)}; 1009 } 1010 HWY_API Vec256<uint64_t> SaturatedAdd(Vec256<uint64_t> a, Vec256<uint64_t> b) { 1011 return Vec256<uint64_t>{__lasx_xvsadd_du(a.raw, b.raw)}; 1012 } 1013 1014 // Signed 1015 HWY_API Vec256<int8_t> SaturatedAdd(Vec256<int8_t> a, Vec256<int8_t> b) { 1016 return Vec256<int8_t>{__lasx_xvsadd_b(a.raw, b.raw)}; 1017 } 1018 HWY_API Vec256<int16_t> SaturatedAdd(Vec256<int16_t> a, Vec256<int16_t> b) { 1019 return Vec256<int16_t>{__lasx_xvsadd_h(a.raw, b.raw)}; 1020 } 1021 HWY_API Vec256<int32_t> SaturatedAdd(Vec256<int32_t> a, Vec256<int32_t> b) { 1022 return Vec256<int32_t>{__lasx_xvsadd_w(a.raw, b.raw)}; 1023 } 1024 HWY_API Vec256<int64_t> SaturatedAdd(Vec256<int64_t> a, Vec256<int64_t> b) { 1025 return Vec256<int64_t>{__lasx_xvsadd_d(a.raw, b.raw)}; 1026 } 1027 1028 // ------------------------------ SaturatedSub 1029 1030 // Returns a - b clamped to the destination range. 1031 1032 // Unsigned 1033 HWY_API Vec256<uint8_t> SaturatedSub(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1034 return Vec256<uint8_t>{__lasx_xvssub_bu(a.raw, b.raw)}; 1035 } 1036 HWY_API Vec256<uint16_t> SaturatedSub(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1037 return Vec256<uint16_t>{__lasx_xvssub_hu(a.raw, b.raw)}; 1038 } 1039 HWY_API Vec256<uint32_t> SaturatedSub(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1040 return Vec256<uint32_t>{__lasx_xvssub_wu(a.raw, b.raw)}; 1041 } 1042 HWY_API Vec256<uint64_t> SaturatedSub(Vec256<uint64_t> a, Vec256<uint64_t> b) { 1043 return Vec256<uint64_t>{__lasx_xvssub_du(a.raw, b.raw)}; 1044 } 1045 1046 // Signed 1047 HWY_API Vec256<int8_t> SaturatedSub(Vec256<int8_t> a, Vec256<int8_t> b) { 1048 return Vec256<int8_t>{__lasx_xvssub_b(a.raw, b.raw)}; 1049 } 1050 HWY_API Vec256<int16_t> SaturatedSub(Vec256<int16_t> a, Vec256<int16_t> b) { 1051 return Vec256<int16_t>{__lasx_xvssub_h(a.raw, b.raw)}; 1052 } 1053 HWY_API Vec256<int32_t> SaturatedSub(Vec256<int32_t> a, Vec256<int32_t> b) { 1054 return Vec256<int32_t>{__lasx_xvssub_w(a.raw, b.raw)}; 1055 } 1056 HWY_API Vec256<int64_t> SaturatedSub(Vec256<int64_t> a, Vec256<int64_t> b) { 1057 return Vec256<int64_t>{__lasx_xvssub_d(a.raw, b.raw)}; 1058 } 1059 1060 // ------------------------------ Average 1061 1062 // Returns (a + b + 1) / 2 1063 1064 // Unsigned 1065 HWY_API Vec256<int8_t> AverageRound(Vec256<int8_t> a, Vec256<int8_t> b) { 1066 return Vec256<int8_t>{__lasx_xvavgr_b(a.raw, b.raw)}; 1067 } 1068 HWY_API Vec256<uint8_t> AverageRound(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1069 return Vec256<uint8_t>{__lasx_xvavgr_bu(a.raw, b.raw)}; 1070 } 1071 HWY_API Vec256<int16_t> AverageRound(Vec256<int16_t> a, Vec256<int16_t> b) { 1072 return Vec256<int16_t>{__lasx_xvavgr_h(a.raw, b.raw)}; 1073 } 1074 HWY_API Vec256<uint16_t> AverageRound(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1075 return Vec256<uint16_t>{__lasx_xvavgr_hu(a.raw, b.raw)}; 1076 } 1077 HWY_API Vec256<int32_t> AverageRound(Vec256<int32_t> a, Vec256<int32_t> b) { 1078 return Vec256<int32_t>{__lasx_xvavgr_w(a.raw, b.raw)}; 1079 } 1080 HWY_API Vec256<uint32_t> AverageRound(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1081 return Vec256<uint32_t>{__lasx_xvavgr_wu(a.raw, b.raw)}; 1082 } 1083 HWY_API Vec256<int64_t> AverageRound(Vec256<int64_t> a, Vec256<int64_t> b) { 1084 return Vec256<int64_t>{__lasx_xvavgr_d(a.raw, b.raw)}; 1085 } 1086 HWY_API Vec256<uint64_t> AverageRound(Vec256<uint64_t> a, Vec256<uint64_t> b) { 1087 return Vec256<uint64_t>{__lasx_xvavgr_du(a.raw, b.raw)}; 1088 } 1089 1090 // ------------------------------ Abs (Sub) 1091 1092 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. 1093 HWY_API Vec256<int8_t> Abs(Vec256<int8_t> v) { 1094 return Vec256<int8_t>{__lasx_xvabsd_b(v.raw, __lasx_xvreplgr2vr_b(0))}; 1095 } 1096 HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) { 1097 return Vec256<int16_t>{__lasx_xvabsd_h(v.raw, __lasx_xvreplgr2vr_h(0))}; 1098 } 1099 HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) { 1100 return Vec256<int32_t>{__lasx_xvabsd_w(v.raw, __lasx_xvreplgr2vr_w(0))}; 1101 } 1102 HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) { 1103 return Vec256<int64_t>{__lasx_xvabsd_d(v.raw, __lasx_xvreplgr2vr_d(0))}; 1104 } 1105 1106 // ------------------------------ Integer AbsDiff 1107 HWY_API Vec256<int8_t> AbsDiff(const Vec256<int8_t> a, Vec256<int8_t> b) { 1108 return Vec256<int8_t>{__lasx_xvabsd_b(a.raw, b.raw)}; 1109 } 1110 HWY_API Vec256<int16_t> AbsDiff(const Vec256<int16_t> a, Vec256<int16_t> b) { 1111 return Vec256<int16_t>{__lasx_xvabsd_h(a.raw, b.raw)}; 1112 } 1113 HWY_API Vec256<int32_t> AbsDiff(const Vec256<int32_t> a, Vec256<int32_t> b) { 1114 return Vec256<int32_t>{__lasx_xvabsd_w(a.raw, b.raw)}; 1115 } 1116 HWY_API Vec256<int64_t> AbsDiff(const Vec256<int64_t> a, Vec256<int64_t> b) { 1117 return Vec256<int64_t>{__lasx_xvabsd_d(a.raw, b.raw)}; 1118 } 1119 1120 HWY_API Vec256<uint8_t> AbsDiff(const Vec256<uint8_t> a, Vec256<uint8_t> b) { 1121 return Vec256<uint8_t>{__lasx_xvabsd_bu(a.raw, b.raw)}; 1122 } 1123 HWY_API Vec256<uint16_t> AbsDiff(const Vec256<uint16_t> a, Vec256<uint16_t> b) { 1124 return Vec256<uint16_t>{__lasx_xvabsd_hu(a.raw, b.raw)}; 1125 } 1126 HWY_API Vec256<uint32_t> AbsDiff(const Vec256<uint32_t> a, Vec256<uint32_t> b) { 1127 return Vec256<uint32_t>{__lasx_xvabsd_wu(a.raw, b.raw)}; 1128 } 1129 HWY_API Vec256<uint64_t> AbsDiff(const Vec256<uint64_t> a, Vec256<uint64_t> b) { 1130 return Vec256<uint64_t>{__lasx_xvabsd_du(a.raw, b.raw)}; 1131 } 1132 1133 // ------------------------------ Integer multiplication 1134 1135 // Unsigned 1136 HWY_API Vec256<uint8_t> operator*(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1137 return Vec256<uint8_t>{__lasx_xvmul_b(a.raw, b.raw)}; 1138 } 1139 HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1140 return Vec256<uint16_t>{__lasx_xvmul_h(a.raw, b.raw)}; 1141 } 1142 HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1143 return Vec256<uint32_t>{__lasx_xvmul_w(a.raw, b.raw)}; 1144 } 1145 HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) { 1146 return Vec256<uint64_t>{__lasx_xvmul_d(a.raw, b.raw)}; 1147 } 1148 1149 // Signed 1150 HWY_API Vec256<int8_t> operator*(Vec256<int8_t> a, Vec256<int8_t> b) { 1151 return Vec256<int8_t>{__lasx_xvmul_b(a.raw, b.raw)}; 1152 } 1153 HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) { 1154 return Vec256<int16_t>{__lasx_xvmul_h(a.raw, b.raw)}; 1155 } 1156 HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) { 1157 return Vec256<int32_t>{__lasx_xvmul_w(a.raw, b.raw)}; 1158 } 1159 HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) { 1160 return Vec256<int64_t>{__lasx_xvmul_d(a.raw, b.raw)}; 1161 } 1162 1163 HWY_API Vec256<uint8_t> MulHigh(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1164 return Vec256<uint8_t>{__lasx_xvmuh_bu(a.raw, b.raw)}; 1165 } 1166 HWY_API Vec256<int8_t> MulHigh(Vec256<int8_t> a, Vec256<int8_t> b) { 1167 return Vec256<int8_t>{__lasx_xvmuh_b(a.raw, b.raw)}; 1168 } 1169 HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1170 return Vec256<uint16_t>{__lasx_xvmuh_hu(a.raw, b.raw)}; 1171 } 1172 HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) { 1173 return Vec256<int16_t>{__lasx_xvmuh_h(a.raw, b.raw)}; 1174 } 1175 HWY_API Vec256<uint32_t> MulHigh(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1176 return Vec256<uint32_t>{__lasx_xvmuh_wu(a.raw, b.raw)}; 1177 } 1178 HWY_API Vec256<int32_t> MulHigh(Vec256<int32_t> a, Vec256<int32_t> b) { 1179 return Vec256<int32_t>{__lasx_xvmuh_w(a.raw, b.raw)}; 1180 } 1181 HWY_API Vec256<uint64_t> MulHigh(Vec256<uint64_t> a, Vec256<uint64_t> b) { 1182 return Vec256<uint64_t>{__lasx_xvmuh_du(a.raw, b.raw)}; 1183 } 1184 HWY_API Vec256<int64_t> MulHigh(Vec256<int64_t> a, Vec256<int64_t> b) { 1185 return Vec256<int64_t>{__lasx_xvmuh_d(a.raw, b.raw)}; 1186 } 1187 1188 // Multiplies even lanes (0, 2 ..) and places the double-wide result into 1189 // even and the upper half into its odd neighbor lane. 1190 HWY_API Vec256<int16_t> MulEven(Vec256<int8_t> a, Vec256<int8_t> b) { 1191 return Vec256<int16_t>{__lasx_xvmulwev_h_b(a.raw, b.raw)}; 1192 } 1193 HWY_API Vec256<uint16_t> MulEven(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1194 return Vec256<uint16_t>{__lasx_xvmulwev_h_bu(a.raw, b.raw)}; 1195 } 1196 HWY_API Vec256<int32_t> MulEven(Vec256<int16_t> a, Vec256<int16_t> b) { 1197 return Vec256<int32_t>{__lasx_xvmulwev_w_h(a.raw, b.raw)}; 1198 } 1199 HWY_API Vec256<uint32_t> MulEven(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1200 return Vec256<uint32_t>{__lasx_xvmulwev_w_hu(a.raw, b.raw)}; 1201 } 1202 HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) { 1203 return Vec256<int64_t>{__lasx_xvmulwev_d_w(a.raw, b.raw)}; 1204 } 1205 HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1206 return Vec256<uint64_t>{__lasx_xvmulwev_d_wu(a.raw, b.raw)}; 1207 } 1208 template <typename T, HWY_IF_I64(T)> 1209 HWY_API Vec256<T> MulEven(Vec256<T> a, Vec256<T> b) { 1210 return Vec256<T>{__lasx_xvmulwev_q_d(a.raw, b.raw)}; 1211 } 1212 template <typename T, HWY_IF_U64(T)> 1213 HWY_API Vec256<T> MulEven(Vec256<T> a, Vec256<T> b) { 1214 return Vec256<T>{__lasx_xvmulwev_q_du(a.raw, b.raw)}; 1215 } 1216 1217 HWY_API Vec256<int16_t> MulOdd(Vec256<int8_t> a, Vec256<int8_t> b) { 1218 return Vec256<int16_t>{__lasx_xvmulwod_h_b(a.raw, b.raw)}; 1219 } 1220 HWY_API Vec256<uint16_t> MulOdd(Vec256<uint8_t> a, Vec256<uint8_t> b) { 1221 return Vec256<uint16_t>{__lasx_xvmulwod_h_bu(a.raw, b.raw)}; 1222 } 1223 HWY_API Vec256<int32_t> MulOdd(Vec256<int16_t> a, Vec256<int16_t> b) { 1224 return Vec256<int32_t>{__lasx_xvmulwod_w_h(a.raw, b.raw)}; 1225 } 1226 HWY_API Vec256<uint32_t> MulOdd(Vec256<uint16_t> a, Vec256<uint16_t> b) { 1227 return Vec256<uint32_t>{__lasx_xvmulwod_w_hu(a.raw, b.raw)}; 1228 } 1229 HWY_API Vec256<int64_t> MulOdd(Vec256<int32_t> a, Vec256<int32_t> b) { 1230 return Vec256<int64_t>{__lasx_xvmulwod_d_w(a.raw, b.raw)}; 1231 } 1232 HWY_API Vec256<uint64_t> MulOdd(Vec256<uint32_t> a, Vec256<uint32_t> b) { 1233 return Vec256<uint64_t>{__lasx_xvmulwod_d_wu(a.raw, b.raw)}; 1234 } 1235 template <typename T, HWY_IF_I64(T)> 1236 HWY_API Vec256<T> MulOdd(Vec256<T> a, Vec256<T> b) { 1237 return Vec256<T>{__lasx_xvmulwod_q_d(a.raw, b.raw)}; 1238 } 1239 template <typename T, HWY_IF_U64(T)> 1240 HWY_API Vec256<T> MulOdd(Vec256<T> a, Vec256<T> b) { 1241 return Vec256<T>{__lasx_xvmulwod_q_du(a.raw, b.raw)}; 1242 } 1243 1244 template <typename T, HWY_IF_I16(T)> 1245 HWY_API Vec256<T> MulFixedPoint15(Vec256<T> a, Vec256<T> b) { 1246 const auto i32_ev = MulEven(a, b); 1247 const auto i32_od = MulOdd(a, b); 1248 const auto i64_lo = InterleaveLower(i32_ev, i32_od); 1249 const auto i64_hi = InterleaveUpper(Full256<int32_t>(), i32_ev, i32_od); 1250 return Vec256<T>{__lasx_xvssrarni_h_w(i64_hi.raw, i64_lo.raw, 15)}; 1251 } 1252 1253 // ------------------------------ Integer division 1254 1255 HWY_API Vec256<int8_t> operator/(const Vec256<int8_t> a, 1256 const Vec256<int8_t> b) { 1257 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1258 // or a[i] == LimitsMin<int8_t>() && b[i] == -1 1259 __m256i raw_result; 1260 __asm__("xvdiv.b %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1261 return Vec256<int8_t>{raw_result}; 1262 } 1263 1264 HWY_API Vec256<uint8_t> operator/(const Vec256<uint8_t> a, 1265 const Vec256<uint8_t> b) { 1266 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1267 __m256i raw_result; 1268 __asm__("xvdiv.bu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1269 return Vec256<uint8_t>{raw_result}; 1270 } 1271 1272 HWY_API Vec256<int16_t> operator/(const Vec256<int16_t> a, 1273 const Vec256<int16_t> b) { 1274 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1275 // or a[i] == LimitsMin<int16_t>() && b[i] == -1 1276 __m256i raw_result; 1277 __asm__("xvdiv.h %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1278 return Vec256<int16_t>{raw_result}; 1279 } 1280 1281 HWY_API Vec256<uint16_t> operator/(const Vec256<uint16_t> a, 1282 const Vec256<uint16_t> b) { 1283 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1284 __m256i raw_result; 1285 __asm__("xvdiv.hu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1286 return Vec256<uint16_t>{raw_result}; 1287 } 1288 1289 HWY_API Vec256<int32_t> operator/(const Vec256<int32_t> a, 1290 const Vec256<int32_t> b) { 1291 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1292 // or a[i] == LimitsMin<int32_t>() && b[i] == -1 1293 __m256i raw_result; 1294 __asm__("xvdiv.w %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1295 return Vec256<int32_t>{raw_result}; 1296 } 1297 1298 HWY_API Vec256<uint32_t> operator/(const Vec256<uint32_t> a, 1299 const Vec256<uint32_t> b) { 1300 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1301 __m256i raw_result; 1302 __asm__("xvdiv.wu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1303 return Vec256<uint32_t>{raw_result}; 1304 } 1305 1306 HWY_API Vec256<int64_t> operator/(const Vec256<int64_t> a, 1307 const Vec256<int64_t> b) { 1308 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1309 // or a[i] == LimitsMin<int64_t>() && b[i] == -1 1310 __m256i raw_result; 1311 __asm__("xvdiv.d %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1312 return Vec256<int64_t>{raw_result}; 1313 } 1314 1315 HWY_API Vec256<uint64_t> operator/(const Vec256<uint64_t> a, 1316 const Vec256<uint64_t> b) { 1317 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1318 __m256i raw_result; 1319 __asm__("xvdiv.du %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1320 return Vec256<uint64_t>{raw_result}; 1321 } 1322 1323 // ------------------------------ Integer modulo 1324 1325 HWY_API Vec256<int8_t> operator%(const Vec256<int8_t> a, 1326 const Vec256<int8_t> b) { 1327 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1328 // or a[i] == LimitsMin<int8_t>() && b[i] == -1 1329 __m256i raw_result; 1330 __asm__("xvmod.b %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1331 return Vec256<int8_t>{raw_result}; 1332 } 1333 1334 HWY_API Vec256<uint8_t> operator%(const Vec256<uint8_t> a, 1335 const Vec256<uint8_t> b) { 1336 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1337 __m256i raw_result; 1338 __asm__("xvmod.bu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1339 return Vec256<uint8_t>{raw_result}; 1340 } 1341 1342 HWY_API Vec256<int16_t> operator%(const Vec256<int16_t> a, 1343 const Vec256<int16_t> b) { 1344 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1345 // or a[i] == LimitsMin<int16_t>() && b[i] == -1 1346 __m256i raw_result; 1347 __asm__("xvmod.h %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1348 return Vec256<int16_t>{raw_result}; 1349 } 1350 1351 HWY_API Vec256<uint16_t> operator%(const Vec256<uint16_t> a, 1352 const Vec256<uint16_t> b) { 1353 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1354 __m256i raw_result; 1355 __asm__("xvmod.hu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1356 return Vec256<uint16_t>{raw_result}; 1357 } 1358 1359 HWY_API Vec256<int32_t> operator%(const Vec256<int32_t> a, 1360 const Vec256<int32_t> b) { 1361 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1362 // or a[i] == LimitsMin<int32_t>() && b[i] == -1 1363 __m256i raw_result; 1364 __asm__("xvmod.w %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1365 return Vec256<int32_t>{raw_result}; 1366 } 1367 1368 HWY_API Vec256<uint32_t> operator%(const Vec256<uint32_t> a, 1369 const Vec256<uint32_t> b) { 1370 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1371 __m256i raw_result; 1372 __asm__("xvmod.wu %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1373 return Vec256<uint32_t>{raw_result}; 1374 } 1375 1376 HWY_API Vec256<int64_t> operator%(const Vec256<int64_t> a, 1377 const Vec256<int64_t> b) { 1378 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1379 // or a[i] == LimitsMin<int64_t>() && b[i] == -1 1380 __m256i raw_result; 1381 __asm__("xvmod.d %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1382 return Vec256<int64_t>{raw_result}; 1383 } 1384 1385 HWY_API Vec256<uint64_t> operator%(const Vec256<uint64_t> a, 1386 const Vec256<uint64_t> b) { 1387 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 1388 __m256i raw_result; 1389 __asm__("xvmod.du %u0,%u1,%u2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 1390 return Vec256<uint64_t>{raw_result}; 1391 } 1392 1393 // ------------------------------ ShiftLeft (Compile-time constant shifts) 1394 1395 template <int kBits, typename T, HWY_IF_UI8(T)> 1396 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) { 1397 return Vec256<T>{__lasx_xvslli_b(v.raw, kBits)}; 1398 } 1399 1400 template <int kBits, typename T, HWY_IF_UI16(T)> 1401 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) { 1402 return Vec256<T>{__lasx_xvslli_h(v.raw, kBits)}; 1403 } 1404 1405 template <int kBits, typename T, HWY_IF_UI32(T)> 1406 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) { 1407 return Vec256<T>{__lasx_xvslli_w(v.raw, kBits)}; 1408 } 1409 1410 template <int kBits, typename T, HWY_IF_UI64(T)> 1411 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) { 1412 return Vec256<T>{__lasx_xvslli_d(v.raw, kBits)}; 1413 } 1414 1415 // ------------------------------ ShiftRight (Compile-time constant shifts) 1416 1417 template <int kBits> 1418 HWY_API Vec256<uint8_t> ShiftRight(Vec256<uint8_t> v) { 1419 return Vec256<uint8_t>{__lasx_xvsrli_b(v.raw, kBits)}; 1420 } 1421 1422 template <int kBits> 1423 HWY_API Vec256<uint16_t> ShiftRight(Vec256<uint16_t> v) { 1424 return Vec256<uint16_t>{__lasx_xvsrli_h(v.raw, kBits)}; 1425 } 1426 1427 template <int kBits> 1428 HWY_API Vec256<uint32_t> ShiftRight(Vec256<uint32_t> v) { 1429 return Vec256<uint32_t>{__lasx_xvsrli_w(v.raw, kBits)}; 1430 } 1431 1432 template <int kBits> 1433 HWY_API Vec256<uint64_t> ShiftRight(Vec256<uint64_t> v) { 1434 return Vec256<uint64_t>{__lasx_xvsrli_d(v.raw, kBits)}; 1435 } 1436 1437 template <int kBits> 1438 HWY_API Vec256<int8_t> ShiftRight(Vec256<int8_t> v) { 1439 return Vec256<int8_t>{__lasx_xvsrai_b(v.raw, kBits)}; 1440 } 1441 1442 template <int kBits> 1443 HWY_API Vec256<int16_t> ShiftRight(Vec256<int16_t> v) { 1444 return Vec256<int16_t>{__lasx_xvsrai_h(v.raw, kBits)}; 1445 } 1446 1447 template <int kBits> 1448 HWY_API Vec256<int32_t> ShiftRight(Vec256<int32_t> v) { 1449 return Vec256<int32_t>{__lasx_xvsrai_w(v.raw, kBits)}; 1450 } 1451 1452 template <int kBits> 1453 HWY_API Vec256<int64_t> ShiftRight(Vec256<int64_t> v) { 1454 return Vec256<int64_t>{__lasx_xvsrai_d(v.raw, kBits)}; 1455 } 1456 1457 // ------------------------------ RoundingShiftRight 1458 1459 template <int kBits> 1460 HWY_API Vec256<int8_t> RoundingShiftRight(Vec256<int8_t> v) { 1461 return Vec256<int8_t>{__lasx_xvsrari_b(v.raw, kBits)}; 1462 } 1463 template <int kBits> 1464 HWY_API Vec256<int16_t> RoundingShiftRight(Vec256<int16_t> v) { 1465 return Vec256<int16_t>{__lasx_xvsrari_h(v.raw, kBits)}; 1466 } 1467 template <int kBits> 1468 HWY_API Vec256<int32_t> RoundingShiftRight(Vec256<int32_t> v) { 1469 return Vec256<int32_t>{__lasx_xvsrari_w(v.raw, kBits)}; 1470 } 1471 template <int kBits> 1472 HWY_API Vec256<int64_t> RoundingShiftRight(Vec256<int64_t> v) { 1473 return Vec256<int64_t>{__lasx_xvsrari_d(v.raw, kBits)}; 1474 } 1475 1476 template <int kBits> 1477 HWY_API Vec256<uint8_t> RoundingShiftRight(Vec256<uint8_t> v) { 1478 return Vec256<uint8_t>{__lasx_xvsrlri_b(v.raw, kBits)}; 1479 } 1480 template <int kBits> 1481 HWY_API Vec256<uint16_t> RoundingShiftRight(Vec256<uint16_t> v) { 1482 return Vec256<uint16_t>{__lasx_xvsrlri_h(v.raw, kBits)}; 1483 } 1484 template <int kBits> 1485 HWY_API Vec256<uint32_t> RoundingShiftRight(Vec256<uint32_t> v) { 1486 return Vec256<uint32_t>{__lasx_xvsrlri_w(v.raw, kBits)}; 1487 } 1488 template <int kBits> 1489 HWY_API Vec256<uint64_t> RoundingShiftRight(Vec256<uint64_t> v) { 1490 return Vec256<uint64_t>{__lasx_xvsrlri_d(v.raw, kBits)}; 1491 } 1492 // ------------------------------ RoundingShr 1493 1494 HWY_API Vec256<uint8_t> RoundingShr(Vec256<uint8_t> v, Vec256<uint8_t> bits) { 1495 return Vec256<uint8_t>{__lasx_xvsrlr_b(v.raw, bits.raw)}; 1496 } 1497 HWY_API Vec256<uint16_t> RoundingShr(Vec256<uint16_t> v, 1498 Vec256<uint16_t> bits) { 1499 return Vec256<uint16_t>{__lasx_xvsrlr_h(v.raw, bits.raw)}; 1500 } 1501 HWY_API Vec256<uint32_t> RoundingShr(Vec256<uint32_t> v, 1502 Vec256<uint32_t> bits) { 1503 return Vec256<uint32_t>{__lasx_xvsrlr_w(v.raw, bits.raw)}; 1504 } 1505 HWY_API Vec256<uint64_t> RoundingShr(Vec256<uint64_t> v, 1506 Vec256<uint64_t> bits) { 1507 return Vec256<uint64_t>{__lasx_xvsrlr_d(v.raw, bits.raw)}; 1508 } 1509 1510 HWY_API Vec256<int8_t> RoundingShr(Vec256<int8_t> v, Vec256<int8_t> bits) { 1511 return Vec256<int8_t>{__lasx_xvsrar_b(v.raw, bits.raw)}; 1512 } 1513 HWY_API Vec256<int16_t> RoundingShr(Vec256<int16_t> v, Vec256<int16_t> bits) { 1514 return Vec256<int16_t>{__lasx_xvsrar_h(v.raw, bits.raw)}; 1515 } 1516 HWY_API Vec256<int32_t> RoundingShr(Vec256<int32_t> v, Vec256<int32_t> bits) { 1517 return Vec256<int32_t>{__lasx_xvsrar_w(v.raw, bits.raw)}; 1518 } 1519 HWY_API Vec256<int64_t> RoundingShr(Vec256<int64_t> v, Vec256<int64_t> bits) { 1520 return Vec256<int64_t>{__lasx_xvsrar_d(v.raw, bits.raw)}; 1521 } 1522 1523 // ------------------------------ RoundingShiftRightSame (RoundingShr) 1524 1525 template <typename T> 1526 HWY_API Vec256<T> RoundingShiftRightSame(const Vec256<T> v, int bits) { 1527 return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits))); 1528 } 1529 1530 // ------------------------------ RotateRight (Compile-time constant shifts) 1531 1532 template <int kBits, typename T, HWY_IF_UI8(T)> 1533 HWY_API Vec256<T> RotateRight(const Vec256<T> v) { 1534 static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); 1535 if (kBits == 0) return v; 1536 return Vec256<T>{__lasx_xvrotri_b(v.raw, kBits)}; 1537 } 1538 1539 template <int kBits, typename T, HWY_IF_UI16(T)> 1540 HWY_API Vec256<T> RotateRight(const Vec256<T> v) { 1541 static_assert(0 <= kBits && kBits < 16, "Invalid shift count"); 1542 if (kBits == 0) return v; 1543 return Vec256<T>{__lasx_xvrotri_h(v.raw, kBits)}; 1544 } 1545 1546 template <int kBits, typename T, HWY_IF_UI32(T)> 1547 HWY_API Vec256<T> RotateRight(const Vec256<T> v) { 1548 static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); 1549 if (kBits == 0) return v; 1550 return Vec256<T>{__lasx_xvrotri_w(v.raw, kBits)}; 1551 } 1552 1553 template <int kBits, typename T, HWY_IF_UI64(T)> 1554 HWY_API Vec256<T> RotateRight(const Vec256<T> v) { 1555 static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); 1556 if (kBits == 0) return v; 1557 return Vec256<T>{__lasx_xvrotri_d(v.raw, kBits)}; 1558 } 1559 1560 // ------------------------------ Rol/Ror 1561 template <class T, HWY_IF_UI8(T)> 1562 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) { 1563 return Vec256<T>{__lasx_xvrotr_b(a.raw, b.raw)}; 1564 } 1565 1566 template <class T, HWY_IF_UI16(T)> 1567 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) { 1568 return Vec256<T>{__lasx_xvrotr_h(a.raw, b.raw)}; 1569 } 1570 1571 template <class T, HWY_IF_UI32(T)> 1572 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) { 1573 return Vec256<T>{__lasx_xvrotr_w(a.raw, b.raw)}; 1574 } 1575 1576 template <class T, HWY_IF_UI64(T)> 1577 HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) { 1578 return Vec256<T>{__lasx_xvrotr_d(a.raw, b.raw)}; 1579 } 1580 1581 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) 1582 1583 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) { 1584 return Vec256<int8_t>{__lasx_xvsrai_b(v.raw, 7)}; 1585 } 1586 1587 HWY_API Vec256<int16_t> BroadcastSignBit(const Vec256<int16_t> v) { 1588 return Vec256<int16_t>{__lasx_xvsrai_h(v.raw, 15)}; 1589 } 1590 1591 HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) { 1592 return Vec256<int32_t>{__lasx_xvsrai_w(v.raw, 31)}; 1593 } 1594 1595 HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) { 1596 return Vec256<int64_t>{__lasx_xvsrai_d(v.raw, 63)}; 1597 } 1598 1599 // ------------------------------ IfNegativeThenElse (BroadcastSignBit) 1600 template <typename T> 1601 HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { 1602 static_assert(IsSigned<T>(), "Only works for signed/float"); 1603 const DFromV<decltype(v)> d; 1604 const RebindToSigned<decltype(d)> di; 1605 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 1606 return IfThenElse(mask, yes, no); 1607 } 1608 1609 // ------------------------------ IfNegativeThenNegOrUndefIfZero 1610 1611 HWY_API Vec256<int8_t> IfNegativeThenNegOrUndefIfZero(Vec256<int8_t> mask, 1612 Vec256<int8_t> v) { 1613 return Vec256<int8_t>{__lasx_xvsigncov_b(mask.raw, v.raw)}; 1614 } 1615 1616 HWY_API Vec256<int16_t> IfNegativeThenNegOrUndefIfZero(Vec256<int16_t> mask, 1617 Vec256<int16_t> v) { 1618 return Vec256<int16_t>{__lasx_xvsigncov_h(mask.raw, v.raw)}; 1619 } 1620 1621 HWY_API Vec256<int32_t> IfNegativeThenNegOrUndefIfZero(Vec256<int32_t> mask, 1622 Vec256<int32_t> v) { 1623 return Vec256<int32_t>{__lasx_xvsigncov_w(mask.raw, v.raw)}; 1624 } 1625 1626 HWY_API Vec256<int64_t> IfNegativeThenNegOrUndefIfZero(Vec256<int64_t> mask, 1627 Vec256<int64_t> v) { 1628 return Vec256<int64_t>{__lasx_xvsigncov_d(mask.raw, v.raw)}; 1629 } 1630 1631 // ------------------------------ ShiftLeftSame 1632 1633 template <typename T> 1634 HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) { 1635 return Shl(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits))); 1636 } 1637 1638 // ------------------------------ ShiftRightSame (BroadcastSignBit) 1639 1640 HWY_API Vec256<uint8_t> ShiftRightSame(const Vec256<uint8_t> v, 1641 const int bits) { 1642 return Vec256<uint8_t>{__lasx_xvsrl_b(v.raw, __lasx_xvreplgr2vr_b(bits))}; 1643 } 1644 1645 HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v, 1646 const int bits) { 1647 return Vec256<uint16_t>{__lasx_xvsrl_h(v.raw, __lasx_xvreplgr2vr_h(bits))}; 1648 } 1649 1650 HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v, 1651 const int bits) { 1652 return Vec256<uint32_t>{__lasx_xvsrl_w(v.raw, __lasx_xvreplgr2vr_w(bits))}; 1653 } 1654 1655 HWY_API Vec256<uint64_t> ShiftRightSame(const Vec256<uint64_t> v, 1656 const int bits) { 1657 return Vec256<uint64_t>{__lasx_xvsrl_d(v.raw, __lasx_xvreplgr2vr_d(bits))}; 1658 } 1659 1660 HWY_API Vec256<int8_t> ShiftRightSame(const Vec256<int8_t> v, const int bits) { 1661 return Vec256<int8_t>{__lasx_xvsra_b(v.raw, __lasx_xvreplgr2vr_b(bits))}; 1662 } 1663 1664 HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v, 1665 const int bits) { 1666 return Vec256<int16_t>{__lasx_xvsra_h(v.raw, __lasx_xvreplgr2vr_h(bits))}; 1667 } 1668 1669 HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v, 1670 const int bits) { 1671 return Vec256<int32_t>{__lasx_xvsra_w(v.raw, __lasx_xvreplgr2vr_w(bits))}; 1672 } 1673 1674 HWY_API Vec256<int64_t> ShiftRightSame(const Vec256<int64_t> v, 1675 const int bits) { 1676 return Vec256<int64_t>{__lasx_xvsra_d(v.raw, __lasx_xvreplgr2vr_d(bits))}; 1677 } 1678 1679 // ------------------------------ Neg (Xor, Sub) 1680 1681 namespace detail { 1682 1683 template <typename T> 1684 HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) { 1685 const DFromV<decltype(v)> d; 1686 return Xor(v, SignBit(d)); 1687 } 1688 1689 template <typename T> 1690 HWY_INLINE Vec256<T> Neg(hwy::SpecialTag /*tag*/, const Vec256<T> v) { 1691 const DFromV<decltype(v)> d; 1692 return Xor(v, SignBit(d)); 1693 } 1694 1695 // Not floating-point 1696 template <typename T, HWY_IF_UI8(T)> 1697 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) { 1698 return Vec256<T>{__lasx_xvneg_b(v.raw)}; 1699 } 1700 1701 template <typename T, HWY_IF_UI16(T)> 1702 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) { 1703 return Vec256<T>{__lasx_xvneg_h(v.raw)}; 1704 } 1705 1706 template <typename T, HWY_IF_UI32(T)> 1707 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) { 1708 return Vec256<T>{__lasx_xvneg_w(v.raw)}; 1709 } 1710 1711 template <typename T, HWY_IF_UI64(T)> 1712 HWY_INLINE Vec256<T> Neg(hwy::SignedTag /*tag*/, const Vec256<T> v) { 1713 return Vec256<T>{__lasx_xvneg_d(v.raw)}; 1714 } 1715 1716 } // namespace detail 1717 1718 template <typename T> 1719 HWY_API Vec256<T> Neg(const Vec256<T> v) { 1720 return detail::Neg(hwy::TypeTag<T>(), v); 1721 } 1722 1723 // ------------------------------ Floating-point mul / div 1724 1725 HWY_API Vec256<float> operator*(Vec256<float> a, Vec256<float> b) { 1726 return Vec256<float>{__lasx_xvfmul_s(a.raw, b.raw)}; 1727 } 1728 HWY_API Vec256<double> operator*(Vec256<double> a, Vec256<double> b) { 1729 return Vec256<double>{__lasx_xvfmul_d(a.raw, b.raw)}; 1730 } 1731 1732 HWY_API Vec256<float> operator/(Vec256<float> a, Vec256<float> b) { 1733 return Vec256<float>{__lasx_xvfdiv_s(a.raw, b.raw)}; 1734 } 1735 HWY_API Vec256<double> operator/(Vec256<double> a, Vec256<double> b) { 1736 return Vec256<double>{__lasx_xvfdiv_d(a.raw, b.raw)}; 1737 } 1738 1739 // Approximate reciprocal 1740 1741 HWY_API Vec256<float> ApproximateReciprocal(Vec256<float> v) { 1742 return Vec256<float>{__lasx_xvfrecip_s(v.raw)}; 1743 } 1744 1745 HWY_API Vec256<double> ApproximateReciprocal(Vec256<double> v) { 1746 return Vec256<double>{__lasx_xvfrecip_d(v.raw)}; 1747 } 1748 1749 // Integer multiply-add variants 1750 1751 // signed 1752 HWY_API Vec256<int8_t> MulAdd(Vec256<int8_t> mul, Vec256<int8_t> x, 1753 Vec256<int8_t> add) { 1754 return Vec256<int8_t>{__lasx_xvmadd_b(add.raw, mul.raw, x.raw)}; 1755 } 1756 HWY_API Vec256<int16_t> MulAdd(Vec256<int16_t> mul, Vec256<int16_t> x, 1757 Vec256<int16_t> add) { 1758 return Vec256<int16_t>{__lasx_xvmadd_h(add.raw, mul.raw, x.raw)}; 1759 } 1760 HWY_API Vec256<int32_t> MulAdd(Vec256<int32_t> mul, Vec256<int32_t> x, 1761 Vec256<int32_t> add) { 1762 return Vec256<int32_t>{__lasx_xvmadd_w(add.raw, mul.raw, x.raw)}; 1763 } 1764 HWY_API Vec256<int64_t> MulAdd(Vec256<int64_t> mul, Vec256<int64_t> x, 1765 Vec256<int64_t> add) { 1766 return Vec256<int64_t>{__lasx_xvmadd_d(add.raw, mul.raw, x.raw)}; 1767 } 1768 1769 // unsigend 1770 HWY_API Vec256<uint8_t> MulAdd(Vec256<uint8_t> mul, Vec256<uint8_t> x, 1771 Vec256<uint8_t> add) { 1772 return Vec256<uint8_t>{__lasx_xvmadd_b(add.raw, mul.raw, x.raw)}; 1773 } 1774 HWY_API Vec256<uint16_t> MulAdd(Vec256<uint16_t> mul, Vec256<uint16_t> x, 1775 Vec256<uint16_t> add) { 1776 return Vec256<uint16_t>{__lasx_xvmadd_h(add.raw, mul.raw, x.raw)}; 1777 } 1778 HWY_API Vec256<uint32_t> MulAdd(Vec256<uint32_t> mul, Vec256<uint32_t> x, 1779 Vec256<uint32_t> add) { 1780 return Vec256<uint32_t>{__lasx_xvmadd_w(add.raw, mul.raw, x.raw)}; 1781 } 1782 HWY_API Vec256<uint64_t> MulAdd(Vec256<uint64_t> mul, Vec256<uint64_t> x, 1783 Vec256<uint64_t> add) { 1784 return Vec256<uint64_t>{__lasx_xvmadd_d(add.raw, mul.raw, x.raw)}; 1785 } 1786 1787 // signed 1788 HWY_API Vec256<int8_t> NegMulAdd(Vec256<int8_t> mul, Vec256<int8_t> x, 1789 Vec256<int8_t> add) { 1790 return Vec256<int8_t>{__lasx_xvmsub_b(add.raw, mul.raw, x.raw)}; 1791 } 1792 HWY_API Vec256<int16_t> NegMulAdd(Vec256<int16_t> mul, Vec256<int16_t> x, 1793 Vec256<int16_t> add) { 1794 return Vec256<int16_t>{__lasx_xvmsub_h(add.raw, mul.raw, x.raw)}; 1795 } 1796 HWY_API Vec256<int32_t> NegMulAdd(Vec256<int32_t> mul, Vec256<int32_t> x, 1797 Vec256<int32_t> add) { 1798 return Vec256<int32_t>{__lasx_xvmsub_w(add.raw, mul.raw, x.raw)}; 1799 } 1800 HWY_API Vec256<int64_t> NegMulAdd(Vec256<int64_t> mul, Vec256<int64_t> x, 1801 Vec256<int64_t> add) { 1802 return Vec256<int64_t>{__lasx_xvmsub_d(add.raw, mul.raw, x.raw)}; 1803 } 1804 1805 // unsigned 1806 HWY_API Vec256<uint8_t> NegMulAdd(Vec256<uint8_t> mul, Vec256<uint8_t> x, 1807 Vec256<uint8_t> add) { 1808 return Vec256<uint8_t>{__lasx_xvmsub_b(add.raw, mul.raw, x.raw)}; 1809 } 1810 HWY_API Vec256<uint16_t> NegMulAdd(Vec256<uint16_t> mul, Vec256<uint16_t> x, 1811 Vec256<uint16_t> add) { 1812 return Vec256<uint16_t>{__lasx_xvmsub_h(add.raw, mul.raw, x.raw)}; 1813 } 1814 HWY_API Vec256<uint32_t> NegMulAdd(Vec256<uint32_t> mul, Vec256<uint32_t> x, 1815 Vec256<uint32_t> add) { 1816 return Vec256<uint32_t>{__lasx_xvmsub_w(add.raw, mul.raw, x.raw)}; 1817 } 1818 HWY_API Vec256<uint64_t> NegMulAdd(Vec256<uint64_t> mul, Vec256<uint64_t> x, 1819 Vec256<uint64_t> add) { 1820 return Vec256<uint64_t>{__lasx_xvmsub_d(add.raw, mul.raw, x.raw)}; 1821 } 1822 1823 // ------------------------------ Floating-point multiply-add variants 1824 1825 HWY_API Vec256<float> MulAdd(Vec256<float> mul, Vec256<float> x, 1826 Vec256<float> add) { 1827 return Vec256<float>{__lasx_xvfmadd_s(mul.raw, x.raw, add.raw)}; 1828 } 1829 HWY_API Vec256<double> MulAdd(Vec256<double> mul, Vec256<double> x, 1830 Vec256<double> add) { 1831 return Vec256<double>{__lasx_xvfmadd_d(mul.raw, x.raw, add.raw)}; 1832 } 1833 1834 HWY_API Vec256<float> NegMulAdd(Vec256<float> mul, Vec256<float> x, 1835 Vec256<float> add) { 1836 return add - mul * x; 1837 } 1838 HWY_API Vec256<double> NegMulAdd(Vec256<double> mul, Vec256<double> x, 1839 Vec256<double> add) { 1840 return add - mul * x; 1841 } 1842 1843 HWY_API Vec256<float> MulSub(Vec256<float> mul, Vec256<float> x, 1844 Vec256<float> sub) { 1845 return Vec256<float>{__lasx_xvfmsub_s(mul.raw, x.raw, sub.raw)}; 1846 } 1847 HWY_API Vec256<double> MulSub(Vec256<double> mul, Vec256<double> x, 1848 Vec256<double> sub) { 1849 return Vec256<double>{__lasx_xvfmsub_d(mul.raw, x.raw, sub.raw)}; 1850 } 1851 1852 HWY_API Vec256<float> NegMulSub(Vec256<float> mul, Vec256<float> x, 1853 Vec256<float> sub) { 1854 return Vec256<float>{__lasx_xvfnmadd_s(mul.raw, x.raw, sub.raw)}; 1855 } 1856 HWY_API Vec256<double> NegMulSub(Vec256<double> mul, Vec256<double> x, 1857 Vec256<double> sub) { 1858 return Vec256<double>{__lasx_xvfnmadd_d(mul.raw, x.raw, sub.raw)}; 1859 } 1860 1861 // ------------------------------ MulAddSub(Float) 1862 1863 template <typename T, HWY_IF_FLOAT3264(T)> 1864 HWY_API Vec256<T> MulAddSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub_or_add) { 1865 return OddEven(MulAdd(mul, x, sub_or_add), MulSub(mul, x, sub_or_add)); 1866 } 1867 1868 // ------------------------------ Floating-point square root 1869 1870 // Full precision square root 1871 HWY_API Vec256<float> Sqrt(Vec256<float> v) { 1872 return Vec256<float>{__lasx_xvfsqrt_s(v.raw)}; 1873 } 1874 1875 HWY_API Vec256<double> Sqrt(Vec256<double> v) { 1876 return Vec256<double>{__lasx_xvfsqrt_d(v.raw)}; 1877 } 1878 1879 // Approximate reciprocal square root 1880 HWY_API Vec256<float> ApproximateReciprocalSqrt(Vec256<float> v) { 1881 return Vec256<float>{__lasx_xvfrsqrt_s(v.raw)}; 1882 } 1883 1884 HWY_API Vec256<double> ApproximateReciprocalSqrt(Vec256<double> v) { 1885 return Vec256<double>{__lasx_xvfrsqrt_d(v.raw)}; 1886 } 1887 1888 // ------------------------------ Floating-point rounding 1889 1890 // Toward nearest integer, tie to even 1891 HWY_API Vec256<float> Round(Vec256<float> v) { 1892 return Vec256<float>{__lasx_xvfrintrne_s(v.raw)}; 1893 } 1894 1895 HWY_API Vec256<double> Round(Vec256<double> v) { 1896 return Vec256<double>{__lasx_xvfrintrne_d(v.raw)}; 1897 } 1898 1899 // Toward zero, aka truncate 1900 HWY_API Vec256<float> Trunc(Vec256<float> v) { 1901 return Vec256<float>{__lasx_xvfrintrz_s(v.raw)}; 1902 } 1903 1904 HWY_API Vec256<double> Trunc(Vec256<double> v) { 1905 return Vec256<double>{__lasx_xvfrintrz_d(v.raw)}; 1906 } 1907 1908 // Toward +infinity, aka ceiling 1909 HWY_API Vec256<float> Ceil(Vec256<float> v) { 1910 return Vec256<float>{__lasx_xvfrintrp_s(v.raw)}; 1911 } 1912 1913 HWY_API Vec256<double> Ceil(Vec256<double> v) { 1914 return Vec256<double>{__lasx_xvfrintrp_d(v.raw)}; 1915 } 1916 1917 // Toward -infinity, aka floor 1918 HWY_API Vec256<float> Floor(Vec256<float> v) { 1919 return Vec256<float>{__lasx_xvfrintrm_s(v.raw)}; 1920 } 1921 1922 HWY_API Vec256<double> Floor(Vec256<double> v) { 1923 return Vec256<double>{__lasx_xvfrintrm_d(v.raw)}; 1924 } 1925 1926 // ------------------------------ Floating-point classification 1927 1928 // FIXME: disable gcc-14 tree-based loop optimizations to prevent 1929 // 'HighwayTestGroup/HighwayTest.TestAllIsNaN/LASX' failures 1930 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG 1931 #pragma GCC push_options 1932 #pragma GCC optimize("-fno-tree-loop-optimize") 1933 #endif 1934 1935 HWY_API Mask256<float> IsNaN(Vec256<float> v) { 1936 const DFromV<decltype(v)> d; 1937 const RebindToSigned<decltype(d)> di; 1938 return RebindMask(d, 1939 MFromD<decltype(di)>{__lasx_xvfcmp_cune_s(v.raw, v.raw)}); 1940 } 1941 1942 HWY_API Mask256<double> IsNaN(Vec256<double> v) { 1943 const DFromV<decltype(v)> d; 1944 const RebindToSigned<decltype(d)> di; 1945 return RebindMask(d, 1946 MFromD<decltype(di)>{__lasx_xvfcmp_cune_d(v.raw, v.raw)}); 1947 } 1948 1949 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG 1950 #pragma GCC pop_options 1951 #endif 1952 1953 HWY_API Mask256<float> IsEitherNaN(Vec256<float> a, Vec256<float> b) { 1954 const DFromV<decltype(a)> d; 1955 const RebindToSigned<decltype(d)> di; 1956 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cun_s(a.raw, b.raw)}); 1957 } 1958 1959 HWY_API Mask256<double> IsEitherNaN(Vec256<double> a, Vec256<double> b) { 1960 const DFromV<decltype(a)> d; 1961 const RebindToSigned<decltype(d)> di; 1962 return RebindMask(d, MFromD<decltype(di)>{__lasx_xvfcmp_cun_d(a.raw, b.raw)}); 1963 } 1964 1965 // ================================================== MEMORY 1966 1967 // ------------------------------ Load 1968 1969 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1970 HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) { 1971 const RebindToSigned<D> di; 1972 return BitCast(D(), VFromD<decltype(di)>{__lasx_xvld(aligned, 0)}); 1973 } 1974 1975 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1976 HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) { 1977 const RebindToSigned<D> di; 1978 return BitCast(D(), VFromD<decltype(di)>{__lasx_xvld(p, 0)}); 1979 } 1980 1981 // ------------------------------ MaskedLoad 1982 1983 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1984 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 1985 const TFromD<D>* HWY_RESTRICT p) { 1986 return IfThenElseZero(m, LoadU(d, p)); 1987 } 1988 1989 // ------------------------------ LoadDup128 1990 1991 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1992 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { 1993 VFromD<Half<D>> vec_tmp; 1994 vec_tmp = Load(Half<D>(), p); 1995 return Combine(d, vec_tmp, vec_tmp); 1996 } 1997 1998 // ------------------------------ Store 1999 2000 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2001 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { 2002 __lasx_xvst(v.raw, aligned, 0); 2003 } 2004 2005 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2006 HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) { 2007 __lasx_xvst(v.raw, p, 0); 2008 } 2009 2010 // ------------------------------ BlendedStore 2011 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2012 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 2013 TFromD<D>* HWY_RESTRICT p) { 2014 const RebindToUnsigned<decltype(d)> du; 2015 const auto blended = 2016 IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p))); 2017 StoreU(BitCast(d, blended), d, p); 2018 } 2019 2020 // ================================================== SWIZZLE 2021 // ------------------------------ LowerHalf 2022 2023 template <class D, HWY_IF_V_SIZE_D(D, 16)> 2024 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { 2025 #if HWY_HAS_BUILTIN(__builtin_shufflevector) 2026 typedef uint32_t U32RawVectType __attribute__((__vector_size__(32))); 2027 return VFromD<D>{reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>( 2028 __builtin_shufflevector(reinterpret_cast<U32RawVectType>(v.raw), 2029 reinterpret_cast<U32RawVectType>(v.raw), 0, 1, 2, 2030 3))}; 2031 #else 2032 const RebindToUnsigned<D> du; 2033 const Twice<decltype(du)> dut; 2034 alignas(32) __m128i vec_tmp[2]; 2035 __m256i vec_result = BitCast(dut, v).raw; 2036 CopyBytes<32>(&vec_result, vec_tmp); 2037 return BitCast(D(), VFromD<decltype(du)>{vec_tmp[0]}); 2038 #endif 2039 } 2040 2041 template <typename T> 2042 HWY_API Vec128<T> LowerHalf(Vec256<T> v) { 2043 const Full128<T> dh; 2044 return LowerHalf(dh, v); 2045 } 2046 2047 // ------------------------------ UpperHalf 2048 2049 template <class D, HWY_IF_V_SIZE_D(D, 16)> 2050 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 2051 #if HWY_HAS_BUILTIN(__builtin_shufflevector) 2052 (void)d; 2053 typedef uint32_t U32RawVectType __attribute__((__vector_size__(32))); 2054 return VFromD<D>{reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>( 2055 __builtin_shufflevector(reinterpret_cast<U32RawVectType>(v.raw), 2056 reinterpret_cast<U32RawVectType>(v.raw), 4, 5, 6, 2057 7))}; 2058 #else 2059 const RebindToUnsigned<decltype(d)> du; 2060 const Twice<decltype(du)> dut; 2061 alignas(32) __m128i vec_tmp[2]; 2062 __m256i vec_result = BitCast(dut, v).raw; 2063 CopyBytes<32>(&vec_result, vec_tmp); 2064 return BitCast(d, VFromD<decltype(du)>{vec_tmp[1]}); 2065 #endif 2066 } 2067 2068 // ------------------------------ ExtractLane (Store) 2069 template <typename T> 2070 HWY_API T ExtractLane(const Vec256<T> v, size_t i) { 2071 const DFromV<decltype(v)> d; 2072 HWY_DASSERT(i < Lanes(d)); 2073 2074 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2075 constexpr size_t kLanesPerBlock = 16 / sizeof(T); 2076 if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) { 2077 return ExtractLane(LowerHalf(Half<decltype(d)>(), v), i); 2078 } 2079 #endif 2080 2081 alignas(32) T lanes[32 / sizeof(T)]; 2082 Store(v, d, lanes); 2083 return lanes[i]; 2084 } 2085 2086 // ------------------------------ InsertLane (Store) 2087 template <typename T> 2088 HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) { 2089 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 2090 } 2091 2092 // ------------------------------ GetLane (LowerHalf) 2093 template <typename T> 2094 HWY_API T GetLane(const Vec256<T> v) { 2095 return GetLane(LowerHalf(v)); 2096 } 2097 2098 // ------------------------------ ExtractBlock (LowerHalf, UpperHalf) 2099 2100 template <int kBlockIdx, class T> 2101 HWY_API Vec128<T> ExtractBlock(Vec256<T> v) { 2102 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); 2103 const Half<DFromV<decltype(v)>> dh; 2104 return (kBlockIdx == 0) ? LowerHalf(dh, v) : UpperHalf(dh, v); 2105 } 2106 2107 // ------------------------------ ZeroExtendVector 2108 2109 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2110 HWY_API VFromD<D> ZeroExtendVector(D /* tag */, VFromD<Half<D>> lo) { 2111 #if HWY_HAS_BUILTIN(__builtin_shufflevector) 2112 typedef uint32_t U32RawVectType __attribute__((__vector_size__(16))); 2113 U32RawVectType zero = {0, 0, 0, 0}; 2114 return VFromD<D>{reinterpret_cast<typename detail::Raw256<TFromD<D>>::type>( 2115 __builtin_shufflevector(reinterpret_cast<U32RawVectType>(lo.raw), zero, 0, 2116 1, 2, 3, 4, 5, 6, 7))}; 2117 #else 2118 return Combine(D(), Zero(Half<D>()), lo); 2119 #endif 2120 } 2121 2122 // ------------------------------ ZeroExtendResizeBitCast 2123 2124 namespace detail { 2125 2126 template <class DTo, class DFrom> 2127 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( 2128 hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */, 2129 DTo d_to, DFrom d_from, VFromD<DFrom> v) { 2130 const Twice<decltype(d_from)> dt_from; 2131 const Twice<decltype(dt_from)> dq_from; 2132 return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v))); 2133 } 2134 2135 } // namespace detail 2136 2137 // ------------------------------ Combine 2138 2139 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2140 HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) { 2141 #if HWY_HAS_BUILTIN(__builtin_shufflevector) 2142 (void)d; 2143 typedef uint32_t U32RawVectType __attribute__((__vector_size__(16))); 2144 return VFromD<D>{reinterpret_cast<typename detail::Raw256<TFromD<D>>::type>( 2145 __builtin_shufflevector(reinterpret_cast<U32RawVectType>(lo.raw), 2146 reinterpret_cast<U32RawVectType>(hi.raw), 0, 1, 2, 2147 3, 4, 5, 6, 7))}; 2148 #else 2149 const RebindToUnsigned<decltype(d)> du; 2150 const Half<decltype(du)> du128; 2151 alignas(32) __m128i vec_tmp[2]; 2152 __m256i vec_result; 2153 vec_tmp[0] = BitCast(du128, lo).raw; 2154 vec_tmp[1] = BitCast(du128, hi).raw; 2155 CopyBytes<32>(vec_tmp, &vec_result); 2156 return BitCast(d, VFromD<decltype(du)>{vec_result}); 2157 #endif 2158 } 2159 2160 // ------------------------------ ShiftLeftBytes 2161 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)> 2162 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { 2163 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2164 if (kBytes == 0) return v; 2165 const RebindToUnsigned<decltype(d)> du; 2166 return BitCast( 2167 d, VFromD<decltype(du)>{__lasx_xvbsll_v(BitCast(du, v).raw, kBytes)}); 2168 } 2169 2170 // ------------------------------ ShiftRightBytes 2171 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)> 2172 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 2173 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2174 if (kBytes == 0) return v; 2175 const RebindToUnsigned<decltype(d)> du; 2176 return BitCast( 2177 d, VFromD<decltype(du)>{__lasx_xvbsrl_v(BitCast(du, v).raw, kBytes)}); 2178 } 2179 2180 // ------------------------------ CombineShiftRightBytes 2181 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 32)> 2182 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 2183 return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); 2184 } 2185 2186 // ------------------------------ Broadcast 2187 2188 template <int kLane, class T, HWY_IF_T_SIZE(T, 1)> 2189 HWY_API Vec256<T> Broadcast(const Vec256<T> v) { 2190 static_assert(0 <= kLane && kLane < 16, "Invalid lane"); 2191 return Vec256<T>{__lasx_xvreplve_b(v.raw, kLane)}; 2192 } 2193 2194 template <int kLane, typename T, HWY_IF_T_SIZE(T, 2)> 2195 HWY_API Vec256<T> Broadcast(const Vec256<T> v) { 2196 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 2197 const DFromV<decltype(v)> d; 2198 const RebindToUnsigned<decltype(d)> du; 2199 return BitCast( 2200 d, VFromD<decltype(du)>{__lasx_xvreplve_h(BitCast(du, v).raw, kLane)}); 2201 } 2202 2203 template <int kLane, typename T, HWY_IF_UI32(T)> 2204 HWY_API Vec256<T> Broadcast(const Vec256<T> v) { 2205 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 2206 return Vec256<T>{__lasx_xvreplve_w(v.raw, kLane)}; 2207 } 2208 2209 template <int kLane, typename T, HWY_IF_UI64(T)> 2210 HWY_API Vec256<T> Broadcast(const Vec256<T> v) { 2211 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 2212 return Vec256<T>{__lasx_xvreplve_d(v.raw, kLane)}; 2213 } 2214 2215 template <int kLane> 2216 HWY_API Vec256<float> Broadcast(Vec256<float> v) { 2217 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 2218 const DFromV<decltype(v)> d; 2219 const RebindToUnsigned<decltype(d)> du; 2220 return BitCast( 2221 d, VFromD<decltype(du)>{__lasx_xvreplve_w(BitCast(du, v).raw, kLane)}); 2222 } 2223 2224 template <int kLane> 2225 HWY_API Vec256<double> Broadcast(const Vec256<double> v) { 2226 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 2227 const DFromV<decltype(v)> d; 2228 const RebindToUnsigned<decltype(d)> du; 2229 return BitCast( 2230 d, VFromD<decltype(du)>{__lasx_xvreplve_d(BitCast(du, v).raw, kLane)}); 2231 } 2232 2233 // ------------------------------ BroadcastBlock 2234 2235 template <int kBlockIdx, class T> 2236 HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) { 2237 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); 2238 const DFromV<decltype(v)> d; 2239 return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v) 2240 : ConcatUpperUpper(d, v, v); 2241 } 2242 2243 // ------------------------------ BroadcastLane 2244 2245 namespace detail { 2246 2247 template <class T, HWY_IF_T_SIZE(T, 1)> 2248 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, 2249 Vec256<T> v) { 2250 return Vec256<T>{__lasx_xvreplve0_b(v.raw)}; 2251 } 2252 2253 template <class T, HWY_IF_T_SIZE(T, 2)> 2254 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, 2255 Vec256<T> v) { 2256 const DFromV<decltype(v)> d; 2257 const RebindToUnsigned<decltype(d)> du; // for float16_t 2258 return BitCast(d, 2259 VFromD<decltype(du)>{__lasx_xvreplve0_h(BitCast(du, v).raw)}); 2260 } 2261 2262 template <class T, HWY_IF_UI32(T)> 2263 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, 2264 Vec256<T> v) { 2265 return Vec256<T>{__lasx_xvreplve0_w(v.raw)}; 2266 } 2267 2268 template <class T, HWY_IF_UI64(T)> 2269 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, 2270 Vec256<T> v) { 2271 return Vec256<T>{__lasx_xvreplve0_d(v.raw)}; 2272 } 2273 2274 HWY_INLINE Vec256<float> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, 2275 Vec256<float> v) { 2276 const DFromV<decltype(v)> d; 2277 const RebindToUnsigned<decltype(d)> du; 2278 return BitCast(d, 2279 VFromD<decltype(du)>{__lasx_xvreplve0_w(BitCast(du, v).raw)}); 2280 } 2281 2282 HWY_INLINE Vec256<double> BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, 2283 Vec256<double> v) { 2284 const DFromV<decltype(v)> d; 2285 const RebindToUnsigned<decltype(d)> du; 2286 return BitCast(d, 2287 VFromD<decltype(du)>{__lasx_xvreplve0_d(BitCast(du, v).raw)}); 2288 } 2289 2290 template <size_t kLaneIdx, class T, hwy::EnableIf<kLaneIdx != 0>* = nullptr> 2291 HWY_INLINE Vec256<T> BroadcastLane(hwy::SizeTag<kLaneIdx> /* lane_idx_tag */, 2292 Vec256<T> v) { 2293 constexpr size_t kLanesPerBlock = 16 / sizeof(T); 2294 constexpr int kBlockIdx = static_cast<int>(kLaneIdx / kLanesPerBlock); 2295 constexpr int kLaneInBlkIdx = 2296 static_cast<int>(kLaneIdx) & (kLanesPerBlock - 1); 2297 return Broadcast<kLaneInBlkIdx>(BroadcastBlock<kBlockIdx>(v)); 2298 } 2299 } // namespace detail 2300 2301 template <int kLaneIdx, class T> 2302 HWY_API Vec256<T> BroadcastLane(Vec256<T> v) { 2303 static_assert(kLaneIdx >= 0, "Invalid lane"); 2304 return detail::BroadcastLane(hwy::SizeTag<static_cast<size_t>(kLaneIdx)>(), 2305 v); 2306 } 2307 2308 // ------------------------------ Hard-coded shuffles 2309 2310 // Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is 2311 // least-significant). Shuffle0321 rotates four-lane blocks one lane to the 2312 // right (the previous least-significant lane is now most-significant => 2313 // 47650321). These could also be implemented via CombineShiftRightBytes but 2314 // the shuffle_abcd notation is more convenient. 2315 2316 // Swap 32-bit halves in 64-bit halves. 2317 template <typename T, HWY_IF_UI32(T)> 2318 HWY_API Vec256<T> Shuffle2301(const Vec256<T> v) { 2319 return Vec256<T>{__lasx_xvshuf4i_w(v.raw, 0xb1)}; 2320 } 2321 HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) { 2322 const DFromV<decltype(v)> d; 2323 const RebindToUnsigned<decltype(d)> du; 2324 return BitCast( 2325 d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0xb1)}); 2326 } 2327 2328 // Used by generic_ops-inl.h 2329 namespace detail { 2330 2331 template <typename T, HWY_IF_T_SIZE(T, 4)> 2332 HWY_API Vec256<T> ShuffleTwo2301(const Vec256<T> a, const Vec256<T> b) { 2333 const DFromV<decltype(a)> d; 2334 const RebindToUnsigned<decltype(d)> du; 2335 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w( 2336 BitCast(du, b).raw, BitCast(du, a).raw, 0xb1)}); 2337 } 2338 template <typename T, HWY_IF_T_SIZE(T, 4)> 2339 HWY_API Vec256<T> ShuffleTwo1230(const Vec256<T> a, const Vec256<T> b) { 2340 const DFromV<decltype(a)> d; 2341 const RebindToUnsigned<decltype(d)> du; 2342 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w( 2343 BitCast(du, b).raw, BitCast(du, a).raw, 0x6c)}); 2344 } 2345 template <typename T, HWY_IF_T_SIZE(T, 4)> 2346 HWY_API Vec256<T> ShuffleTwo3012(const Vec256<T> a, const Vec256<T> b) { 2347 const DFromV<decltype(a)> d; 2348 const RebindToUnsigned<decltype(d)> du; 2349 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_w( 2350 BitCast(du, b).raw, BitCast(du, a).raw, 0xc6)}); 2351 } 2352 2353 } // namespace detail 2354 2355 // Swap 64-bit halves 2356 HWY_API Vec256<uint32_t> Shuffle1032(const Vec256<uint32_t> v) { 2357 return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)}; 2358 } 2359 HWY_API Vec256<int32_t> Shuffle1032(const Vec256<int32_t> v) { 2360 return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)}; 2361 } 2362 HWY_API Vec256<float> Shuffle1032(const Vec256<float> v) { 2363 const DFromV<decltype(v)> d; 2364 const RebindToUnsigned<decltype(d)> du; 2365 return BitCast( 2366 d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x4e)}); 2367 } 2368 HWY_API Vec256<uint64_t> Shuffle01(const Vec256<uint64_t> v) { 2369 return Vec256<uint64_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)}; 2370 } 2371 HWY_API Vec256<int64_t> Shuffle01(const Vec256<int64_t> v) { 2372 return Vec256<int64_t>{__lasx_xvshuf4i_w(v.raw, 0x4e)}; 2373 } 2374 HWY_API Vec256<double> Shuffle01(const Vec256<double> v) { 2375 const DFromV<decltype(v)> d; 2376 const RebindToUnsigned<decltype(d)> du; 2377 return BitCast( 2378 d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x4e)}); 2379 } 2380 2381 // Rotate right 32 bits 2382 HWY_API Vec256<uint32_t> Shuffle0321(const Vec256<uint32_t> v) { 2383 return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x39)}; 2384 } 2385 HWY_API Vec256<int32_t> Shuffle0321(const Vec256<int32_t> v) { 2386 return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x39)}; 2387 } 2388 HWY_API Vec256<float> Shuffle0321(const Vec256<float> v) { 2389 const DFromV<decltype(v)> d; 2390 const RebindToUnsigned<decltype(d)> du; 2391 return BitCast( 2392 d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x39)}); 2393 } 2394 // Rotate left 32 bits 2395 HWY_API Vec256<uint32_t> Shuffle2103(const Vec256<uint32_t> v) { 2396 return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x93)}; 2397 } 2398 HWY_API Vec256<int32_t> Shuffle2103(const Vec256<int32_t> v) { 2399 return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x93)}; 2400 } 2401 HWY_API Vec256<float> Shuffle2103(const Vec256<float> v) { 2402 const DFromV<decltype(v)> d; 2403 const RebindToUnsigned<decltype(d)> du; 2404 return BitCast( 2405 d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x93)}); 2406 } 2407 2408 // Reverse 2409 HWY_API Vec256<uint32_t> Shuffle0123(const Vec256<uint32_t> v) { 2410 return Vec256<uint32_t>{__lasx_xvshuf4i_w(v.raw, 0x1B)}; 2411 } 2412 HWY_API Vec256<int32_t> Shuffle0123(const Vec256<int32_t> v) { 2413 return Vec256<int32_t>{__lasx_xvshuf4i_w(v.raw, 0x1B)}; 2414 } 2415 HWY_API Vec256<float> Shuffle0123(const Vec256<float> v) { 2416 const DFromV<decltype(v)> d; 2417 const RebindToUnsigned<decltype(d)> du; 2418 return BitCast( 2419 d, VFromD<decltype(du)>{__lasx_xvshuf4i_w(BitCast(du, v).raw, 0x1b)}); 2420 } 2421 2422 // ------------------------------ TableLookupLanes 2423 2424 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. 2425 template <typename T> 2426 struct Indices256 { 2427 __m256i raw; 2428 }; 2429 2430 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI> 2431 HWY_API Indices256<TFromD<D>> IndicesFromVec(D /* tag */, Vec256<TI> vec) { 2432 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); 2433 #if HWY_IS_DEBUG_BUILD 2434 const Full256<TI> di; 2435 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 2436 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(2 * Lanes(di)))))); 2437 #endif 2438 return Indices256<TFromD<D>>{vec.raw}; 2439 } 2440 2441 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI> 2442 HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) { 2443 const Rebind<TI, decltype(d)> di; 2444 return IndicesFromVec(d, LoadU(di, idx)); 2445 } 2446 2447 template <typename T, HWY_IF_T_SIZE(T, 1)> 2448 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { 2449 const DFromV<decltype(v)> d; 2450 const auto a = ConcatLowerLower(d, v, v); 2451 const auto b = ConcatUpperUpper(d, v, v); 2452 return Vec256<T>{__lasx_xvshuf_b(b.raw, a.raw, idx.raw)}; 2453 } 2454 2455 template <typename T, HWY_IF_T_SIZE(T, 2)> 2456 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { 2457 const DFromV<decltype(v)> d; 2458 const RebindToUnsigned<decltype(d)> du; 2459 const auto a = ConcatLowerLower(d, v, v); 2460 const auto b = ConcatUpperUpper(d, v, v); 2461 return BitCast(d, VFromD<decltype(du)>{__lasx_xvshuf_h( 2462 idx.raw, BitCast(du, b).raw, BitCast(du, a).raw)}); 2463 } 2464 2465 template <typename T, HWY_IF_T_SIZE(T, 4)> 2466 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { 2467 const DFromV<decltype(v)> d; 2468 const RebindToSigned<decltype(d)> di; 2469 return BitCast(d, 2470 Vec256<int32_t>{__lasx_xvperm_w(BitCast(di, v).raw, idx.raw)}); 2471 } 2472 2473 template <typename T, HWY_IF_T_SIZE(T, 8)> 2474 HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) { 2475 using TI = MakeSigned<T>; 2476 const DFromV<decltype(v)> d; 2477 const RebindToSigned<decltype(d)> di64; 2478 const Repartition<int32_t, decltype(d)> di32; 2479 // Replicate 64-bit index into upper 32 bits 2480 const Vec256<TI> dup{__lasx_xvpackev_w(idx.raw, idx.raw)}; 2481 // For each idx64 i, idx32 are 2*i and 2*i+1. 2482 const Vec256<TI> idx32 = dup + dup + Set(di64, int64_t(1) << 32); 2483 return BitCast( 2484 d, TableLookupLanes(BitCast(di32, v), Indices256<int32_t>{idx32.raw})); 2485 } 2486 2487 template <typename T, HWY_IF_T_SIZE(T, 1)> 2488 HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b, 2489 Indices256<T> idx) { 2490 const auto idx2 = Indices256<T>{__lasx_xvandi_b(idx.raw, 31)}; 2491 const Vec256<T> idx_vec{idx.raw}; 2492 const auto sel_hi_mask = ShiftLeft<2>(idx_vec); 2493 const auto mask0or1 = __lasx_xvslti_b(sel_hi_mask.raw, 0); 2494 const auto lo_lookup_result = TableLookupLanes(a, idx); 2495 const auto hi_lookup_result = TableLookupLanes(b, idx2); 2496 return IfThenElse(Mask256<T>{mask0or1}, hi_lookup_result, lo_lookup_result); 2497 } 2498 2499 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> 2500 HWY_API Vec256<T> TwoTablesLookupLanes(Vec256<T> a, Vec256<T> b, 2501 Indices256<T> idx) { 2502 const DFromV<decltype(a)> d; 2503 const RebindToSigned<decltype(d)> di; 2504 const Vec256<TFromD<decltype(di)>> idx_vec{idx.raw}; 2505 constexpr int shift_count = 8 * sizeof(T) - 6 + CeilLog2(sizeof(T)); 2506 const auto sel_hi_mask = BitCast(di, ShiftLeft<shift_count>(idx_vec)); 2507 const auto lo_lookup_result = BitCast(di, TableLookupLanes(a, idx)); 2508 const auto hi_lookup_result = BitCast(di, TableLookupLanes(b, idx)); 2509 return BitCast( 2510 d, IfNegativeThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); 2511 } 2512 2513 // ------------------------------ SwapAdjacentBlocks 2514 2515 template <typename T> 2516 HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) { 2517 const DFromV<decltype(v)> d; 2518 const RebindToUnsigned<decltype(d)> du; 2519 return BitCast(d, Vec256<uint8_t>{__lasx_xvpermi_q( 2520 BitCast(du, v).raw, BitCast(du, v).raw, 0x01)}); 2521 } 2522 2523 // ------------------------------ InterleaveEvenBlocks (ConcatLowerLower) 2524 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)> 2525 HWY_API V InterleaveEvenBlocks(D d, V a, V b) { 2526 return ConcatLowerLower(d, b, a); 2527 } 2528 2529 // ------------------------------ InterleaveOddBlocks (ConcatUpperUpper) 2530 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)> 2531 HWY_API V InterleaveOddBlocks(D d, V a, V b) { 2532 return ConcatUpperUpper(d, b, a); 2533 } 2534 2535 // ------------------------------ Reverse (RotateRight) 2536 2537 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)> 2538 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 2539 alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; 2540 return TableLookupLanes(v, SetTableIndices(d, kReverse)); 2541 } 2542 2543 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 2544 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 2545 const RebindToUnsigned<decltype(d)> du; 2546 return BitCast( 2547 d, VFromD<decltype(du)>{__lasx_xvpermi_d(BitCast(du, v).raw, 0x1b)}); 2548 } 2549 2550 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2551 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 2552 alignas(32) static constexpr int16_t kReverse[16] = { 2553 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; 2554 return TableLookupLanes(v, SetTableIndices(d, kReverse)); 2555 } 2556 2557 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 2558 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 2559 alignas(32) static constexpr TFromD<D> kReverse[32] = { 2560 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 2561 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; 2562 return TableLookupLanes(v, SetTableIndices(d, kReverse)); 2563 } 2564 2565 // ------------------------------ Reverse4 (SwapAdjacentBlocks) 2566 2567 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2568 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) { 2569 const RebindToUnsigned<decltype(d)> du; 2570 return BitCast( 2571 d, VFromD<decltype(du)>{__lasx_xvshuf4i_h(BitCast(du, v).raw, 0x1b)}); 2572 } 2573 2574 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 2575 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { 2576 const RebindToUnsigned<D> du; 2577 return BitCast( 2578 D(), VFromD<decltype(du)>{__lasx_xvpermi_d(BitCast(du, v).raw, 0x1b)}); 2579 } 2580 2581 // ------------------------------ Reverse8 2582 2583 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2584 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 2585 const RebindToSigned<decltype(d)> di; 2586 const VFromD<decltype(di)> shuffle = Dup128VecFromValues( 2587 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); 2588 return BitCast(d, TableLookupBytes(v, shuffle)); 2589 } 2590 2591 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)> 2592 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 2593 return Reverse(d, v); 2594 } 2595 2596 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 2597 HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D> /* v */) { 2598 HWY_ASSERT(0); 2599 } 2600 2601 // ------------------------------ InterleaveLower 2602 2603 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides 2604 // the least-significant lane) and "b". To concatenate two half-width integers 2605 // into one, use ZipLower/Upper instead (also works with scalar). 2606 2607 template <typename T, HWY_IF_T_SIZE(T, 1)> 2608 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) { 2609 return Vec256<T>{__lasx_xvilvl_b(b.raw, a.raw)}; 2610 } 2611 template <typename T, HWY_IF_T_SIZE(T, 2)> 2612 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) { 2613 const DFromV<decltype(a)> d; 2614 const RebindToUnsigned<decltype(d)> du; 2615 using VU = VFromD<decltype(du)>; // for float16_t 2616 return BitCast(d, 2617 VU{__lasx_xvilvl_h(BitCast(du, b).raw, BitCast(du, a).raw)}); 2618 } 2619 template <typename T, HWY_IF_UI32(T)> 2620 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) { 2621 return Vec256<T>{__lasx_xvilvl_w(b.raw, a.raw)}; 2622 } 2623 template <typename T, HWY_IF_UI64(T)> 2624 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) { 2625 return Vec256<T>{__lasx_xvilvl_d(b.raw, a.raw)}; 2626 } 2627 2628 HWY_API Vec256<float> InterleaveLower(Vec256<float> a, Vec256<float> b) { 2629 const Full256<uint32_t> du; 2630 const Full256<float> df; 2631 return BitCast(df, Vec256<uint32_t>{__lasx_xvilvl_w(BitCast(du, b).raw, 2632 BitCast(du, a).raw)}); 2633 } 2634 HWY_API Vec256<double> InterleaveLower(Vec256<double> a, Vec256<double> b) { 2635 const Full256<uint64_t> du; 2636 const Full256<double> df; 2637 return BitCast(df, Vec256<uint64_t>{__lasx_xvilvl_d(BitCast(du, b).raw, 2638 BitCast(du, a).raw)}); 2639 } 2640 2641 // ------------------------------ InterleaveUpper 2642 2643 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 2644 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 2645 return VFromD<D>{__lasx_xvilvh_b(b.raw, a.raw)}; 2646 } 2647 2648 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2649 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 2650 const RebindToUnsigned<decltype(d)> du; 2651 using VU = VFromD<decltype(du)>; // for float16_t 2652 return BitCast(d, 2653 VU{__lasx_xvilvh_h(BitCast(du, b).raw, BitCast(du, a).raw)}); 2654 } 2655 2656 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> 2657 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 2658 return VFromD<D>{__lasx_xvilvh_w(b.raw, a.raw)}; 2659 } 2660 2661 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> 2662 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 2663 return VFromD<D>{__lasx_xvilvh_d(b.raw, a.raw)}; 2664 } 2665 2666 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 2667 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 2668 const RebindToUnsigned<D> du; 2669 return BitCast(D(), VFromD<decltype(du)>{__lasx_xvilvh_w( 2670 BitCast(du, b).raw, BitCast(du, a).raw)}); 2671 } 2672 2673 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 2674 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 2675 const RebindToUnsigned<D> du; 2676 return BitCast(D(), VFromD<decltype(du)>{__lasx_xvilvh_d( 2677 BitCast(du, b).raw, BitCast(du, a).raw)}); 2678 } 2679 2680 // ------------------------------ Blocks (LowerHalf, ZeroExtendVector) 2681 2682 // hiH,hiL loH,loL |-> hiL,loL (= lower halves) 2683 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2684 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 2685 const RebindToUnsigned<decltype(d)> du; 2686 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q( 2687 BitCast(du, hi).raw, BitCast(du, lo).raw, 0x20)}); 2688 } 2689 2690 // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) 2691 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2692 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 2693 const RebindToUnsigned<decltype(d)> du; 2694 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q( 2695 BitCast(du, hi).raw, BitCast(du, lo).raw, 0x21)}); 2696 } 2697 2698 // hiH,hiL loH,loL |-> hiH,loL (= outer halves) 2699 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2700 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 2701 const RebindToUnsigned<decltype(d)> du; // for float16_t 2702 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q( 2703 BitCast(du, hi).raw, BitCast(du, lo).raw, 0x30)}); 2704 } 2705 2706 // hiH,hiL loH,loL |-> hiH,loH (= upper halves) 2707 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2708 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 2709 const RebindToUnsigned<decltype(d)> du; // for float16_t 2710 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q( 2711 BitCast(du, hi).raw, BitCast(du, lo).raw, 0x31)}); 2712 } 2713 2714 // ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower) 2715 template <int kBlockIdx, class T> 2716 HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) { 2717 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); 2718 2719 const DFromV<decltype(v)> d; 2720 const auto vec_to_insert = ResizeBitCast(d, blk_to_insert); 2721 return (kBlockIdx == 0) ? ConcatUpperLower(d, v, vec_to_insert) 2722 : ConcatLowerLower(d, vec_to_insert, v); 2723 } 2724 2725 // ------------------------------ ConcatOdd 2726 2727 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 2728 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 2729 __m256i od = __lasx_xvpickod_b(hi.raw, lo.raw); 2730 return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)}; 2731 } 2732 2733 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2734 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 2735 const RebindToUnsigned<decltype(d)> du; 2736 __m256i od = __lasx_xvpickod_h(BitCast(du, hi).raw, BitCast(du, lo).raw); 2737 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)}); 2738 } 2739 2740 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> 2741 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 2742 __m256i od = __lasx_xvpickod_w(hi.raw, lo.raw); 2743 return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)}; 2744 } 2745 2746 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 2747 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 2748 const RebindToUnsigned<decltype(d)> du; 2749 __m256i od = __lasx_xvpickod_w(BitCast(du, hi).raw, BitCast(du, lo).raw); 2750 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)}); 2751 } 2752 2753 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> 2754 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 2755 __m256i od = __lasx_xvpickod_d(hi.raw, lo.raw); 2756 return VFromD<D>{__lasx_xvpermi_d(od, 0xd8)}; 2757 } 2758 2759 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 2760 HWY_API Vec256<double> ConcatOdd(D d, Vec256<double> hi, Vec256<double> lo) { 2761 const RebindToUnsigned<decltype(d)> du; 2762 __m256i od = __lasx_xvpickod_d(BitCast(du, hi).raw, BitCast(du, lo).raw); 2763 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(od, 0xd8)}); 2764 } 2765 2766 // ------------------------------ ConcatEven 2767 2768 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 2769 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 2770 __m256i ev = __lasx_xvpickev_b(hi.raw, lo.raw); 2771 return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)}; 2772 } 2773 2774 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2775 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 2776 const RebindToUnsigned<decltype(d)> du; 2777 __m256i ev = __lasx_xvpickev_h(BitCast(du, hi).raw, BitCast(du, lo).raw); 2778 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)}); 2779 } 2780 2781 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI32_D(D)> 2782 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 2783 __m256i ev = __lasx_xvpickev_w(hi.raw, lo.raw); 2784 return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)}; 2785 } 2786 2787 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 2788 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 2789 const RebindToUnsigned<decltype(d)> du; 2790 __m256i ev = __lasx_xvpickev_w(BitCast(du, hi).raw, BitCast(du, lo).raw); 2791 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)}); 2792 } 2793 2794 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_UI64_D(D)> 2795 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 2796 __m256i ev = __lasx_xvpickev_d(hi.raw, lo.raw); 2797 return VFromD<D>{__lasx_xvpermi_d(ev, 0xd8)}; 2798 } 2799 2800 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 2801 HWY_API Vec256<double> ConcatEven(D d, Vec256<double> hi, Vec256<double> lo) { 2802 const RebindToUnsigned<decltype(d)> du; 2803 __m256i ev = __lasx_xvpickev_d(BitCast(du, hi).raw, BitCast(du, lo).raw); 2804 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_d(ev, 0xd8)}); 2805 } 2806 2807 // ------------------------------ InterleaveWholeLower 2808 2809 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2810 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { 2811 return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b)); 2812 } 2813 2814 // ------------------------------ InterleaveWholeUpper 2815 2816 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2817 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { 2818 return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b)); 2819 } 2820 2821 // ------------------------------ DupEven (InterleaveLower) 2822 2823 template <typename T, HWY_IF_UI8(T)> 2824 HWY_API Vec256<T> DupEven(Vec256<T> v) { 2825 return Vec256<T>{__lasx_xvpackev_b(v.raw, v.raw)}; 2826 } 2827 2828 template <typename T, HWY_IF_UI16(T)> 2829 HWY_API Vec256<T> DupEven(Vec256<T> v) { 2830 return Vec256<T>{__lasx_xvpackev_h(v.raw, v.raw)}; 2831 } 2832 2833 template <typename T, HWY_IF_UI32(T)> 2834 HWY_API Vec256<T> DupEven(Vec256<T> v) { 2835 return Vec256<T>{__lasx_xvpackev_w(v.raw, v.raw)}; 2836 } 2837 2838 HWY_API Vec256<float> DupEven(Vec256<float> v) { 2839 const Full256<uint32_t> du; 2840 const DFromV<decltype(v)> d; 2841 return BitCast(d, Vec256<uint32_t>{__lasx_xvpackev_w(BitCast(du, v).raw, 2842 BitCast(du, v).raw)}); 2843 } 2844 2845 template <typename T, HWY_IF_T_SIZE(T, 8)> 2846 HWY_API Vec256<T> DupEven(const Vec256<T> v) { 2847 const DFromV<decltype(v)> d; 2848 return InterleaveLower(d, v, v); 2849 } 2850 2851 // ------------------------------ DupOdd (InterleaveUpper) 2852 2853 template <typename T, HWY_IF_UI8(T)> 2854 HWY_API Vec256<T> DupOdd(Vec256<T> v) { 2855 return Vec256<T>{__lasx_xvpackod_b(v.raw, v.raw)}; 2856 } 2857 2858 template <typename T, HWY_IF_UI16(T)> 2859 HWY_API Vec256<T> DupOdd(Vec256<T> v) { 2860 return Vec256<T>{__lasx_xvpackod_h(v.raw, v.raw)}; 2861 } 2862 2863 template <typename T, HWY_IF_UI32(T)> 2864 HWY_API Vec256<T> DupOdd(Vec256<T> v) { 2865 return Vec256<T>{__lasx_xvpackod_w(v.raw, v.raw)}; 2866 } 2867 2868 HWY_API Vec256<float> DupOdd(Vec256<float> v) { 2869 const Full256<uint32_t> du; 2870 const DFromV<decltype(v)> d; 2871 return BitCast(d, Vec256<uint32_t>{__lasx_xvpackod_w(BitCast(du, v).raw, 2872 BitCast(du, v).raw)}); 2873 } 2874 2875 template <typename T, HWY_IF_T_SIZE(T, 8)> 2876 HWY_API Vec256<T> DupOdd(const Vec256<T> v) { 2877 const DFromV<decltype(v)> d; 2878 return InterleaveUpper(d, v, v); 2879 } 2880 2881 // ------------------------------ OddEven 2882 2883 template <typename T, HWY_IF_T_SIZE(T, 1)> 2884 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) { 2885 __m256i c = __lasx_xvpackod_b(a.raw, a.raw); 2886 return Vec256<T>{__lasx_xvpackev_b(c, b.raw)}; 2887 } 2888 2889 template <typename T, HWY_IF_UI16(T)> 2890 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) { 2891 __m256i c = __lasx_xvpackod_h(a.raw, a.raw); 2892 return Vec256<T>{__lasx_xvpackev_h(c, b.raw)}; 2893 } 2894 2895 template <typename T, HWY_IF_UI32(T)> 2896 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) { 2897 __m256i c = __lasx_xvpackod_w(a.raw, a.raw); 2898 return Vec256<T>{__lasx_xvpackev_w(c, b.raw)}; 2899 } 2900 2901 template <typename T, HWY_IF_UI64(T)> 2902 HWY_INLINE Vec256<T> OddEven(Vec256<T> a, Vec256<T> b) { 2903 return Vec256<T>{__lasx_xvextrins_d(b.raw, a.raw, 0x11)}; 2904 } 2905 2906 HWY_API Vec256<float> OddEven(Vec256<float> a, Vec256<float> b) { 2907 const DFromV<decltype(a)> d; 2908 const RebindToUnsigned<decltype(d)> du; 2909 __m256i c = __lasx_xvpackod_w(BitCast(du, a).raw, BitCast(du, a).raw); 2910 return BitCast( 2911 d, VFromD<decltype(du)>{__lasx_xvpackev_w(c, BitCast(du, b).raw)}); 2912 } 2913 2914 HWY_API Vec256<double> OddEven(Vec256<double> a, Vec256<double> b) { 2915 const DFromV<decltype(a)> d; 2916 const RebindToUnsigned<decltype(d)> du; 2917 return BitCast(d, VFromD<decltype(du)>{__lasx_xvextrins_d( 2918 BitCast(du, b).raw, BitCast(du, a).raw, 0x11)}); 2919 } 2920 2921 // -------------------------- InterleaveEven 2922 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 2923 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 2924 return VFromD<D>{__lasx_xvpackev_b(b.raw, a.raw)}; 2925 } 2926 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2927 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 2928 return VFromD<D>{__lasx_xvpackev_h(b.raw, a.raw)}; 2929 } 2930 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)> 2931 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 2932 const RebindToUnsigned<decltype(d)> du; 2933 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpackev_w( 2934 BitCast(du, b).raw, BitCast(du, a).raw)}); 2935 } 2936 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 2937 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 2938 return InterleaveLower(a, b); 2939 } 2940 2941 // -------------------------- InterleaveOdd 2942 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 2943 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 2944 return VFromD<D>{__lasx_xvpackod_b(b.raw, a.raw)}; 2945 } 2946 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 2947 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 2948 return VFromD<D>{__lasx_xvpackod_h(b.raw, a.raw)}; 2949 } 2950 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)> 2951 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 2952 const RebindToUnsigned<decltype(d)> du; 2953 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpackod_w( 2954 BitCast(du, b).raw, BitCast(du, a).raw)}); 2955 } 2956 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 2957 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 2958 return InterleaveUpper(d, a, b); 2959 } 2960 2961 // ------------------------------ OddEvenBlocks 2962 2963 template <typename T> 2964 Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) { 2965 const DFromV<decltype(odd)> d; 2966 const RebindToUnsigned<decltype(d)> du; 2967 return BitCast(d, VFromD<decltype(du)>{__lasx_xvpermi_q( 2968 BitCast(du, odd).raw, BitCast(du, even).raw, 0x30)}); 2969 } 2970 2971 // ------------------------------ ReverseBlocks (SwapAdjacentBlocks) 2972 2973 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2974 HWY_API VFromD<D> ReverseBlocks(D /*d*/, VFromD<D> v) { 2975 return SwapAdjacentBlocks(v); 2976 } 2977 2978 // ------------------------------ TableLookupBytes (ZeroExtendVector) 2979 2980 // Both full 2981 template <typename T, typename TI> 2982 HWY_API Vec256<TI> TableLookupBytes(Vec256<T> bytes, Vec256<TI> from) { 2983 const DFromV<decltype(from)> d; 2984 return BitCast(d, Vec256<uint8_t>{__lasx_xvshuf_b( 2985 BitCast(Full256<uint8_t>(), bytes).raw, 2986 BitCast(Full256<uint8_t>(), bytes).raw, 2987 BitCast(Full256<uint8_t>(), from).raw)}); 2988 } 2989 2990 // Partial index vector 2991 template <typename T, typename TI, size_t NI> 2992 HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, Vec128<TI, NI> from) { 2993 const Full256<TI> di; 2994 const Half<decltype(di)> dih; 2995 // First expand to full 128, then 256. 2996 const auto from_256 = ZeroExtendVector(di, Vec128<TI>{from.raw}); 2997 const auto tbl_full = TableLookupBytes(bytes, from_256); 2998 // Shrink to 128, then partial. 2999 return Vec128<TI, NI>{LowerHalf(dih, tbl_full).raw}; 3000 } 3001 3002 // Partial table vector 3003 template <typename T, size_t N, typename TI> 3004 HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, Vec256<TI> from) { 3005 const Full256<T> d; 3006 // First expand to full 128, then 256. 3007 const auto bytes_256 = ZeroExtendVector(d, Vec128<T>{bytes.raw}); 3008 return TableLookupBytes(bytes_256, from); 3009 } 3010 3011 // ------------------------------ Per4LaneBlockShuffle 3012 3013 namespace detail { 3014 3015 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3016 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, 3017 const uint32_t x2, 3018 const uint32_t x1, 3019 const uint32_t x0) { 3020 alignas(32) uint32_t rawU32[8] = {x0, x1, x2, x3, x0, x1, x2, x3}; 3021 return BitCast(d, Vec256<uint32_t>{__lasx_xvld(rawU32, 0)}); 3022 } 3023 3024 template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)> 3025 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3026 hwy::SizeTag<4> /*lane_size_tag*/, 3027 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 3028 const DFromV<decltype(v)> d; 3029 V idx = 3030 Per4LaneBlkShufDupSet4xU32(d, (kIdx3210 >> 6) & 3, (kIdx3210 >> 4) & 3, 3031 (kIdx3210 >> 2) & 3, kIdx3210 & 3); 3032 return V{__lasx_xvshuf_w(idx.raw, v.raw, v.raw)}; 3033 } 3034 3035 template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)> 3036 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3037 hwy::SizeTag<4> /*lane_size_tag*/, 3038 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 3039 const DFromV<decltype(v)> d; 3040 const RebindToUnsigned<decltype(d)> du; 3041 const auto idx = 3042 Per4LaneBlkShufDupSet4xU32(du, (kIdx3210 >> 6) & 3, (kIdx3210 >> 4) & 3, 3043 (kIdx3210 >> 2) & 3, kIdx3210 & 3); 3044 return BitCast(d, VFromD<decltype(du)>{__lasx_xvshuf_w( 3045 idx.raw, BitCast(du, v).raw, BitCast(du, v).raw)}); 3046 } 3047 3048 template <class V> 3049 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, 3050 hwy::SizeTag<8> /*lane_size_tag*/, 3051 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 3052 const DFromV<decltype(v)> d; 3053 return ConcatLowerLower(d, v, v); 3054 } 3055 3056 template <class V> 3057 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, 3058 hwy::SizeTag<8> /*lane_size_tag*/, 3059 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 3060 const DFromV<decltype(v)> d; 3061 return ConcatUpperUpper(d, v, v); 3062 } 3063 3064 template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)> 3065 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3066 hwy::SizeTag<8> /*lane_size_tag*/, 3067 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 3068 const DFromV<decltype(v)> d; 3069 const RebindToUnsigned<decltype(d)> du; 3070 using VU = VFromD<decltype(du)>; 3071 3072 const VU vu = BitCast(du, v); 3073 return BitCast( 3074 d, VU{__lasx_xvpermi_d(vu.raw, static_cast<int>(kIdx3210 & 0xFF))}); 3075 } 3076 3077 } // namespace detail 3078 3079 // ------------------------------ SlideUpLanes 3080 3081 namespace detail { 3082 3083 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3084 HWY_INLINE VFromD<D> TableLookupSlideUpLanes(D d, VFromD<D> v, size_t amt) { 3085 const RebindToUnsigned<D> du; 3086 using TU = TFromD<decltype(du)>; 3087 const auto idx = Iota(du, static_cast<TU>(size_t{0} - amt)); 3088 const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1))); 3089 return BitCast( 3090 d, IfThenElseZero( 3091 idx == masked_idx, 3092 TableLookupLanes(BitCast(du, v), IndicesFromVec(du, masked_idx)))); 3093 } 3094 3095 } // namespace detail 3096 3097 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)> 3098 HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) { 3099 static_assert(0 <= kBlocks && kBlocks <= 1, 3100 "kBlocks must be between 0 and 1"); 3101 return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v; 3102 } 3103 3104 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3105 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3106 #if !HWY_IS_DEBUG_BUILD 3107 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>); 3108 if (__builtin_constant_p(amt)) { 3109 const auto v_lo = ConcatLowerLower(d, v, Zero(d)); 3110 switch (amt * sizeof(TFromD<D>)) { 3111 case 0: 3112 return v; 3113 case 1: 3114 return CombineShiftRightBytes<15>(d, v, v_lo); 3115 case 2: 3116 return CombineShiftRightBytes<14>(d, v, v_lo); 3117 case 3: 3118 return CombineShiftRightBytes<13>(d, v, v_lo); 3119 case 4: 3120 return CombineShiftRightBytes<12>(d, v, v_lo); 3121 case 5: 3122 return CombineShiftRightBytes<11>(d, v, v_lo); 3123 case 6: 3124 return CombineShiftRightBytes<10>(d, v, v_lo); 3125 case 7: 3126 return CombineShiftRightBytes<9>(d, v, v_lo); 3127 case 8: 3128 return CombineShiftRightBytes<8>(d, v, v_lo); 3129 case 9: 3130 return CombineShiftRightBytes<7>(d, v, v_lo); 3131 case 10: 3132 return CombineShiftRightBytes<6>(d, v, v_lo); 3133 case 11: 3134 return CombineShiftRightBytes<5>(d, v, v_lo); 3135 case 12: 3136 return CombineShiftRightBytes<4>(d, v, v_lo); 3137 case 13: 3138 return CombineShiftRightBytes<3>(d, v, v_lo); 3139 case 14: 3140 return CombineShiftRightBytes<2>(d, v, v_lo); 3141 case 15: 3142 return CombineShiftRightBytes<1>(d, v, v_lo); 3143 } 3144 } 3145 3146 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { 3147 const Half<decltype(d)> dh; 3148 return Combine(d, SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock), 3149 Zero(dh)); 3150 } 3151 #endif 3152 3153 return detail::TableLookupSlideUpLanes(d, v, amt); 3154 } 3155 3156 // ------------------------------ Slide1Up 3157 3158 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 3159 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { 3160 const auto v_lo = ConcatLowerLower(d, v, Zero(d)); 3161 return CombineShiftRightBytes<15>(d, v, v_lo); 3162 } 3163 3164 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 3165 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { 3166 const auto v_lo = ConcatLowerLower(d, v, Zero(d)); 3167 return CombineShiftRightBytes<14>(d, v, v_lo); 3168 } 3169 3170 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)> 3171 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { 3172 const auto v_lo = ConcatLowerLower(d, v, Zero(d)); 3173 return CombineShiftRightBytes<12>(d, v, v_lo); 3174 } 3175 3176 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 3177 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { 3178 const auto v_lo = ConcatLowerLower(d, v, Zero(d)); 3179 return CombineShiftRightBytes<8>(d, v, v_lo); 3180 } 3181 3182 // ------------------------------ SlideDownLanes 3183 3184 namespace detail { 3185 3186 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3187 HWY_INLINE VFromD<D> TableLookupSlideDownLanes(D d, VFromD<D> v, size_t amt) { 3188 const RebindToUnsigned<decltype(d)> du; 3189 using TU = TFromD<decltype(du)>; 3190 const auto idx = Iota(du, static_cast<TU>(amt)); 3191 const auto masked_idx = And(idx, Set(du, static_cast<TU>(MaxLanes(d) - 1))); 3192 return IfThenElseZero(RebindMask(d, idx == masked_idx), 3193 TableLookupLanes(v, IndicesFromVec(d, masked_idx))); 3194 } 3195 3196 } // namespace detail 3197 3198 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)> 3199 HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) { 3200 static_assert(0 <= kBlocks && kBlocks <= 1, 3201 "kBlocks must be between 0 and 1"); 3202 const Half<decltype(d)> dh; 3203 return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v; 3204 } 3205 3206 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3207 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3208 #if !HWY_IS_DEBUG_BUILD 3209 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>); 3210 const Half<decltype(d)> dh; 3211 if (__builtin_constant_p(amt)) { 3212 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); 3213 switch (amt * sizeof(TFromD<D>)) { 3214 case 0: 3215 return v; 3216 case 1: 3217 return CombineShiftRightBytes<1>(d, v_hi, v); 3218 case 2: 3219 return CombineShiftRightBytes<2>(d, v_hi, v); 3220 case 3: 3221 return CombineShiftRightBytes<3>(d, v_hi, v); 3222 case 4: 3223 return CombineShiftRightBytes<4>(d, v_hi, v); 3224 case 5: 3225 return CombineShiftRightBytes<5>(d, v_hi, v); 3226 case 6: 3227 return CombineShiftRightBytes<6>(d, v_hi, v); 3228 case 7: 3229 return CombineShiftRightBytes<7>(d, v_hi, v); 3230 case 8: 3231 return CombineShiftRightBytes<8>(d, v_hi, v); 3232 case 9: 3233 return CombineShiftRightBytes<9>(d, v_hi, v); 3234 case 10: 3235 return CombineShiftRightBytes<10>(d, v_hi, v); 3236 case 11: 3237 return CombineShiftRightBytes<11>(d, v_hi, v); 3238 case 12: 3239 return CombineShiftRightBytes<12>(d, v_hi, v); 3240 case 13: 3241 return CombineShiftRightBytes<13>(d, v_hi, v); 3242 case 14: 3243 return CombineShiftRightBytes<14>(d, v_hi, v); 3244 case 15: 3245 return CombineShiftRightBytes<15>(d, v_hi, v); 3246 } 3247 } 3248 3249 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { 3250 return ZeroExtendVector( 3251 d, SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock)); 3252 } 3253 #endif 3254 3255 return detail::TableLookupSlideDownLanes(d, v, amt); 3256 } 3257 3258 // ------------------------------ Slide1Down 3259 3260 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 1)> 3261 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 3262 const Half<decltype(d)> dh; 3263 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); 3264 return CombineShiftRightBytes<1>(d, v_hi, v); 3265 } 3266 3267 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 3268 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 3269 const Half<decltype(d)> dh; 3270 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); 3271 return CombineShiftRightBytes<2>(d, v_hi, v); 3272 } 3273 3274 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 4)> 3275 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 3276 const Half<decltype(d)> dh; 3277 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); 3278 return CombineShiftRightBytes<4>(d, v_hi, v); 3279 } 3280 3281 template <typename D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 8)> 3282 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 3283 const Half<decltype(d)> dh; 3284 const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); 3285 return CombineShiftRightBytes<8>(d, v_hi, v); 3286 } 3287 3288 // ------------------------------ Shl (Mul, ZipLower) 3289 namespace detail { 3290 3291 HWY_INLINE Vec256<uint8_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint8_t> v, 3292 Vec256<uint8_t> bits) { 3293 return Vec256<uint8_t>{__lasx_xvsll_b(v.raw, bits.raw)}; 3294 } 3295 3296 HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v, 3297 Vec256<uint16_t> bits) { 3298 return Vec256<uint16_t>{__lasx_xvsll_h(v.raw, bits.raw)}; 3299 } 3300 3301 HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v, 3302 Vec256<uint32_t> bits) { 3303 return Vec256<uint32_t>{__lasx_xvsll_w(v.raw, bits.raw)}; 3304 } 3305 3306 HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v, 3307 Vec256<uint64_t> bits) { 3308 return Vec256<uint64_t>{__lasx_xvsll_d(v.raw, bits.raw)}; 3309 } 3310 3311 template <typename T> 3312 HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) { 3313 // Signed left shifts are the same as unsigned. 3314 const Full256<T> di; 3315 const Full256<MakeUnsigned<T>> du; 3316 return BitCast(di, 3317 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); 3318 } 3319 3320 } // namespace detail 3321 3322 template <typename T> 3323 HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) { 3324 return detail::Shl(hwy::TypeTag<T>(), v, bits); 3325 } 3326 3327 // ------------------------------ Shr (MulHigh, IfThenElse, Not) 3328 3329 HWY_API Vec256<uint8_t> operator>>(Vec256<uint8_t> v, Vec256<uint8_t> bits) { 3330 return Vec256<uint8_t>{__lasx_xvsrl_b(v.raw, bits.raw)}; 3331 } 3332 3333 HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) { 3334 return Vec256<uint16_t>{__lasx_xvsrl_h(v.raw, bits.raw)}; 3335 } 3336 3337 HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) { 3338 return Vec256<uint32_t>{__lasx_xvsrl_w(v.raw, bits.raw)}; 3339 } 3340 3341 HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) { 3342 return Vec256<uint64_t>{__lasx_xvsrl_d(v.raw, bits.raw)}; 3343 } 3344 3345 HWY_API Vec256<int8_t> operator>>(Vec256<int8_t> v, Vec256<int8_t> bits) { 3346 return Vec256<int8_t>{__lasx_xvsra_b(v.raw, bits.raw)}; 3347 } 3348 3349 HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) { 3350 return Vec256<int16_t>{__lasx_xvsra_h(v.raw, bits.raw)}; 3351 } 3352 3353 HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) { 3354 return Vec256<int32_t>{__lasx_xvsra_w(v.raw, bits.raw)}; 3355 } 3356 3357 HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) { 3358 return Vec256<int64_t>{__lasx_xvsra_d(v.raw, bits.raw)}; 3359 } 3360 3361 // ------------------------------ WidenMulPairwiseAdd 3362 3363 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3364 HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<int16_t> a, 3365 Vec256<int16_t> b) { 3366 __m256i ev = __lasx_xvmulwev_w_h(b.raw, a.raw); 3367 return VFromD<D>{__lasx_xvmaddwod_w_h(ev, b.raw, a.raw)}; 3368 } 3369 3370 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> 3371 HWY_API VFromD<D> WidenMulPairwiseAdd(D /*d32*/, Vec256<uint16_t> a, 3372 Vec256<uint16_t> b) { 3373 __m256i ev = __lasx_xvmulwev_w_hu(b.raw, a.raw); 3374 return VFromD<D>{__lasx_xvmaddwod_w_hu(ev, b.raw, a.raw)}; 3375 } 3376 3377 // ------------------------------ ReorderWidenMulAccumulate 3378 3379 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3380 HWY_API VFromD<D> ReorderWidenMulAccumulate(D /*tag*/, Vec256<int16_t> a, 3381 Vec256<int16_t> b, 3382 const VFromD<D> sum0, 3383 VFromD<D>& /*sum1*/) { 3384 return VFromD<D>{__lasx_xvmaddwev_w_h( 3385 __lasx_xvmaddwod_w_h(sum0.raw, a.raw, b.raw), a.raw, b.raw)}; 3386 } 3387 3388 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> 3389 HWY_API VFromD<D> ReorderWidenMulAccumulate(D /*tag*/, Vec256<uint16_t> a, 3390 Vec256<uint16_t> b, 3391 const VFromD<D> sum0, 3392 VFromD<D>& /*sum1*/) { 3393 return VFromD<D>{__lasx_xvmaddwev_w_hu( 3394 __lasx_xvmaddwod_w_hu(sum0.raw, a.raw, b.raw), a.raw, b.raw)}; 3395 } 3396 3397 // ------------------------------ RearrangeToOddPlusEven 3398 HWY_API Vec256<int32_t> RearrangeToOddPlusEven(const Vec256<int32_t> sum0, 3399 Vec256<int32_t> /*sum1*/) { 3400 return sum0; // invariant already holds 3401 } 3402 3403 HWY_API Vec256<uint32_t> RearrangeToOddPlusEven(const Vec256<uint32_t> sum0, 3404 Vec256<uint32_t> /*sum1*/) { 3405 return sum0; // invariant already holds 3406 } 3407 3408 // ================================================== CONVERT 3409 3410 // ------------------------------ Promotions (part w/ narrow lanes -> full) 3411 3412 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 3413 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<hwy::float16_t> v) { 3414 const Repartition<hwy::float16_t, D> df16; 3415 const auto from_128 = ZeroExtendVector(df16, v); 3416 const VFromD<decltype(df16)> f16_concat{__lasx_xvpermi_d(from_128.raw, 0xd8)}; 3417 return VFromD<D>{__lasx_xvfcvtl_s_h(f16_concat.raw)}; 3418 } 3419 3420 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 3421 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<float> v) { 3422 const Repartition<float, D> df; 3423 const RebindToSigned<decltype(df)> di; 3424 const auto from_128 = ZeroExtendVector(df, v); 3425 const auto f32_concat = BitCast( 3426 df, Vec256<uint32_t>{__lasx_xvpermi_d(BitCast(di, from_128).raw, 0xd8)}); 3427 return VFromD<D>{__lasx_xvfcvtl_d_s(f32_concat.raw)}; 3428 } 3429 3430 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)> 3431 HWY_API VFromD<D> PromoteTo(D /*di64*/, Vec128<float> v) { 3432 const Repartition<float, D> df; 3433 const RebindToSigned<decltype(df)> di; 3434 const auto from_128 = ZeroExtendVector(df, v); 3435 const auto f32_concat = BitCast( 3436 df, Vec256<uint32_t>{__lasx_xvpermi_d(BitCast(di, from_128).raw, 0xd8)}); 3437 return VFromD<D>{__lasx_xvftintrzl_l_s(f32_concat.raw)}; 3438 } 3439 3440 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 3441 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) { 3442 alignas(32) __m128i vec_tmp[2]; 3443 __m256i vec_temp; 3444 vec_tmp[0] = v.raw; 3445 CopyBytes<32>(vec_tmp, &vec_temp); 3446 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3447 vec_temp = __lasx_xvsllwil_d_w(vec_temp, 0); 3448 return VFromD<D>{__lasx_xvffint_d_l(vec_temp)}; 3449 } 3450 3451 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 3452 HWY_API Vec256<double> PromoteTo(D /* tag */, Vec128<uint32_t> v) { 3453 alignas(32) __m128i vec_tmp[2]; 3454 __m256i vec_temp; 3455 vec_tmp[0] = v.raw; 3456 CopyBytes<32>(vec_tmp, &vec_temp); 3457 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3458 vec_temp = __lasx_xvsllwil_du_wu(vec_temp, 0); 3459 return VFromD<D>{__lasx_xvffint_d_lu(vec_temp)}; 3460 } 3461 3462 // Unsigned: zero-extend. 3463 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)> 3464 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t> v) { 3465 alignas(32) __m128i vec_tmp[2]; 3466 __m256i vec_temp; 3467 vec_tmp[0] = v.raw; 3468 CopyBytes<32>(vec_tmp, &vec_temp); 3469 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3470 return VFromD<D>{__lasx_xvsllwil_hu_bu(vec_temp, 0)}; 3471 } 3472 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> 3473 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint8_t, 8> v) { 3474 alignas(32) __m128i vec_tmp[2]; 3475 __m256i vec_temp; 3476 vec_tmp[0] = v.raw; 3477 CopyBytes<32>(vec_tmp, &vec_temp); 3478 vec_temp = __lasx_xvsllwil_hu_bu(vec_temp, 0); 3479 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3480 return VFromD<D>{__lasx_xvsllwil_wu_hu(vec_temp, 0)}; 3481 } 3482 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> 3483 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint16_t> v) { 3484 alignas(32) __m128i vec_tmp[2]; 3485 __m256i vec_temp; 3486 vec_tmp[0] = v.raw; 3487 CopyBytes<32>(vec_tmp, &vec_temp); 3488 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3489 return VFromD<D>{__lasx_xvsllwil_wu_hu(vec_temp, 0)}; 3490 } 3491 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)> 3492 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<uint32_t> v) { 3493 alignas(32) __m128i vec_tmp[2]; 3494 __m256i vec_temp; 3495 vec_tmp[0] = v.raw; 3496 CopyBytes<32>(vec_tmp, &vec_temp); 3497 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3498 return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)}; 3499 } 3500 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)> 3501 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec64<uint16_t> v) { 3502 alignas(32) __m128i vec_tmp[2]; 3503 __m256i vec_temp; 3504 vec_tmp[0] = v.raw; 3505 CopyBytes<32>(vec_tmp, &vec_temp); 3506 vec_temp = __lasx_xvsllwil_wu_hu(vec_temp, 0); 3507 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3508 return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)}; 3509 } 3510 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U64_D(D)> 3511 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<uint8_t> v) { 3512 alignas(32) __m128i vec_tmp[2]; 3513 __m256i vec_temp; 3514 vec_tmp[0] = v.raw; 3515 CopyBytes<32>(vec_tmp, &vec_temp); 3516 vec_temp = __lasx_xvsllwil_hu_bu(vec_temp, 0); 3517 vec_temp = __lasx_xvsllwil_wu_hu(vec_temp, 0); 3518 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3519 return VFromD<D>{__lasx_xvsllwil_du_wu(vec_temp, 0)}; 3520 } 3521 3522 // Signed: replicate sign bit. 3523 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)> 3524 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t> v) { 3525 alignas(32) __m128i vec_tmp[2]; 3526 __m256i vec_temp; 3527 vec_tmp[0] = v.raw; 3528 CopyBytes<32>(vec_tmp, &vec_temp); 3529 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3530 return VFromD<D>{__lasx_xvsllwil_h_b(vec_temp, 0)}; 3531 } 3532 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3533 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int8_t, 8> v) { 3534 alignas(32) __m128i vec_tmp[2]; 3535 __m256i vec_temp; 3536 vec_tmp[0] = v.raw; 3537 CopyBytes<32>(vec_tmp, &vec_temp); 3538 vec_temp = __lasx_xvsllwil_h_b(vec_temp, 0); 3539 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3540 return VFromD<D>{__lasx_xvsllwil_w_h(vec_temp, 0)}; 3541 } 3542 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3543 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int16_t> v) { 3544 alignas(32) __m128i vec_tmp[2]; 3545 __m256i vec_temp; 3546 vec_tmp[0] = v.raw; 3547 CopyBytes<32>(vec_tmp, &vec_temp); 3548 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3549 return VFromD<D>{__lasx_xvsllwil_w_h(vec_temp, 0)}; 3550 } 3551 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)> 3552 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<int32_t> v) { 3553 alignas(32) __m128i vec_tmp[2]; 3554 __m256i vec_temp; 3555 vec_tmp[0] = v.raw; 3556 CopyBytes<32>(vec_tmp, &vec_temp); 3557 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3558 return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)}; 3559 } 3560 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)> 3561 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec64<int16_t> v) { 3562 alignas(32) __m128i vec_tmp[2]; 3563 __m256i vec_temp; 3564 vec_tmp[0] = v.raw; 3565 CopyBytes<32>(vec_tmp, &vec_temp); 3566 vec_temp = __lasx_xvsllwil_w_h(vec_temp, 0); 3567 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3568 return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)}; 3569 } 3570 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)> 3571 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec32<int8_t> v) { 3572 alignas(32) __m128i vec_tmp[2]; 3573 __m256i vec_temp; 3574 vec_tmp[0] = v.raw; 3575 CopyBytes<32>(vec_tmp, &vec_temp); 3576 vec_temp = __lasx_xvsllwil_h_b(vec_temp, 0); 3577 vec_temp = __lasx_xvsllwil_w_h(vec_temp, 0); 3578 vec_temp = __lasx_xvpermi_d(vec_temp, 0xd8); 3579 return VFromD<D>{__lasx_xvsllwil_d_w(vec_temp, 0)}; 3580 } 3581 3582 // ------------------------------ PromoteEvenTo/PromoteOddTo 3583 namespace detail { 3584 3585 // I32->I64 PromoteEvenTo/PromoteOddTo 3586 3587 template <class D, HWY_IF_LANES_D(D, 4)> 3588 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, 3589 hwy::SizeTag<8> /*to_lane_size_tag*/, 3590 hwy::SignedTag /*from_type_tag*/, D d_to, 3591 Vec256<int32_t> v) { 3592 return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v)); 3593 } 3594 3595 template <class D, HWY_IF_LANES_D(D, 4)> 3596 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/, 3597 hwy::SizeTag<8> /*to_lane_size_tag*/, 3598 hwy::SignedTag /*from_type_tag*/, D d_to, 3599 Vec256<int32_t> v) { 3600 return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v))); 3601 } 3602 3603 } // namespace detail 3604 3605 // ------------------------------ Demotions (full -> part w/ narrow lanes) 3606 3607 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)> 3608 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int16_t> a, 3609 Vec256<int16_t> b) { 3610 return VFromD<D>{__lasx_xvssrani_b_h(b.raw, a.raw, 0)}; 3611 } 3612 3613 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)> 3614 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int16_t> a, 3615 Vec256<int16_t> b) { 3616 return VFromD<D>{__lasx_xvssrani_bu_h(b.raw, a.raw, 0)}; 3617 } 3618 3619 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I8_D(D)> 3620 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint16_t> a, 3621 Vec256<uint16_t> b) { 3622 return VFromD<D>{__lasx_xvssrlni_b_h(b.raw, a.raw, 0)}; 3623 } 3624 3625 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U8_D(D)> 3626 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint16_t> a, 3627 Vec256<uint16_t> b) { 3628 return VFromD<D>{__lasx_xvssrlni_bu_h(b.raw, a.raw, 0)}; 3629 } 3630 3631 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)> 3632 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int32_t> a, 3633 Vec256<int32_t> b) { 3634 return VFromD<D>{__lasx_xvssrani_h_w(b.raw, a.raw, 0)}; 3635 } 3636 3637 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)> 3638 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int32_t> a, 3639 Vec256<int32_t> b) { 3640 return VFromD<D>{__lasx_xvssrani_hu_w(b.raw, a.raw, 0)}; 3641 } 3642 3643 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I16_D(D)> 3644 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint32_t> a, 3645 Vec256<uint32_t> b) { 3646 return VFromD<D>{__lasx_xvssrlni_h_w(b.raw, a.raw, 0)}; 3647 } 3648 3649 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U16_D(D)> 3650 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint32_t> a, 3651 Vec256<uint32_t> b) { 3652 return VFromD<D>{__lasx_xvssrlni_hu_w(b.raw, a.raw, 0)}; 3653 } 3654 3655 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3656 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int64_t> a, 3657 Vec256<int64_t> b) { 3658 return VFromD<D>{__lasx_xvssrani_w_d(b.raw, a.raw, 0)}; 3659 } 3660 3661 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> 3662 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<int64_t> a, 3663 Vec256<int64_t> b) { 3664 return VFromD<D>{__lasx_xvssrani_wu_d(b.raw, a.raw, 0)}; 3665 } 3666 3667 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3668 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint64_t> a, 3669 Vec256<uint64_t> b) { 3670 return VFromD<D>{__lasx_xvssrlni_w_d(b.raw, a.raw, 0)}; 3671 } 3672 3673 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_U32_D(D)> 3674 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec256<uint64_t> a, 3675 Vec256<uint64_t> b) { 3676 return VFromD<D>{__lasx_xvssrlni_wu_d(b.raw, a.raw, 0)}; 3677 } 3678 3679 template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), 3680 HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 3681 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 3682 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 3683 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 3684 return VFromD<D>{__lasx_xvpermi_d(ReorderDemote2To(d, a, b).raw, 0xd8)}; 3685 } 3686 3687 template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), 3688 HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 3689 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 3690 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>))> 3691 HWY_API VFromD<D> DemoteTo(D d, V v) { 3692 return LowerHalf(OrderedDemote2To(Twice<decltype(d)>(), v, v)); 3693 } 3694 3695 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 3696 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<float> v) { 3697 const Full256<int16_t> di; 3698 const Vec256<hwy::float16_t> f16_blocks{__lasx_xvfcvt_h_s(v.raw, v.raw)}; 3699 const auto f16_concat = 3700 BitCast(Twice<D>(), VFromD<decltype(di)>{__lasx_xvpermi_d( 3701 BitCast(di, f16_blocks).raw, 0xd8)}); 3702 return LowerHalf(f16_concat); 3703 } 3704 3705 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3706 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec256<double> v) { 3707 const Full256<int32_t> di; 3708 const Vec256<float> f32_blocks{__lasx_xvfcvt_s_d(v.raw, v.raw)}; 3709 const auto f32_concat = 3710 BitCast(Twice<D>(), VFromD<decltype(di)>{__lasx_xvpermi_d( 3711 BitCast(di, f32_blocks).raw, 0xd8)}); 3712 return LowerHalf(f32_concat); 3713 } 3714 3715 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 3716 HWY_API VFromD<D> DemoteTo(D dn, Vec256<double> v) { 3717 const __m256i i32_blocks = __lasx_xvftintrz_w_d(v.raw, v.raw); 3718 return LowerHalf(dn, VFromD<Twice<D>>{__lasx_xvpermi_d(i32_blocks, 0xd8)}); 3719 } 3720 3721 // For already range-limited input [0, 255]. 3722 HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) { 3723 const Full256<uint32_t> d32; 3724 const Full64<uint8_t> d8; 3725 alignas(32) static constexpr uint32_t k8From32[8] = { 3726 0x0C080400u, 0x13121110u, 0, 0, 0x13121110u, 0x0C080400u, 0, 0}; 3727 // Place first four bytes in lo[0], remaining 4 in hi[1]. 3728 const auto quad = VFromD<decltype(d32)>{ 3729 __lasx_xvshuf_b(Zero(d32).raw, v.raw, Load(d32, k8From32).raw)}; 3730 // Interleave both quadruplets - OR instead of unpack reduces port5 pressure. 3731 const auto lo = LowerHalf(quad); 3732 const auto hi = UpperHalf(Half<decltype(d32)>(), quad); 3733 return BitCast(d8, LowerHalf(lo | hi)); 3734 } 3735 3736 // ------------------------------ Truncations 3737 3738 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)> 3739 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) { 3740 const Full256<uint8_t> d8; 3741 alignas(32) static constexpr uint8_t kMap[32] = {0, 8, 16, 24}; 3742 const auto i8 = TableLookupLanes(BitCast(d8, v), SetTableIndices(d8, kMap)); 3743 return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{i8.raw}))); 3744 } 3745 3746 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> 3747 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) { 3748 const __m256i i32_blocks = __lasx_xvpickev_w(v.raw, v.raw); 3749 const __m256i i32_concat = __lasx_xvpermi_d(i32_blocks, 0xd8); 3750 const __m256i i16 = __lasx_xvpickev_h(i32_concat, i32_concat); 3751 return LowerHalf(LowerHalf(Vec256<uint16_t>{i16})); 3752 } 3753 3754 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 3755 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint64_t> v) { 3756 const Full256<uint32_t> d32; 3757 alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6}; 3758 const auto v32 = 3759 TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven)); 3760 return LowerHalf(Vec256<uint32_t>{v32.raw}); 3761 } 3762 3763 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 3764 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) { 3765 const Full256<uint8_t> d8; 3766 alignas(32) static constexpr uint8_t kEven[32] = {0, 4, 8, 12, 3767 16, 20, 24, 28}; 3768 const auto i8 = TableLookupLanes(BitCast(d8, v), SetTableIndices(d8, kEven)); 3769 return LowerHalf(LowerHalf(Vec256<uint8_t>{i8.raw})); 3770 } 3771 3772 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 3773 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint32_t> v) { 3774 const __m256i i16_blocks = __lasx_xvpickev_h(v.raw, v.raw); 3775 const __m256i i16_concat = __lasx_xvpermi_d(i16_blocks, 0xd8); 3776 return LowerHalf(Vec256<uint16_t>{i16_concat}); 3777 } 3778 3779 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 3780 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec256<uint16_t> v) { 3781 const __m256i i8_blocks = __lasx_xvpickev_b(v.raw, v.raw); 3782 const __m256i i8_concat = __lasx_xvpermi_d(i8_blocks, 0xd8); 3783 return LowerHalf(Vec256<uint8_t>{i8_concat}); 3784 } 3785 3786 // ------------------------------ Integer <=> fp (ShiftRight, OddEven) 3787 3788 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 3789 HWY_API VFromD<D> ConvertTo(D /* tag */, Vec256<int32_t> v) { 3790 return VFromD<D>{__lasx_xvffint_s_w(v.raw)}; 3791 } 3792 3793 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)> 3794 HWY_API VFromD<D> ConvertTo(D /*df*/, Vec256<uint32_t> v) { 3795 return VFromD<D>{__lasx_xvffint_s_wu(v.raw)}; 3796 } 3797 3798 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 3799 HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<int64_t> v) { 3800 return VFromD<D>{__lasx_xvffint_d_l(v.raw)}; 3801 } 3802 3803 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)> 3804 HWY_API VFromD<D> ConvertTo(D /*dd*/, Vec256<uint64_t> v) { 3805 return VFromD<D>{__lasx_xvffint_d_lu(v.raw)}; 3806 } 3807 3808 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I32_D(D)> 3809 HWY_API VFromD<D> ConvertTo(D /*d*/, Vec256<float> v) { 3810 return VFromD<D>{__lasx_xvftintrz_w_s(v.raw)}; 3811 } 3812 3813 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_I64_D(D)> 3814 HWY_API VFromD<D> ConvertTo(D /*di*/, Vec256<double> v) { 3815 return VFromD<D>{__lasx_xvftintrz_l_d(v.raw)}; 3816 } 3817 3818 template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U32_D(DU)> 3819 HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { 3820 return VFromD<DU>{__lasx_xvftintrz_wu_s(v.raw)}; 3821 } 3822 3823 template <class DU, HWY_IF_V_SIZE_D(DU, 32), HWY_IF_U64_D(DU)> 3824 HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { 3825 return VFromD<DU>{__lasx_xvftintrz_lu_d(v.raw)}; 3826 } 3827 3828 template <typename T, HWY_IF_FLOAT3264(T)> 3829 HWY_API Vec256<MakeSigned<T>> NearestInt(const Vec256<T> v) { 3830 return ConvertTo(Full256<MakeSigned<T>>(), Round(v)); 3831 } 3832 3833 // ------------------------------ LoadMaskBits (TestBit) 3834 3835 namespace detail { 3836 3837 template <typename T, HWY_IF_T_SIZE(T, 1)> 3838 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { 3839 const Full256<T> d; 3840 const RebindToUnsigned<decltype(d)> du; 3841 const Repartition<uint32_t, decltype(d)> du32; 3842 const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits))); 3843 3844 // Replicate bytes 8x such that each byte contains the bit that governs it. 3845 const Repartition<uint64_t, decltype(d)> du64; 3846 alignas(32) static constexpr uint64_t kRep8[4] = { 3847 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull, 3848 0x0303030303030303ull}; 3849 const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8))); 3850 3851 const VFromD<decltype(du)> bit = Dup128VecFromValues( 3852 du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); 3853 return RebindMask(d, TestBit(rep8, bit)); 3854 } 3855 3856 template <typename T, HWY_IF_T_SIZE(T, 2)> 3857 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { 3858 const Full256<T> d; 3859 const RebindToUnsigned<decltype(d)> du; 3860 alignas(32) static constexpr uint16_t kBit[16] = { 3861 1, 2, 4, 8, 16, 32, 64, 128, 3862 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; 3863 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); 3864 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 3865 } 3866 3867 template <typename T, HWY_IF_T_SIZE(T, 4)> 3868 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { 3869 const Full256<T> d; 3870 const RebindToUnsigned<decltype(d)> du; 3871 alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; 3872 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); 3873 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 3874 } 3875 3876 template <typename T, HWY_IF_T_SIZE(T, 8)> 3877 HWY_INLINE Mask256<T> LoadMaskBits256(uint64_t mask_bits) { 3878 const Full256<T> d; 3879 const RebindToUnsigned<decltype(d)> du; 3880 alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8}; 3881 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); 3882 } 3883 3884 } // namespace detail 3885 3886 // `p` points to at least 8 readable bytes, not all of which need be valid. 3887 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3888 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 3889 constexpr size_t kN = MaxLanes(d); 3890 constexpr size_t kNumBytes = (kN + 7) / 8; 3891 3892 uint64_t mask_bits = 0; 3893 CopyBytes<kNumBytes>(bits, &mask_bits); 3894 3895 if (kN < 8) { 3896 mask_bits &= (1ull << kN) - 1; 3897 } 3898 3899 return detail::LoadMaskBits256<TFromD<D>>(mask_bits); 3900 } 3901 3902 // ------------------------------ BitsFromMask 3903 3904 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)> 3905 HWY_API uint64_t BitsFromMask(D /*tag*/, MFromD<D> mask) { 3906 const auto sign_bits = __lasx_xvmskltz_b(mask.raw); 3907 return static_cast<uint32_t>(__lasx_xvpickve2gr_w(sign_bits, 0) | 3908 (__lasx_xvpickve2gr_w(sign_bits, 4) << 16)); 3909 } 3910 3911 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)> 3912 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 3913 const RebindToSigned<decltype(d)> di; 3914 const auto vec_mask = VecFromMask(mask); 3915 const auto sign_bits = 3916 __lasx_xvpickod_b(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw); 3917 const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8); 3918 const auto sign_last = __lasx_xvmskltz_b(sign_shuf); 3919 return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0)); 3920 } 3921 3922 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)> 3923 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 3924 const RebindToSigned<decltype(d)> di; 3925 const auto vec_mask = VecFromMask(mask); 3926 const auto sign_bits = 3927 __lasx_xvpickod_h(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw); 3928 const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8); 3929 const auto sign_last = __lasx_xvmskltz_h(sign_shuf); 3930 return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0)); 3931 } 3932 3933 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)> 3934 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 3935 const RebindToSigned<decltype(d)> di; 3936 const auto vec_mask = VecFromMask(mask); 3937 const auto sign_bits = 3938 __lasx_xvpickod_w(BitCast(di, vec_mask).raw, BitCast(di, vec_mask).raw); 3939 const auto sign_shuf = __lasx_xvpermi_d(sign_bits, 0xd8); 3940 const auto sign_last = __lasx_xvmskltz_w(sign_shuf); 3941 return static_cast<unsigned>(__lasx_xvpickve2gr_w(sign_last, 0)); 3942 } 3943 3944 // ------------------------------ StoreMaskBits 3945 // `p` points to at least 8 writable bytes. 3946 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3947 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 3948 constexpr size_t N = MaxLanes(d); 3949 constexpr size_t kNumBytes = (N + 7) / 8; 3950 3951 const uint64_t mask_bits = BitsFromMask(d, mask); 3952 CopyBytes<kNumBytes>(&mask_bits, bits); 3953 return kNumBytes; 3954 } 3955 3956 // ------------------------------ Mask testing 3957 3958 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3959 HWY_API bool AllFalse(D d, MFromD<D> mask) { 3960 return BitsFromMask(d, mask) == 0; 3961 } 3962 3963 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3964 HWY_API bool AllTrue(D d, MFromD<D> mask) { 3965 constexpr size_t kN = MaxLanes(d); 3966 constexpr uint64_t kAllBits = (1ull << kN) - 1; 3967 return BitsFromMask(d, mask) == kAllBits; 3968 } 3969 3970 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3971 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 3972 return PopCount(BitsFromMask(d, mask)); 3973 } 3974 3975 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3976 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 3977 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 3978 return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); 3979 } 3980 3981 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3982 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 3983 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 3984 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; 3985 } 3986 3987 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3988 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 3989 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 3990 return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); 3991 } 3992 3993 template <class D, HWY_IF_V_SIZE_D(D, 32)> 3994 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 3995 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 3996 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) 3997 : -1; 3998 } 3999 4000 // ------------------------------ Compress, CompressBits 4001 4002 namespace detail { 4003 4004 template <typename T, HWY_IF_T_SIZE(T, 4)> 4005 HWY_INLINE Vec256<uint32_t> IndicesFromBits256(uint64_t mask_bits) { 4006 const Full256<uint32_t> d32; 4007 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT 4008 // of SetTableIndices would require 8 KiB, a large part of L1D. We instead 4009 // compress each index into 4 bits, for a total of 1 KiB. 4010 alignas(16) static constexpr uint32_t packed_array[256] = { 4011 // PrintCompress32x8Tables 4012 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8, 4013 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98, 4014 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8, 4015 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98, 4016 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8, 4017 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98, 4018 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8, 4019 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98, 4020 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8, 4021 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98, 4022 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8, 4023 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98, 4024 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8, 4025 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98, 4026 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8, 4027 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98, 4028 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8, 4029 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98, 4030 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8, 4031 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98, 4032 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8, 4033 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98, 4034 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8, 4035 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98, 4036 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8, 4037 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98, 4038 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8, 4039 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98, 4040 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8, 4041 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98, 4042 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8, 4043 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98, 4044 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8, 4045 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98, 4046 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8, 4047 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98, 4048 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8, 4049 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98, 4050 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8, 4051 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98, 4052 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8, 4053 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98, 4054 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98}; 4055 4056 // No need to mask because __lasx_xvperm_w ignores bits 3..31. 4057 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. 4058 const auto packed = Set(d32, packed_array[mask_bits]); 4059 alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, 4060 16, 20, 24, 28}; 4061 return packed >> Load(d32, shifts); 4062 } 4063 4064 template <typename T, HWY_IF_T_SIZE(T, 8)> 4065 HWY_INLINE Vec256<uint64_t> IndicesFromBits256(uint64_t mask_bits) { 4066 const Full256<uint64_t> d64; 4067 4068 // For 64-bit, there are only 4 lanes, so we can afford to load the 4069 // entire index vector directly. 4070 alignas(32) static constexpr uint64_t u64_indices[64] = { 4071 // PrintCompress64x4PairTables 4072 0, 1, 2, 3, 8, 1, 2, 3, 9, 0, 2, 3, 8, 9, 2, 3, 4073 10, 0, 1, 3, 8, 10, 1, 3, 9, 10, 0, 3, 8, 9, 10, 3, 4074 11, 0, 1, 2, 8, 11, 1, 2, 9, 11, 0, 2, 8, 9, 11, 2, 4075 10, 11, 0, 1, 8, 10, 11, 1, 9, 10, 11, 0, 8, 9, 10, 11}; 4076 return Load(d64, u64_indices + 4 * mask_bits); 4077 } 4078 4079 template <typename T, HWY_IF_T_SIZE(T, 4)> 4080 HWY_INLINE Vec256<uint32_t> IndicesFromNotBits256(uint64_t mask_bits) { 4081 const Full256<uint32_t> d32; 4082 // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT 4083 // of SetTableIndices would require 8 KiB, a large part of L1D. We instead 4084 // compress each index into 4 bits, for a total of 1 KiB. 4085 alignas(16) static constexpr uint32_t packed_array[256] = { 4086 // PrintCompressNot32x8Tables 4087 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9, 4088 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca, 4089 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9, 4090 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb, 4091 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9, 4092 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba, 4093 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9, 4094 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec, 4095 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9, 4096 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea, 4097 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9, 4098 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb, 4099 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9, 4100 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba, 4101 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9, 4102 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd, 4103 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9, 4104 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca, 4105 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9, 4106 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb, 4107 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9, 4108 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba, 4109 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9, 4110 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc, 4111 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9, 4112 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda, 4113 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9, 4114 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb, 4115 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9, 4116 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba, 4117 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9, 4118 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e, 4119 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9, 4120 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca, 4121 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9, 4122 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db, 4123 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9, 4124 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba, 4125 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9, 4126 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c, 4127 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9, 4128 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a, 4129 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98}; 4130 4131 // No need to mask because <__lasx_xvperm_w> ignores bits 3..31. 4132 // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. 4133 const Vec256<uint32_t> packed = Set(d32, packed_array[mask_bits]); 4134 alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, 4135 16, 20, 24, 28}; 4136 return packed >> Load(d32, shifts); 4137 } 4138 4139 template <typename T, HWY_IF_T_SIZE(T, 8)> 4140 HWY_INLINE Vec256<uint64_t> IndicesFromNotBits256(uint64_t mask_bits) { 4141 const Full256<uint64_t> d64; 4142 4143 // For 64-bit, there are only 4 lanes, so we can afford to load 4144 // the entire index vector directly. 4145 alignas(32) static constexpr uint64_t u64_indices[64] = { 4146 // PrintCompressNot64x4PairTables 4147 8, 9, 10, 11, 9, 10, 11, 0, 8, 10, 11, 1, 10, 11, 0, 1, 4148 8, 9, 11, 2, 9, 11, 0, 2, 8, 11, 1, 2, 11, 0, 1, 2, 4149 8, 9, 10, 3, 9, 10, 0, 3, 8, 10, 1, 3, 10, 0, 1, 3, 4150 8, 9, 2, 3, 9, 0, 2, 3, 8, 1, 2, 3, 0, 1, 2, 3}; 4151 return Load(d64, u64_indices + 4 * mask_bits); 4152 } 4153 4154 template <typename T, HWY_IF_NOT_T_SIZE(T, 2)> 4155 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) { 4156 const DFromV<decltype(v)> d; 4157 const RebindToSigned<decltype(d)> di; 4158 4159 HWY_DASSERT(mask_bits < (1ull << Lanes(d))); 4160 const Indices256<TFromD<decltype(di)>> indices{ 4161 IndicesFromBits256<T>(mask_bits).raw}; 4162 return BitCast(d, TableLookupLanes(BitCast(di, v), indices)); 4163 } 4164 4165 // LUTs are infeasible for 2^16 possible masks, so splice together two 4166 // half-vector Compress. 4167 template <typename T, HWY_IF_T_SIZE(T, 2)> 4168 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) { 4169 const DFromV<decltype(v)> d; 4170 const RebindToUnsigned<decltype(d)> du; 4171 const auto vu16 = BitCast(du, v); // (required for float16_t inputs) 4172 const Half<decltype(du)> duh; 4173 const auto half0 = LowerHalf(duh, vu16); 4174 const auto half1 = UpperHalf(duh, vu16); 4175 4176 const uint64_t mask_bits0 = mask_bits & 0xFF; 4177 const uint64_t mask_bits1 = mask_bits >> 8; 4178 const auto compressed0 = detail::CompressBits(half0, mask_bits0); 4179 const auto compressed1 = detail::CompressBits(half1, mask_bits1); 4180 4181 alignas(32) uint16_t all_true[16] = {}; 4182 // Store mask=true lanes, left to right. 4183 const size_t num_true0 = PopCount(mask_bits0); 4184 Store(compressed0, duh, all_true); 4185 StoreU(compressed1, duh, all_true + num_true0); 4186 4187 if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) { 4188 // Store mask=false lanes, right to left. The second vector fills the upper 4189 // half with right-aligned false lanes. The first vector is shifted 4190 // rightwards to overwrite the true lanes of the second. 4191 alignas(32) uint16_t all_false[16] = {}; 4192 const size_t num_true1 = PopCount(mask_bits1); 4193 Store(compressed1, duh, all_false + 8); 4194 StoreU(compressed0, duh, all_false + num_true1); 4195 4196 const auto mask = FirstN(du, num_true0 + num_true1); 4197 return BitCast(d, 4198 IfThenElse(mask, Load(du, all_true), Load(du, all_false))); 4199 } else { 4200 // Only care about the mask=true lanes. 4201 return BitCast(d, Load(du, all_true)); 4202 } 4203 } 4204 4205 template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> 4206 HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) { 4207 const DFromV<decltype(v)> d; 4208 const RebindToSigned<decltype(d)> di; 4209 4210 HWY_DASSERT(mask_bits < (1ull << Lanes(d))); 4211 const Indices256<TFromD<decltype(di)>> indices{ 4212 IndicesFromNotBits256<T>(mask_bits).raw}; 4213 return BitCast(d, TableLookupLanes(BitCast(di, v), indices)); 4214 } 4215 4216 // LUTs are infeasible for 2^16 possible masks, so splice together two 4217 // half-vector Compress. 4218 template <typename T, HWY_IF_T_SIZE(T, 2)> 4219 HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) { 4220 // Compress ensures only the lower 16 bits are set, so flip those. 4221 return Compress(v, mask_bits ^ 0xFFFF); 4222 } 4223 4224 } // namespace detail 4225 4226 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> 4227 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) { 4228 const DFromV<decltype(v)> d; 4229 return detail::Compress(v, BitsFromMask(d, m)); 4230 } 4231 4232 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> 4233 HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) { 4234 const DFromV<decltype(v)> d; 4235 return detail::CompressNot(v, BitsFromMask(d, m)); 4236 } 4237 4238 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v, 4239 Mask256<uint64_t> mask) { 4240 return CompressNot(v, mask); 4241 } 4242 4243 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> 4244 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) { 4245 constexpr size_t N = 32 / sizeof(T); 4246 constexpr size_t kNumBytes = (N + 7) / 8; 4247 4248 uint64_t mask_bits = 0; 4249 CopyBytes<kNumBytes>(bits, &mask_bits); 4250 4251 if (N < 8) { 4252 mask_bits &= (1ull << N) - 1; 4253 } 4254 4255 return detail::Compress(v, mask_bits); 4256 } 4257 4258 // ------------------------------ CompressStore, CompressBitsStore 4259 4260 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)> 4261 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, 4262 TFromD<D>* HWY_RESTRICT unaligned) { 4263 const uint64_t mask_bits = BitsFromMask(d, m); 4264 const size_t count = PopCount(mask_bits); 4265 StoreU(detail::Compress(v, mask_bits), d, unaligned); 4266 detail::MaybeUnpoison(unaligned, count); 4267 return count; 4268 } 4269 4270 template <class D, HWY_IF_V_SIZE_D(D, 32), 4271 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 4272 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 4273 TFromD<D>* HWY_RESTRICT unaligned) { 4274 const uint64_t mask_bits = BitsFromMask(d, m); 4275 const size_t count = PopCount(mask_bits); 4276 using TU = MakeUnsigned<TFromD<D>>; 4277 4278 const RebindToUnsigned<decltype(d)> du; 4279 HWY_DASSERT(mask_bits < (1ull << Lanes(d))); 4280 const Vec256<TU> idx_mask = detail::IndicesFromBits256<TFromD<D>>(mask_bits); 4281 // Shift nibble MSB into MSB 4282 const auto shiftVal = sizeof(TU) == 4 ? 28 : 60; 4283 const Mask256<TU> mask32or64 = MaskFromVec(ShiftLeft<shiftVal>(idx_mask)); 4284 const Mask256<TU> masku{sizeof(TU) == 4 ? __lasx_xvslti_w(mask32or64.raw, 0) 4285 : __lasx_xvslti_d(mask32or64.raw, 0)}; 4286 const MFromD<D> mask = RebindMask(d, masku); 4287 const VFromD<D> compressed = BitCast( 4288 d, TableLookupLanes(BitCast(du, v), Indices256<TU>{idx_mask.raw})); 4289 4290 BlendedStore(compressed, mask, d, unaligned); 4291 detail::MaybeUnpoison(unaligned, count); 4292 return count; 4293 } 4294 4295 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_T_SIZE_D(D, 2)> 4296 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 4297 TFromD<D>* HWY_RESTRICT unaligned) { 4298 const uint64_t mask_bits = BitsFromMask(d, m); 4299 const size_t count = PopCount(mask_bits); 4300 const VFromD<D> compressed = detail::Compress(v, mask_bits); 4301 BlendedStore(compressed, FirstN(d, count), d, unaligned); 4302 return count; 4303 } 4304 4305 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_T_SIZE_D(D, 1)> 4306 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 4307 D d, TFromD<D>* HWY_RESTRICT unaligned) { 4308 constexpr size_t N = MaxLanes(d); 4309 constexpr size_t kNumBytes = (N + 7) / 8; 4310 4311 uint64_t mask_bits = 0; 4312 CopyBytes<kNumBytes>(bits, &mask_bits); 4313 4314 if (N < 8) { 4315 mask_bits &= (1ull << N) - 1; 4316 } 4317 const size_t count = PopCount(mask_bits); 4318 4319 StoreU(detail::Compress(v, mask_bits), d, unaligned); 4320 detail::MaybeUnpoison(unaligned, count); 4321 return count; 4322 } 4323 4324 // ------------------------------ Dup128MaskFromMaskBits 4325 4326 // Generic for all vector lengths >= 32 bytes 4327 template <class D, HWY_IF_V_SIZE_GT_D(D, 16)> 4328 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 4329 const Half<decltype(d)> dh; 4330 const auto mh = Dup128MaskFromMaskBits(dh, mask_bits); 4331 return CombineMasks(d, mh, mh); 4332 } 4333 4334 // ------------------------------ Expand 4335 4336 template <typename T, HWY_IF_T_SIZE(T, 1)> 4337 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { 4338 const DFromV<decltype(v)> d; 4339 // LUTs are infeasible for so many mask combinations, so Combine two 4340 // half-vector Expand. 4341 const Half<decltype(d)> dh; 4342 const uint64_t mask_bits = BitsFromMask(d, mask); 4343 constexpr size_t N = 32 / sizeof(T); 4344 const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); 4345 const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); 4346 const Vec128<T> expandL = Expand(LowerHalf(v), maskL); 4347 4348 alignas(32) T lanes[N]; 4349 Store(v, d, lanes); 4350 const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); 4351 const Vec128<T> expandH = Expand(LoadU(dh, lanes + countL), maskH); 4352 return Combine(d, expandH, expandL); 4353 } 4354 4355 template <typename T, HWY_IF_T_SIZE(T, 2)> 4356 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { 4357 const Full256<T> d; 4358 // LUTs are infeasible for 2^16 possible masks, so splice together two 4359 // half-vector Expand. 4360 const Half<decltype(d)> dh; 4361 const Mask128<T> maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); 4362 const Vec128<T> expandL = Expand(LowerHalf(v), maskL); 4363 4364 alignas(32) T lanes[32 / sizeof(T)]; 4365 Store(v, d, lanes); 4366 const Vec128<T> vH = LoadU(dh, lanes + CountTrue(dh, maskL)); 4367 const Mask128<T> maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); 4368 const Vec128<T> expandH = Expand(vH, maskH); 4369 return Combine(d, expandH, expandL); 4370 } 4371 4372 template <typename T, HWY_IF_T_SIZE(T, 4)> 4373 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { 4374 const Full256<T> d; 4375 const RebindToUnsigned<decltype(d)> du; 4376 const uint64_t mask_bits = BitsFromMask(d, mask); 4377 alignas(16) constexpr uint32_t packed_array[256] = { 4378 // PrintExpand32x8Nibble. 4379 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0, 4380 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10, 4381 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0, 4382 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210, 4383 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0, 4384 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10, 4385 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0, 4386 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210, 4387 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0, 4388 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10, 4389 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0, 4390 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210, 4391 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0, 4392 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10, 4393 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0, 4394 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210, 4395 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0, 4396 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10, 4397 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0, 4398 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210, 4399 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0, 4400 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10, 4401 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0, 4402 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210, 4403 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0, 4404 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10, 4405 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0, 4406 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210, 4407 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0, 4408 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10, 4409 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0, 4410 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210, 4411 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0, 4412 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10, 4413 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0, 4414 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210, 4415 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0, 4416 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10, 4417 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0, 4418 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210, 4419 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0, 4420 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10, 4421 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210, 4422 }; 4423 4424 // For lane i, shift the i-th 4-bit index down to bits [0, 3). 4425 const Vec256<uint32_t> packed = Set(du, packed_array[mask_bits]); 4426 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; 4427 // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. 4428 const Indices256<uint32_t> indices{(packed >> Load(du, shifts)).raw}; 4429 const Vec256<uint32_t> expand = TableLookupLanes(BitCast(du, v), indices); 4430 // TableLookupLanes cannot also zero masked-off lanes, so do that now. 4431 return IfThenElseZero(mask, BitCast(d, expand)); 4432 } 4433 4434 template <typename T, HWY_IF_T_SIZE(T, 8)> 4435 HWY_API Vec256<T> Expand(Vec256<T> v, Mask256<T> mask) { 4436 const Full256<T> d; 4437 const RebindToUnsigned<decltype(d)> du; 4438 const uint64_t mask_bits = BitsFromMask(d, mask); 4439 4440 alignas(16) constexpr uint64_t packed_array[16] = { 4441 // PrintExpand64x4Nibble. 4442 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, 4443 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, 4444 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; 4445 4446 // For lane i, shift the i-th 4-bit index down to bits [0, 2). 4447 const Vec256<uint64_t> packed = Set(du, packed_array[mask_bits]); 4448 alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; 4449 // 64-bit TableLookupLanes on LASX requires IndicesFromVec, which checks 4450 // bounds, so clear the upper bits. 4451 const Vec256<uint64_t> masked = And(packed >> Load(du, shifts), Set(du, 3)); 4452 const Indices256<uint64_t> indices = IndicesFromVec(du, masked); 4453 const Vec256<uint64_t> expand = TableLookupLanes(BitCast(du, v), indices); 4454 // TableLookupLanes cannot also zero masked-off lanes, so do that now. 4455 return IfThenElseZero(mask, BitCast(d, expand)); 4456 } 4457 4458 // ------------------------------ LoadExpand 4459 4460 template <class D, HWY_IF_V_SIZE_D(D, 32), 4461 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 4462 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 4463 const TFromD<D>* HWY_RESTRICT unaligned) { 4464 return Expand(LoadU(d, unaligned), mask); 4465 } 4466 4467 template <class D, HWY_IF_V_SIZE_D(D, 32), 4468 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 4469 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 4470 const TFromD<D>* HWY_RESTRICT unaligned) { 4471 return Expand(LoadU(d, unaligned), mask); 4472 } 4473 4474 // ------------------------------ LoadInterleaved3/4 4475 4476 // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. 4477 4478 namespace detail { 4479 // Input: 4480 // 1 0 (<- first block of unaligned) 4481 // 3 2 4482 // 5 4 4483 // Output: 4484 // 3 0 4485 // 4 1 4486 // 5 2 4487 template <class D, HWY_IF_V_SIZE_D(D, 32)> 4488 HWY_API void LoadTransposedBlocks3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 4489 VFromD<D>& A, VFromD<D>& B, VFromD<D>& C) { 4490 constexpr size_t N = MaxLanes(d); 4491 const VFromD<D> v10 = LoadU(d, unaligned + 0 * N); // 1 0 4492 const VFromD<D> v32 = LoadU(d, unaligned + 1 * N); 4493 const VFromD<D> v54 = LoadU(d, unaligned + 2 * N); 4494 4495 A = ConcatUpperLower(d, v32, v10); 4496 B = ConcatLowerUpper(d, v54, v10); 4497 C = ConcatUpperLower(d, v54, v32); 4498 } 4499 4500 // Input (128-bit blocks): 4501 // 1 0 (first block of unaligned) 4502 // 3 2 4503 // 5 4 4504 // 7 6 4505 // Output: 4506 // 4 0 (LSB of vA) 4507 // 5 1 4508 // 6 2 4509 // 7 3 4510 template <class D, HWY_IF_V_SIZE_D(D, 32)> 4511 HWY_API void LoadTransposedBlocks4(D d, const TFromD<D>* HWY_RESTRICT unaligned, 4512 VFromD<D>& vA, VFromD<D>& vB, VFromD<D>& vC, 4513 VFromD<D>& vD) { 4514 constexpr size_t N = MaxLanes(d); 4515 const VFromD<D> v10 = LoadU(d, unaligned + 0 * N); 4516 const VFromD<D> v32 = LoadU(d, unaligned + 1 * N); 4517 const VFromD<D> v54 = LoadU(d, unaligned + 2 * N); 4518 const VFromD<D> v76 = LoadU(d, unaligned + 3 * N); 4519 4520 vA = ConcatLowerLower(d, v54, v10); 4521 vB = ConcatUpperUpper(d, v54, v10); 4522 vC = ConcatLowerLower(d, v76, v32); 4523 vD = ConcatUpperUpper(d, v76, v32); 4524 } 4525 } // namespace detail 4526 4527 // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) 4528 4529 // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. 4530 4531 namespace detail { 4532 // Input (128-bit blocks): 4533 // 2 0 (LSB of i) 4534 // 3 1 4535 // Output: 4536 // 1 0 4537 // 3 2 4538 template <class D, HWY_IF_V_SIZE_D(D, 32)> 4539 HWY_API void StoreTransposedBlocks2(VFromD<D> i, VFromD<D> j, D d, 4540 TFromD<D>* HWY_RESTRICT unaligned) { 4541 constexpr size_t N = MaxLanes(d); 4542 const auto out0 = ConcatLowerLower(d, j, i); 4543 const auto out1 = ConcatUpperUpper(d, j, i); 4544 StoreU(out0, d, unaligned + 0 * N); 4545 StoreU(out1, d, unaligned + 1 * N); 4546 } 4547 4548 // Input (128-bit blocks): 4549 // 3 0 (LSB of i) 4550 // 4 1 4551 // 5 2 4552 // Output: 4553 // 1 0 4554 // 3 2 4555 // 5 4 4556 template <class D, HWY_IF_V_SIZE_D(D, 32)> 4557 HWY_API void StoreTransposedBlocks3(VFromD<D> i, VFromD<D> j, VFromD<D> k, D d, 4558 TFromD<D>* HWY_RESTRICT unaligned) { 4559 constexpr size_t N = MaxLanes(d); 4560 const auto out0 = ConcatLowerLower(d, j, i); 4561 const auto out1 = ConcatUpperLower(d, i, k); 4562 const auto out2 = ConcatUpperUpper(d, k, j); 4563 StoreU(out0, d, unaligned + 0 * N); 4564 StoreU(out1, d, unaligned + 1 * N); 4565 StoreU(out2, d, unaligned + 2 * N); 4566 } 4567 4568 // Input (128-bit blocks): 4569 // 4 0 (LSB of i) 4570 // 5 1 4571 // 6 2 4572 // 7 3 4573 // Output: 4574 // 1 0 4575 // 3 2 4576 // 5 4 4577 // 7 6 4578 template <class D, HWY_IF_V_SIZE_D(D, 32)> 4579 HWY_API void StoreTransposedBlocks4(VFromD<D> i, VFromD<D> j, VFromD<D> k, 4580 VFromD<D> l, D d, 4581 TFromD<D>* HWY_RESTRICT unaligned) { 4582 constexpr size_t N = MaxLanes(d); 4583 // Write lower halves, then upper. 4584 const auto out0 = ConcatLowerLower(d, j, i); 4585 const auto out1 = ConcatLowerLower(d, l, k); 4586 StoreU(out0, d, unaligned + 0 * N); 4587 StoreU(out1, d, unaligned + 1 * N); 4588 const auto out2 = ConcatUpperUpper(d, j, i); 4589 const auto out3 = ConcatUpperUpper(d, l, k); 4590 StoreU(out2, d, unaligned + 2 * N); 4591 StoreU(out3, d, unaligned + 3 * N); 4592 } 4593 } // namespace detail 4594 4595 // ------------------------------ Additional mask logical operations 4596 4597 namespace detail { 4598 4599 template <class T> 4600 static HWY_INLINE HWY_MAYBE_UNUSED Vec256<T> LasxI256Neg(Vec256<T> v) { 4601 const Full256<T> d; 4602 const Repartition<uint64_t, decltype(d)> du64; 4603 4604 const auto vu64 = BitCast(du64, v); 4605 const auto vu64_zero = Zero(du64); 4606 const auto i128_ne_zero = VecFromMask(du64, Ne128(du64, vu64, vu64_zero)); 4607 const VFromD<decltype(du64)> i128_neg_result{ 4608 __lasx_xvsub_q(vu64_zero.raw, vu64.raw)}; 4609 const VFromD<decltype(du64)> i256_neg_result_as_u64{ 4610 __lasx_xvadd_q(i128_neg_result.raw, 4611 ConcatLowerLower(du64, i128_ne_zero, vu64_zero).raw)}; 4612 4613 return BitCast(d, i256_neg_result_as_u64); 4614 } 4615 4616 } // namespace detail 4617 4618 template <class T> 4619 HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) { 4620 const Full256<T> d; 4621 return Or(mask, MaskFromVec(detail::LasxI256Neg(VecFromMask(d, mask)))); 4622 } 4623 4624 template <class T> 4625 HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) { 4626 return Not(SetAtOrAfterFirst(mask)); 4627 } 4628 4629 template <class T> 4630 HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) { 4631 const Full256<T> d; 4632 const RebindToSigned<decltype(d)> di; 4633 4634 const auto vmask = BitCast(di, VecFromMask(d, mask)); 4635 const auto neg_vmask = detail::LasxI256Neg(vmask); 4636 4637 return MaskFromVec(BitCast(d, Neg(And(vmask, neg_vmask)))); 4638 } 4639 4640 template <class T> 4641 HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) { 4642 const Full256<T> d; 4643 constexpr size_t kLanesPerBlock = MaxLanes(d) / 2; 4644 4645 const auto vmask = VecFromMask(d, mask); 4646 const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d)); 4647 return SetBeforeFirst( 4648 MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>( 4649 d, vmask, vmask_lo))); 4650 } 4651 4652 // ------------------------------ LeadingZeroCount 4653 4654 template <class V, HWY_IF_UI8(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)> 4655 HWY_API V LeadingZeroCount(V v) { 4656 return V{__lasx_xvclz_b(v.raw)}; 4657 } 4658 template <class V, HWY_IF_UI16(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)> 4659 HWY_API V LeadingZeroCount(V v) { 4660 return V{__lasx_xvclz_h(v.raw)}; 4661 } 4662 template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)> 4663 HWY_API V LeadingZeroCount(V v) { 4664 return V{__lasx_xvclz_w(v.raw)}; 4665 } 4666 template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_V(V, 32)> 4667 HWY_API V LeadingZeroCount(V v) { 4668 return V{__lasx_xvclz_d(v.raw)}; 4669 } 4670 4671 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_V_SIZE_V(V, 32)> 4672 HWY_API V HighestSetBitIndex(V v) { 4673 const DFromV<decltype(v)> d; 4674 using T = TFromD<decltype(d)>; 4675 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); 4676 } 4677 4678 // NOLINTNEXTLINE(google-readability-namespace-comments) 4679 } // namespace HWY_NAMESPACE 4680 } // namespace hwy 4681 HWY_AFTER_NAMESPACE();