loongarch_lsx-inl.h (220759B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <stdio.h> 17 18 #ifndef __loongarch_sx 19 // If LSX is to be runtime dispatched (instead of in baseline), we need 20 // to enable it *and* define __loongarch_sx or the intrinsic header will 21 // fail to compile. 22 // 23 // We cannot simply move lsxintrin.h after HWY_BEFORE_NAMESPACE because 24 // doing so may cause the first (the only effective) inclusion of 25 // lsxintrin.h to be compiled with both LSX and LASX enabled. Then when 26 // we call the inline functions in the header with only LSX enabled, 27 // we'll get an "always_inline function requires lasx but would be inlined 28 // into a function that is compiled without suport for lasx" error. 29 HWY_PUSH_ATTRIBUTES("lsx") 30 #define __loongarch_sx 31 #include <lsxintrin.h> 32 #undef __loongarch_sx 33 // Prevent "unused push_attribute" warning from Clang. 34 HWY_MAYBE_UNUSED static void HWY_CONCAT(hwy_lsx_dummy, __COUNTER__) () {} 35 HWY_POP_ATTRIBUTES 36 #else 37 #include <lsxintrin.h> 38 #endif 39 40 #include "hwy/base.h" 41 #include "hwy/ops/shared-inl.h" 42 43 HWY_BEFORE_NAMESPACE(); 44 namespace hwy { 45 namespace HWY_NAMESPACE { 46 namespace detail { 47 48 // Enable generic functions for whichever of (f16, bf16) are not supported. 49 #define HWY_LSX_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) 50 51 template <typename T> 52 struct Raw128 { 53 using type = __m128i; 54 }; 55 template <> 56 struct Raw128<float> { 57 using type = __m128; 58 }; 59 template <> 60 struct Raw128<double> { 61 using type = __m128d; 62 }; 63 64 } // namespace detail 65 66 template <typename T, size_t N = 16 / sizeof(T)> 67 class Vec128 { 68 using Raw = typename detail::Raw128<T>::type; 69 70 public: 71 using PrivateT = T; // only for DFromV 72 static constexpr size_t kPrivateN = N; // only for DFromV 73 74 // Compound assignment. Only usable if there is a corresponding non-member 75 // binary operator overload. For example, only f32 and f64 support division. 76 HWY_INLINE Vec128& operator*=(const Vec128 other) { 77 return *this = (*this * other); 78 } 79 HWY_INLINE Vec128& operator/=(const Vec128 other) { 80 return *this = (*this / other); 81 } 82 HWY_INLINE Vec128& operator+=(const Vec128 other) { 83 return *this = (*this + other); 84 } 85 HWY_INLINE Vec128& operator-=(const Vec128 other) { 86 return *this = (*this - other); 87 } 88 HWY_INLINE Vec128& operator%=(const Vec128 other) { 89 return *this = (*this % other); 90 } 91 HWY_INLINE Vec128& operator&=(const Vec128 other) { 92 return *this = (*this & other); 93 } 94 HWY_INLINE Vec128& operator|=(const Vec128 other) { 95 return *this = (*this | other); 96 } 97 HWY_INLINE Vec128& operator^=(const Vec128 other) { 98 return *this = (*this ^ other); 99 } 100 101 Raw raw; 102 }; 103 104 template <typename T> 105 using Vec64 = Vec128<T, 8 / sizeof(T)>; 106 107 template <typename T> 108 using Vec32 = Vec128<T, 4 / sizeof(T)>; 109 110 template <typename T> 111 using Vec16 = Vec128<T, 2 / sizeof(T)>; 112 113 namespace detail { 114 115 template <typename T> 116 using RawMask128 = typename Raw128<T>::type; 117 118 } // namespace detail 119 120 template <typename T, size_t N = 16 / sizeof(T)> 121 struct Mask128 { 122 using Raw = typename detail::RawMask128<T>; 123 124 using PrivateT = T; // only for DFromM 125 static constexpr size_t kPrivateN = N; // only for DFromM 126 127 Raw raw; 128 }; 129 130 template <class V> 131 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 132 133 template <class M> 134 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 135 136 template <class V> 137 using TFromV = typename V::PrivateT; 138 139 // ------------------------------ BitCast 140 141 namespace detail { 142 143 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } 144 HWY_INLINE __m128i BitCastToInteger(__m128 v) { 145 return reinterpret_cast<__m128i>(v); 146 } 147 HWY_INLINE __m128i BitCastToInteger(__m128d v) { 148 return reinterpret_cast<__m128i>(v); 149 } 150 151 template <typename T, size_t N> 152 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { 153 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; 154 } 155 156 // Cannot rely on function overloading because return types differ. 157 template <typename T> 158 struct BitCastFromInteger128 { 159 HWY_INLINE __m128i operator()(__m128i v) { return v; } 160 }; 161 template <> 162 struct BitCastFromInteger128<float> { 163 HWY_INLINE __m128 operator()(__m128i v) { 164 return reinterpret_cast<__m128>(v); 165 } 166 }; 167 template <> 168 struct BitCastFromInteger128<double> { 169 HWY_INLINE __m128d operator()(__m128i v) { 170 return reinterpret_cast<__m128d>(v); 171 } 172 }; 173 174 } // namespace detail 175 176 // ------------------------------ Zero 177 178 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. 179 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)> 180 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 181 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{(__lsx_vreplgr2vr_w(0))}; 182 } 183 184 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_FLOAT3264_D(D)> 185 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 186 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{ 187 detail::BitCastFromInteger128<TFromD<D>>()(__lsx_vreplgr2vr_w(0))}; 188 } 189 190 template <class D> 191 using VFromD = decltype(Zero(D())); 192 193 namespace detail { 194 195 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 196 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 197 Vec128<uint8_t, D().MaxBytes()> v) { 198 return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)}; 199 } 200 201 } // namespace detail 202 203 template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)> 204 HWY_API VFromD<D> BitCast(D d, 205 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { 206 return detail::BitCastFromByte(d, detail::BitCastToByte(v)); 207 } 208 209 // ------------------------------ Set 210 211 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 212 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 213 return VFromD<D>{__lsx_vreplgr2vr_b(static_cast<int>(t))}; 214 } 215 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> 216 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 217 return VFromD<D>{__lsx_vreplgr2vr_h(static_cast<int>(t))}; 218 } 219 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 220 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 221 return VFromD<D>{__lsx_vreplgr2vr_w(static_cast<int>(t))}; 222 } 223 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 224 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 225 return VFromD<D>{__lsx_vreplgr2vr_d(static_cast<long int>(t))}; 226 } 227 228 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 229 HWY_API VFromD<D> Set(D d, float t) { 230 const RebindToSigned<decltype(d)> di; 231 return BitCast(d, VFromD<decltype(di)>{__lsx_vldrepl_w(&t, 0)}); 232 } 233 234 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 235 HWY_API VFromD<D> Set(D d, double t) { 236 const RebindToSigned<decltype(d)> di; 237 return BitCast(d, VFromD<decltype(di)>{__lsx_vldrepl_d(&t, 0)}); 238 } 239 240 // Generic for all vector lengths. 241 template <class D, HWY_LSX_IF_EMULATED_D(D)> 242 HWY_API VFromD<D> Set(D df, TFromD<D> t) { 243 const RebindToUnsigned<decltype(df)> du; 244 static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16"); 245 uint16_t bits; 246 CopyBytes<2>(&t, &bits); 247 return BitCast(df, Set(du, bits)); 248 } 249 250 // ------------------------------ Undefined 251 252 HWY_DIAGNOSTICS(push) 253 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 254 255 // Returns a vector with uninitialized elements. 256 template <class D> 257 HWY_API VFromD<D> Undefined(D /* tag */) { 258 VFromD<D> v; 259 return v; 260 } 261 262 HWY_DIAGNOSTICS(pop) 263 264 // ------------------------------ GetLane 265 266 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 267 HWY_API T GetLane(const Vec128<T, N> v) { 268 return static_cast<T>(__lsx_vpickve2gr_b(v.raw, 0)); 269 } 270 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 271 HWY_API T GetLane(const Vec128<T, N> v) { 272 return static_cast<T>(__lsx_vpickve2gr_h(v.raw, 0)); 273 } 274 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 275 HWY_API T GetLane(const Vec128<T, N> v) { 276 return static_cast<T>(__lsx_vpickve2gr_w(v.raw, 0)); 277 } 278 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 279 HWY_API T GetLane(const Vec128<T, N> v) { 280 return static_cast<T>(__lsx_vpickve2gr_d(v.raw, 0)); 281 } 282 template <size_t N> 283 HWY_API float GetLane(const Vec128<float, N> v) { 284 float f32; 285 int32_t i32 = __lsx_vpickve2gr_w(reinterpret_cast<__m128i>(v.raw), 0); 286 CopyBytes<4>(&i32, &f32); 287 return f32; 288 } 289 template <size_t N> 290 HWY_API double GetLane(const Vec128<double, N> v) { 291 double f64; 292 int64_t i64 = __lsx_vpickve2gr_d(reinterpret_cast<__m128i>(v.raw), 0); 293 CopyBytes<8>(&i64, &f64); 294 return f64; 295 } 296 297 // ------------------------------ ResizeBitCast 298 299 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), 300 HWY_IF_V_SIZE_LE_D(D, 16)> 301 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 302 const Repartition<uint8_t, decltype(d)> du8; 303 return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)}); 304 } 305 306 // ------------------------------ Dup128VecFromValues 307 308 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 309 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 310 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 311 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 312 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 313 TFromD<D> t11, TFromD<D> t12, 314 TFromD<D> t13, TFromD<D> t14, 315 TFromD<D> t15) { 316 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16))); 317 const GccI8RawVectType raw = { 318 static_cast<int8_t>(t0), static_cast<int8_t>(t1), 319 static_cast<int8_t>(t2), static_cast<int8_t>(t3), 320 static_cast<int8_t>(t4), static_cast<int8_t>(t5), 321 static_cast<int8_t>(t6), static_cast<int8_t>(t7), 322 static_cast<int8_t>(t8), static_cast<int8_t>(t9), 323 static_cast<int8_t>(t10), static_cast<int8_t>(t11), 324 static_cast<int8_t>(t12), static_cast<int8_t>(t13), 325 static_cast<int8_t>(t14), static_cast<int8_t>(t15)}; 326 return VFromD<D>{reinterpret_cast<__m128i>(raw)}; 327 } 328 329 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 330 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 331 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 332 TFromD<D> t5, TFromD<D> t6, 333 TFromD<D> t7) { 334 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16))); 335 const GccI16RawVectType raw = { 336 static_cast<int16_t>(t0), static_cast<int16_t>(t1), 337 static_cast<int16_t>(t2), static_cast<int16_t>(t3), 338 static_cast<int16_t>(t4), static_cast<int16_t>(t5), 339 static_cast<int16_t>(t6), static_cast<int16_t>(t7)}; 340 return VFromD<D>{reinterpret_cast<__m128i>(raw)}; 341 } 342 343 template <class D, HWY_IF_SPECIAL_FLOAT_D(D)> 344 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 345 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 346 TFromD<D> t5, TFromD<D> t6, 347 TFromD<D> t7) { 348 const RebindToSigned<decltype(d)> di; 349 return BitCast(d, 350 Dup128VecFromValues( 351 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), 352 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), 353 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), 354 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); 355 } 356 357 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 358 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 359 TFromD<D> t2, TFromD<D> t3) { 360 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16))); 361 const GccI32RawVectType raw = { 362 static_cast<int32_t>(t0), static_cast<int32_t>(t1), 363 static_cast<int32_t>(t2), static_cast<int32_t>(t3)}; 364 return VFromD<D>{reinterpret_cast<__m128i>(raw)}; 365 } 366 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 367 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 368 typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16))); 369 const GccI64RawVectType raw = {static_cast<int64_t>(t0), 370 static_cast<int64_t>(t1)}; 371 return VFromD<D>{reinterpret_cast<__m128i>(raw)}; 372 } 373 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 374 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 375 TFromD<D> t2, TFromD<D> t3) { 376 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 377 const GccF32RawVectType raw = {t0, t1, t2, t3}; 378 return VFromD<D>{reinterpret_cast<__m128>(raw)}; 379 } 380 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 381 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 382 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 383 const GccF64RawVectType raw = {t0, t1}; 384 return VFromD<D>{reinterpret_cast<__m128d>(raw)}; 385 } 386 387 // ================================================== LOGICAL 388 389 // ------------------------------ And 390 391 template <typename T, size_t N> 392 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { 393 const DFromV<decltype(a)> d; 394 const RebindToUnsigned<decltype(d)> du; 395 return BitCast(d, VFromD<decltype(du)>{ 396 __lsx_vand_v(BitCast(du, a).raw, BitCast(du, b).raw)}); 397 } 398 399 // ------------------------------ AndNot 400 401 // Returns ~not_mask & mask. 402 template <typename T, size_t N> 403 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { 404 const DFromV<decltype(mask)> d; 405 const RebindToUnsigned<decltype(d)> du; 406 return BitCast(d, VFromD<decltype(du)>{__lsx_vandn_v( 407 BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); 408 } 409 410 // ------------------------------ Or 411 412 template <typename T, size_t N> 413 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { 414 const DFromV<decltype(a)> d; 415 const RebindToUnsigned<decltype(d)> du; 416 return BitCast(d, VFromD<decltype(du)>{ 417 __lsx_vor_v(BitCast(du, a).raw, BitCast(du, b).raw)}); 418 } 419 420 // ------------------------------ Xor 421 422 template <typename T, size_t N> 423 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { 424 const DFromV<decltype(a)> d; 425 const RebindToUnsigned<decltype(d)> du; 426 return BitCast(d, VFromD<decltype(du)>{ 427 __lsx_vxor_v(BitCast(du, a).raw, BitCast(du, b).raw)}); 428 } 429 430 // ------------------------------ Not 431 template <typename T, size_t N> 432 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { 433 const DFromV<decltype(v)> d; 434 const RebindToUnsigned<decltype(d)> du; 435 return BitCast(d, VFromD<decltype(du)>{ 436 __lsx_vnor_v(BitCast(du, v).raw, BitCast(du, v).raw)}); 437 } 438 439 // ------------------------------ Xor3 440 template <typename T, size_t N> 441 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 442 return Xor(x1, Xor(x2, x3)); 443 } 444 445 // ------------------------------ Or3 446 template <typename T, size_t N> 447 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { 448 return Or(o1, Or(o2, o3)); 449 } 450 451 // ------------------------------ OrAnd 452 template <typename T, size_t N> 453 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { 454 return Or(o, And(a1, a2)); 455 } 456 457 // ------------------------------ Mask 458 459 // Mask and Vec are the same (true = FF..FF). 460 template <typename T, size_t N> 461 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 462 return Mask128<T, N>{v.raw}; 463 } 464 465 template <class D> 466 using MFromD = decltype(MaskFromVec(VFromD<D>())); 467 468 template <typename T, size_t N> 469 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 470 return Vec128<T, N>{v.raw}; 471 } 472 473 // Generic for all vector lengths. 474 template <class D> 475 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { 476 return VecFromMask(v); 477 } 478 479 template <typename T, size_t N> 480 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 481 Vec128<T, N> no) { 482 const DFromV<decltype(yes)> d; 483 RebindToSigned<decltype(d)> di; 484 return BitCast(d, VFromD<decltype(di)>{__lsx_vbitsel_v( 485 BitCast(di, no).raw, BitCast(di, yes).raw, 486 RebindMask(di, mask).raw)}); 487 } 488 489 // ------------------------------ IfVecThenElse 490 template <typename T, size_t N> 491 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, 492 Vec128<T, N> no) { 493 return IfThenElse(MaskFromVec(mask), yes, no); 494 } 495 496 // ------------------------------ BitwiseIfThenElse 497 498 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE 499 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE 500 #else 501 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE 502 #endif 503 504 template <class V> 505 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { 506 return IfVecThenElse(mask, yes, no); 507 } 508 509 // ------------------------------ Operator overloads (internal-only if float) 510 511 template <typename T, size_t N> 512 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { 513 return And(a, b); 514 } 515 516 template <typename T, size_t N> 517 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { 518 return Or(a, b); 519 } 520 521 template <typename T, size_t N> 522 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { 523 return Xor(a, b); 524 } 525 526 // ------------------------------ PopulationCount 527 528 #ifdef HWY_NATIVE_POPCNT 529 #undef HWY_NATIVE_POPCNT 530 #else 531 #define HWY_NATIVE_POPCNT 532 #endif 533 534 namespace detail { 535 536 template <typename T, size_t N> 537 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, 538 Vec128<T, N> v) { 539 return Vec128<T, N>{__lsx_vpcnt_b(v.raw)}; 540 } 541 template <typename T, size_t N> 542 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, 543 Vec128<T, N> v) { 544 return Vec128<T, N>{__lsx_vpcnt_h(v.raw)}; 545 } 546 template <typename T, size_t N> 547 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, 548 Vec128<T, N> v) { 549 return Vec128<T, N>{__lsx_vpcnt_w(v.raw)}; 550 } 551 template <typename T, size_t N> 552 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, 553 Vec128<T, N> v) { 554 return Vec128<T, N>{__lsx_vpcnt_d(v.raw)}; 555 } 556 557 } // namespace detail 558 559 template <typename T, size_t N> 560 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { 561 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); 562 } 563 564 // ================================================== SIGN 565 566 // ------------------------------ Neg 567 568 template <typename T, size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)> 569 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 570 return Xor(v, SignBit(DFromV<decltype(v)>())); 571 } 572 573 template <typename T, size_t N, HWY_IF_UI8(T)> 574 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 575 return Vec128<T, N>{__lsx_vneg_b(v.raw)}; 576 } 577 578 template <typename T, size_t N, HWY_IF_UI16(T)> 579 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 580 return Vec128<T, N>{__lsx_vneg_h(v.raw)}; 581 } 582 583 template <typename T, size_t N, HWY_IF_UI32(T)> 584 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 585 return Vec128<T, N>{__lsx_vneg_w(v.raw)}; 586 } 587 588 template <typename T, size_t N, HWY_IF_UI64(T)> 589 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 590 return Vec128<T, N>{__lsx_vneg_d(v.raw)}; 591 } 592 593 // ------------------------------ Floating-point Abs 594 // Generic for all vector lengths 595 template <class V, HWY_IF_FLOAT(TFromV<V>)> 596 HWY_API V Abs(V v) { 597 const DFromV<decltype(v)> d; 598 const RebindToSigned<decltype(d)> di; 599 using TI = TFromD<decltype(di)>; 600 return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>()))); 601 } 602 603 // ------------------------------ CopySign 604 // Generic for all vector lengths. 605 template <class V> 606 HWY_API V CopySign(const V magn, const V sign) { 607 static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point"); 608 609 const DFromV<decltype(magn)> d; 610 const auto msb = SignBit(d); 611 return BitwiseIfThenElse(msb, sign, magn); 612 } 613 614 // ------------------------------ CopySignToAbs 615 // Generic for all vector lengths. 616 template <class V> 617 HWY_API V CopySignToAbs(const V abs, const V sign) { 618 const DFromV<decltype(abs)> d; 619 return OrAnd(abs, SignBit(d), sign); 620 } 621 622 // ------------------------------ IfThenElseZero 623 624 template <typename T, size_t N> 625 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 626 return yes & VecFromMask(DFromV<decltype(yes)>(), mask); 627 } 628 629 template <typename T, size_t N> 630 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 631 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); 632 } 633 634 // ------------------------------ Mask logical 635 636 template <typename T, size_t N> 637 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 638 const Simd<T, N, 0> d; 639 return MaskFromVec(Not(VecFromMask(d, m))); 640 } 641 642 template <typename T, size_t N> 643 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 644 const Simd<T, N, 0> d; 645 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 646 } 647 648 template <typename T, size_t N> 649 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 650 const Simd<T, N, 0> d; 651 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 652 } 653 654 template <typename T, size_t N> 655 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 656 const Simd<T, N, 0> d; 657 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 658 } 659 660 template <typename T, size_t N> 661 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 662 const Simd<T, N, 0> d; 663 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 664 } 665 666 // ------------------------------ ExclusiveNeither 667 668 template <typename T, size_t N> 669 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { 670 const Simd<T, N, 0> d; 671 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 672 } 673 674 // ------------------------------ ShiftLeft 675 676 template <int kBits, size_t N> 677 HWY_API Vec128<uint8_t, N> ShiftLeft(const Vec128<uint8_t, N> v) { 678 return Vec128<uint8_t, N>{__lsx_vslli_b(v.raw, kBits)}; 679 } 680 template <int kBits, size_t N> 681 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { 682 return Vec128<uint16_t, N>{__lsx_vslli_h(v.raw, kBits)}; 683 } 684 template <int kBits, size_t N> 685 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { 686 return Vec128<uint32_t, N>{__lsx_vslli_w(v.raw, kBits)}; 687 } 688 template <int kBits, size_t N> 689 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { 690 return Vec128<uint64_t, N>{__lsx_vslli_d(v.raw, kBits)}; 691 } 692 693 template <int kBits, size_t N> 694 HWY_API Vec128<int8_t, N> ShiftLeft(const Vec128<int8_t, N> v) { 695 return Vec128<int8_t, N>{__lsx_vslli_b(v.raw, kBits)}; 696 } 697 template <int kBits, size_t N> 698 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { 699 return Vec128<int16_t, N>{__lsx_vslli_h(v.raw, kBits)}; 700 } 701 template <int kBits, size_t N> 702 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { 703 return Vec128<int32_t, N>{__lsx_vslli_w(v.raw, kBits)}; 704 } 705 template <int kBits, size_t N> 706 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { 707 return Vec128<int64_t, N>{__lsx_vslli_d(v.raw, kBits)}; 708 } 709 710 // ------------------------------ ShiftRight 711 712 template <int kBits, size_t N> 713 HWY_API Vec128<uint8_t, N> ShiftRight(Vec128<uint8_t, N> v) { 714 return Vec128<uint8_t, N>{__lsx_vsrli_b(v.raw, kBits)}; 715 } 716 template <int kBits, size_t N> 717 HWY_API Vec128<uint16_t, N> ShiftRight(Vec128<uint16_t, N> v) { 718 return Vec128<uint16_t, N>{__lsx_vsrli_h(v.raw, kBits)}; 719 } 720 template <int kBits, size_t N> 721 HWY_API Vec128<uint32_t, N> ShiftRight(Vec128<uint32_t, N> v) { 722 return Vec128<uint32_t, N>{__lsx_vsrli_w(v.raw, kBits)}; 723 } 724 template <int kBits, size_t N> 725 HWY_API Vec128<uint64_t, N> ShiftRight(Vec128<uint64_t, N> v) { 726 return Vec128<uint64_t, N>{__lsx_vsrli_d(v.raw, kBits)}; 727 } 728 729 template <int kBits, size_t N> 730 HWY_API Vec128<int8_t, N> ShiftRight(Vec128<int8_t, N> v) { 731 return Vec128<int8_t, N>{__lsx_vsrai_b(v.raw, kBits)}; 732 } 733 template <int kBits, size_t N> 734 HWY_API Vec128<int16_t, N> ShiftRight(Vec128<int16_t, N> v) { 735 return Vec128<int16_t, N>{__lsx_vsrai_h(v.raw, kBits)}; 736 } 737 template <int kBits, size_t N> 738 HWY_API Vec128<int32_t, N> ShiftRight(Vec128<int32_t, N> v) { 739 return Vec128<int32_t, N>{__lsx_vsrai_w(v.raw, kBits)}; 740 } 741 template <int kBits, size_t N> 742 HWY_API Vec128<int64_t, N> ShiftRight(Vec128<int64_t, N> v) { 743 return Vec128<int64_t, N>{__lsx_vsrai_d(v.raw, kBits)}; 744 } 745 746 // ------------------------------ RoundingShiftRight 747 748 #ifdef HWY_NATIVE_ROUNDING_SHR 749 #undef HWY_NATIVE_ROUNDING_SHR 750 #else 751 #define HWY_NATIVE_ROUNDING_SHR 752 #endif 753 754 template <int kBits, size_t N> 755 HWY_API Vec128<int8_t, N> RoundingShiftRight(Vec128<int8_t, N> v) { 756 return Vec128<int8_t, N>{__lsx_vsrari_b(v.raw, kBits)}; 757 } 758 template <int kBits, size_t N> 759 HWY_API Vec128<int16_t, N> RoundingShiftRight(Vec128<int16_t, N> v) { 760 return Vec128<int16_t, N>{__lsx_vsrari_h(v.raw, kBits)}; 761 } 762 template <int kBits, size_t N> 763 HWY_API Vec128<int32_t, N> RoundingShiftRight(Vec128<int32_t, N> v) { 764 return Vec128<int32_t, N>{__lsx_vsrari_w(v.raw, kBits)}; 765 } 766 template <int kBits, size_t N> 767 HWY_API Vec128<int64_t, N> RoundingShiftRight(Vec128<int64_t, N> v) { 768 return Vec128<int64_t, N>{__lsx_vsrari_d(v.raw, kBits)}; 769 } 770 771 template <int kBits, size_t N> 772 HWY_API Vec128<uint8_t, N> RoundingShiftRight(Vec128<uint8_t, N> v) { 773 return Vec128<uint8_t, N>{__lsx_vsrlri_b(v.raw, kBits)}; 774 } 775 template <int kBits, size_t N> 776 HWY_API Vec128<uint16_t, N> RoundingShiftRight(Vec128<uint16_t, N> v) { 777 return Vec128<uint16_t, N>{__lsx_vsrlri_h(v.raw, kBits)}; 778 } 779 template <int kBits, size_t N> 780 HWY_API Vec128<uint32_t, N> RoundingShiftRight(Vec128<uint32_t, N> v) { 781 return Vec128<uint32_t, N>{__lsx_vsrlri_w(v.raw, kBits)}; 782 } 783 template <int kBits, size_t N> 784 HWY_API Vec128<uint64_t, N> RoundingShiftRight(Vec128<uint64_t, N> v) { 785 return Vec128<uint64_t, N>{__lsx_vsrlri_d(v.raw, kBits)}; 786 } 787 788 // ------------------------------ RoundingShr 789 790 template <size_t N> 791 HWY_API Vec128<int8_t, N> RoundingShr(Vec128<int8_t, N> v, 792 Vec128<int8_t, N> bits) { 793 return Vec128<int8_t, N>{__lsx_vsrar_b(v.raw, bits.raw)}; 794 } 795 template <size_t N> 796 HWY_API Vec128<int16_t, N> RoundingShr(Vec128<int16_t, N> v, 797 Vec128<int16_t, N> bits) { 798 return Vec128<int16_t, N>{__lsx_vsrar_h(v.raw, bits.raw)}; 799 } 800 template <size_t N> 801 HWY_API Vec128<int32_t, N> RoundingShr(Vec128<int32_t, N> v, 802 Vec128<int32_t, N> bits) { 803 return Vec128<int32_t, N>{__lsx_vsrar_w(v.raw, bits.raw)}; 804 } 805 template <size_t N> 806 HWY_API Vec128<int64_t, N> RoundingShr(Vec128<int64_t, N> v, 807 Vec128<int64_t, N> bits) { 808 return Vec128<int64_t, N>{__lsx_vsrar_d(v.raw, bits.raw)}; 809 } 810 811 template <size_t N> 812 HWY_API Vec128<uint8_t, N> RoundingShr(Vec128<uint8_t, N> v, 813 Vec128<uint8_t, N> bits) { 814 return Vec128<uint8_t, N>{__lsx_vsrlr_b(v.raw, bits.raw)}; 815 } 816 template <size_t N> 817 HWY_API Vec128<uint16_t, N> RoundingShr(Vec128<uint16_t, N> v, 818 Vec128<uint16_t, N> bits) { 819 return Vec128<uint16_t, N>{__lsx_vsrlr_h(v.raw, bits.raw)}; 820 } 821 template <size_t N> 822 HWY_API Vec128<uint32_t, N> RoundingShr(Vec128<uint32_t, N> v, 823 Vec128<uint32_t, N> bits) { 824 return Vec128<uint32_t, N>{__lsx_vsrlr_w(v.raw, bits.raw)}; 825 } 826 template <size_t N> 827 HWY_API Vec128<uint64_t, N> RoundingShr(Vec128<uint64_t, N> v, 828 Vec128<uint64_t, N> bits) { 829 return Vec128<uint64_t, N>{__lsx_vsrlr_d(v.raw, bits.raw)}; 830 } 831 832 // ------------------------------ RoundingShiftRightSame (RoundingShr) 833 834 template <typename T, size_t N> 835 HWY_API Vec128<T, N> RoundingShiftRightSame(const Vec128<T, N> v, int bits) { 836 return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits))); 837 } 838 839 // ================================================== MEMORY (1) 840 841 // ------------------------------ Load 128 842 843 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 844 HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) { 845 const RebindToUnsigned<decltype(d)> du; 846 return BitCast(d, VFromD<decltype(du)>{__lsx_vld(aligned, 0)}); 847 } 848 849 // Partial 850 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 851 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { 852 VFromD<D> v; 853 CopyBytes<d.MaxBytes()>(p, &v); 854 return v; 855 } 856 857 // LoadU == Load 858 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 859 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 860 return Load(d, p); 861 } 862 863 // ------------------------------ MaskedLoad 864 865 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 866 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 867 const TFromD<D>* HWY_RESTRICT p) { 868 return IfThenElseZero(m, LoadU(d, p)); 869 } 870 871 // ------------------------------ MaskedLoadOr 872 873 template <class D> 874 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 875 const TFromD<D>* HWY_RESTRICT p) { 876 return IfThenElse(m, LoadU(d, p), v); 877 } 878 879 // 128-bit SIMD => nothing to duplicate, same as an unaligned load. 880 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 881 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { 882 return Load(d, p); 883 } 884 885 // ------------------------------ Store 128 886 887 template <class D, HWY_IF_V_SIZE_D(D, 16)> 888 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) { 889 __lsx_vst(v.raw, aligned, 0); 890 } 891 892 // ------------------------------ Store 64 893 894 template <class D, HWY_IF_V_SIZE_D(D, 8)> 895 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) { 896 __lsx_vstelm_d(v.raw, aligned, 0, 0); 897 } 898 899 // ------------------------------ Store 32 900 901 template <class D, HWY_IF_V_SIZE_D(D, 4)> 902 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) { 903 __lsx_vstelm_w(v.raw, aligned, 0, 0); 904 } 905 906 // ------------------------------ Store 16 907 908 template <class D, HWY_IF_V_SIZE_D(D, 2)> 909 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) { 910 __lsx_vstelm_h(v.raw, aligned, 0, 0); 911 } 912 913 // ------------------------------ Store 8 914 915 template <class D, HWY_IF_V_SIZE_D(D, 1)> 916 HWY_API void Store(VFromD<D> v, D /* tag */, void* HWY_RESTRICT aligned) { 917 __lsx_vstelm_b(v.raw, aligned, 0, 0); 918 } 919 920 template <class D> 921 HWY_API void StoreU(VFromD<D> v, D d, void* HWY_RESTRICT p) { 922 Store(v, d, p); 923 } 924 925 // ================================================== SWIZZLE (1) 926 927 // ------------------------------ TableLookupBytes 928 template <typename T, size_t N, typename TI, size_t NI> 929 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, 930 const Vec128<TI, NI> from) { 931 const DFromV<decltype(from)> d; 932 const Repartition<uint8_t, decltype(d)> du8; 933 const DFromV<decltype(bytes)> d_bytes; 934 const Repartition<uint8_t, decltype(d_bytes)> du8_bytes; 935 return BitCast( 936 d, VFromD<decltype(du8)>{__lsx_vshuf_b(BitCast(du8_bytes, bytes).raw, 937 BitCast(du8_bytes, bytes).raw, 938 (BitCast(du8, from).raw))}); 939 } 940 941 // ------------------------------ TableLookupBytesOr0 942 template <class V, class VI> 943 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { 944 const DFromV<VI> d; 945 const Repartition<int8_t, decltype(d)> di8; 946 return BitCast(d, 947 IfThenZeroElse(Lt(BitCast(di8, from), Zero(di8)), 948 BitCast(di8, TableLookupBytes(bytes, from)))); 949 } 950 951 // ------------------------------ Shuffles (ShiftRight, TableLookupBytes) 952 953 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). 954 // Shuffle0321 rotates one lane to the right (the previous least-significant 955 // lane is now most-significant). These could also be implemented via 956 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. 957 958 // Swap 32-bit halves in 64-bit halves. 959 template <typename T, size_t N> 960 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) { 961 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 962 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 963 const DFromV<decltype(v)> d; 964 const RebindToUnsigned<decltype(d)> du; 965 return BitCast(d, VFromD<decltype(du)>{__lsx_vshuf4i_w( 966 detail::BitCastToInteger(v.raw), 0xB1)}); 967 } 968 969 namespace detail { 970 971 template <typename T, HWY_IF_T_SIZE(T, 1)> 972 HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) { 973 const int8_t _data_idx[] = {1, 0, 19, 18}; 974 __m128i shuffle_idx = __lsx_vld(_data_idx, 0); 975 return Vec32<T>{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)}; 976 } 977 template <typename T, HWY_IF_T_SIZE(T, 2)> 978 HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) { 979 const int16_t _data_idx[] = {9, 8, 3, 2}; 980 __m128i shuffle_idx = __lsx_vld(_data_idx, 0); 981 return Vec64<T>{__lsx_vshuf_h(shuffle_idx, a.raw, b.raw)}; 982 } 983 template <typename T, HWY_IF_T_SIZE(T, 4)> 984 HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) { 985 const DFromV<decltype(a)> d; 986 const RebindToSigned<decltype(d)> di; 987 return BitCast(d, Vec128<int32_t>{__lsx_vpermi_w(BitCast(di, b).raw, 988 BitCast(di, a).raw, 0xB1)}); 989 } 990 991 template <typename T, HWY_IF_T_SIZE(T, 1)> 992 HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) { 993 const int8_t _data_idx[] = {0, 3, 18, 17}; 994 __m128i shuffle_idx = __lsx_vld(_data_idx, 0); 995 return Vec32<T>{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)}; 996 } 997 template <typename T, HWY_IF_T_SIZE(T, 2)> 998 HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) { 999 const int16_t _data_idx[] = {10, 11, 2, 1}; 1000 __m128i shuffle_idx = __lsx_vld(_data_idx, 0); 1001 auto t0 = __lsx_vshuf_h(shuffle_idx, a.raw, b.raw); 1002 return Vec64<T>{t0}; 1003 } 1004 template <typename T, HWY_IF_T_SIZE(T, 4)> 1005 HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) { 1006 const DFromV<decltype(a)> d; 1007 const RebindToSigned<decltype(d)> di; 1008 return BitCast(d, Vec128<int32_t>{__lsx_vpermi_w(BitCast(di, b).raw, 1009 BitCast(di, a).raw, 0x6C)}); 1010 } 1011 1012 template <typename T, HWY_IF_T_SIZE(T, 1)> 1013 HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) { 1014 const int8_t _data_idx[] = {2, 1, 16, 19}; 1015 __m128i shuffle_idx = __lsx_vld(_data_idx, 0); 1016 return Vec32<T>{__lsx_vshuf_b(b.raw, a.raw, shuffle_idx)}; 1017 } 1018 template <typename T, HWY_IF_T_SIZE(T, 2)> 1019 HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) { 1020 const int16_t _data_idx[] = {8, 9, 0, 3}; 1021 __m128i shuffle_idx = __lsx_vld(_data_idx, 0); 1022 return Vec64<T>{__lsx_vshuf_h(shuffle_idx, a.raw, b.raw)}; 1023 } 1024 template <typename T, HWY_IF_T_SIZE(T, 4)> 1025 HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) { 1026 const DFromV<decltype(a)> d; 1027 const RebindToSigned<decltype(d)> di; 1028 return BitCast(d, Vec128<int32_t>{__lsx_vpermi_w(BitCast(di, b).raw, 1029 BitCast(di, a).raw, 0xC6)}); 1030 } 1031 1032 } // namespace detail 1033 1034 // Swap 64-bit halves 1035 template <typename T, HWY_IF_T_SIZE(T, 4)> 1036 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) { 1037 const DFromV<decltype(v)> d; 1038 return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w( 1039 reinterpret_cast<__m128i>(v.raw), 0x4E)}); 1040 } 1041 HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) { 1042 return Vec128<uint64_t>{__lsx_vshuf4i_w(v.raw, 0x4E)}; 1043 } 1044 HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) { 1045 return Vec128<int64_t>{__lsx_vshuf4i_w(v.raw, 0x4E)}; 1046 } 1047 HWY_API Vec128<double> Shuffle01(const Vec128<double> v) { 1048 const DFromV<decltype(v)> d; 1049 return BitCast(d, Vec128<uint64_t>{__lsx_vshuf4i_d( 1050 reinterpret_cast<__m128i>(v.raw), 1051 reinterpret_cast<__m128i>(v.raw), 0x1)}); 1052 } 1053 1054 // Rotate right 32 bits 1055 template <typename T, HWY_IF_T_SIZE(T, 4)> 1056 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) { 1057 const DFromV<decltype(v)> d; 1058 return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w( 1059 reinterpret_cast<__m128i>(v.raw), 0x39)}); 1060 } 1061 // Rotate left 32 bits 1062 template <typename T, HWY_IF_T_SIZE(T, 4)> 1063 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) { 1064 const DFromV<decltype(v)> d; 1065 return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w( 1066 reinterpret_cast<__m128i>(v.raw), 0x93)}); 1067 } 1068 // Reverse 1069 template <typename T, HWY_IF_T_SIZE(T, 4)> 1070 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) { 1071 const DFromV<decltype(v)> d; 1072 return BitCast(d, Vec128<uint32_t>{__lsx_vshuf4i_w( 1073 reinterpret_cast<__m128i>(v.raw), 0x1B)}); 1074 } 1075 1076 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 1077 1078 template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)> 1079 HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) { 1080 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 1081 const Simd<TFrom, NFrom, 0> d; 1082 return MaskFromVec(BitCast(dto, VecFromMask(d, m))); 1083 } 1084 1085 // ================================================== COMPARE 1086 1087 template <typename T, size_t N> 1088 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { 1089 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1090 return (v & bit) == bit; 1091 } 1092 1093 // ------------------------------ Equality 1094 1095 // Unsigned 1096 template <size_t N> 1097 HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a, 1098 Vec128<uint8_t, N> b) { 1099 return Mask128<uint8_t, N>{__lsx_vseq_b(a.raw, b.raw)}; 1100 } 1101 template <size_t N> 1102 HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a, 1103 Vec128<uint16_t, N> b) { 1104 return Mask128<uint16_t, N>{__lsx_vseq_h(a.raw, b.raw)}; 1105 } 1106 template <size_t N> 1107 HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a, 1108 Vec128<uint32_t, N> b) { 1109 return Mask128<uint32_t, N>{__lsx_vseq_w(a.raw, b.raw)}; 1110 } 1111 template <size_t N> 1112 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, 1113 const Vec128<uint64_t, N> b) { 1114 return Mask128<uint64_t, N>{__lsx_vseq_d(a.raw, b.raw)}; 1115 } 1116 1117 // Signed 1118 template <size_t N> 1119 HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a, 1120 Vec128<int8_t, N> b) { 1121 return Mask128<int8_t, N>{__lsx_vseq_b(a.raw, b.raw)}; 1122 } 1123 template <size_t N> 1124 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, 1125 Vec128<int16_t, N> b) { 1126 return Mask128<int16_t, N>{__lsx_vseq_h(a.raw, b.raw)}; 1127 } 1128 template <size_t N> 1129 HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a, 1130 Vec128<int32_t, N> b) { 1131 return Mask128<int32_t, N>{__lsx_vseq_w(a.raw, b.raw)}; 1132 } 1133 template <size_t N> 1134 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, 1135 const Vec128<int64_t, N> b) { 1136 return Mask128<int64_t, N>{__lsx_vseq_d(a.raw, b.raw)}; 1137 } 1138 1139 // Float 1140 template <size_t N> 1141 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { 1142 return Mask128<float, N>{ 1143 reinterpret_cast<__m128>(__lsx_vfcmp_ceq_s(a.raw, b.raw))}; 1144 } 1145 template <size_t N> 1146 HWY_API Mask128<double, N> operator==(Vec128<double, N> a, 1147 Vec128<double, N> b) { 1148 return Mask128<double, N>{ 1149 reinterpret_cast<__m128d>(__lsx_vfcmp_ceq_d(a.raw, b.raw))}; 1150 } 1151 1152 // ------------------------------ Inequality 1153 1154 // This cannot have T as a template argument, otherwise it is not more 1155 // specialized than rewritten operator== in C++20, leading to compile 1156 // errors: https://gcc.godbolt.org/z/xsrPhPvPT. 1157 template <size_t N> 1158 HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a, 1159 Vec128<uint8_t, N> b) { 1160 return Not(a == b); 1161 } 1162 template <size_t N> 1163 HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a, 1164 Vec128<uint16_t, N> b) { 1165 return Not(a == b); 1166 } 1167 template <size_t N> 1168 HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a, 1169 Vec128<uint32_t, N> b) { 1170 return Not(a == b); 1171 } 1172 template <size_t N> 1173 HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a, 1174 Vec128<uint64_t, N> b) { 1175 return Not(a == b); 1176 } 1177 template <size_t N> 1178 HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a, 1179 Vec128<int8_t, N> b) { 1180 return Not(a == b); 1181 } 1182 template <size_t N> 1183 HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a, 1184 Vec128<int16_t, N> b) { 1185 return Not(a == b); 1186 } 1187 template <size_t N> 1188 HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a, 1189 Vec128<int32_t, N> b) { 1190 return Not(a == b); 1191 } 1192 template <size_t N> 1193 HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a, 1194 Vec128<int64_t, N> b) { 1195 return Not(a == b); 1196 } 1197 1198 template <size_t N> 1199 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { 1200 return Mask128<float, N>{ 1201 reinterpret_cast<__m128>(__lsx_vfcmp_cune_s(a.raw, b.raw))}; 1202 } 1203 template <size_t N> 1204 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, 1205 Vec128<double, N> b) { 1206 return Mask128<double, N>{ 1207 reinterpret_cast<__m128d>(__lsx_vfcmp_cune_d(a.raw, b.raw))}; 1208 } 1209 1210 // ------------------------------ Strict inequality 1211 1212 namespace detail { 1213 1214 template <size_t N> 1215 HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a, 1216 Vec128<int8_t, N> b) { 1217 return Mask128<int8_t, N>{__lsx_vslt_b(b.raw, a.raw)}; 1218 } 1219 template <size_t N> 1220 HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a, 1221 Vec128<int16_t, N> b) { 1222 return Mask128<int16_t, N>{__lsx_vslt_h(b.raw, a.raw)}; 1223 } 1224 template <size_t N> 1225 HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a, 1226 Vec128<int32_t, N> b) { 1227 return Mask128<int32_t, N>{__lsx_vslt_w(b.raw, a.raw)}; 1228 } 1229 template <size_t N> 1230 HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/, 1231 const Vec128<int64_t, N> a, 1232 const Vec128<int64_t, N> b) { 1233 return Mask128<int64_t, N>{__lsx_vslt_d(b.raw, a.raw)}; 1234 } 1235 1236 template <size_t N> 1237 HWY_INLINE Mask128<uint8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<uint8_t, N> a, 1238 Vec128<uint8_t, N> b) { 1239 return Mask128<uint8_t, N>{__lsx_vslt_b(b.raw, a.raw)}; 1240 } 1241 template <size_t N> 1242 HWY_INLINE Mask128<uint16_t, N> Gt(hwy::SignedTag /*tag*/, 1243 Vec128<uint16_t, N> a, 1244 Vec128<uint16_t, N> b) { 1245 return Mask128<uint16_t, N>{__lsx_vslt_h(b.raw, a.raw)}; 1246 } 1247 template <size_t N> 1248 HWY_INLINE Mask128<uint32_t, N> Gt(hwy::SignedTag /*tag*/, 1249 Vec128<uint32_t, N> a, 1250 Vec128<uint32_t, N> b) { 1251 return Mask128<uint32_t, N>{__lsx_vslt_w(b.raw, a.raw)}; 1252 } 1253 template <size_t N> 1254 HWY_INLINE Mask128<uint64_t, N> Gt(hwy::SignedTag /*tag*/, 1255 const Vec128<uint64_t, N> a, 1256 const Vec128<uint64_t, N> b) { 1257 return Mask128<uint64_t, N>{__lsx_vslt_d(b.raw, a.raw)}; 1258 } 1259 1260 template <typename T, size_t N> 1261 HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a, 1262 Vec128<T, N> b) { 1263 const DFromV<decltype(a)> du; 1264 const RebindToSigned<decltype(du)> di; 1265 const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1); 1266 const auto sa = BitCast(di, Xor(a, msb)); 1267 const auto sb = BitCast(di, Xor(b, msb)); 1268 return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); 1269 } 1270 1271 template <size_t N> 1272 HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a, 1273 Vec128<float, N> b) { 1274 return Mask128<float, N>{ 1275 reinterpret_cast<__m128>(__lsx_vfcmp_clt_s(b.raw, a.raw))}; 1276 } 1277 template <size_t N> 1278 HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a, 1279 Vec128<double, N> b) { 1280 return Mask128<double, N>{ 1281 reinterpret_cast<__m128d>(__lsx_vfcmp_clt_d(b.raw, a.raw))}; 1282 } 1283 1284 } // namespace detail 1285 1286 template <typename T, size_t N> 1287 HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { 1288 return detail::Gt(hwy::TypeTag<T>(), a, b); 1289 } 1290 1291 // ------------------------------ Weak inequality 1292 1293 namespace detail { 1294 template <typename T, size_t N> 1295 HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a, 1296 Vec128<T, N> b) { 1297 return Not(Gt(tag, b, a)); 1298 } 1299 1300 template <typename T, size_t N> 1301 HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a, 1302 Vec128<T, N> b) { 1303 return Not(Gt(tag, b, a)); 1304 } 1305 1306 template <size_t N> 1307 HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a, 1308 Vec128<float, N> b) { 1309 return Mask128<float, N>{ 1310 reinterpret_cast<__m128>(__lsx_vfcmp_cle_s(b.raw, a.raw))}; 1311 } 1312 template <size_t N> 1313 HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a, 1314 Vec128<double, N> b) { 1315 return Mask128<double, N>{ 1316 reinterpret_cast<__m128d>(__lsx_vfcmp_cle_d(b.raw, a.raw))}; 1317 } 1318 1319 } // namespace detail 1320 1321 template <typename T, size_t N> 1322 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { 1323 return detail::Ge(hwy::TypeTag<T>(), a, b); 1324 } 1325 1326 // ------------------------------ Reversed comparisons 1327 1328 template <typename T, size_t N> 1329 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { 1330 return b > a; 1331 } 1332 1333 template <typename T, size_t N> 1334 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { 1335 return b >= a; 1336 } 1337 1338 // ------------------------------ Iota (Load) 1339 1340 namespace detail { 1341 1342 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 1343 HWY_INLINE VFromD<D> Iota0(D d) { 1344 return Dup128VecFromValues( 1345 d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4}, 1346 TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9}, 1347 TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14}, 1348 TFromD<D>{15}); 1349 } 1350 1351 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> 1352 HWY_INLINE VFromD<D> Iota0(D d) { 1353 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, 1354 TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5}, 1355 TFromD<D>{6}, TFromD<D>{7}); 1356 } 1357 1358 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 1359 HWY_INLINE VFromD<D> Iota0(D d) { 1360 return Dup128VecFromValues( 1361 d, static_cast<TFromD<D>>(0), static_cast<TFromD<D>>(1), 1362 static_cast<TFromD<D>>(2), static_cast<TFromD<D>>(3)); 1363 } 1364 1365 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 1366 HWY_INLINE VFromD<D> Iota0(D d) { 1367 return Dup128VecFromValues(d, static_cast<TFromD<D>>(0), 1368 static_cast<TFromD<D>>(1)); 1369 } 1370 1371 } // namespace detail 1372 1373 template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)> 1374 HWY_API VFromD<D> Iota(D d, const T2 first) { 1375 const auto result_iota = 1376 detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); 1377 return result_iota; 1378 } 1379 1380 // ------------------------------ FirstN (Iota, Lt) 1381 1382 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1383 HWY_API MFromD<D> FirstN(D d, size_t num) { 1384 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. 1385 using TI = TFromD<decltype(di)>; 1386 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num))); 1387 } 1388 1389 // ------------------------------ InterleaveLower 1390 1391 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides 1392 // the least-significant lane) and "b". To concatenate two half-width integers 1393 // into one, use ZipLower/Upper instead (also works with scalar). 1394 1395 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 1396 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 1397 return Vec128<T, N>{__lsx_vilvl_b(b.raw, a.raw)}; 1398 } 1399 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 1400 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 1401 return Vec128<T, N>{__lsx_vilvl_h(b.raw, a.raw)}; 1402 } 1403 template <typename T, size_t N, HWY_IF_UI32(T)> 1404 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 1405 return Vec128<T, N>{__lsx_vilvl_w(b.raw, a.raw)}; 1406 } 1407 template <typename T, size_t N, HWY_IF_UI64(T)> 1408 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 1409 return Vec128<T, N>{__lsx_vilvl_d(b.raw, a.raw)}; 1410 } 1411 1412 template <size_t N> 1413 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, 1414 Vec128<float, N> b) { 1415 return Vec128<float, N>{reinterpret_cast<__m128>(__lsx_vilvl_w( 1416 reinterpret_cast<__m128i>(b.raw), reinterpret_cast<__m128i>(a.raw)))}; 1417 } 1418 template <size_t N> 1419 HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a, 1420 Vec128<double, N> b) { 1421 return Vec128<double, N>{reinterpret_cast<__m128d>(__lsx_vilvl_d( 1422 reinterpret_cast<__m128i>(b.raw), reinterpret_cast<__m128i>(a.raw)))}; 1423 } 1424 1425 // Generic for all vector lengths. 1426 template <class D> 1427 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { 1428 return InterleaveLower(a, b); 1429 } 1430 1431 // ------------------------------ BlendedStore 1432 1433 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1434 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 1435 TFromD<D>* HWY_RESTRICT p) { 1436 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); 1437 } 1438 1439 // ================================================== ARITHMETIC 1440 1441 // ------------------------------ Addition 1442 1443 // Unsigned 1444 template <size_t N> 1445 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, 1446 const Vec128<uint8_t, N> b) { 1447 return Vec128<uint8_t, N>{__lsx_vadd_b(a.raw, b.raw)}; 1448 } 1449 template <size_t N> 1450 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, 1451 const Vec128<uint16_t, N> b) { 1452 return Vec128<uint16_t, N>{__lsx_vadd_h(a.raw, b.raw)}; 1453 } 1454 template <size_t N> 1455 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, 1456 const Vec128<uint32_t, N> b) { 1457 return Vec128<uint32_t, N>{__lsx_vadd_w(a.raw, b.raw)}; 1458 } 1459 template <size_t N> 1460 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, 1461 const Vec128<uint64_t, N> b) { 1462 return Vec128<uint64_t, N>{__lsx_vadd_d(a.raw, b.raw)}; 1463 } 1464 1465 // Signed 1466 template <size_t N> 1467 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, 1468 const Vec128<int8_t, N> b) { 1469 return Vec128<int8_t, N>{__lsx_vadd_b(a.raw, b.raw)}; 1470 } 1471 template <size_t N> 1472 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, 1473 const Vec128<int16_t, N> b) { 1474 return Vec128<int16_t, N>{__lsx_vadd_h(a.raw, b.raw)}; 1475 } 1476 template <size_t N> 1477 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, 1478 const Vec128<int32_t, N> b) { 1479 return Vec128<int32_t, N>{__lsx_vadd_w(a.raw, b.raw)}; 1480 } 1481 template <size_t N> 1482 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, 1483 const Vec128<int64_t, N> b) { 1484 return Vec128<int64_t, N>{__lsx_vadd_d(a.raw, b.raw)}; 1485 } 1486 1487 template <size_t N> 1488 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, 1489 const Vec128<float, N> b) { 1490 return Vec128<float, N>{__lsx_vfadd_s(a.raw, b.raw)}; 1491 } 1492 template <size_t N> 1493 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a, 1494 const Vec128<double, N> b) { 1495 return Vec128<double, N>{__lsx_vfadd_d(a.raw, b.raw)}; 1496 } 1497 1498 // ------------------------------ Subtraction 1499 1500 // Unsigned 1501 template <size_t N> 1502 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, 1503 const Vec128<uint8_t, N> b) { 1504 return Vec128<uint8_t, N>{__lsx_vsub_b(a.raw, b.raw)}; 1505 } 1506 template <size_t N> 1507 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, 1508 Vec128<uint16_t, N> b) { 1509 return Vec128<uint16_t, N>{__lsx_vsub_h(a.raw, b.raw)}; 1510 } 1511 template <size_t N> 1512 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, 1513 const Vec128<uint32_t, N> b) { 1514 return Vec128<uint32_t, N>{__lsx_vsub_w(a.raw, b.raw)}; 1515 } 1516 template <size_t N> 1517 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, 1518 const Vec128<uint64_t, N> b) { 1519 return Vec128<uint64_t, N>{__lsx_vsub_d(a.raw, b.raw)}; 1520 } 1521 1522 // Signed 1523 template <size_t N> 1524 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, 1525 const Vec128<int8_t, N> b) { 1526 return Vec128<int8_t, N>{__lsx_vsub_b(a.raw, b.raw)}; 1527 } 1528 template <size_t N> 1529 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, 1530 const Vec128<int16_t, N> b) { 1531 return Vec128<int16_t, N>{__lsx_vsub_h(a.raw, b.raw)}; 1532 } 1533 template <size_t N> 1534 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, 1535 const Vec128<int32_t, N> b) { 1536 return Vec128<int32_t, N>{__lsx_vsub_w(a.raw, b.raw)}; 1537 } 1538 template <size_t N> 1539 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, 1540 const Vec128<int64_t, N> b) { 1541 return Vec128<int64_t, N>{__lsx_vsub_d(a.raw, b.raw)}; 1542 } 1543 1544 template <size_t N> 1545 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, 1546 const Vec128<float, N> b) { 1547 return Vec128<float, N>{__lsx_vfsub_s(a.raw, b.raw)}; 1548 } 1549 template <size_t N> 1550 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a, 1551 const Vec128<double, N> b) { 1552 return Vec128<double, N>{__lsx_vfsub_d(a.raw, b.raw)}; 1553 } 1554 1555 // ------------------------------ SumsOf2 1556 namespace detail { 1557 1558 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)> 1559 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 1560 hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 1561 return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_h_b(v.raw, v.raw)}; 1562 } 1563 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)> 1564 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 1565 hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 1566 return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_hu_bu(v.raw, v.raw)}; 1567 } 1568 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)> 1569 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 1570 hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 1571 return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_w_h(v.raw, v.raw)}; 1572 } 1573 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)> 1574 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 1575 hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 1576 return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_wu_hu(v.raw, v.raw)}; 1577 } 1578 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)> 1579 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 1580 hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 1581 return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_d_w(v.raw, v.raw)}; 1582 } 1583 template <class V, HWY_IF_V_SIZE_LE_V(V, 16)> 1584 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 1585 hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 1586 return VFromD<RepartitionToWide<DFromV<V>>>{__lsx_vhaddw_du_wu(v.raw, v.raw)}; 1587 } 1588 1589 } // namespace detail 1590 1591 // ------------------------------ SumsOf8 1592 template <size_t N> 1593 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) { 1594 __m128i temp = __lsx_vhaddw_hu_bu(v.raw, v.raw); 1595 temp = __lsx_vhaddw_wu_hu(temp, temp); 1596 return Vec128<uint64_t, N / 8>{__lsx_vhaddw_du_wu(temp, temp)}; 1597 } 1598 template <size_t N> 1599 HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) { 1600 __m128i temp = __lsx_vhaddw_h_b(v.raw, v.raw); 1601 temp = __lsx_vhaddw_w_h(temp, temp); 1602 return Vec128<int64_t, N / 8>{__lsx_vhaddw_d_w(temp, temp)}; 1603 } 1604 1605 // ------------------------------ SaturatedAdd 1606 1607 // Returns a + b clamped to the destination range. 1608 1609 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB 1610 #undef HWY_NATIVE_I32_SATURATED_ADDSUB 1611 #else 1612 #define HWY_NATIVE_I32_SATURATED_ADDSUB 1613 #endif 1614 1615 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB 1616 #undef HWY_NATIVE_I64_SATURATED_ADDSUB 1617 #else 1618 #define HWY_NATIVE_I64_SATURATED_ADDSUB 1619 #endif 1620 1621 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB 1622 #undef HWY_NATIVE_U32_SATURATED_ADDSUB 1623 #else 1624 #define HWY_NATIVE_U32_SATURATED_ADDSUB 1625 #endif 1626 1627 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB 1628 #undef HWY_NATIVE_U64_SATURATED_ADDSUB 1629 #else 1630 #define HWY_NATIVE_U64_SATURATED_ADDSUB 1631 #endif 1632 1633 // Unsigned 1634 template <size_t N> 1635 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, 1636 const Vec128<uint8_t, N> b) { 1637 return Vec128<uint8_t, N>{__lsx_vsadd_bu(a.raw, b.raw)}; 1638 } 1639 template <size_t N> 1640 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, 1641 const Vec128<uint16_t, N> b) { 1642 return Vec128<uint16_t, N>{__lsx_vsadd_hu(a.raw, b.raw)}; 1643 } 1644 template <size_t N> 1645 HWY_API Vec128<uint32_t, N> SaturatedAdd(const Vec128<uint32_t, N> a, 1646 const Vec128<uint32_t, N> b) { 1647 return Vec128<uint32_t, N>{__lsx_vsadd_wu(a.raw, b.raw)}; 1648 } 1649 template <size_t N> 1650 HWY_API Vec128<uint64_t, N> SaturatedAdd(const Vec128<uint64_t, N> a, 1651 const Vec128<uint64_t, N> b) { 1652 return Vec128<uint64_t, N>{__lsx_vsadd_du(a.raw, b.raw)}; 1653 } 1654 1655 // signed 1656 template <size_t N> 1657 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, 1658 const Vec128<int8_t, N> b) { 1659 return Vec128<int8_t, N>{__lsx_vsadd_b(a.raw, b.raw)}; 1660 } 1661 template <size_t N> 1662 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, 1663 const Vec128<int16_t, N> b) { 1664 return Vec128<int16_t, N>{__lsx_vsadd_h(a.raw, b.raw)}; 1665 } 1666 template <size_t N> 1667 HWY_API Vec128<int32_t, N> SaturatedAdd(const Vec128<int32_t, N> a, 1668 const Vec128<int32_t, N> b) { 1669 return Vec128<int32_t, N>{__lsx_vsadd_w(a.raw, b.raw)}; 1670 } 1671 template <size_t N> 1672 HWY_API Vec128<int64_t, N> SaturatedAdd(const Vec128<int64_t, N> a, 1673 const Vec128<int64_t, N> b) { 1674 return Vec128<int64_t, N>{__lsx_vsadd_d(a.raw, b.raw)}; 1675 } 1676 1677 // ------------------------------ SaturatedSub 1678 1679 // Returns a - b clamped to the destination range. 1680 1681 // Unsigned 1682 template <size_t N> 1683 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, 1684 const Vec128<uint8_t, N> b) { 1685 return Vec128<uint8_t, N>{__lsx_vssub_bu(a.raw, b.raw)}; 1686 } 1687 template <size_t N> 1688 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, 1689 const Vec128<uint16_t, N> b) { 1690 return Vec128<uint16_t, N>{__lsx_vssub_hu(a.raw, b.raw)}; 1691 } 1692 template <size_t N> 1693 HWY_API Vec128<uint32_t, N> SaturatedSub(const Vec128<uint32_t, N> a, 1694 const Vec128<uint32_t, N> b) { 1695 return Vec128<uint32_t, N>{__lsx_vssub_wu(a.raw, b.raw)}; 1696 } 1697 template <size_t N> 1698 HWY_API Vec128<uint64_t, N> SaturatedSub(const Vec128<uint64_t, N> a, 1699 const Vec128<uint64_t, N> b) { 1700 return Vec128<uint64_t, N>{__lsx_vssub_du(a.raw, b.raw)}; 1701 } 1702 1703 // signed 1704 template <size_t N> 1705 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, 1706 const Vec128<int8_t, N> b) { 1707 return Vec128<int8_t, N>{__lsx_vssub_b(a.raw, b.raw)}; 1708 } 1709 template <size_t N> 1710 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, 1711 const Vec128<int16_t, N> b) { 1712 return Vec128<int16_t, N>{__lsx_vssub_h(a.raw, b.raw)}; 1713 } 1714 template <size_t N> 1715 HWY_API Vec128<int32_t, N> SaturatedSub(const Vec128<int32_t, N> a, 1716 const Vec128<int32_t, N> b) { 1717 return Vec128<int32_t, N>{__lsx_vssub_w(a.raw, b.raw)}; 1718 } 1719 template <size_t N> 1720 HWY_API Vec128<int64_t, N> SaturatedSub(const Vec128<int64_t, N> a, 1721 const Vec128<int64_t, N> b) { 1722 return Vec128<int64_t, N>{__lsx_vssub_d(a.raw, b.raw)}; 1723 } 1724 1725 // ------------------------------ AverageRound 1726 1727 // Returns (a + b + 1) / 2 1728 1729 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 1730 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 1731 #else 1732 #define HWY_NATIVE_AVERAGE_ROUND_UI32 1733 #endif 1734 1735 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 1736 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 1737 #else 1738 #define HWY_NATIVE_AVERAGE_ROUND_UI64 1739 #endif 1740 1741 // Unsigned 1742 template <size_t N> 1743 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, 1744 const Vec128<uint8_t, N> b) { 1745 return Vec128<uint8_t, N>{__lsx_vavgr_bu(a.raw, b.raw)}; 1746 } 1747 template <size_t N> 1748 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, 1749 const Vec128<uint16_t, N> b) { 1750 return Vec128<uint16_t, N>{__lsx_vavgr_hu(a.raw, b.raw)}; 1751 } 1752 template <size_t N> 1753 HWY_API Vec128<uint32_t, N> AverageRound(const Vec128<uint32_t, N> a, 1754 const Vec128<uint32_t, N> b) { 1755 return Vec128<uint32_t, N>{__lsx_vavgr_wu(a.raw, b.raw)}; 1756 } 1757 template <size_t N> 1758 HWY_API Vec128<uint64_t, N> AverageRound(const Vec128<uint64_t, N> a, 1759 const Vec128<uint64_t, N> b) { 1760 return Vec128<uint64_t, N>{__lsx_vavgr_du(a.raw, b.raw)}; 1761 } 1762 1763 // signed 1764 template <size_t N> 1765 HWY_API Vec128<int8_t, N> AverageRound(const Vec128<int8_t, N> a, 1766 const Vec128<int8_t, N> b) { 1767 return Vec128<int8_t, N>{__lsx_vavgr_b(a.raw, b.raw)}; 1768 } 1769 template <size_t N> 1770 HWY_API Vec128<int16_t, N> AverageRound(const Vec128<int16_t, N> a, 1771 const Vec128<int16_t, N> b) { 1772 return Vec128<int16_t, N>{__lsx_vavgr_h(a.raw, b.raw)}; 1773 } 1774 template <size_t N> 1775 HWY_API Vec128<int32_t, N> AverageRound(const Vec128<int32_t, N> a, 1776 const Vec128<int32_t, N> b) { 1777 return Vec128<int32_t, N>{__lsx_vavgr_w(a.raw, b.raw)}; 1778 } 1779 template <size_t N> 1780 HWY_API Vec128<int64_t, N> AverageRound(const Vec128<int64_t, N> a, 1781 const Vec128<int64_t, N> b) { 1782 return Vec128<int64_t, N>{__lsx_vavgr_d(a.raw, b.raw)}; 1783 } 1784 1785 // ------------------------------ Integer/Float multiplication 1786 1787 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. 1788 #ifdef HWY_NATIVE_MUL_8 1789 #undef HWY_NATIVE_MUL_8 1790 #else 1791 #define HWY_NATIVE_MUL_8 1792 #endif 1793 #ifdef HWY_NATIVE_MUL_64 1794 #undef HWY_NATIVE_MUL_64 1795 #else 1796 #define HWY_NATIVE_MUL_64 1797 #endif 1798 1799 template <typename T, size_t N, HWY_IF_UI8(T)> 1800 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) { 1801 return Vec128<T, N>{__lsx_vmul_b(a.raw, b.raw)}; 1802 } 1803 template <typename T, size_t N, HWY_IF_UI16(T)> 1804 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) { 1805 return Vec128<T, N>{__lsx_vmul_h(a.raw, b.raw)}; 1806 } 1807 template <typename T, size_t N, HWY_IF_UI32(T)> 1808 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) { 1809 return Vec128<T, N>{__lsx_vmul_w(a.raw, b.raw)}; 1810 } 1811 template <typename T, size_t N, HWY_IF_UI64(T)> 1812 HWY_API Vec128<T, N> operator*(const Vec128<T, N> a, const Vec128<T, N> b) { 1813 return Vec128<T, N>{__lsx_vmul_d(a.raw, b.raw)}; 1814 } 1815 1816 template <size_t N> 1817 HWY_API Vec128<float, N> operator*(const Vec128<float, N> a, 1818 const Vec128<float, N> b) { 1819 return Vec128<float, N>{__lsx_vfmul_s(a.raw, b.raw)}; 1820 } 1821 template <size_t N> 1822 HWY_API Vec128<double, N> operator*(const Vec128<double, N> a, 1823 const Vec128<double, N> b) { 1824 return Vec128<double, N>{__lsx_vfmul_d(a.raw, b.raw)}; 1825 } 1826 1827 // ------------------------------ MulHigh 1828 1829 // Usigned 1830 template <size_t N> 1831 HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a, 1832 const Vec128<uint8_t, N> b) { 1833 return Vec128<uint8_t, N>{__lsx_vmuh_bu(a.raw, b.raw)}; 1834 } 1835 template <size_t N> 1836 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, 1837 const Vec128<uint16_t, N> b) { 1838 return Vec128<uint16_t, N>{__lsx_vmuh_hu(a.raw, b.raw)}; 1839 } 1840 template <size_t N> 1841 HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a, 1842 const Vec128<uint32_t, N> b) { 1843 return Vec128<uint32_t, N>{__lsx_vmuh_wu(a.raw, b.raw)}; 1844 } 1845 template <size_t N> 1846 HWY_API Vec128<uint64_t, N> MulHigh(const Vec128<uint64_t, N> a, 1847 const Vec128<uint64_t, N> b) { 1848 return Vec128<uint64_t, N>{__lsx_vmuh_du(a.raw, b.raw)}; 1849 } 1850 1851 // signed 1852 template <size_t N> 1853 HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a, 1854 const Vec128<int8_t, N> b) { 1855 return Vec128<int8_t, N>{__lsx_vmuh_b(a.raw, b.raw)}; 1856 } 1857 template <size_t N> 1858 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, 1859 const Vec128<int16_t, N> b) { 1860 return Vec128<int16_t, N>{__lsx_vmuh_h(a.raw, b.raw)}; 1861 } 1862 template <size_t N> 1863 HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a, 1864 const Vec128<int32_t, N> b) { 1865 return Vec128<int32_t, N>{__lsx_vmuh_w(a.raw, b.raw)}; 1866 } 1867 template <size_t N> 1868 HWY_API Vec128<int64_t, N> MulHigh(const Vec128<int64_t, N> a, 1869 const Vec128<int64_t, N> b) { 1870 return Vec128<int64_t, N>{__lsx_vmuh_d(a.raw, b.raw)}; 1871 } 1872 1873 // ------------------------------ MulEven 1874 1875 template <size_t N> 1876 HWY_API Vec128<int16_t, (N + 1) / 2> MulEven(Vec128<int8_t, N> a, 1877 Vec128<int8_t, N> b) { 1878 return Vec128<int16_t, (N + 1) / 2>{__lsx_vmulwev_h_b(a.raw, b.raw)}; 1879 } 1880 1881 template <size_t N> 1882 HWY_API Vec128<uint16_t, (N + 1) / 2> MulEven(Vec128<uint8_t, N> a, 1883 Vec128<uint8_t, N> b) { 1884 return Vec128<uint16_t, (N + 1) / 2>{__lsx_vmulwev_h_bu(a.raw, b.raw)}; 1885 } 1886 1887 template <size_t N> 1888 HWY_API Vec128<int32_t, (N + 1) / 2> MulEven(Vec128<int16_t, N> a, 1889 Vec128<int16_t, N> b) { 1890 return Vec128<int32_t, (N + 1) / 2>{__lsx_vmulwev_w_h(a.raw, b.raw)}; 1891 } 1892 1893 template <size_t N> 1894 HWY_API Vec128<uint32_t, (N + 1) / 2> MulEven(Vec128<uint16_t, N> a, 1895 Vec128<uint16_t, N> b) { 1896 return Vec128<uint32_t, (N + 1) / 2>{__lsx_vmulwev_w_hu(a.raw, b.raw)}; 1897 } 1898 1899 template <size_t N> 1900 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a, 1901 Vec128<int32_t, N> b) { 1902 return Vec128<int64_t, (N + 1) / 2>{__lsx_vmulwev_d_w(a.raw, b.raw)}; 1903 } 1904 1905 template <size_t N> 1906 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a, 1907 Vec128<uint32_t, N> b) { 1908 return Vec128<uint64_t, (N + 1) / 2>{__lsx_vmulwev_d_wu(a.raw, b.raw)}; 1909 } 1910 1911 template <typename T, HWY_IF_I64(T)> 1912 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 1913 return Vec128<T>{__lsx_vmulwev_q_d(a.raw, b.raw)}; 1914 } 1915 1916 template <typename T, HWY_IF_U64(T)> 1917 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 1918 return Vec128<T>{__lsx_vmulwev_q_du(a.raw, b.raw)}; 1919 } 1920 1921 // ------------------------------ MulOdd 1922 1923 template <size_t N> 1924 HWY_API Vec128<int16_t, (N + 1) / 2> MulOdd(Vec128<int8_t, N> a, 1925 Vec128<int8_t, N> b) { 1926 return Vec128<int16_t, (N + 1) / 2>{__lsx_vmulwod_h_b(a.raw, b.raw)}; 1927 } 1928 1929 template <size_t N> 1930 HWY_API Vec128<uint16_t, (N + 1) / 2> MulOdd(Vec128<uint8_t, N> a, 1931 Vec128<uint8_t, N> b) { 1932 return Vec128<uint16_t, (N + 1) / 2>{__lsx_vmulwod_h_bu(a.raw, b.raw)}; 1933 } 1934 1935 template <size_t N> 1936 HWY_API Vec128<int32_t, (N + 1) / 2> MulOdd(Vec128<int16_t, N> a, 1937 Vec128<int16_t, N> b) { 1938 return Vec128<int32_t, (N + 1) / 2>{__lsx_vmulwod_w_h(a.raw, b.raw)}; 1939 } 1940 1941 template <size_t N> 1942 HWY_API Vec128<uint32_t, (N + 1) / 2> MulOdd(Vec128<uint16_t, N> a, 1943 Vec128<uint16_t, N> b) { 1944 return Vec128<uint32_t, (N + 1) / 2>{__lsx_vmulwod_w_hu(a.raw, b.raw)}; 1945 } 1946 1947 template <size_t N> 1948 HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(Vec128<int32_t, N> a, 1949 Vec128<int32_t, N> b) { 1950 return Vec128<int64_t, (N + 1) / 2>{__lsx_vmulwod_d_w(a.raw, b.raw)}; 1951 } 1952 1953 template <size_t N> 1954 HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a, 1955 Vec128<uint32_t, N> b) { 1956 return Vec128<uint64_t, (N + 1) / 2>{__lsx_vmulwod_d_wu(a.raw, b.raw)}; 1957 } 1958 1959 template <typename T, HWY_IF_I64(T)> 1960 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 1961 return Vec128<T>{__lsx_vmulwod_q_d(a.raw, b.raw)}; 1962 } 1963 1964 template <typename T, HWY_IF_U64(T)> 1965 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 1966 return Vec128<T>{__lsx_vmulwod_q_du(a.raw, b.raw)}; 1967 } 1968 1969 // ------------------------------ RotateRight (ShiftRight, Or) 1970 1971 template <int kBits, typename T, size_t N, HWY_IF_UI8(T)> 1972 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 1973 return Vec128<T, N>{__lsx_vrotri_b(v.raw, kBits)}; 1974 } 1975 template <int kBits, typename T, size_t N, HWY_IF_UI16(T)> 1976 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 1977 return Vec128<T, N>{__lsx_vrotri_h(v.raw, kBits)}; 1978 } 1979 template <int kBits, typename T, size_t N, HWY_IF_UI32(T)> 1980 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 1981 return Vec128<T, N>{__lsx_vrotri_w(v.raw, kBits)}; 1982 } 1983 template <int kBits, typename T, size_t N, HWY_IF_UI64(T)> 1984 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 1985 return Vec128<T, N>{__lsx_vrotri_d(v.raw, kBits)}; 1986 } 1987 1988 // ------------------------------ Ror 1989 #ifdef HWY_NATIVE_ROL_ROR_8 1990 #undef HWY_NATIVE_ROL_ROR_8 1991 #else 1992 #define HWY_NATIVE_ROL_ROR_8 1993 #endif 1994 1995 #ifdef HWY_NATIVE_ROL_ROR_16 1996 #undef HWY_NATIVE_ROL_ROR_16 1997 #else 1998 #define HWY_NATIVE_ROL_ROR_16 1999 #endif 2000 2001 #ifdef HWY_NATIVE_ROL_ROR_32_64 2002 #undef HWY_NATIVE_ROL_ROR_32_64 2003 #else 2004 #define HWY_NATIVE_ROL_ROR_32_64 2005 #endif 2006 2007 template <class T, size_t N, HWY_IF_UI8(T)> 2008 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 2009 return Vec128<T, N>{__lsx_vrotr_b(a.raw, b.raw)}; 2010 } 2011 2012 template <class T, size_t N, HWY_IF_UI16(T)> 2013 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 2014 return Vec128<T, N>{__lsx_vrotr_h(a.raw, b.raw)}; 2015 } 2016 2017 template <class T, size_t N, HWY_IF_UI32(T)> 2018 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 2019 return Vec128<T, N>{__lsx_vrotr_w(a.raw, b.raw)}; 2020 } 2021 2022 template <class T, size_t N, HWY_IF_UI64(T)> 2023 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 2024 return Vec128<T, N>{__lsx_vrotr_d(a.raw, b.raw)}; 2025 } 2026 2027 // Rol is generic for all vector lengths 2028 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 2029 HWY_API V Rol(V a, V b) { 2030 const DFromV<decltype(a)> d; 2031 const RebindToSigned<decltype(d)> di; 2032 2033 return Ror(a, BitCast(d, Neg(BitCast(di, b)))); 2034 } 2035 2036 // ------------------------------ RotateLeftSame/RotateRightSame 2037 2038 #ifdef HWY_NATIVE_ROL_ROR_SAME_8 2039 #undef HWY_NATIVE_ROL_ROR_SAME_8 2040 #else 2041 #define HWY_NATIVE_ROL_ROR_SAME_8 2042 #endif 2043 2044 #ifdef HWY_NATIVE_ROL_ROR_SAME_16 2045 #undef HWY_NATIVE_ROL_ROR_SAME_16 2046 #else 2047 #define HWY_NATIVE_ROL_ROR_SAME_16 2048 #endif 2049 2050 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 2051 #undef HWY_NATIVE_ROL_ROR_SAME_32_64 2052 #else 2053 #define HWY_NATIVE_ROL_ROR_SAME_32_64 2054 #endif 2055 2056 // RotateLeftSame/RotateRightSame are generic for all vector lengths 2057 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 2058 HWY_API V RotateLeftSame(V v, int bits) { 2059 using T = TFromV<V>; 2060 const DFromV<decltype(v)> d; 2061 return Rol(v, Set(d, static_cast<T>(bits))); 2062 } 2063 2064 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 2065 HWY_API V RotateRightSame(V v, int bits) { 2066 using T = TFromV<V>; 2067 const DFromV<decltype(v)> d; 2068 return Ror(v, Set(d, static_cast<T>(bits))); 2069 } 2070 2071 // ------------------------------ BroadcastSignBit 2072 2073 template <typename T, size_t N, HWY_IF_SIGNED(T)> 2074 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) { 2075 return ShiftRight<sizeof(T) * 8 - 1>(v); 2076 } 2077 2078 // ------------------------------ Integer Abs 2079 2080 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. 2081 template <size_t N> 2082 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { 2083 return Vec128<int8_t, N>{__lsx_vabsd_b(v.raw, __lsx_vreplgr2vr_b(0))}; 2084 } 2085 template <size_t N> 2086 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { 2087 return Vec128<int16_t, N>{__lsx_vabsd_h(v.raw, __lsx_vreplgr2vr_b(0))}; 2088 } 2089 template <size_t N> 2090 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { 2091 return Vec128<int32_t, N>{__lsx_vabsd_w(v.raw, __lsx_vreplgr2vr_b(0))}; 2092 } 2093 template <size_t N> 2094 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { 2095 return Vec128<int64_t, N>{__lsx_vabsd_d(v.raw, __lsx_vreplgr2vr_b(0))}; 2096 } 2097 2098 // ------------------------------ SaturatedAbs 2099 2100 #ifdef HWY_NATIVE_SATURATED_ABS 2101 #undef HWY_NATIVE_SATURATED_ABS 2102 #else 2103 #define HWY_NATIVE_SATURATED_ABS 2104 #endif 2105 2106 template <class V, HWY_IF_I8(TFromV<V>)> 2107 HWY_API V SaturatedAbs(V v) { 2108 const DFromV<decltype(v)> d; 2109 const RebindToUnsigned<decltype(d)> du; 2110 return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v)))); 2111 } 2112 template <class V, HWY_IF_I16(TFromV<V>)> 2113 HWY_API V SaturatedAbs(V v) { 2114 return Max(v, SaturatedSub(Zero(DFromV<V>()), v)); 2115 } 2116 template <class V, HWY_IF_I32(TFromV<V>)> 2117 HWY_API V SaturatedAbs(V v) { 2118 const auto abs_v = Abs(v); 2119 const DFromV<decltype(v)> d; 2120 const RebindToUnsigned<decltype(d)> du; 2121 return BitCast(d, Min(BitCast(du, abs_v), 2122 Set(du, static_cast<uint32_t>(LimitsMax<int32_t>())))); 2123 } 2124 template <class V, HWY_IF_I64(TFromV<V>)> 2125 HWY_API V SaturatedAbs(V v) { 2126 const auto abs_v = Abs(v); 2127 return Add(abs_v, BroadcastSignBit(abs_v)); 2128 } 2129 2130 // ------------------------------ IfNegativeThenElse 2131 template <typename T, size_t N> 2132 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 2133 Vec128<T, N> no) { 2134 static_assert(IsSigned<T>(), "Only works for signed/float"); 2135 const DFromV<decltype(no)> d; 2136 const RebindToSigned<decltype(d)> di; 2137 2138 Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 2139 return IfThenElse(m, yes, no); 2140 } 2141 2142 // ------------------------------ IfNegativeThenNegOrUndefIfZero 2143 2144 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 2145 #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 2146 #else 2147 #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 2148 #endif 2149 2150 template <size_t N> 2151 HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask, 2152 Vec128<int8_t, N> v) { 2153 return Vec128<int8_t, N>{__lsx_vsigncov_b(mask.raw, v.raw)}; 2154 } 2155 2156 template <size_t N> 2157 HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero( 2158 Vec128<int16_t, N> mask, Vec128<int16_t, N> v) { 2159 return Vec128<int16_t, N>{__lsx_vsigncov_h(mask.raw, v.raw)}; 2160 } 2161 2162 template <size_t N> 2163 HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero( 2164 Vec128<int32_t, N> mask, Vec128<int32_t, N> v) { 2165 return Vec128<int32_t, N>{__lsx_vsigncov_w(mask.raw, v.raw)}; 2166 } 2167 2168 template <size_t N> 2169 HWY_API Vec128<int64_t, N> IfNegativeThenNegOrUndefIfZero( 2170 Vec128<int64_t, N> mask, Vec128<int64_t, N> v) { 2171 return Vec128<int64_t, N>{__lsx_vsigncov_d(mask.raw, v.raw)}; 2172 } 2173 2174 // ------------------------------ ShiftLeftSame/ShiftRightSame 2175 2176 template <typename T, size_t N> 2177 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) { 2178 return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits)); 2179 } 2180 template <typename T, size_t N> 2181 HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) { 2182 return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits)); 2183 } 2184 2185 // ------------------------------ Integer/Float Div 2186 2187 #ifdef HWY_NATIVE_INT_DIV 2188 #undef HWY_NATIVE_INT_DIV 2189 #else 2190 #define HWY_NATIVE_INT_DIV 2191 #endif 2192 2193 template <size_t N> 2194 HWY_API Vec128<int8_t, N> operator/(const Vec128<int8_t, N> a, 2195 const Vec128<int8_t, N> b) { 2196 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2197 // or a[i] == LimitsMin<int8_t>() && b[i] == -1 2198 __m128i raw_result; 2199 __asm__("vdiv.b %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2200 return Vec128<int8_t, N>{raw_result}; 2201 } 2202 2203 template <size_t N> 2204 HWY_API Vec128<uint8_t, N> operator/(const Vec128<uint8_t, N> a, 2205 const Vec128<uint8_t, N> b) { 2206 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2207 __m128i raw_result; 2208 __asm__("vdiv.bu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2209 return Vec128<uint8_t, N>{raw_result}; 2210 } 2211 2212 template <size_t N> 2213 HWY_API Vec128<int16_t, N> operator/(const Vec128<int16_t, N> a, 2214 const Vec128<int16_t, N> b) { 2215 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2216 // or a[i] == LimitsMin<int16_t>() && b[i] == -1 2217 __m128i raw_result; 2218 __asm__("vdiv.h %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2219 return Vec128<int16_t, N>{raw_result}; 2220 } 2221 2222 template <size_t N> 2223 HWY_API Vec128<uint16_t, N> operator/(const Vec128<uint16_t, N> a, 2224 const Vec128<uint16_t, N> b) { 2225 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2226 __m128i raw_result; 2227 __asm__("vdiv.hu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2228 return Vec128<uint16_t, N>{raw_result}; 2229 } 2230 2231 template <size_t N> 2232 HWY_API Vec128<int32_t, N> operator/(const Vec128<int32_t, N> a, 2233 const Vec128<int32_t, N> b) { 2234 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2235 // or a[i] == LimitsMin<int32_t>() && b[i] == -1 2236 __m128i raw_result; 2237 __asm__("vdiv.w %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2238 return Vec128<int32_t, N>{raw_result}; 2239 } 2240 2241 template <size_t N> 2242 HWY_API Vec128<uint32_t, N> operator/(const Vec128<uint32_t, N> a, 2243 const Vec128<uint32_t, N> b) { 2244 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2245 __m128i raw_result; 2246 __asm__("vdiv.wu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2247 return Vec128<uint32_t, N>{raw_result}; 2248 } 2249 2250 template <size_t N> 2251 HWY_API Vec128<int64_t, N> operator/(const Vec128<int64_t, N> a, 2252 const Vec128<int64_t, N> b) { 2253 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2254 // or a[i] == LimitsMin<int64_t>() && b[i] == -1 2255 __m128i raw_result; 2256 __asm__("vdiv.d %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2257 return Vec128<int64_t, N>{raw_result}; 2258 } 2259 2260 template <size_t N> 2261 HWY_API Vec128<uint64_t, N> operator/(const Vec128<uint64_t, N> a, 2262 const Vec128<uint64_t, N> b) { 2263 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2264 __m128i raw_result; 2265 __asm__("vdiv.du %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2266 return Vec128<uint64_t, N>{raw_result}; 2267 } 2268 2269 template <size_t N> 2270 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, 2271 const Vec128<float, N> b) { 2272 return Vec128<float, N>{__lsx_vfdiv_s(a.raw, b.raw)}; 2273 } 2274 template <size_t N> 2275 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a, 2276 const Vec128<double, N> b) { 2277 return Vec128<double, N>{__lsx_vfdiv_d(a.raw, b.raw)}; 2278 } 2279 2280 // ------------------------------ Integer Mod 2281 2282 template <size_t N> 2283 HWY_API Vec128<int8_t, N> operator%(const Vec128<int8_t, N> a, 2284 const Vec128<int8_t, N> b) { 2285 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2286 // or a[i] == LimitsMin<int8_t>() && b[i] == -1 2287 __m128i raw_result; 2288 __asm__("vmod.b %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2289 return Vec128<int8_t, N>{raw_result}; 2290 } 2291 2292 template <size_t N> 2293 HWY_API Vec128<uint8_t, N> operator%(const Vec128<uint8_t, N> a, 2294 const Vec128<uint8_t, N> b) { 2295 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2296 __m128i raw_result; 2297 __asm__("vmod.bu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2298 return Vec128<uint8_t, N>{raw_result}; 2299 } 2300 2301 template <size_t N> 2302 HWY_API Vec128<int16_t, N> operator%(const Vec128<int16_t, N> a, 2303 const Vec128<int16_t, N> b) { 2304 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2305 // or a[i] == LimitsMin<int16_t>() && b[i] == -1 2306 __m128i raw_result; 2307 __asm__("vmod.h %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2308 return Vec128<int16_t, N>{raw_result}; 2309 } 2310 2311 template <size_t N> 2312 HWY_API Vec128<uint16_t, N> operator%(const Vec128<uint16_t, N> a, 2313 const Vec128<uint16_t, N> b) { 2314 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2315 __m128i raw_result; 2316 __asm__("vmod.hu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2317 return Vec128<uint16_t, N>{raw_result}; 2318 } 2319 2320 template <size_t N> 2321 HWY_API Vec128<int32_t, N> operator%(const Vec128<int32_t, N> a, 2322 const Vec128<int32_t, N> b) { 2323 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2324 // or a[i] == LimitsMin<int32_t>() && b[i] == -1 2325 __m128i raw_result; 2326 __asm__("vmod.w %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2327 return Vec128<int32_t, N>{raw_result}; 2328 } 2329 2330 template <size_t N> 2331 HWY_API Vec128<uint32_t, N> operator%(const Vec128<uint32_t, N> a, 2332 const Vec128<uint32_t, N> b) { 2333 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2334 __m128i raw_result; 2335 __asm__("vmod.wu %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2336 return Vec128<uint32_t, N>{raw_result}; 2337 } 2338 2339 template <size_t N> 2340 HWY_API Vec128<int64_t, N> operator%(const Vec128<int64_t, N> a, 2341 const Vec128<int64_t, N> b) { 2342 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2343 // or a[i] == LimitsMin<int64_t>() && b[i] == -1 2344 __m128i raw_result; 2345 __asm__("vmod.d %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2346 return Vec128<int64_t, N>{raw_result}; 2347 } 2348 2349 template <size_t N> 2350 HWY_API Vec128<uint64_t, N> operator%(const Vec128<uint64_t, N> a, 2351 const Vec128<uint64_t, N> b) { 2352 // Use inline assembly to avoid undefined behavior if any lanes of b are zero 2353 __m128i raw_result; 2354 __asm__("vmod.du %w0,%w1,%w2" : "=f"(raw_result) : "f"(a.raw), "f"(b.raw) :); 2355 return Vec128<uint64_t, N>{raw_result}; 2356 } 2357 2358 // ------------------------------ ApproximateReciprocal 2359 2360 #ifdef HWY_NATIVE_F64_APPROX_RECIP 2361 #undef HWY_NATIVE_F64_APPROX_RECIP 2362 #else 2363 #define HWY_NATIVE_F64_APPROX_RECIP 2364 #endif 2365 2366 template <size_t N> 2367 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { 2368 return Vec128<float, N>{__lsx_vfrecip_s(v.raw)}; 2369 } 2370 template <size_t N> 2371 HWY_API Vec128<double, N> ApproximateReciprocal(const Vec128<double, N> v) { 2372 return Vec128<double, N>{__lsx_vfrecip_d(v.raw)}; 2373 } 2374 2375 // ------------------------------ Absolute value of difference 2376 2377 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF 2378 #undef HWY_NATIVE_INTEGER_ABS_DIFF 2379 #else 2380 #define HWY_NATIVE_INTEGER_ABS_DIFF 2381 #endif 2382 2383 template <size_t N> 2384 HWY_API Vec128<int8_t, N> AbsDiff(const Vec128<int8_t, N> a, 2385 Vec128<int8_t, N> b) { 2386 return Vec128<int8_t, N>{__lsx_vabsd_b(a.raw, b.raw)}; 2387 } 2388 template <size_t N> 2389 HWY_API Vec128<int16_t, N> AbsDiff(const Vec128<int16_t, N> a, 2390 Vec128<int16_t, N> b) { 2391 return Vec128<int16_t, N>{__lsx_vabsd_h(a.raw, b.raw)}; 2392 } 2393 template <size_t N> 2394 HWY_API Vec128<int32_t, N> AbsDiff(const Vec128<int32_t, N> a, 2395 Vec128<int32_t, N> b) { 2396 return Vec128<int32_t, N>{__lsx_vabsd_w(a.raw, b.raw)}; 2397 } 2398 template <size_t N> 2399 HWY_API Vec128<int64_t, N> AbsDiff(const Vec128<int64_t, N> a, 2400 Vec128<int64_t, N> b) { 2401 return Vec128<int64_t, N>{__lsx_vabsd_d(a.raw, b.raw)}; 2402 } 2403 2404 template <size_t N> 2405 HWY_API Vec128<uint8_t, N> AbsDiff(const Vec128<uint8_t, N> a, 2406 Vec128<uint8_t, N> b) { 2407 return Vec128<uint8_t, N>{__lsx_vabsd_bu(a.raw, b.raw)}; 2408 } 2409 template <size_t N> 2410 HWY_API Vec128<uint16_t, N> AbsDiff(const Vec128<uint16_t, N> a, 2411 Vec128<uint16_t, N> b) { 2412 return Vec128<uint16_t, N>{__lsx_vabsd_hu(a.raw, b.raw)}; 2413 } 2414 template <size_t N> 2415 HWY_API Vec128<uint32_t, N> AbsDiff(const Vec128<uint32_t, N> a, 2416 Vec128<uint32_t, N> b) { 2417 return Vec128<uint32_t, N>{__lsx_vabsd_wu(a.raw, b.raw)}; 2418 } 2419 template <size_t N> 2420 HWY_API Vec128<uint64_t, N> AbsDiff(const Vec128<uint64_t, N> a, 2421 Vec128<uint64_t, N> b) { 2422 return Vec128<uint64_t, N>{__lsx_vabsd_du(a.raw, b.raw)}; 2423 } 2424 2425 // Generic for all vector lengths. 2426 template <class V, HWY_IF_FLOAT_V(V)> 2427 HWY_API V AbsDiff(V a, V b) { 2428 return Abs(a - b); 2429 } 2430 2431 // ------------------------------ Integer/Float multiply-add 2432 2433 #ifdef HWY_NATIVE_INT_FMA 2434 #undef HWY_NATIVE_INT_FMA 2435 #else 2436 #define HWY_NATIVE_INT_FMA 2437 #endif 2438 2439 template <size_t N> 2440 HWY_API Vec128<int8_t, N> MulAdd(Vec128<int8_t, N> mul, Vec128<int8_t, N> x, 2441 Vec128<int8_t, N> add) { 2442 return Vec128<int8_t, N>{__lsx_vmadd_b(add.raw, mul.raw, x.raw)}; 2443 } 2444 template <size_t N> 2445 HWY_API Vec128<int16_t, N> MulAdd(Vec128<int16_t, N> mul, Vec128<int16_t, N> x, 2446 Vec128<int16_t, N> add) { 2447 return Vec128<int16_t, N>{__lsx_vmadd_h(add.raw, mul.raw, x.raw)}; 2448 } 2449 template <size_t N> 2450 HWY_API Vec128<int32_t, N> MulAdd(Vec128<int32_t, N> mul, Vec128<int32_t, N> x, 2451 Vec128<int32_t, N> add) { 2452 return Vec128<int32_t, N>{__lsx_vmadd_w(add.raw, mul.raw, x.raw)}; 2453 } 2454 template <size_t N> 2455 HWY_API Vec128<int64_t, N> MulAdd(Vec128<int64_t, N> mul, Vec128<int64_t, N> x, 2456 Vec128<int64_t, N> add) { 2457 return Vec128<int64_t, N>{__lsx_vmadd_d(add.raw, mul.raw, x.raw)}; 2458 } 2459 2460 template <size_t N> 2461 HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x, 2462 Vec128<float, N> add) { 2463 return Vec128<float, N>{__lsx_vfmadd_s(mul.raw, x.raw, add.raw)}; 2464 } 2465 template <size_t N> 2466 HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x, 2467 Vec128<double, N> add) { 2468 return Vec128<double, N>{__lsx_vfmadd_d(mul.raw, x.raw, add.raw)}; 2469 } 2470 2471 // Unsinged 2472 template <typename T, size_t N, HWY_IF_UNSIGNED(T)> 2473 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2474 Vec128<T, N> add) { 2475 return mul * x + add; 2476 } 2477 2478 // ------------------------------ Integer/Float NegMulAdd 2479 2480 template <size_t N> 2481 HWY_API Vec128<int8_t, N> NegMulAdd(Vec128<int8_t, N> mul, Vec128<int8_t, N> x, 2482 Vec128<int8_t, N> add) { 2483 return Vec128<int8_t, N>{__lsx_vmsub_b(add.raw, mul.raw, x.raw)}; 2484 } 2485 template <size_t N> 2486 HWY_API Vec128<int16_t, N> NegMulAdd(Vec128<int16_t, N> mul, 2487 Vec128<int16_t, N> x, 2488 Vec128<int16_t, N> add) { 2489 return Vec128<int16_t, N>{__lsx_vmsub_h(add.raw, mul.raw, x.raw)}; 2490 } 2491 template <size_t N> 2492 HWY_API Vec128<int32_t, N> NegMulAdd(Vec128<int32_t, N> mul, 2493 Vec128<int32_t, N> x, 2494 Vec128<int32_t, N> sub) { 2495 return Vec128<int32_t, N>{__lsx_vmsub_w(sub.raw, mul.raw, x.raw)}; 2496 } 2497 template <size_t N> 2498 HWY_API Vec128<int64_t, N> NegMulAdd(Vec128<int64_t, N> mul, 2499 Vec128<int64_t, N> x, 2500 Vec128<int64_t, N> sub) { 2501 return Vec128<int64_t, N>{__lsx_vmsub_d(sub.raw, mul.raw, x.raw)}; 2502 } 2503 2504 // Float/unsigned 2505 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 2506 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2507 Vec128<T, N> add) { 2508 return add - mul * x; 2509 } 2510 2511 // ------------------------------ Float MulSub 2512 2513 // float 2514 template <size_t N> 2515 HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x, 2516 Vec128<float, N> sub) { 2517 return Vec128<float, N>{__lsx_vfmsub_s(x.raw, mul.raw, sub.raw)}; 2518 } 2519 template <size_t N> 2520 HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x, 2521 Vec128<double, N> sub) { 2522 return Vec128<double, N>{__lsx_vfmsub_d(x.raw, mul.raw, sub.raw)}; 2523 } 2524 2525 // unsigned 2526 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 2527 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, 2528 Vec128<T, N> sub) { 2529 return mul * x - sub; 2530 } 2531 2532 // ------------------------------ Float NegMulSub 2533 2534 // float/unsigned 2535 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 2536 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, 2537 Vec128<T, N> sub) { 2538 return Neg(mul) * x - sub; 2539 } 2540 2541 // ------------------------------ Floating-point square root 2542 2543 template <size_t N> 2544 HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) { 2545 return Vec128<float, N>{__lsx_vfsqrt_s(v.raw)}; 2546 } 2547 template <size_t N> 2548 HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) { 2549 return Vec128<double, N>{__lsx_vfsqrt_d(v.raw)}; 2550 } 2551 2552 // ------------------------------ ApproximateReciprocalSqrt 2553 #ifdef HWY_NATIVE_F64_APPROX_RSQRT 2554 #undef HWY_NATIVE_F64_APPROX_RSQRT 2555 #else 2556 #define HWY_NATIVE_F64_APPROX_RSQRT 2557 #endif 2558 2559 template <size_t N> 2560 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { 2561 return Vec128<float, N>{__lsx_vfrsqrt_s(v.raw)}; 2562 } 2563 template <size_t N> 2564 HWY_API Vec128<double, N> ApproximateReciprocalSqrt(Vec128<double, N> v) { 2565 return Vec128<double, N>{__lsx_vfrsqrt_d(v.raw)}; 2566 } 2567 2568 // ------------------------------ Min 2569 2570 template <size_t N> 2571 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 2572 return Vec128<uint8_t, N>{__lsx_vmin_bu(a.raw, b.raw)}; 2573 } 2574 template <size_t N> 2575 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { 2576 return Vec128<uint16_t, N>{__lsx_vmin_hu(a.raw, b.raw)}; 2577 } 2578 template <size_t N> 2579 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { 2580 return Vec128<uint32_t, N>{__lsx_vmin_wu(a.raw, b.raw)}; 2581 } 2582 template <size_t N> 2583 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 2584 return Vec128<uint64_t, N>{__lsx_vmin_du(a.raw, b.raw)}; 2585 } 2586 2587 template <size_t N> 2588 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 2589 return Vec128<int8_t, N>{__lsx_vmin_b(a.raw, b.raw)}; 2590 } 2591 template <size_t N> 2592 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 2593 return Vec128<int16_t, N>{__lsx_vmin_h(a.raw, b.raw)}; 2594 } 2595 template <size_t N> 2596 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 2597 return Vec128<int32_t, N>{__lsx_vmin_w(a.raw, b.raw)}; 2598 } 2599 template <size_t N> 2600 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 2601 return Vec128<int64_t, N>{__lsx_vmin_d(a.raw, b.raw)}; 2602 } 2603 2604 template <size_t N> 2605 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) { 2606 return Vec128<float, N>{__lsx_vfmin_s(a.raw, b.raw)}; 2607 } 2608 template <size_t N> 2609 HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) { 2610 return Vec128<double, N>{__lsx_vfmin_d(a.raw, b.raw)}; 2611 } 2612 2613 // ------------------------------ Max 2614 2615 template <size_t N> 2616 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 2617 return Vec128<uint8_t, N>{__lsx_vmax_bu(a.raw, b.raw)}; 2618 } 2619 template <size_t N> 2620 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { 2621 return Vec128<uint16_t, N>{__lsx_vmax_hu(a.raw, b.raw)}; 2622 } 2623 template <size_t N> 2624 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { 2625 return Vec128<uint32_t, N>{__lsx_vmax_wu(a.raw, b.raw)}; 2626 } 2627 template <size_t N> 2628 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 2629 return Vec128<uint64_t, N>{__lsx_vmax_du(a.raw, b.raw)}; 2630 } 2631 2632 template <size_t N> 2633 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 2634 return Vec128<int8_t, N>{__lsx_vmax_b(a.raw, b.raw)}; 2635 } 2636 template <size_t N> 2637 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 2638 return Vec128<int16_t, N>{__lsx_vmax_h(a.raw, b.raw)}; 2639 } 2640 template <size_t N> 2641 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 2642 return Vec128<int32_t, N>{__lsx_vmax_w(a.raw, b.raw)}; 2643 } 2644 template <size_t N> 2645 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 2646 return Vec128<int64_t, N>{__lsx_vmax_d(a.raw, b.raw)}; 2647 } 2648 2649 template <size_t N> 2650 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) { 2651 return Vec128<float, N>{__lsx_vfmax_s(a.raw, b.raw)}; 2652 } 2653 template <size_t N> 2654 HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) { 2655 return Vec128<double, N>{__lsx_vfmax_d(a.raw, b.raw)}; 2656 } 2657 2658 // ------------------------------ MinMagnitude and MaxMagnitude 2659 2660 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 2661 #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 2662 #else 2663 #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 2664 #endif 2665 2666 template <size_t N> 2667 HWY_API Vec128<float, N> MinMagnitude(Vec128<float, N> a, Vec128<float, N> b) { 2668 return Vec128<float, N>{__lsx_vfmina_s(a.raw, b.raw)}; 2669 } 2670 template <size_t N> 2671 HWY_API Vec128<double, N> MinMagnitude(Vec128<double, N> a, 2672 Vec128<double, N> b) { 2673 return Vec128<double, N>{__lsx_vfmina_d(a.raw, b.raw)}; 2674 } 2675 2676 template <size_t N> 2677 HWY_API Vec128<float, N> MaxMagnitude(Vec128<float, N> a, Vec128<float, N> b) { 2678 return Vec128<float, N>{__lsx_vfmaxa_s(a.raw, b.raw)}; 2679 } 2680 template <size_t N> 2681 HWY_API Vec128<double, N> MaxMagnitude(Vec128<double, N> a, 2682 Vec128<double, N> b) { 2683 return Vec128<double, N>{__lsx_vfmaxa_d(a.raw, b.raw)}; 2684 } 2685 2686 // ------------------------------ Non-temporal stores 2687 2688 // Same as aligned stores on non-x86. 2689 2690 template <class D> 2691 HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 2692 __builtin_prefetch(aligned, 1, 0); 2693 Store(v, d, aligned); 2694 } 2695 2696 // ------------------------------ Scatter in generic_ops-inl.h 2697 // ------------------------------ Gather in generic_ops-inl.h 2698 2699 // ================================================== SWIZZLE (2) 2700 2701 // ------------------------------ LowerHalf 2702 2703 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2704 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { 2705 return VFromD<D>{v.raw}; 2706 } 2707 template <typename T, size_t N> 2708 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 2709 return Vec128<T, N / 2>{v.raw}; 2710 } 2711 2712 // ------------------------------ ShiftLeftBytes 2713 2714 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> 2715 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { 2716 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2717 if (kBytes == 0) return v; 2718 const RebindToUnsigned<decltype(d)> du; 2719 return BitCast( 2720 d, VFromD<decltype(du)>{__lsx_vbsll_v(BitCast(du, v).raw, kBytes)}); 2721 } 2722 2723 // Generic for all vector lengths. 2724 template <int kBytes, class V> 2725 HWY_API V ShiftLeftBytes(const V v) { 2726 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 2727 } 2728 2729 // ------------------------------ ShiftLeftLanes 2730 2731 // Generic for all vector lengths. 2732 template <int kLanes, class D> 2733 HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) { 2734 const Repartition<uint8_t, decltype(d)> d8; 2735 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v))); 2736 } 2737 2738 // Generic for all vector lengths. 2739 template <int kLanes, class V> 2740 HWY_API V ShiftLeftLanes(const V v) { 2741 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 2742 } 2743 2744 // ------------------------------ ShiftRightBytes 2745 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> 2746 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 2747 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2748 if (kBytes == 0) return v; 2749 const RebindToUnsigned<decltype(d)> du; 2750 // For partial vectors, clear upper lanes so we shift in zeros. 2751 if (d.MaxBytes() != 16) { 2752 const Full128<TFromD<D>> dfull; 2753 const VFromD<decltype(dfull)> vfull{v.raw}; 2754 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; 2755 } 2756 return BitCast( 2757 d, VFromD<decltype(du)>{__lsx_vbsrl_v(BitCast(du, v).raw, kBytes)}); 2758 } 2759 2760 // ------------------------------ ShiftRightLanes 2761 // Generic for all vector lengths. 2762 template <int kLanes, class D> 2763 HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) { 2764 const Repartition<uint8_t, decltype(d)> d8; 2765 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 2766 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); 2767 } 2768 2769 // ------------------------------ UpperHalf (ShiftRightBytes) 2770 2771 template <class D, HWY_IF_V_SIZE_D(D, 8)> 2772 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 2773 const Twice<RebindToUnsigned<decltype(d)>> dut; 2774 using VUT = VFromD<decltype(dut)>; // for float16_t 2775 const VUT vut = BitCast(dut, v); 2776 return BitCast(d, LowerHalf(VUT{__lsx_vilvh_d(vut.raw, vut.raw)})); 2777 } 2778 2779 // Partial 2780 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 2781 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 2782 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); 2783 } 2784 2785 // ------------------------------ ExtractLane (UpperHalf) 2786 2787 namespace detail { 2788 2789 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2790 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 2791 static_assert(kLane < N, "Lane index out of bounds"); 2792 return static_cast<T>(__lsx_vpickve2gr_b(v.raw, kLane) & 0xFF); 2793 } 2794 2795 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2796 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 2797 static_assert(kLane < N, "Lane index out of bounds"); 2798 const DFromV<decltype(v)> d; 2799 const RebindToUnsigned<decltype(d)> du; 2800 const uint16_t lane = static_cast<uint16_t>( 2801 __lsx_vpickve2gr_hu(BitCast(du, v).raw, kLane) & 0xFFFF); 2802 return BitCastScalar<T>(lane); 2803 } 2804 2805 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 2806 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 2807 static_assert(kLane < N, "Lane index out of bounds"); 2808 return static_cast<T>(__lsx_vpickve2gr_w(v.raw, kLane)); 2809 } 2810 2811 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 2812 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 2813 static_assert(kLane < N, "Lane index out of bounds"); 2814 return static_cast<T>(__lsx_vpickve2gr_d(v.raw, kLane)); 2815 } 2816 2817 template <size_t kLane, size_t N> 2818 HWY_INLINE float ExtractLane(const Vec128<float, N> v) { 2819 float f32; 2820 int32_t i32 = __lsx_vpickve2gr_w(reinterpret_cast<__m128i>(v.raw), kLane); 2821 CopyBytes<4>(&i32, &f32); 2822 return f32; 2823 } 2824 template <size_t kLane, size_t N> 2825 HWY_INLINE double ExtractLane(const Vec128<double, N> v) { 2826 double f64; 2827 int64_t i64 = __lsx_vpickve2gr_d(reinterpret_cast<__m128i>(v.raw), kLane); 2828 CopyBytes<8>(&i64, &f64); 2829 return f64; 2830 } 2831 2832 } // namespace detail 2833 2834 template <typename T> 2835 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { 2836 HWY_DASSERT(i == 0); 2837 (void)i; 2838 return GetLane(v); 2839 } 2840 2841 template <typename T> 2842 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { 2843 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2844 if (__builtin_constant_p(i)) { 2845 switch (i) { 2846 case 0: 2847 return detail::ExtractLane<0>(v); 2848 case 1: 2849 return detail::ExtractLane<1>(v); 2850 } 2851 } 2852 #endif 2853 alignas(16) T lanes[2]; 2854 Store(v, DFromV<decltype(v)>(), lanes); 2855 return lanes[i]; 2856 } 2857 2858 template <typename T> 2859 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { 2860 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2861 if (__builtin_constant_p(i)) { 2862 switch (i) { 2863 case 0: 2864 return detail::ExtractLane<0>(v); 2865 case 1: 2866 return detail::ExtractLane<1>(v); 2867 case 2: 2868 return detail::ExtractLane<2>(v); 2869 case 3: 2870 return detail::ExtractLane<3>(v); 2871 } 2872 } 2873 #endif 2874 alignas(16) T lanes[4]; 2875 Store(v, DFromV<decltype(v)>(), lanes); 2876 return lanes[i]; 2877 } 2878 2879 template <typename T> 2880 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { 2881 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2882 if (__builtin_constant_p(i)) { 2883 switch (i) { 2884 case 0: 2885 return detail::ExtractLane<0>(v); 2886 case 1: 2887 return detail::ExtractLane<1>(v); 2888 case 2: 2889 return detail::ExtractLane<2>(v); 2890 case 3: 2891 return detail::ExtractLane<3>(v); 2892 case 4: 2893 return detail::ExtractLane<4>(v); 2894 case 5: 2895 return detail::ExtractLane<5>(v); 2896 case 6: 2897 return detail::ExtractLane<6>(v); 2898 case 7: 2899 return detail::ExtractLane<7>(v); 2900 } 2901 } 2902 #endif 2903 alignas(16) T lanes[8]; 2904 Store(v, DFromV<decltype(v)>(), lanes); 2905 return lanes[i]; 2906 } 2907 2908 template <typename T> 2909 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { 2910 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2911 if (__builtin_constant_p(i)) { 2912 switch (i) { 2913 case 0: 2914 return detail::ExtractLane<0>(v); 2915 case 1: 2916 return detail::ExtractLane<1>(v); 2917 case 2: 2918 return detail::ExtractLane<2>(v); 2919 case 3: 2920 return detail::ExtractLane<3>(v); 2921 case 4: 2922 return detail::ExtractLane<4>(v); 2923 case 5: 2924 return detail::ExtractLane<5>(v); 2925 case 6: 2926 return detail::ExtractLane<6>(v); 2927 case 7: 2928 return detail::ExtractLane<7>(v); 2929 case 8: 2930 return detail::ExtractLane<8>(v); 2931 case 9: 2932 return detail::ExtractLane<9>(v); 2933 case 10: 2934 return detail::ExtractLane<10>(v); 2935 case 11: 2936 return detail::ExtractLane<11>(v); 2937 case 12: 2938 return detail::ExtractLane<12>(v); 2939 case 13: 2940 return detail::ExtractLane<13>(v); 2941 case 14: 2942 return detail::ExtractLane<14>(v); 2943 case 15: 2944 return detail::ExtractLane<15>(v); 2945 } 2946 } 2947 #endif 2948 alignas(16) T lanes[16]; 2949 Store(v, DFromV<decltype(v)>(), lanes); 2950 return lanes[i]; 2951 } 2952 2953 // ------------------------------ InsertLane (UpperHalf) 2954 2955 namespace detail { 2956 2957 template <class V> 2958 HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) { 2959 const DFromV<decltype(v)> d; 2960 2961 #if HWY_TARGET <= HWY_AVX3 2962 using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw); 2963 const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)}; 2964 #else 2965 const RebindToUnsigned<decltype(d)> du; 2966 using TU = TFromD<decltype(du)>; 2967 const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i))); 2968 #endif 2969 2970 return IfThenElse(mask, Set(d, t), v); 2971 } 2972 2973 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2974 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2975 static_assert(kLane < N, "Lane index out of bounds"); 2976 return Vec128<T, N>{__lsx_vinsgr2vr_b(v.raw, t, kLane)}; 2977 } 2978 2979 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2980 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2981 static_assert(kLane < N, "Lane index out of bounds"); 2982 const DFromV<decltype(v)> d; 2983 const RebindToUnsigned<decltype(d)> du; 2984 const uint16_t bits = BitCastScalar<uint16_t>(t); 2985 return BitCast(d, VFromD<decltype(du)>{ 2986 __lsx_vinsgr2vr_h(BitCast(du, v).raw, bits, kLane)}); 2987 } 2988 template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)> 2989 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2990 static_assert(kLane < N, "Lane index out of bounds"); 2991 return Vec128<T, N>{__lsx_vinsgr2vr_w(v.raw, t, kLane)}; 2992 } 2993 template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)> 2994 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2995 static_assert(kLane < N, "Lane index out of bounds"); 2996 return Vec128<T, N>{__lsx_vinsgr2vr_d(v.raw, t, kLane)}; 2997 } 2998 2999 template <size_t kLane, size_t N> 3000 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) { 3001 static_assert(kLane < N, "Lane index out of bounds"); 3002 const DFromV<decltype(v)> d; 3003 int ti = BitCastScalar<int>(t); 3004 RebindToUnsigned<decltype(d)> du; 3005 return BitCast(d, VFromD<decltype(du)>{__lsx_vinsgr2vr_w( 3006 reinterpret_cast<__m128i>(v.raw), ti, kLane)}); 3007 } 3008 3009 template <size_t kLane> 3010 HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) { 3011 static_assert(kLane < 2, "Lane index out of bounds"); 3012 const DFromV<decltype(v)> d; 3013 long int ti = BitCastScalar<long int>(t); 3014 RebindToUnsigned<decltype(d)> du; 3015 return BitCast(d, VFromD<decltype(du)>{__lsx_vinsgr2vr_d( 3016 reinterpret_cast<__m128i>(v.raw), ti, kLane)}); 3017 } 3018 3019 } // namespace detail 3020 3021 template <typename T> 3022 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { 3023 HWY_DASSERT(i == 0); 3024 (void)i; 3025 return Set(DFromV<decltype(v)>(), t); 3026 } 3027 3028 template <typename T> 3029 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { 3030 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3031 if (__builtin_constant_p(i)) { 3032 switch (i) { 3033 case 0: 3034 return detail::InsertLane<0>(v, t); 3035 case 1: 3036 return detail::InsertLane<1>(v, t); 3037 } 3038 } 3039 #endif 3040 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 3041 } 3042 3043 template <typename T> 3044 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { 3045 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3046 if (__builtin_constant_p(i)) { 3047 switch (i) { 3048 case 0: 3049 return detail::InsertLane<0>(v, t); 3050 case 1: 3051 return detail::InsertLane<1>(v, t); 3052 case 2: 3053 return detail::InsertLane<2>(v, t); 3054 case 3: 3055 return detail::InsertLane<3>(v, t); 3056 } 3057 } 3058 #endif 3059 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 3060 } 3061 3062 template <typename T> 3063 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { 3064 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3065 if (__builtin_constant_p(i)) { 3066 switch (i) { 3067 case 0: 3068 return detail::InsertLane<0>(v, t); 3069 case 1: 3070 return detail::InsertLane<1>(v, t); 3071 case 2: 3072 return detail::InsertLane<2>(v, t); 3073 case 3: 3074 return detail::InsertLane<3>(v, t); 3075 case 4: 3076 return detail::InsertLane<4>(v, t); 3077 case 5: 3078 return detail::InsertLane<5>(v, t); 3079 case 6: 3080 return detail::InsertLane<6>(v, t); 3081 case 7: 3082 return detail::InsertLane<7>(v, t); 3083 } 3084 } 3085 #endif 3086 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 3087 } 3088 3089 template <typename T> 3090 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { 3091 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3092 if (__builtin_constant_p(i)) { 3093 switch (i) { 3094 case 0: 3095 return detail::InsertLane<0>(v, t); 3096 case 1: 3097 return detail::InsertLane<1>(v, t); 3098 case 2: 3099 return detail::InsertLane<2>(v, t); 3100 case 3: 3101 return detail::InsertLane<3>(v, t); 3102 case 4: 3103 return detail::InsertLane<4>(v, t); 3104 case 5: 3105 return detail::InsertLane<5>(v, t); 3106 case 6: 3107 return detail::InsertLane<6>(v, t); 3108 case 7: 3109 return detail::InsertLane<7>(v, t); 3110 case 8: 3111 return detail::InsertLane<8>(v, t); 3112 case 9: 3113 return detail::InsertLane<9>(v, t); 3114 case 10: 3115 return detail::InsertLane<10>(v, t); 3116 case 11: 3117 return detail::InsertLane<11>(v, t); 3118 case 12: 3119 return detail::InsertLane<12>(v, t); 3120 case 13: 3121 return detail::InsertLane<13>(v, t); 3122 case 14: 3123 return detail::InsertLane<14>(v, t); 3124 case 15: 3125 return detail::InsertLane<15>(v, t); 3126 } 3127 } 3128 #endif 3129 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 3130 } 3131 3132 // ------------------------------ CombineShiftRightBytes 3133 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)> 3134 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 3135 static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); 3136 return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); 3137 } 3138 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3139 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 3140 constexpr size_t kSize = d.MaxBytes(); 3141 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 3142 3143 const Twice<decltype(d)> dt; 3144 return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw}; 3145 } 3146 3147 // ------------------------------ Broadcast/splat any lane 3148 3149 template <int kLane, typename T, size_t N, HWY_IF_UI8(T)> 3150 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { 3151 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3152 return Vec128<T, N>{__lsx_vreplvei_b(v.raw, kLane)}; 3153 } 3154 template <int kLane, typename T, size_t N, HWY_IF_UI16(T)> 3155 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { 3156 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3157 return Vec128<T, N>{__lsx_vreplvei_h(v.raw, kLane)}; 3158 } 3159 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 3160 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { 3161 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3162 const DFromV<decltype(v)> d; 3163 return BitCast(d, Vec128<int32_t, N>{__lsx_vreplvei_w( 3164 reinterpret_cast<__m128i>(v.raw), kLane)}); 3165 } 3166 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 3167 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { 3168 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 3169 const DFromV<decltype(v)> d; 3170 return BitCast(d, Vec128<int64_t, N>{__lsx_vreplvei_d( 3171 reinterpret_cast<__m128i>(v.raw), kLane)}); 3172 } 3173 3174 // ------------------------------ TableLookupLanes (Shuffle01) 3175 3176 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. 3177 template <typename T, size_t N = 16 / sizeof(T)> 3178 struct Indices128 { 3179 __m128i raw; 3180 }; 3181 3182 namespace detail { 3183 3184 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3185 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 3186 D d) { 3187 const Repartition<uint8_t, decltype(d)> d8; 3188 return Iota(d8, 0); 3189 } 3190 3191 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3192 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 3193 D d) { 3194 const Repartition<uint8_t, decltype(d)> d8; 3195 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 3196 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 3197 return Load(d8, kBroadcastLaneBytes); 3198 } 3199 3200 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3201 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 3202 D d) { 3203 const Repartition<uint8_t, decltype(d)> d8; 3204 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 3205 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 3206 return Load(d8, kBroadcastLaneBytes); 3207 } 3208 3209 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3210 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 3211 D d) { 3212 const Repartition<uint8_t, decltype(d)> d8; 3213 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 3214 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; 3215 return Load(d8, kBroadcastLaneBytes); 3216 } 3217 3218 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3219 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 3220 const Repartition<uint8_t, decltype(d)> d8; 3221 return Zero(d8); 3222 } 3223 3224 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3225 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 3226 const Repartition<uint8_t, decltype(d)> d8; 3227 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 3228 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; 3229 return Load(d8, kByteOffsets); 3230 } 3231 3232 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3233 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 3234 const Repartition<uint8_t, decltype(d)> d8; 3235 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 3236 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; 3237 return Load(d8, kByteOffsets); 3238 } 3239 3240 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3241 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 3242 const Repartition<uint8_t, decltype(d)> d8; 3243 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 3244 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; 3245 return Load(d8, kByteOffsets); 3246 } 3247 3248 } // namespace detail 3249 3250 template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)> 3251 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 3252 D d, Vec128<TI, MaxLanes(D())> vec) { 3253 using T = TFromD<D>; 3254 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 3255 #if HWY_IS_DEBUG_BUILD 3256 const RebindToUnsigned<decltype(d)> du; 3257 using TU = TFromD<decltype(du)>; 3258 HWY_DASSERT(AllTrue( 3259 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 3260 #endif 3261 3262 (void)d; 3263 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw}; 3264 } 3265 3266 template <class D, typename TI, 3267 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> 3268 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 3269 D d, Vec128<TI, MaxLanes(D())> vec) { 3270 using T = TFromD<D>; 3271 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 3272 #if HWY_IS_DEBUG_BUILD 3273 const RebindToUnsigned<decltype(d)> du; 3274 using TU = TFromD<decltype(du)>; 3275 HWY_DASSERT(AllTrue( 3276 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 3277 #endif 3278 3279 const Repartition<uint8_t, decltype(d)> d8; 3280 using V8 = VFromD<decltype(d8)>; 3281 3282 // Broadcast each lane index to all bytes of T and shift to bytes 3283 const V8 lane_indices = TableLookupBytes( 3284 BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); 3285 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); 3286 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); 3287 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); 3288 return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw}; 3289 } 3290 3291 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI> 3292 HWY_API Indices128<TFromD<D>, MaxLanes(D())> SetTableIndices(D d, 3293 const TI* idx) { 3294 const Rebind<TI, decltype(d)> di; 3295 return IndicesFromVec(d, LoadU(di, idx)); 3296 } 3297 3298 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 3299 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 3300 using TI = MakeSigned<T>; 3301 const DFromV<decltype(v)> d; 3302 const Rebind<TI, decltype(d)> di; 3303 auto t1 = TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}); 3304 return BitCast(d, t1); 3305 } 3306 3307 // Single lane: no change 3308 template <typename T> 3309 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, 3310 Indices128<T, 1> /* idx */) { 3311 return v; 3312 } 3313 3314 // ------------------------------ ReverseBlocks 3315 3316 // Single block: no change 3317 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3318 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { 3319 return v; 3320 } 3321 3322 // ------------------------------ Reverse (Shuffle0123, Shuffle2301) 3323 3324 // Single lane: no change 3325 template <class D, HWY_IF_LANES_D(D, 1)> 3326 HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) { 3327 return v; 3328 } 3329 // 32-bit x2: shuffle 3330 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)> 3331 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { 3332 return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw}; 3333 } 3334 // 64-bit x2: shuffle 3335 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 3336 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { 3337 return Shuffle01(v); 3338 } 3339 // 32-bit x4: shuffle 3340 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 3341 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { 3342 return Shuffle0123(v); 3343 } 3344 3345 // 16-bit 3346 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2), 3347 HWY_IF_LANES_GT_D(D, 1)> 3348 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 3349 const RebindToUnsigned<decltype(d)> du; 3350 using VU = VFromD<decltype(du)>; 3351 const VU vu = BitCast(du, v); 3352 constexpr size_t kN = MaxLanes(d); 3353 if (kN == 1) return v; 3354 if (kN == 2) { 3355 return BitCast(d, VU{__lsx_vshuf4i_h(vu.raw, 0x11)}); 3356 } 3357 if (kN == 4) { 3358 return BitCast(d, VU{__lsx_vshuf4i_h(vu.raw, 0x1B)}); 3359 } 3360 const RebindToSigned<decltype(d)> di; 3361 const VFromD<decltype(di)> shuffle = Dup128VecFromValues( 3362 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); 3363 return BitCast(d, TableLookupBytes(v, shuffle)); 3364 } 3365 3366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1), 3367 HWY_IF_LANES_GT_D(D, 1)> 3368 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 3369 static constexpr int kN = static_cast<int>(MaxLanes(d)); 3370 if (kN == 1) return v; 3371 alignas(16) static constexpr int8_t _tmp_data[] = { 3372 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, 3373 kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; 3374 return VFromD<D>{__lsx_vshuf_b(v.raw, v.raw, __lsx_vld(_tmp_data, 0))}; 3375 } 3376 3377 // ------------------------------ Reverse2 3378 3379 // Single lane: no change 3380 template <class D, HWY_IF_LANES_D(D, 1)> 3381 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 3382 return v; 3383 } 3384 3385 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3386 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { 3387 const RepartitionToWide<RebindToUnsigned<decltype(d)>> dw; 3388 return BitCast(d, RotateRight<16>(BitCast(dw, v))); 3389 } 3390 3391 // Generic for all vector lengths. 3392 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)> 3393 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 3394 return Shuffle2301(v); 3395 } 3396 3397 // Generic for all vector lengths. 3398 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)> 3399 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 3400 return Shuffle01(v); 3401 } 3402 3403 // ------------------------------ Reverse4 3404 3405 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3406 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> v) { 3407 return VFromD<D>{__lsx_vshuf4i_h(v.raw, 0x1B)}; 3408 } 3409 3410 // Generic for all vector lengths. 3411 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3412 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { 3413 return Shuffle0123(v); 3414 } 3415 3416 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 3417 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) { 3418 HWY_ASSERT(0); // don't have 4 u64 lanes 3419 } 3420 3421 // ------------------------------ Reverse8 3422 3423 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3424 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 3425 const RepartitionToWide<decltype(d)> dw; 3426 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); 3427 } 3428 3429 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 3430 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 3431 HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) { 3432 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit 3433 } 3434 3435 // ------------------------------ InterleaveUpper (UpperHalf) 3436 3437 // Full 3438 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3439 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 3440 return VFromD<D>{__lsx_vilvh_b(b.raw, a.raw)}; 3441 } 3442 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3443 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 3444 return VFromD<D>{__lsx_vilvh_h(b.raw, a.raw)}; 3445 } 3446 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 3447 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 3448 const RebindToSigned<decltype(d)> df; 3449 return BitCast(d, VFromD<decltype(df)>{ 3450 __lsx_vilvh_w(BitCast(df, b).raw, BitCast(df, a).raw)}); 3451 } 3452 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 3453 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 3454 const RebindToSigned<decltype(d)> dd; 3455 return BitCast(d, VFromD<decltype(dd)>{ 3456 __lsx_vilvh_d(BitCast(dd, b).raw, BitCast(dd, a).raw)}); 3457 } 3458 3459 // Partial 3460 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3461 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 3462 const Half<decltype(d)> d2; 3463 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, 3464 VFromD<D>{UpperHalf(d2, b).raw}); 3465 } 3466 3467 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 3468 3469 // Same as Interleave*, except that the return lanes are double-width integers; 3470 // this is necessary because the single-lane scalar cannot return two values. 3471 template <class V, class DW = RepartitionToWide<DFromV<V>>> 3472 HWY_API VFromD<DW> ZipLower(V a, V b) { 3473 return BitCast(DW(), InterleaveLower(a, b)); 3474 } 3475 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 3476 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 3477 return BitCast(dw, InterleaveLower(D(), a, b)); 3478 } 3479 3480 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 3481 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 3482 return BitCast(dw, InterleaveUpper(D(), a, b)); 3483 } 3484 3485 // ================================================== CONVERT (1) 3486 3487 // ------------------------------ PromoteTo unsigned 3488 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> 3489 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 3490 return VFromD<D>{__lsx_vsllwil_hu_bu(v.raw, 0)}; 3491 } 3492 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 3493 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 3494 return VFromD<D>{__lsx_vsllwil_wu_hu(v.raw, 0)}; 3495 } 3496 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 3497 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 3498 return VFromD<D>{__lsx_vsllwil_du_wu(v.raw, 0)}; 3499 } 3500 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 3501 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 3502 const __m128i u16 = __lsx_vsllwil_hu_bu(v.raw, 0); 3503 return VFromD<D>{__lsx_vsllwil_wu_hu(u16, 0)}; 3504 } 3505 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 3506 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { 3507 const Rebind<uint32_t, decltype(d)> du32; 3508 return PromoteTo(d, PromoteTo(du32, v)); 3509 } 3510 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 3511 HWY_API VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<uint16_t, D>> v) { 3512 const __m128i u32 = __lsx_vsllwil_wu_hu(v.raw, 0); 3513 return VFromD<D>{__lsx_vsllwil_du_wu(u32, 0)}; 3514 } 3515 3516 // Unsigned to signed: same plus cast. 3517 template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V), 3518 HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)), 3519 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> 3520 HWY_API VFromD<D> PromoteTo(D di, V v) { 3521 const RebindToUnsigned<decltype(di)> du; 3522 return BitCast(di, PromoteTo(du, v)); 3523 } 3524 3525 // signed 3526 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> 3527 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 3528 return VFromD<D>{__lsx_vsllwil_h_b(v.raw, 0)}; 3529 } 3530 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 3531 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 3532 return VFromD<D>{__lsx_vsllwil_w_h(v.raw, 0)}; 3533 } 3534 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 3535 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 3536 return VFromD<D>{__lsx_vsllwil_d_w(v.raw, 0)}; 3537 } 3538 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 3539 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 3540 const __m128i i16 = __lsx_vsllwil_h_b(v.raw, 0); 3541 return VFromD<D>{__lsx_vsllwil_w_h(i16, 0)}; 3542 } 3543 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 3544 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) { 3545 const Rebind<int32_t, decltype(d)> di32; 3546 return PromoteTo(d, PromoteTo(di32, v)); 3547 } 3548 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 3549 HWY_API VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<int16_t, D>> v) { 3550 const __m128i i32 = __lsx_vsllwil_w_h(v.raw, 0); 3551 return VFromD<D>{__lsx_vsllwil_d_w(i32, 0)}; 3552 } 3553 3554 // -------------------- PromoteTo float 3555 3556 #ifdef HWY_NATIVE_F16C 3557 #undef HWY_NATIVE_F16C 3558 #else 3559 #define HWY_NATIVE_F16C 3560 #endif 3561 3562 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 3563 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<hwy::float16_t, D>> v) { 3564 return VFromD<D>{__lsx_vfcvtl_s_h(v.raw)}; 3565 } 3566 3567 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3568 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { 3569 return VFromD<D>{__lsx_vfcvtl_d_s(v.raw)}; 3570 } 3571 3572 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3573 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 3574 return VFromD<D>{__lsx_vffintl_d_w(v.raw)}; 3575 } 3576 3577 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 3578 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) { 3579 const Rebind<int32_t, decltype(df64)> di32; 3580 const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v)); 3581 return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result, 3582 Set(df64, 4294967296.0), 3583 Zero(df64)); 3584 } 3585 3586 template <class D, HWY_IF_F32_D(D)> 3587 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<hwy::bfloat16_t, D>> v) { 3588 const RebindToSigned<decltype(d)> di32; 3589 const Rebind<uint16_t, decltype(d)> du16; 3590 return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 3591 } 3592 3593 // ------------------------------ Per4LaneBlockShuffle 3594 3595 namespace detail { 3596 3597 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 3598 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 3599 #else 3600 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 3601 #endif 3602 3603 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3604 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, 3605 const uint32_t x2, 3606 const uint32_t x1, 3607 const uint32_t x0) { 3608 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); 3609 const GccU32RawVectType raw = {x0, x1, x2, x3}; 3610 return ResizeBitCast(d, Vec128<uint32_t>{reinterpret_cast<__m128i>(raw)}); 3611 } 3612 3613 template <size_t kIdx3210, size_t kVectSize, class V, 3614 HWY_IF_LANES_LE(kVectSize, 16)> 3615 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3616 hwy::SizeTag<1> /*lane_size_tag*/, 3617 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 3618 V v) { 3619 constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF); 3620 return V{__lsx_vshuf4i_b(v.raw, kShuffle)}; 3621 } 3622 3623 template <size_t kIdx3210, size_t kVectSize, class V, 3624 HWY_IF_LANES_LE(kVectSize, 16)> 3625 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3626 hwy::SizeTag<2> /*lane_size_tag*/, 3627 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 3628 V v) { 3629 const DFromV<decltype(v)> d; 3630 const RebindToUnsigned<decltype(d)> du; // for float16_t 3631 constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF); 3632 return BitCast( 3633 d, VFromD<decltype(du)>{__lsx_vshuf4i_h(BitCast(du, v).raw, kShuffle)}); 3634 } 3635 3636 template <size_t kIdx3210, class V> 3637 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3638 hwy::SizeTag<4> /*lane_size_tag*/, 3639 hwy::SizeTag<16> /*vect_size_tag*/, V v) { 3640 const DFromV<decltype(v)> d; 3641 constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF); 3642 const RebindToUnsigned<decltype(d)> du; 3643 return BitCast(d, VFromD<decltype(du)>{__lsx_vshuf4i_w( 3644 reinterpret_cast<__m128i>(v.raw), kShuffle)}); 3645 } 3646 3647 } // namespace detail 3648 3649 // ------------------------------ SlideUpLanes 3650 3651 namespace detail { 3652 3653 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 3654 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 3655 const DFromV<decltype(v)> d; 3656 const Full64<uint64_t> du64; 3657 const auto vu64 = ResizeBitCast(du64, v); 3658 return ResizeBitCast( 3659 d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8))); 3660 } 3661 3662 template <class V, HWY_IF_V_SIZE_V(V, 16)> 3663 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 3664 const DFromV<decltype(v)> d; 3665 const Repartition<uint8_t, decltype(d)> du8; 3666 const auto idx = 3667 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>))); 3668 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); 3669 } 3670 3671 } // namespace detail 3672 3673 template <class D, HWY_IF_LANES_D(D, 1)> 3674 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 3675 return v; 3676 } 3677 3678 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 3679 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3680 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3681 if (__builtin_constant_p(amt)) { 3682 switch (amt) { 3683 case 0: 3684 return v; 3685 case 1: 3686 return ShiftLeftLanes<1>(d, v); 3687 } 3688 } 3689 #else 3690 (void)d; 3691 #endif 3692 3693 return detail::SlideUpLanes(v, amt); 3694 } 3695 3696 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 3697 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3698 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3699 if (__builtin_constant_p(amt)) { 3700 switch (amt) { 3701 case 0: 3702 return v; 3703 case 1: 3704 return ShiftLeftLanes<1>(d, v); 3705 case 2: 3706 return ShiftLeftLanes<2>(d, v); 3707 case 3: 3708 return ShiftLeftLanes<3>(d, v); 3709 } 3710 } 3711 #else 3712 (void)d; 3713 #endif 3714 3715 return detail::SlideUpLanes(v, amt); 3716 } 3717 3718 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 3719 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3720 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3721 if (__builtin_constant_p(amt)) { 3722 switch (amt) { 3723 case 0: 3724 return v; 3725 case 1: 3726 return ShiftLeftLanes<1>(d, v); 3727 case 2: 3728 return ShiftLeftLanes<2>(d, v); 3729 case 3: 3730 return ShiftLeftLanes<3>(d, v); 3731 case 4: 3732 return ShiftLeftLanes<4>(d, v); 3733 case 5: 3734 return ShiftLeftLanes<5>(d, v); 3735 case 6: 3736 return ShiftLeftLanes<6>(d, v); 3737 case 7: 3738 return ShiftLeftLanes<7>(d, v); 3739 } 3740 } 3741 #else 3742 (void)d; 3743 #endif 3744 3745 return detail::SlideUpLanes(v, amt); 3746 } 3747 3748 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 3749 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3750 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3751 if (__builtin_constant_p(amt)) { 3752 switch (amt) { 3753 case 0: 3754 return v; 3755 case 1: 3756 return ShiftLeftLanes<1>(d, v); 3757 case 2: 3758 return ShiftLeftLanes<2>(d, v); 3759 case 3: 3760 return ShiftLeftLanes<3>(d, v); 3761 case 4: 3762 return ShiftLeftLanes<4>(d, v); 3763 case 5: 3764 return ShiftLeftLanes<5>(d, v); 3765 case 6: 3766 return ShiftLeftLanes<6>(d, v); 3767 case 7: 3768 return ShiftLeftLanes<7>(d, v); 3769 case 8: 3770 return ShiftLeftLanes<8>(d, v); 3771 case 9: 3772 return ShiftLeftLanes<9>(d, v); 3773 case 10: 3774 return ShiftLeftLanes<10>(d, v); 3775 case 11: 3776 return ShiftLeftLanes<11>(d, v); 3777 case 12: 3778 return ShiftLeftLanes<12>(d, v); 3779 case 13: 3780 return ShiftLeftLanes<13>(d, v); 3781 case 14: 3782 return ShiftLeftLanes<14>(d, v); 3783 case 15: 3784 return ShiftLeftLanes<15>(d, v); 3785 } 3786 } 3787 #else 3788 (void)d; 3789 #endif 3790 3791 return detail::SlideUpLanes(v, amt); 3792 } 3793 3794 // ------------------------------ SlideDownLanes 3795 3796 namespace detail { 3797 3798 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 3799 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 3800 const DFromV<decltype(v)> d; 3801 const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv; 3802 return BitCast(d, 3803 ShiftRightSame(BitCast(dv, v), 3804 static_cast<int>(amt * sizeof(TFromV<V>) * 8))); 3805 } 3806 3807 template <class V, HWY_IF_V_SIZE_V(V, 16)> 3808 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 3809 const DFromV<decltype(v)> d; 3810 const Repartition<int8_t, decltype(d)> di8; 3811 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>))); 3812 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); 3813 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); 3814 } 3815 3816 } // namespace detail 3817 3818 template <class D, HWY_IF_LANES_D(D, 1)> 3819 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 3820 return v; 3821 } 3822 3823 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 3824 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3825 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3826 if (__builtin_constant_p(amt)) { 3827 switch (amt) { 3828 case 0: 3829 return v; 3830 case 1: 3831 return ShiftRightLanes<1>(d, v); 3832 } 3833 } 3834 #else 3835 (void)d; 3836 #endif 3837 3838 return detail::SlideDownLanes(v, amt); 3839 } 3840 3841 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 3842 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3843 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3844 if (__builtin_constant_p(amt)) { 3845 switch (amt) { 3846 case 0: 3847 return v; 3848 case 1: 3849 return ShiftRightLanes<1>(d, v); 3850 case 2: 3851 return ShiftRightLanes<2>(d, v); 3852 case 3: 3853 return ShiftRightLanes<3>(d, v); 3854 } 3855 } 3856 #else 3857 (void)d; 3858 #endif 3859 3860 return detail::SlideDownLanes(v, amt); 3861 } 3862 3863 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 3864 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3865 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3866 if (__builtin_constant_p(amt)) { 3867 switch (amt) { 3868 case 0: 3869 return v; 3870 case 1: 3871 return ShiftRightLanes<1>(d, v); 3872 case 2: 3873 return ShiftRightLanes<2>(d, v); 3874 case 3: 3875 return ShiftRightLanes<3>(d, v); 3876 case 4: 3877 return ShiftRightLanes<4>(d, v); 3878 case 5: 3879 return ShiftRightLanes<5>(d, v); 3880 case 6: 3881 return ShiftRightLanes<6>(d, v); 3882 case 7: 3883 return ShiftRightLanes<7>(d, v); 3884 } 3885 } 3886 #else 3887 (void)d; 3888 #endif 3889 3890 return detail::SlideDownLanes(v, amt); 3891 } 3892 3893 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 3894 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3895 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 3896 if (__builtin_constant_p(amt)) { 3897 switch (amt) { 3898 case 0: 3899 return v; 3900 case 1: 3901 return ShiftRightLanes<1>(d, v); 3902 case 2: 3903 return ShiftRightLanes<2>(d, v); 3904 case 3: 3905 return ShiftRightLanes<3>(d, v); 3906 case 4: 3907 return ShiftRightLanes<4>(d, v); 3908 case 5: 3909 return ShiftRightLanes<5>(d, v); 3910 case 6: 3911 return ShiftRightLanes<6>(d, v); 3912 case 7: 3913 return ShiftRightLanes<7>(d, v); 3914 case 8: 3915 return ShiftRightLanes<8>(d, v); 3916 case 9: 3917 return ShiftRightLanes<9>(d, v); 3918 case 10: 3919 return ShiftRightLanes<10>(d, v); 3920 case 11: 3921 return ShiftRightLanes<11>(d, v); 3922 case 12: 3923 return ShiftRightLanes<12>(d, v); 3924 case 13: 3925 return ShiftRightLanes<13>(d, v); 3926 case 14: 3927 return ShiftRightLanes<14>(d, v); 3928 case 15: 3929 return ShiftRightLanes<15>(d, v); 3930 } 3931 } 3932 #else 3933 (void)d; 3934 #endif 3935 3936 return detail::SlideDownLanes(v, amt); 3937 } 3938 3939 // ================================================== COMBINE 3940 3941 // ------------------------------ Combine (InterleaveLower) 3942 3943 // N = N/2 + N/2 (upper half undefined) 3944 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> 3945 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { 3946 const Half<decltype(d)> dh; 3947 const RebindToUnsigned<decltype(dh)> duh; 3948 // Treat half-width input as one lane, and expand to two lanes. 3949 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; 3950 const VU lo{BitCast(duh, lo_half).raw}; 3951 const VU hi{BitCast(duh, hi_half).raw}; 3952 return BitCast(d, InterleaveLower(lo, hi)); 3953 } 3954 3955 // ------------------------------ ZeroExtendVector (Combine) 3956 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3957 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 3958 return Combine(d, Zero(Half<decltype(d)>()), lo); 3959 } 3960 3961 // ------------------------------ Concat full (InterleaveLower) 3962 3963 // hiH,hiL loH,loL |-> hiL,loL (= lower halves) 3964 template <class D, HWY_IF_V_SIZE_D(D, 16)> 3965 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 3966 const Repartition<uint64_t, decltype(d)> d64; 3967 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); 3968 } 3969 3970 // hiH,hiL loH,loL |-> hiH,loH (= upper halves) 3971 template <class D, HWY_IF_V_SIZE_D(D, 16)> 3972 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 3973 const Repartition<uint64_t, decltype(d)> d64; 3974 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); 3975 } 3976 3977 // hiH,hiL loH,loL |-> hiL,loH (= inner halves) 3978 template <class D, HWY_IF_V_SIZE_D(D, 16)> 3979 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 3980 return CombineShiftRightBytes<8>(d, hi, lo); 3981 } 3982 3983 // hiH,hiL loH,loL |-> hiH,loL (= outer halves) 3984 template <class D, HWY_IF_V_SIZE_D(D, 16)> 3985 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 3986 return BitCast(d, Vec128<uint8_t>{__lsx_vshuf4i_d( 3987 reinterpret_cast<__m128i>(lo.raw), 3988 reinterpret_cast<__m128i>(hi.raw), 0xC)}); 3989 } 3990 3991 // ------------------------------ Concat partial (Combine, LowerHalf) 3992 3993 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3994 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 3995 const Half<decltype(d)> d2; 3996 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); 3997 } 3998 3999 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 4000 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 4001 const Half<decltype(d)> d2; 4002 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); 4003 } 4004 4005 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 4006 HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi, 4007 const VFromD<D> lo) { 4008 const Half<decltype(d)> d2; 4009 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); 4010 } 4011 4012 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 4013 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 4014 const Half<decltype(d)> d2; 4015 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); 4016 } 4017 4018 // ------------------------------ ConcatOdd 4019 4020 // 8-bit full 4021 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 4022 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4023 return VFromD<D>{__lsx_vpickod_b(hi.raw, lo.raw)}; 4024 } 4025 // 8-bit x8 4026 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 4027 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4028 __m128i _tmp = __lsx_vpickod_b(hi.raw, lo.raw); 4029 return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; 4030 } 4031 // 8-bit x4 4032 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)> 4033 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4034 __m128i _tmp = __lsx_vpickod_b(hi.raw, lo.raw); 4035 return VFromD<D>{__lsx_vextrins_h(_tmp, _tmp, 0x14)}; 4036 } 4037 4038 // 16-bit full 4039 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 4040 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4041 return VFromD<D>{__lsx_vpickod_h(hi.raw, lo.raw)}; 4042 } 4043 // 16-bit x4 4044 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 4045 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4046 __m128i _tmp = __lsx_vpickod_h(hi.raw, lo.raw); 4047 return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; 4048 } 4049 4050 // 32-bit full 4051 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 4052 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 4053 return BitCast( 4054 d, Vec128<uint8_t>{__lsx_vpickod_w(reinterpret_cast<__m128i>(hi.raw), 4055 reinterpret_cast<__m128i>(lo.raw))}); 4056 } 4057 4058 // Any T x2 4059 template <class D, HWY_IF_LANES_D(D, 2)> 4060 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 4061 return InterleaveUpper(d, lo, hi); 4062 } 4063 4064 // ------------------------------ ConcatEven 4065 4066 // 8-bit full 4067 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 4068 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4069 return VFromD<D>{__lsx_vpickev_b(hi.raw, lo.raw)}; 4070 } 4071 // 8-bit x8 4072 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 4073 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4074 __m128i _tmp = __lsx_vpickev_b(hi.raw, lo.raw); 4075 return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; 4076 } 4077 // 8-bit x4 4078 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)> 4079 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4080 __m128i _tmp = __lsx_vpickev_b(hi.raw, lo.raw); 4081 return VFromD<D>{__lsx_vextrins_h(_tmp, _tmp, 0x14)}; 4082 } 4083 4084 // 16-bit full 4085 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 4086 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4087 return VFromD<D>{__lsx_vpickev_h(hi.raw, lo.raw)}; 4088 } 4089 // 16-bit x4 4090 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 4091 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 4092 __m128i _tmp = __lsx_vpickev_h(hi.raw, lo.raw); 4093 return VFromD<D>{__lsx_vextrins_w(_tmp, _tmp, 0x12)}; 4094 } 4095 4096 // 32-bit full 4097 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 4098 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 4099 return BitCast( 4100 d, Vec128<uint8_t>{__lsx_vpickev_w(reinterpret_cast<__m128i>(hi.raw), 4101 reinterpret_cast<__m128i>(lo.raw))}); 4102 } 4103 4104 // Any T x2 4105 template <class D, HWY_IF_LANES_D(D, 2)> 4106 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 4107 return InterleaveLower(d, lo, hi); 4108 } 4109 4110 template <size_t N> 4111 HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi, 4112 Vec128<float16_t, N> lo) { 4113 const DFromV<decltype(hi)> d; 4114 const RebindToUnsigned<decltype(d)> du; 4115 return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo))); 4116 } 4117 // ------------------------------ DupEven (InterleaveLower) 4118 4119 template <typename T> 4120 HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) { 4121 return v; 4122 } 4123 4124 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 4125 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) { 4126 __m128i _tmp = __lsx_vpickev_b(v.raw, v.raw); 4127 return Vec128<T, N>{__lsx_vilvl_b(_tmp, _tmp)}; 4128 } 4129 4130 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 4131 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) { 4132 const DFromV<decltype(v)> d; 4133 const RebindToUnsigned<decltype(d)> du; // for float16_t 4134 __m128i _tmp = __lsx_vpickev_h(BitCast(du, v).raw, BitCast(du, v).raw); 4135 return BitCast(d, VFromD<decltype(du)>{__lsx_vilvl_h(_tmp, _tmp)}); 4136 } 4137 4138 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 4139 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) { 4140 const DFromV<decltype(v)> d; 4141 __m128i _tmp = detail::BitCastToInteger(v.raw); 4142 __m128i _tmp1 = __lsx_vpickev_w(_tmp, _tmp); 4143 return BitCast(d, Vec128<uint32_t, N>{__lsx_vilvl_w(_tmp1, _tmp1)}); 4144 } 4145 4146 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 4147 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 4148 return InterleaveLower(DFromV<decltype(v)>(), v, v); 4149 } 4150 4151 // ------------------------------ DupOdd (InterleaveUpper) 4152 4153 template <typename T, HWY_IF_T_SIZE(T, 1)> 4154 HWY_API Vec128<T, 1> DupOdd(Vec128<T, 1> v) { 4155 return v; 4156 } 4157 4158 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 4159 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { 4160 __m128i _tmp = __lsx_vpickod_b(v.raw, v.raw); 4161 return Vec128<T, N>{__lsx_vilvl_b(_tmp, _tmp)}; 4162 } 4163 4164 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 4165 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { 4166 __m128i _tmp = __lsx_vpickod_h(v.raw, v.raw); 4167 return Vec128<T, N>{__lsx_vilvl_h(_tmp, _tmp)}; 4168 } 4169 4170 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 4171 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { 4172 const DFromV<decltype(v)> d; 4173 __m128i _tmp = detail::BitCastToInteger(v.raw); 4174 __m128i _tmp1 = __lsx_vpickod_w(_tmp, _tmp); 4175 return BitCast(d, Vec128<uint32_t, N>{__lsx_vilvl_w(_tmp1, _tmp1)}); 4176 } 4177 4178 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 4179 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 4180 return InterleaveUpper(DFromV<decltype(v)>(), v, v); 4181 } 4182 4183 // ------------------------------ TwoTablesLookupLanes (DupEven) 4184 4185 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 4186 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, 4187 Indices128<T, N> idx) { 4188 const DFromV<decltype(a)> d; 4189 const Twice<decltype(d)> dt; 4190 const Repartition<uint8_t, decltype(dt)> dt_u8; 4191 // TableLookupLanes currently requires table and index vectors to be the same 4192 // size, though a half-length index vector would be sufficient here. 4193 #if HWY_IS_MSAN 4194 const Vec128<T, N> idx_vec{idx.raw}; 4195 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; 4196 #else 4197 // We only keep LowerHalf of the result, which is valid in idx. 4198 const Indices128<T, N * 2> idx2{idx.raw}; 4199 #endif 4200 return LowerHalf( 4201 d, TableLookupBytes(Combine(dt, b, a), 4202 BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw}))); 4203 } 4204 4205 template <typename T, HWY_IF_UI8(T)> 4206 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 4207 Indices128<T> idx) { 4208 return Vec128<T>{__lsx_vshuf_b(b.raw, a.raw, idx.raw)}; 4209 } 4210 4211 template <typename T, HWY_IF_T_SIZE_ONE_OF(T, ((1 << 2) | (1 << 4) | (1 << 8)))> 4212 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 4213 Indices128<T> idx) { 4214 const DFromV<decltype(a)> d; 4215 const Repartition<uint8_t, decltype(d)> du8; 4216 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), 4217 Indices128<uint8_t>{idx.raw})); 4218 } 4219 4220 // ------------------------------ OddEven 4221 4222 template <typename T, size_t N, HWY_IF_UI8(T)> 4223 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 4224 __m128i t0 = __lsx_vpackod_b(a.raw, a.raw); 4225 return Vec128<T, N>{__lsx_vpackev_b(t0, b.raw)}; 4226 } 4227 template <typename T, size_t N, HWY_IF_UI16(T)> 4228 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 4229 __m128i t0 = __lsx_vpackod_h(a.raw, a.raw); 4230 return Vec128<T, N>{__lsx_vpackev_h(t0, b.raw)}; 4231 } 4232 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 4233 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 4234 const DFromV<decltype(a)> d; 4235 const RebindToUnsigned<decltype(d)> du; 4236 __m128i t0 = __lsx_vpackod_w(BitCast(du, a).raw, BitCast(du, a).raw); 4237 return BitCast(d, 4238 VFromD<decltype(du)>{__lsx_vpackev_w(t0, BitCast(du, b).raw)}); 4239 } 4240 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 4241 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 4242 const DFromV<decltype(a)> d; 4243 const RebindToUnsigned<decltype(d)> du; 4244 return BitCast(d, VFromD<decltype(du)>{__lsx_vextrins_d( 4245 BitCast(du, b).raw, BitCast(du, a).raw, 0x11)}); 4246 } 4247 4248 // -------------------------- InterleaveEven 4249 4250 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 4251 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 4252 return VFromD<D>{__lsx_vpackev_b(b.raw, a.raw)}; 4253 } 4254 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 4255 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 4256 return VFromD<D>{__lsx_vpackev_h(b.raw, a.raw)}; 4257 } 4258 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 4259 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 4260 const RebindToSigned<D> di; 4261 return BitCast(d, VFromD<decltype(di)>{__lsx_vpackev_w(BitCast(di, b).raw, 4262 BitCast(di, a).raw)}); 4263 } 4264 4265 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 4266 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 4267 const RebindToSigned<D> di; 4268 return BitCast(d, VFromD<decltype(di)>{__lsx_vpackev_d(BitCast(di, b).raw, 4269 BitCast(di, a).raw)}); 4270 } 4271 4272 // -------------------------- InterleaveOdd 4273 4274 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 4275 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 4276 return VFromD<D>{__lsx_vpackod_b(b.raw, a.raw)}; 4277 } 4278 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 4279 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 4280 return VFromD<D>{__lsx_vpackod_h(b.raw, a.raw)}; 4281 } 4282 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 4283 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 4284 const RebindToSigned<D> di; 4285 return BitCast(d, VFromD<decltype(di)>{__lsx_vpackod_w(BitCast(di, b).raw, 4286 BitCast(di, a).raw)}); 4287 } 4288 4289 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 4290 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 4291 const RebindToSigned<D> di; 4292 return BitCast(d, VFromD<decltype(di)>{__lsx_vpackod_d(BitCast(di, b).raw, 4293 BitCast(di, a).raw)}); 4294 } 4295 4296 // ------------------------------ OddEvenBlocks 4297 4298 template <typename T, size_t N> 4299 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 4300 return even; 4301 } 4302 4303 // ------------------------------ SwapAdjacentBlocks 4304 4305 template <typename T, size_t N> 4306 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 4307 return v; 4308 } 4309 4310 // ------------------------------ InterleaveEvenBlocks 4311 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 4312 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 4313 return a; 4314 } 4315 // ------------------------------ InterleaveOddBlocks 4316 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 4317 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 4318 return a; 4319 } 4320 4321 // ------------------------------ Shl 4322 4323 template <typename T, size_t N, HWY_IF_UI8(T)> 4324 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 4325 return Vec128<T, N>{__lsx_vsll_b(v.raw, bits.raw)}; 4326 } 4327 4328 template <typename T, size_t N, HWY_IF_UI16(T)> 4329 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 4330 return Vec128<T, N>{__lsx_vsll_h(v.raw, bits.raw)}; 4331 } 4332 4333 template <typename T, size_t N, HWY_IF_UI32(T)> 4334 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 4335 return Vec128<T, N>{__lsx_vsll_w(v.raw, bits.raw)}; 4336 } 4337 4338 template <typename T, size_t N, HWY_IF_UI64(T)> 4339 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 4340 return Vec128<T, N>{__lsx_vsll_d(v.raw, bits.raw)}; 4341 } 4342 4343 // ------------------------------ Shr 4344 4345 namespace detail { 4346 4347 template <size_t N> 4348 HWY_API Vec128<uint8_t, N> Shr(Vec128<uint8_t, N> v, Vec128<uint8_t, N> bits) { 4349 return Vec128<uint8_t, N>{__lsx_vsrl_b(v.raw, bits.raw)}; 4350 } 4351 template <size_t N> 4352 HWY_API Vec128<uint16_t, N> Shr(Vec128<uint16_t, N> v, 4353 Vec128<uint16_t, N> bits) { 4354 return Vec128<uint16_t, N>{__lsx_vsrl_h(v.raw, bits.raw)}; 4355 } 4356 template <size_t N> 4357 HWY_API Vec128<uint32_t, N> Shr(Vec128<uint32_t, N> v, 4358 Vec128<uint32_t, N> bits) { 4359 return Vec128<uint32_t, N>{__lsx_vsrl_w(v.raw, bits.raw)}; 4360 } 4361 template <size_t N> 4362 HWY_API Vec128<uint64_t, N> Shr(Vec128<uint64_t, N> v, 4363 Vec128<uint64_t, N> bits) { 4364 return Vec128<uint64_t, N>{__lsx_vsrl_d(v.raw, bits.raw)}; 4365 } 4366 4367 template <size_t N> 4368 HWY_API Vec128<int8_t, N> Shr(Vec128<int8_t, N> v, Vec128<int8_t, N> bits) { 4369 return Vec128<int8_t, N>{__lsx_vsra_b(v.raw, bits.raw)}; 4370 } 4371 template <size_t N> 4372 HWY_API Vec128<int16_t, N> Shr(Vec128<int16_t, N> v, Vec128<int16_t, N> bits) { 4373 return Vec128<int16_t, N>{__lsx_vsra_h(v.raw, bits.raw)}; 4374 } 4375 template <size_t N> 4376 HWY_API Vec128<int32_t, N> Shr(Vec128<int32_t, N> v, Vec128<int32_t, N> bits) { 4377 return Vec128<int32_t, N>{__lsx_vsra_w(v.raw, bits.raw)}; 4378 } 4379 template <size_t N> 4380 HWY_API Vec128<int64_t, N> Shr(Vec128<int64_t, N> v, Vec128<int64_t, N> bits) { 4381 return Vec128<int64_t, N>{__lsx_vsra_d(v.raw, bits.raw)}; 4382 } 4383 4384 } // namespace detail 4385 4386 template <typename T, size_t N> 4387 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) { 4388 return detail::Shr(v, bits); 4389 } 4390 4391 // ================================================== CONVERT (2) 4392 4393 // ------------------------------ PromoteEvenTo/PromoteOddTo 4394 #include "hwy/ops/inside-inl.h" 4395 4396 // Generic for all vector lengths. 4397 template <class DF, HWY_IF_F32_D(DF), 4398 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 4399 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 4400 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), 4401 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); 4402 } 4403 4404 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), 4405 class V16 = VFromD<RepartitionToNarrow<D32>>> 4406 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { 4407 __m128i _tmp = __lsx_vmulwev_w_h(a.raw, b.raw); 4408 return VFromD<D32>{__lsx_vmaddwod_w_h(_tmp, a.raw, b.raw)}; 4409 } 4410 4411 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16), 4412 class VU16 = VFromD<RepartitionToNarrow<DU32>>> 4413 HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 /* tag */, VU16 a, VU16 b) { 4414 __m128i _tmp = __lsx_vmulwev_w_hu(a.raw, b.raw); 4415 return VFromD<DU32>{__lsx_vmaddwod_w_hu(_tmp, a.raw, b.raw)}; 4416 } 4417 4418 // ------------------------------ ReorderWidenMulAccumulate 4419 4420 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), 4421 class V16 = VFromD<RepartitionToNarrow<D32>>> 4422 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b, 4423 const VFromD<D32> sum0, 4424 VFromD<D32>& /* sum1 */) { 4425 return VFromD<D32>{__lsx_vmaddwev_w_h( 4426 __lsx_vmaddwod_w_h(sum0.raw, a.raw, b.raw), a.raw, b.raw)}; 4427 } 4428 4429 template <class DU32, HWY_IF_U32_D(DU32), 4430 class VU16 = VFromD<RepartitionToNarrow<DU32>>> 4431 HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 /* tag */, VU16 a, VU16 b, 4432 const VFromD<DU32> sum0, 4433 VFromD<DU32>& /* sum1 */) { 4434 return VFromD<DU32>{__lsx_vmaddwev_w_hu( 4435 __lsx_vmaddwod_w_hu(sum0.raw, a.raw, b.raw), a.raw, b.raw)}; 4436 } 4437 4438 // ------------------------------ RearrangeToOddPlusEven 4439 template <size_t N> 4440 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0, 4441 Vec128<int32_t, N> /*sum1*/) { 4442 return sum0; // invariant already holds 4443 } 4444 4445 template <size_t N> 4446 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven( 4447 const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) { 4448 return sum0; // invariant already holds 4449 } 4450 4451 template <class VW> 4452 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { 4453 return Add(sum0, sum1); 4454 } 4455 4456 // ------------------------------ Demotions 4457 4458 // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of 4459 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since 4460 // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause 4461 // SFINAE to occur instead of a hard error due to a dependency on the V template 4462 // argument 4463 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V 4464 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \ 4465 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr 4466 4467 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> 4468 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4469 return VFromD<D>{__lsx_vssrani_b_h(v.raw, v.raw, 0)}; 4470 } 4471 4472 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 4473 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4474 return VFromD<D>{__lsx_vssrani_bu_h(v.raw, v.raw, 0)}; 4475 } 4476 4477 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> 4478 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4479 return VFromD<D>{__lsx_vssrlni_b_h(v.raw, v.raw, 0)}; 4480 } 4481 4482 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 4483 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4484 return VFromD<D>{__lsx_vssrlni_bu_h(v.raw, v.raw, 0)}; 4485 } 4486 4487 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 4488 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4489 return VFromD<D>{__lsx_vssrani_h_w(v.raw, v.raw, 0)}; 4490 } 4491 4492 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4493 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4494 return VFromD<D>{__lsx_vssrani_hu_w(v.raw, v.raw, 0)}; 4495 } 4496 4497 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 4498 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4499 return VFromD<D>{__lsx_vssrlni_h_w(v.raw, v.raw, 0)}; 4500 } 4501 4502 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4503 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4504 return VFromD<D>{__lsx_vssrlni_hu_w(v.raw, v.raw, 0)}; 4505 } 4506 4507 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4508 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 4509 return VFromD<D>{__lsx_vssrani_w_d(v.raw, v.raw, 0)}; 4510 } 4511 4512 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4513 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 4514 return VFromD<D>{__lsx_vssrani_wu_d(v.raw, v.raw, 0)}; 4515 } 4516 4517 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4518 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 4519 return VFromD<D>{__lsx_vssrlni_w_d(v.raw, v.raw, 0)}; 4520 } 4521 4522 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4523 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 4524 return VFromD<D>{__lsx_vssrlni_wu_d(v.raw, v.raw, 0)}; 4525 } 4526 4527 // UI->UI DemoteTo for the case where 4528 // sizeof(TFromD<D>) <= sizeof(TFromV<V>) / 4 is generic for all vector lengths 4529 template <class DN, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), 4530 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4531 HWY_IF_T_SIZE_LE_D(DN, sizeof(TFromV<V>) / 4)> 4532 HWY_API VFromD<DN> DemoteTo(DN dn, V v) { 4533 using T = TFromV<V>; 4534 using TN = TFromD<DN>; 4535 4536 using TDemoteTo = 4537 MakeNarrow<If<IsSigned<T>() && IsSigned<TN>(), T, MakeUnsigned<T>>>; 4538 return DemoteTo(dn, DemoteTo(Rebind<TDemoteTo, DN>(), v)); 4539 } 4540 4541 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> 4542 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { 4543 return VFromD<D>{__lsx_vfcvt_h_s(v.raw, v.raw)}; 4544 } 4545 4546 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4547 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4548 return VFromD<D>{__lsx_vfcvt_s_d(v.raw, v.raw)}; 4549 } 4550 4551 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4552 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4553 return VFromD<D>{__lsx_vftintrz_w_d( 4554 reinterpret_cast<__m128d>(__lsx_vreplgr2vr_w(0)), v.raw)}; 4555 } 4556 4557 template <class D, HWY_IF_U32_D(D)> 4558 HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) { 4559 const Rebind<uint64_t, decltype(du32)> du64; 4560 return DemoteTo(du32, ConvertTo(du64, v)); 4561 } 4562 4563 template <class D, HWY_IF_F32_D(D)> 4564 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) { 4565 const Rebind<double, decltype(df32)> df64; 4566 const RebindToUnsigned<decltype(df64)> du64; 4567 const RebindToSigned<decltype(df32)> di32; 4568 const RebindToUnsigned<decltype(df32)> du32; 4569 4570 const auto k2p64_63 = Set(df64, 27670116110564327424.0); 4571 const auto f64_hi52 = 4572 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; 4573 const auto f64_lo12 = 4574 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), 4575 Set(du32, uint32_t{0x00000FFF})))); 4576 4577 const auto f64_sum = f64_hi52 + f64_lo12; 4578 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 4579 4580 const auto f64_sum_is_inexact = 4581 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 4582 const auto f64_bits_decrement = 4583 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), 4584 f64_sum_is_inexact); 4585 4586 const auto adj_f64_val = BitCast( 4587 df64, 4588 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); 4589 4590 return DemoteTo(df32, adj_f64_val); 4591 } 4592 4593 template <class D, HWY_IF_F32_D(D)> 4594 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) { 4595 const Rebind<double, decltype(df32)> df64; 4596 const RebindToUnsigned<decltype(df64)> du64; 4597 const RebindToSigned<decltype(df32)> di32; 4598 const RebindToUnsigned<decltype(df32)> du32; 4599 4600 const auto k2p64 = Set(df64, 18446744073709551616.0); 4601 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; 4602 const auto f64_lo12 = 4603 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), 4604 Set(du32, uint32_t{0x00000FFF})))); 4605 4606 const auto f64_sum = f64_hi52 + f64_lo12; 4607 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 4608 const auto f64_sum_is_inexact = 4609 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 4610 4611 const auto adj_f64_val = BitCast( 4612 df64, 4613 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), 4614 f64_sum_is_inexact)); 4615 4616 return DemoteTo(df32, adj_f64_val); 4617 } 4618 4619 // ------------------------------ ReorderDemote2To 4620 4621 // ReorderDemote2To for 8-byte UI64->UI32, <= 4-byte UI32->UI16, 4622 // and <= 4-byte UI16->UI8 4623 template <class DN, class V, 4624 HWY_IF_V_SIZE_LE_D(DN, ((sizeof(TFromD<DN>) <= 2 ? 4 : 8))), 4625 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4626 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 4627 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 4628 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4629 const DFromV<decltype(a)> d; 4630 const Twice<decltype(d)> dt; 4631 return DemoteTo(dn, Combine(dt, b, a)); 4632 } 4633 4634 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> 4635 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, 4636 Vec128<int16_t> b) { 4637 return VFromD<D>{__lsx_vssrani_b_h(b.raw, a.raw, 0)}; 4638 } 4639 4640 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 4641 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, 4642 Vec128<int16_t> b) { 4643 return VFromD<D>{__lsx_vssrani_bu_h(b.raw, a.raw, 0)}; 4644 } 4645 4646 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> 4647 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint16_t> a, 4648 Vec128<uint16_t> b) { 4649 return VFromD<D>{__lsx_vssrlni_b_h(b.raw, a.raw, 0)}; 4650 } 4651 4652 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 4653 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint16_t> a, 4654 Vec128<uint16_t> b) { 4655 return VFromD<D>{__lsx_vssrlni_bu_h(b.raw, a.raw, 0)}; 4656 } 4657 4658 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 4659 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, 4660 Vec128<int32_t> b) { 4661 return VFromD<D>{__lsx_vssrani_h_w(b.raw, a.raw, 0)}; 4662 } 4663 4664 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 4665 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, 4666 Vec128<int32_t> b) { 4667 return VFromD<D>{__lsx_vssrani_hu_w(b.raw, a.raw, 0)}; 4668 } 4669 4670 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 4671 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint32_t> a, 4672 Vec128<uint32_t> b) { 4673 return VFromD<D>{__lsx_vssrlni_h_w(b.raw, a.raw, 0)}; 4674 } 4675 4676 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 4677 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint32_t> a, 4678 Vec128<uint32_t> b) { 4679 return VFromD<D>{__lsx_vssrlni_hu_w(b.raw, a.raw, 0)}; 4680 } 4681 4682 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 4683 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int64_t> a, 4684 Vec128<int64_t> b) { 4685 return VFromD<D>{__lsx_vssrani_w_d(b.raw, a.raw, 0)}; 4686 } 4687 4688 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 4689 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int64_t> a, 4690 Vec128<int64_t> b) { 4691 return VFromD<D>{__lsx_vssrani_wu_d(b.raw, a.raw, 0)}; 4692 } 4693 4694 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 4695 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint64_t> a, 4696 Vec128<uint64_t> b) { 4697 return VFromD<D>{__lsx_vssrlni_w_d(b.raw, a.raw, 0)}; 4698 } 4699 4700 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 4701 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<uint64_t> a, 4702 Vec128<uint64_t> b) { 4703 return VFromD<D>{__lsx_vssrlni_wu_d(b.raw, a.raw, 0)}; 4704 } 4705 4706 // 8-byte UI32->UI16 and UI16->UI8 ReorderDemote2To 4707 template <class DN, class V, HWY_IF_V_SIZE_D(DN, 8), 4708 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4709 HWY_IF_T_SIZE_LE_D(DN, 2), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 4710 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 4711 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4712 const Twice<DFromV<V>> dt; 4713 const Twice<decltype(dn)> dt_n; 4714 4715 const auto demote2_result = 4716 ReorderDemote2To(dt_n, ResizeBitCast(dt, a), ResizeBitCast(dt, b)); 4717 return VFromD<DN>{__lsx_vshuf4i_w(demote2_result.raw, 0x88)}; 4718 } 4719 4720 template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), 4721 HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), 4722 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4723 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 4724 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 4725 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 4726 return ReorderDemote2To(d, a, b); 4727 } 4728 4729 template <size_t N> 4730 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { 4731 const DFromV<decltype(v)> du32; 4732 const Rebind<uint8_t, decltype(du32)> du8; 4733 return DemoteTo(du8, BitCast(du32, v)); 4734 } 4735 4736 // ------------------------------ F32->UI64 PromoteTo 4737 4738 // f32 ->i64 4739 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 4740 HWY_API VFromD<D> PromoteTo(D /*di64*/, VFromD<Rebind<float, D>> v) { 4741 return VFromD<D>{__lsx_vftintrzl_l_s(v.raw)}; 4742 } 4743 4744 // F32->U64 PromoteTo generic for all vector lengths 4745 template <class D, HWY_IF_U64_D(D)> 4746 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { 4747 const RebindToFloat<decltype(du64)> df64; 4748 return ConvertTo(du64, PromoteTo(df64, v)); 4749 } 4750 4751 // ------------------------------ MulFixedPoint15 4752 4753 template <size_t N> 4754 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, 4755 const Vec128<int16_t, N> b) { 4756 __m128i temp_ev = __lsx_vmulwev_w_h(a.raw, b.raw); 4757 __m128i temp_od = __lsx_vmulwod_w_h(a.raw, b.raw); 4758 __m128i temp1 = __lsx_vilvl_w(temp_od, temp_ev); 4759 __m128i temp2 = __lsx_vilvh_w(temp_od, temp_ev); 4760 return Vec128<int16_t, N>{__lsx_vssrarni_h_w(temp2, temp1, 15)}; 4761 } 4762 4763 // ------------------------------ Truncations 4764 4765 template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)> 4766 HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) { 4767 const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto; 4768 return VFromD<DTo>{BitCast(dto, v).raw}; 4769 } 4770 4771 template <class D, HWY_IF_U8_D(D)> 4772 HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 4773 return Vec16<uint8_t>{__lsx_vextrins_b(v.raw, v.raw, 0x18)}; 4774 } 4775 4776 template <class D, HWY_IF_U16_D(D)> 4777 HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 4778 return Vec32<uint16_t>{__lsx_vextrins_h(v.raw, v.raw, 0x14)}; 4779 } 4780 4781 template <class D, HWY_IF_U32_D(D)> 4782 HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 4783 return Vec64<uint32_t>{__lsx_vpickev_w(v.raw, v.raw)}; 4784 } 4785 4786 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 4787 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4788 __m128i v_ev = __lsx_vpickev_b(v.raw, v.raw); 4789 return VFromD<D>{__lsx_vpickev_b(v_ev, v_ev)}; 4790 } 4791 4792 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4793 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4794 return VFromD<D>{__lsx_vpickev_h(v.raw, v.raw)}; 4795 } 4796 4797 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 4798 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4799 return VFromD<D>{__lsx_vpickev_b(v.raw, v.raw)}; 4800 } 4801 4802 // ------------------------------ int -> float ConvertTo 4803 4804 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 4805 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4806 return VFromD<D>{__lsx_vffint_s_w(v.raw)}; 4807 } 4808 4809 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 4810 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4811 return VFromD<D>{__lsx_vffint_s_wu(v.raw)}; 4812 } 4813 4814 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 4815 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 4816 return VFromD<D>{__lsx_vffint_d_l(v.raw)}; 4817 } 4818 4819 // ------------------------------ float -> int ConvertTo 4820 4821 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 4822 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 4823 return VFromD<D>{__lsx_vffint_d_lu(v.raw)}; 4824 } 4825 4826 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 4827 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) { 4828 return VFromD<D>{__lsx_vftintrz_w_s(v.raw)}; 4829 } 4830 4831 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 4832 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) { 4833 return VFromD<D>{__lsx_vftintrz_wu_s(v.raw)}; 4834 } 4835 4836 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 4837 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4838 return VFromD<D>{__lsx_vftintrz_l_d(v.raw)}; 4839 } 4840 4841 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 4842 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4843 return VFromD<D>{__lsx_vftintrz_lu_d(v.raw)}; 4844 } 4845 4846 // ------------------------------ NearestInt (Round) 4847 4848 template <size_t N> 4849 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { 4850 return Vec128<int32_t, N>{__lsx_vftintrne_w_s(v.raw)}; 4851 } 4852 4853 template <size_t N> 4854 HWY_API Vec128<int64_t, N> NearestInt(const Vec128<double, N> v) { 4855 return Vec128<int64_t, N>{__lsx_vftintrne_l_d(v.raw)}; 4856 } 4857 4858 template <class DI32, HWY_IF_I32_D(DI32)> 4859 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32, 4860 VFromD<Rebind<double, DI32>> v) { 4861 return DemoteTo(di32, NearestInt(v)); 4862 } 4863 4864 // ------------------------------ Floating-point rounding 4865 4866 template <size_t N> 4867 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { 4868 return Vec128<float, N>{__lsx_vfrintrne_s(v.raw)}; 4869 } 4870 template <size_t N> 4871 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) { 4872 return Vec128<double, N>{__lsx_vfrintrne_d(v.raw)}; 4873 } 4874 template <size_t N> 4875 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { 4876 return Vec128<float, N>{__lsx_vfrintrz_s(v.raw)}; 4877 } 4878 template <size_t N> 4879 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) { 4880 return Vec128<double, N>{__lsx_vfrintrz_d(v.raw)}; 4881 } 4882 template <size_t N> 4883 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { 4884 return Vec128<float, N>{__lsx_vfrintrp_s(v.raw)}; 4885 } 4886 template <size_t N> 4887 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) { 4888 return Vec128<double, N>{__lsx_vfrintrp_d(v.raw)}; 4889 } 4890 // Toward -infinity, aka floor 4891 template <size_t N> 4892 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { 4893 return Vec128<float, N>{__lsx_vfrintrm_s(v.raw)}; 4894 } 4895 template <size_t N> 4896 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) { 4897 return Vec128<double, N>{__lsx_vfrintrm_d(v.raw)}; 4898 } 4899 4900 // ------------------------------ Floating-point classification 4901 4902 // FIXME: disable gcc-14 tree-based loop optimizations to prevent 4903 // 'HighwayTestGroup/HighwayTest.TestAllIsNaN/LSX' failures 4904 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG 4905 #pragma GCC push_options 4906 #pragma GCC optimize("-fno-tree-loop-optimize") 4907 #endif 4908 4909 template <size_t N> 4910 HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) { 4911 return Mask128<float, N>{ 4912 reinterpret_cast<__m128>(__lsx_vfcmp_cune_s(v.raw, v.raw))}; 4913 } 4914 4915 template <size_t N> 4916 HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) { 4917 return Mask128<double, N>{ 4918 reinterpret_cast<__m128d>(__lsx_vfcmp_cune_d(v.raw, v.raw))}; 4919 } 4920 4921 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG 4922 #pragma GCC pop_options 4923 #endif 4924 4925 #ifdef HWY_NATIVE_IS_EITHER_NAN 4926 #undef HWY_NATIVE_IS_EITHER_NAN 4927 #else 4928 #define HWY_NATIVE_IS_EITHER_NAN 4929 #endif 4930 4931 template <size_t N> 4932 HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) { 4933 return Mask128<float, N>{ 4934 reinterpret_cast<__m128>(__lsx_vfcmp_cun_s(a.raw, b.raw))}; 4935 } 4936 4937 template <size_t N> 4938 HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a, 4939 Vec128<double, N> b) { 4940 __m128i _tmp = __lsx_vor_v(__lsx_vfcmp_cune_d(a.raw, a.raw), 4941 __lsx_vfcmp_cune_d(b.raw, b.raw)); 4942 return Mask128<double, N>{reinterpret_cast<__m128d>(_tmp)}; 4943 } 4944 4945 #ifdef HWY_NATIVE_ISINF 4946 #undef HWY_NATIVE_ISINF 4947 #else 4948 #define HWY_NATIVE_ISINF 4949 #endif 4950 4951 template <class V> 4952 HWY_API MFromD<DFromV<V>> IsInf(V v) { 4953 using T = TFromV<V>; 4954 4955 static_assert(IsFloat<T>(), "Only for float"); 4956 using TU = MakeUnsigned<T>; 4957 const DFromV<decltype(v)> d; 4958 const RebindToUnsigned<decltype(d)> du; 4959 const VFromD<decltype(du)> vu = BitCast(du, v); 4960 // 'Shift left' to clear the sign bit, check for exponent=max and 4961 // mantissa=0. 4962 return RebindMask( 4963 d, 4964 Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>())))); 4965 } 4966 4967 // Returns whether normal/subnormal/zero. 4968 template <class V> 4969 HWY_API MFromD<DFromV<V>> IsFinite(V v) { 4970 using T = TFromV<V>; 4971 4972 static_assert(IsFloat<T>(), "Only for float"); 4973 using TU = MakeUnsigned<T>; 4974 const DFromV<decltype(v)> d; 4975 const RebindToUnsigned<decltype(d)> du; 4976 const VFromD<decltype(du)> vu = BitCast(du, v); 4977 // 'Shift left' to clear the sign bit, check for exponent<max. 4978 return RebindMask( 4979 d, 4980 Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>())))); 4981 } 4982 4983 // ================================================== MISC 4984 4985 // ------------------------------ LoadMaskBits (TestBit) 4986 4987 namespace detail { 4988 4989 template <class D, HWY_IF_T_SIZE_D(D, 1)> 4990 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 4991 const RebindToUnsigned<decltype(d)> du; 4992 // Easier than Set(), which would require an >8-bit type, which would not 4993 // compile for T=uint8_t, N=1. 4994 const VFromD<D> vbits{__lsx_vreplgr2vr_w(static_cast<int32_t>(bits))}; 4995 4996 // Replicate bytes 8x such that each byte contains the bit that governs it. 4997 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 4998 1, 1, 1, 1, 1, 1, 1, 1}; 4999 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); 5000 5001 alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 5002 1, 2, 4, 8, 16, 32, 64, 128}; 5003 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); 5004 } 5005 5006 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5007 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 5008 const RebindToUnsigned<decltype(d)> du; 5009 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; 5010 return RebindMask( 5011 d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit))); 5012 } 5013 5014 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5015 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 5016 const RebindToUnsigned<decltype(d)> du; 5017 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; 5018 return RebindMask( 5019 d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit))); 5020 } 5021 5022 template <class D, HWY_IF_T_SIZE_D(D, 8)> 5023 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 5024 const RebindToUnsigned<decltype(d)> du; 5025 alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; 5026 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); 5027 } 5028 5029 } // namespace detail 5030 5031 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5032 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 5033 uint64_t mask_bits = 0; 5034 CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits); 5035 return detail::LoadMaskBits(d, mask_bits); 5036 } 5037 5038 // ------------------------------ Dup128MaskFromMaskBits 5039 5040 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5041 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5042 constexpr size_t kN = MaxLanes(d); 5043 if (kN < 8) mask_bits &= (1u << kN) - 1; 5044 return detail::LoadMaskBits(d, mask_bits); 5045 } 5046 5047 template <typename T> 5048 struct CompressIsPartition { 5049 enum { value = (sizeof(T) != 1) }; 5050 }; 5051 5052 // ------------------------------ BitsFromMask 5053 5054 namespace detail { 5055 5056 template <class D> 5057 constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { 5058 return (d.MaxBytes() >= 16) ? mask_bits 5059 : mask_bits & ((1ull << d.MaxLanes()) - 1); 5060 } 5061 5062 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { 5063 return static_cast<uint64_t>(static_cast<unsigned>(mask_bits)); 5064 } 5065 5066 } // namespace detail 5067 5068 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)> 5069 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5070 return detail::OnlyActive( 5071 d, detail::U64FromInt(__lsx_vpickve2gr_w(__lsx_vmskltz_b(mask.raw), 0))); 5072 } 5073 5074 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)> 5075 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5076 return detail::OnlyActive( 5077 d, detail::U64FromInt(__lsx_vpickve2gr_w(__lsx_vmskltz_h(mask.raw), 0))); 5078 } 5079 5080 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)> 5081 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5082 return detail::OnlyActive( 5083 d, detail::U64FromInt(__lsx_vpickve2gr_w( 5084 __lsx_vmskltz_w(reinterpret_cast<__m128i>(mask.raw)), 0))); 5085 } 5086 5087 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)> 5088 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5089 return detail::OnlyActive( 5090 d, detail::U64FromInt(__lsx_vpickve2gr_w( 5091 __lsx_vmskltz_d(reinterpret_cast<__m128i>(mask.raw)), 0))); 5092 } 5093 5094 // ------------------------------ StoreMaskBits 5095 // `p` points to at least 8 writable bytes. 5096 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5097 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 5098 constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; 5099 const uint64_t mask_bits = BitsFromMask(d, mask); 5100 CopyBytes<kNumBytes>(&mask_bits, bits); 5101 return kNumBytes; 5102 } 5103 5104 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5105 HWY_API bool AllFalse(D d, MFromD<D> mask) { 5106 return BitsFromMask(d, mask) == 0; 5107 } 5108 5109 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5110 HWY_API bool AllTrue(D d, MFromD<D> mask) { 5111 constexpr size_t kN = MaxLanes(d); 5112 constexpr uint64_t kAllBits = (1ull << kN) - 1; 5113 return BitsFromMask(d, mask) == kAllBits; 5114 } 5115 5116 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5117 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 5118 return PopCount(BitsFromMask(d, mask)); 5119 } 5120 5121 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5122 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 5123 return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask)); 5124 } 5125 5126 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5127 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 5128 const uint64_t mask_bits = BitsFromMask(d, mask); 5129 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; 5130 } 5131 5132 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5133 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 5134 return 31 - Num0BitsAboveMS1Bit_Nonzero32( 5135 static_cast<uint32_t>(BitsFromMask(d, mask))); 5136 } 5137 5138 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5139 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 5140 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 5141 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) 5142 : -1; 5143 } 5144 5145 // ------------------------------ Compress, CompressBits 5146 5147 namespace detail { 5148 5149 // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. 5150 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5151 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 5152 HWY_DASSERT(mask_bits < 256); 5153 const Rebind<uint8_t, decltype(d)> d8; 5154 const Twice<decltype(d8)> d8t; 5155 const RebindToUnsigned<decltype(d)> du; 5156 5157 alignas(16) static constexpr uint8_t table[2048] = { 5158 // PrintCompress16x8Tables 5159 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5160 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5161 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 5162 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5163 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 5164 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 5165 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 5166 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5167 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 5168 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 5169 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 5170 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 5171 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 5172 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 5173 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 5174 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5175 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 5176 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 5177 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 5178 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 5179 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 5180 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 5181 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 5182 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 5183 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 5184 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 5185 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 5186 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 5187 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 5188 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 5189 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 5190 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5191 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 5192 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 5193 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 5194 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 5195 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 5196 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 5197 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 5198 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 5199 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 5200 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 5201 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 5202 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 5203 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 5204 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 5205 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 5206 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 5207 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 5208 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 5209 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 5210 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 5211 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 5212 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 5213 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 5214 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 5215 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 5216 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 5217 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 5218 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 5219 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 5220 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 5221 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 5222 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5223 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 5224 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 5225 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 5226 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 5227 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 5228 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 5229 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 5230 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 5231 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 5232 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 5233 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 5234 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 5235 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 5236 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 5237 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 5238 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 5239 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 5240 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 5241 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 5242 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 5243 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 5244 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 5245 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 5246 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 5247 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 5248 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 5249 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 5250 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 5251 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 5252 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 5253 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 5254 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 5255 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 5256 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 5257 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 5258 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 5259 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 5260 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 5261 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 5262 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 5263 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 5264 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 5265 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 5266 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 5267 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 5268 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 5269 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 5270 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 5271 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 5272 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 5273 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 5274 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 5275 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 5276 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 5277 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 5278 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 5279 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 5280 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 5281 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 5282 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 5283 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 5284 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 5285 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 5286 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 5287 5288 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5289 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); 5290 return BitCast(d, pairs + Set(du, 0x0100)); 5291 } 5292 5293 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5294 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 5295 HWY_DASSERT(mask_bits < 256); 5296 const Rebind<uint8_t, decltype(d)> d8; 5297 const Twice<decltype(d8)> d8t; 5298 const RebindToUnsigned<decltype(d)> du; 5299 5300 alignas(16) static constexpr uint8_t table[2048] = { 5301 // PrintCompressNot16x8Tables 5302 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 5303 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 5304 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 5305 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 5306 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 5307 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 5308 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 5309 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 5310 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 5311 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 5312 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 5313 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 5314 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 5315 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 5316 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 5317 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 5318 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 5319 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 5320 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 5321 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 5322 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 5323 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 5324 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 5325 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 5326 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 5327 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 5328 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 5329 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 5330 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 5331 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 5332 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 5333 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 5334 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 5335 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 5336 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 5337 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 5338 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 5339 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 5340 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 5341 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 5342 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 5343 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 5344 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 5345 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 5346 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 5347 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 5348 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 5349 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 5350 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 5351 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 5352 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 5353 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 5354 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 5355 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 5356 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 5357 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 5358 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 5359 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 5360 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 5361 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 5362 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 5363 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 5364 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 5365 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 5366 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 5367 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 5368 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 5369 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 5370 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 5371 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 5372 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 5373 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 5374 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 5375 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 5376 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 5377 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 5378 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 5379 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 5380 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 5381 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 5382 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 5383 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 5384 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 5385 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 5386 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 5387 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 5388 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 5389 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 5390 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 5391 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 5392 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 5393 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 5394 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 5395 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 5396 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 5397 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 5398 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 5399 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 5400 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 5401 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 5402 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 5403 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 5404 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 5405 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 5406 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 5407 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 5408 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 5409 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 5410 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 5411 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 5412 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 5413 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 5414 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 5415 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 5416 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 5417 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 5418 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 5419 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 5420 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 5421 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 5422 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 5423 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 5424 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 5425 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 5426 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 5427 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 5428 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 5429 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 5430 5431 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5432 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); 5433 return BitCast(d, pairs + Set(du, 0x0100)); 5434 } 5435 5436 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5437 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 5438 HWY_DASSERT(mask_bits < 16); 5439 5440 // There are only 4 lanes, so we can afford to load the index vector directly. 5441 alignas(16) static constexpr uint8_t u8_indices[256] = { 5442 // PrintCompress32x4Tables 5443 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5444 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5445 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 5446 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5447 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 5448 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 5449 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 5450 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5451 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 5452 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 5453 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 5454 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 5455 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 5456 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 5457 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 5458 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5459 5460 const Repartition<uint8_t, decltype(d)> d8; 5461 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5462 } 5463 5464 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5465 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 5466 HWY_DASSERT(mask_bits < 16); 5467 5468 // There are only 4 lanes, so we can afford to load the index vector directly. 5469 alignas(16) static constexpr uint8_t u8_indices[256] = { 5470 // PrintCompressNot32x4Tables 5471 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 5472 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 5473 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 5474 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 5475 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 5476 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 5477 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5478 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5479 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 5480 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 5481 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 5482 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 5483 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5484 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 5485 12, 13, 14, 15}; 5486 5487 const Repartition<uint8_t, decltype(d)> d8; 5488 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5489 } 5490 5491 template <class D, HWY_IF_T_SIZE_D(D, 8)> 5492 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 5493 HWY_DASSERT(mask_bits < 4); 5494 5495 // There are only 2 lanes, so we can afford to load the index vector directly. 5496 alignas(16) static constexpr uint8_t u8_indices[64] = { 5497 // PrintCompress64x2Tables 5498 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5499 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5500 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 5501 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5502 5503 const Repartition<uint8_t, decltype(d)> d8; 5504 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5505 } 5506 5507 template <class D, HWY_IF_T_SIZE_D(D, 8)> 5508 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 5509 HWY_DASSERT(mask_bits < 4); 5510 5511 // There are only 2 lanes, so we can afford to load the index vector directly. 5512 alignas(16) static constexpr uint8_t u8_indices[64] = { 5513 // PrintCompressNot64x2Tables 5514 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5515 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 5516 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5517 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5518 5519 const Repartition<uint8_t, decltype(d)> d8; 5520 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5521 } 5522 5523 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 5524 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) { 5525 const DFromV<decltype(v)> d; 5526 const RebindToUnsigned<decltype(d)> du; 5527 5528 HWY_DASSERT(mask_bits < (1ull << N)); 5529 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 5530 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5531 } 5532 5533 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 5534 HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) { 5535 const DFromV<decltype(v)> d; 5536 const RebindToUnsigned<decltype(d)> du; 5537 5538 HWY_DASSERT(mask_bits < (1ull << N)); 5539 const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); 5540 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5541 } 5542 5543 } // namespace detail 5544 5545 // Single lane: no-op 5546 template <typename T> 5547 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 5548 return v; 5549 } 5550 5551 // Two lanes: conditional swap 5552 template <typename T, HWY_IF_T_SIZE(T, 8)> 5553 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { 5554 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. 5555 const DFromV<decltype(v)> d; 5556 const Vec128<T> m = VecFromMask(d, mask); 5557 const Vec128<T> maskL = DupEven(m); 5558 const Vec128<T> maskH = DupOdd(m); 5559 const Vec128<T> swap = AndNot(maskL, maskH); 5560 return IfVecThenElse(swap, Shuffle01(v), v); 5561 } 5562 5563 // General case, 2 or 4 bytes 5564 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 5565 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 5566 const DFromV<decltype(v)> d; 5567 return detail::CompressBits(v, BitsFromMask(d, mask)); 5568 } 5569 5570 // ------------------------------ CompressNot 5571 5572 // Single lane: no-op 5573 template <typename T> 5574 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 5575 return v; 5576 } 5577 5578 // Two lanes: conditional swap 5579 template <typename T, HWY_IF_T_SIZE(T, 8)> 5580 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { 5581 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. 5582 const DFromV<decltype(v)> d; 5583 const Vec128<T> m = VecFromMask(d, mask); 5584 const Vec128<T> maskL = DupEven(m); 5585 const Vec128<T> maskH = DupOdd(m); 5586 const Vec128<T> swap = AndNot(maskH, maskL); 5587 return IfVecThenElse(swap, Shuffle01(v), v); 5588 } 5589 5590 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 5591 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 5592 const DFromV<decltype(v)> d; 5593 // For partial vectors, we cannot pull the Not() into the table because 5594 // BitsFromMask clears the upper bits. 5595 if (N < 16 / sizeof(T)) { 5596 return detail::CompressBits(v, BitsFromMask(d, Not(mask))); 5597 } 5598 return detail::CompressNotBits(v, BitsFromMask(d, mask)); 5599 } 5600 5601 // ------------------------------ CompressBlocksNot 5602 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 5603 Mask128<uint64_t> /* m */) { 5604 return v; 5605 } 5606 5607 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 5608 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 5609 const uint8_t* HWY_RESTRICT bits) { 5610 uint64_t mask_bits = 0; 5611 constexpr size_t kNumBytes = (N + 7) / 8; 5612 CopyBytes<kNumBytes>(bits, &mask_bits); 5613 if (N < 8) { 5614 mask_bits &= (1ull << N) - 1; 5615 } 5616 5617 return detail::CompressBits(v, mask_bits); 5618 } 5619 5620 // ------------------------------ CompressStore, CompressBitsStore 5621 5622 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> 5623 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, 5624 TFromD<D>* HWY_RESTRICT unaligned) { 5625 const RebindToUnsigned<decltype(d)> du; 5626 5627 const uint64_t mask_bits = BitsFromMask(d, m); 5628 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); 5629 const size_t count = PopCount(mask_bits); 5630 5631 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 5632 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5633 StoreU(compressed, d, unaligned); 5634 detail::MaybeUnpoison(unaligned, count); 5635 return count; 5636 } 5637 5638 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> 5639 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 5640 TFromD<D>* HWY_RESTRICT unaligned) { 5641 const RebindToUnsigned<decltype(d)> du; 5642 5643 const uint64_t mask_bits = BitsFromMask(d, m); 5644 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); 5645 const size_t count = PopCount(mask_bits); 5646 5647 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 5648 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5649 BlendedStore(compressed, FirstN(d, count), d, unaligned); 5650 detail::MaybeUnpoison(unaligned, count); 5651 return count; 5652 } 5653 5654 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> 5655 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 5656 D d, TFromD<D>* HWY_RESTRICT unaligned) { 5657 const RebindToUnsigned<decltype(d)> du; 5658 5659 uint64_t mask_bits = 0; 5660 constexpr size_t kN = MaxLanes(d); 5661 constexpr size_t kNumBytes = (kN + 7) / 8; 5662 CopyBytes<kNumBytes>(bits, &mask_bits); 5663 if (kN < 8) { 5664 mask_bits &= (1ull << kN) - 1; 5665 } 5666 const size_t count = PopCount(mask_bits); 5667 5668 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 5669 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 5670 StoreU(compressed, d, unaligned); 5671 5672 detail::MaybeUnpoison(unaligned, count); 5673 return count; 5674 } 5675 5676 // ------------------------------ StoreInterleaved2/3/4 5677 5678 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in 5679 // generic_ops-inl.h. 5680 5681 // ------------------------------ Additional mask logical operations 5682 5683 template <class T> 5684 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { 5685 return mask; 5686 } 5687 template <class T> 5688 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) { 5689 const FixedTag<T, 2> d; 5690 const auto vmask = VecFromMask(d, mask); 5691 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); 5692 } 5693 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 5694 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 5695 const Simd<T, N, 0> d; 5696 const auto vmask = VecFromMask(d, mask); 5697 const auto neg_vmask = 5698 ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask))); 5699 return MaskFromVec(Or(vmask, neg_vmask)); 5700 } 5701 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 5702 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) { 5703 const Full128<T> d; 5704 const Repartition<int64_t, decltype(d)> di64; 5705 5706 auto vmask = BitCast(di64, VecFromMask(d, mask)); 5707 VFromD<decltype(di64)> neg_vmask{__lsx_vsub_q(Zero(di64).raw, vmask.raw)}; 5708 5709 return MaskFromVec(BitCast(d, Or(vmask, neg_vmask))); 5710 } 5711 5712 template <class T, size_t N> 5713 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 5714 return Not(SetAtOrAfterFirst(mask)); 5715 } 5716 5717 template <class T> 5718 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) { 5719 return mask; 5720 } 5721 template <class T> 5722 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) { 5723 const FixedTag<T, 2> d; 5724 const RebindToSigned<decltype(d)> di; 5725 5726 const auto vmask = BitCast(di, VecFromMask(d, mask)); 5727 const auto zero = Zero(di); 5728 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); 5729 return MaskFromVec(BitCast(d, And(vmask, vmask2))); 5730 } 5731 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 5732 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 5733 const Simd<T, N, 0> d; 5734 const RebindToSigned<decltype(d)> di; 5735 5736 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask)); 5737 const auto only_first_vmask = 5738 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); 5739 return MaskFromVec(only_first_vmask); 5740 } 5741 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 5742 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) { 5743 const Full128<T> d; 5744 const RebindToSigned<decltype(d)> di; 5745 5746 auto vmask = BitCast(di, VecFromMask(d, mask)); 5747 VFromD<decltype(di)> neg_vmask{__lsx_vsub_q(Zero(di).raw, vmask.raw)}; 5748 5749 return MaskFromVec(BitCast(d, Neg(And(vmask, neg_vmask)))); 5750 } 5751 5752 template <class T> 5753 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) { 5754 const FixedTag<T, 1> d; 5755 const RebindToSigned<decltype(d)> di; 5756 using TI = MakeSigned<T>; 5757 5758 return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); 5759 } 5760 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> 5761 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 5762 const Simd<T, N, 0> d; 5763 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); 5764 } 5765 5766 // ------------------------------ Reductions 5767 #undef HWY_IF_SUM_OF_LANES_D 5768 #define HWY_IF_SUM_OF_LANES_D(D) \ 5769 HWY_IF_LANES_GT_D(D, 1), \ 5770 hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \ 5771 (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \ 5772 nullptr 5773 // ------------------------------ SumOfLanes 5774 5775 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)> 5776 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 5777 return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF)); 5778 } 5779 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)> 5780 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 5781 const Repartition<uint64_t, decltype(d)> d64; 5782 VFromD<decltype(d64)> sums = SumsOf8(v); 5783 sums = SumOfLanes(d64, sums); 5784 return Broadcast<0>(BitCast(d, sums)); 5785 } 5786 5787 // ------------------------------ Lt128 5788 5789 namespace detail { 5790 5791 // Returns vector-mask for Lt128. Generic for all vector lengths. 5792 template <class D, HWY_IF_U64_D(D)> 5793 HWY_INLINE VFromD<D> Lt128Vec(const D d, VFromD<D> a, VFromD<D> b) { 5794 // Truth table of Eq and Lt for Hi and Lo u64. 5795 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) 5796 // =H =L cH cL | out = cH | (=H & cL) 5797 // 0 0 0 0 | 0 5798 // 0 0 0 1 | 0 5799 // 0 0 1 0 | 1 5800 // 0 0 1 1 | 1 5801 // 0 1 0 0 | 0 5802 // 0 1 0 1 | 0 5803 // 0 1 1 0 | 1 5804 // 1 0 0 0 | 0 5805 // 1 0 0 1 | 1 5806 // 1 1 0 0 | 0 5807 const auto eqHL = Eq(a, b); 5808 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 5809 const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL); 5810 const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL); 5811 return InterleaveUpper(d, vecHx, vecHx); 5812 } 5813 5814 // Returns vector-mask for Eq128. Generic for all vector lengths. 5815 template <class D, HWY_IF_U64_D(D)> 5816 HWY_INLINE VFromD<D> Eq128Vec(D d, VFromD<D> a, VFromD<D> b) { 5817 const auto eqHL = VecFromMask(d, Eq(a, b)); 5818 const auto eqLH = Reverse2(d, eqHL); 5819 return And(eqHL, eqLH); 5820 } 5821 5822 template <class D, HWY_IF_U64_D(D)> 5823 HWY_INLINE VFromD<D> Ne128Vec(D d, VFromD<D> a, VFromD<D> b) { 5824 const auto neHL = VecFromMask(d, Ne(a, b)); 5825 const auto neLH = Reverse2(d, neHL); 5826 return Or(neHL, neLH); 5827 } 5828 5829 template <class D, HWY_IF_U64_D(D)> 5830 HWY_INLINE VFromD<D> Lt128UpperVec(D d, VFromD<D> a, VFromD<D> b) { 5831 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 5832 return InterleaveUpper(d, ltHL, ltHL); 5833 } 5834 5835 template <class D, HWY_IF_U64_D(D)> 5836 HWY_INLINE VFromD<D> Eq128UpperVec(D d, VFromD<D> a, VFromD<D> b) { 5837 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 5838 return InterleaveUpper(d, eqHL, eqHL); 5839 } 5840 5841 template <class D, HWY_IF_U64_D(D)> 5842 HWY_INLINE VFromD<D> Ne128UpperVec(D d, VFromD<D> a, VFromD<D> b) { 5843 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 5844 return InterleaveUpper(d, neHL, neHL); 5845 } 5846 5847 } // namespace detail 5848 5849 template <class D, HWY_IF_U64_D(D)> 5850 HWY_API MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { 5851 return MaskFromVec(detail::Lt128Vec(d, a, b)); 5852 } 5853 5854 template <class D, HWY_IF_U64_D(D)> 5855 HWY_API MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { 5856 return MaskFromVec(detail::Eq128Vec(d, a, b)); 5857 } 5858 5859 template <class D, HWY_IF_U64_D(D)> 5860 HWY_API MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { 5861 return MaskFromVec(detail::Ne128Vec(d, a, b)); 5862 } 5863 5864 template <class D, HWY_IF_U64_D(D)> 5865 HWY_API MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { 5866 return MaskFromVec(detail::Lt128UpperVec(d, a, b)); 5867 } 5868 5869 template <class D, HWY_IF_U64_D(D)> 5870 HWY_API MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { 5871 return MaskFromVec(detail::Eq128UpperVec(d, a, b)); 5872 } 5873 5874 template <class D, HWY_IF_U64_D(D)> 5875 HWY_API MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { 5876 return MaskFromVec(detail::Ne128UpperVec(d, a, b)); 5877 } 5878 5879 // ------------------------------ Min128, Max128 (Lt128) 5880 5881 // Avoids the extra MaskFromVec in Lt128. 5882 template <class D, HWY_IF_U64_D(D)> 5883 HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { 5884 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); 5885 } 5886 5887 template <class D, HWY_IF_U64_D(D)> 5888 HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { 5889 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); 5890 } 5891 5892 template <class D, HWY_IF_U64_D(D)> 5893 HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { 5894 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); 5895 } 5896 5897 template <class D, HWY_IF_U64_D(D)> 5898 HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { 5899 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); 5900 } 5901 5902 // -------------------- LeadingZeroCount, TrailingZeroCount, 5903 // HighestSetBitIndex 5904 5905 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT 5906 #undef HWY_NATIVE_LEADING_ZERO_COUNT 5907 #else 5908 #define HWY_NATIVE_LEADING_ZERO_COUNT 5909 #endif 5910 5911 template <class V, HWY_IF_UI8_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> 5912 HWY_API V LeadingZeroCount(V v) { 5913 return V{__lsx_vclz_b(v.raw)}; 5914 } 5915 5916 template <class V, HWY_IF_UI16_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> 5917 HWY_API V LeadingZeroCount(V v) { 5918 return V{__lsx_vclz_h(v.raw)}; 5919 } 5920 5921 template <class V, HWY_IF_UI32_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> 5922 HWY_API V LeadingZeroCount(V v) { 5923 return V{__lsx_vclz_w(v.raw)}; 5924 } 5925 5926 template <class V, HWY_IF_UI64_D(DFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> 5927 HWY_API V LeadingZeroCount(V v) { 5928 return V{__lsx_vclz_d(v.raw)}; 5929 } 5930 5931 template <class V, HWY_IF_V_SIZE_LE_V(V, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5932 HWY_API V HighestSetBitIndex(V v) { 5933 const DFromV<decltype(v)> d; 5934 using T = TFromD<decltype(d)>; 5935 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); 5936 } 5937 5938 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5939 HWY_API V TrailingZeroCount(V v) { 5940 const DFromV<decltype(v)> d; 5941 const RebindToSigned<decltype(d)> di; 5942 using T = TFromD<decltype(d)>; 5943 5944 const auto lsb = And(v, BitCast(d, Neg(BitCast(di, v)))); 5945 return IfThenElse(Eq(v, Zero(d)), Set(d, T{sizeof(T) * 8}), 5946 HighestSetBitIndex(lsb)); 5947 } 5948 5949 } // namespace HWY_NAMESPACE 5950 } // namespace hwy 5951 5952 HWY_AFTER_NAMESPACE(); 5953 5954 #undef HWY_LSX_IF_EMULATED_D