ppc_vsx-inl.h (276418B)
1 // Copyright 2023 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // 128-bit vectors for VSX/Z14 17 // External include guard in highway.h - see comment there. 18 19 #if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 20 #define HWY_S390X_HAVE_Z14 1 21 #else 22 #define HWY_S390X_HAVE_Z14 0 23 #endif 24 25 #pragma push_macro("vector") 26 #pragma push_macro("pixel") 27 #pragma push_macro("bool") 28 29 #undef vector 30 #undef pixel 31 #undef bool 32 33 #if HWY_S390X_HAVE_Z14 34 #include <vecintrin.h> 35 #else 36 #include <altivec.h> 37 #endif 38 39 #pragma pop_macro("vector") 40 #pragma pop_macro("pixel") 41 #pragma pop_macro("bool") 42 43 #include "hwy/ops/shared-inl.h" 44 45 // clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and 46 // some GCC do the same for _ARCH_PWR10. 47 // This means we can only use POWER10-specific intrinsics in static dispatch 48 // mode (where the -mpower10-vector compiler flag is passed). Same for PPC9. 49 // On other compilers, the usual target check is sufficient. 50 #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \ 51 (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__)) 52 #define HWY_PPC_HAVE_9 1 53 #else 54 #define HWY_PPC_HAVE_9 0 55 #endif 56 57 #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \ 58 (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__)) 59 #define HWY_PPC_HAVE_10 1 60 #else 61 #define HWY_PPC_HAVE_10 0 62 #endif 63 64 #if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13 65 #define HWY_S390X_HAVE_Z15 1 66 #else 67 #define HWY_S390X_HAVE_Z15 0 68 #endif 69 70 HWY_BEFORE_NAMESPACE(); 71 namespace hwy { 72 namespace HWY_NAMESPACE { 73 namespace detail { 74 75 template <typename T> 76 struct Raw128; 77 78 // Each Raw128 specialization defines the following typedefs: 79 // - type: 80 // the backing Altivec/VSX raw vector type of the Vec128<T, N> type 81 // - RawBoolVec: 82 // the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type 83 // - RawT: 84 // the lane type for intrinsics, in particular vec_splat 85 // - AlignedRawVec: 86 // the 128-bit GCC/Clang vector type for aligned loads/stores 87 // - UnalignedRawVec: 88 // the 128-bit GCC/Clang vector type for unaligned loads/stores 89 #define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \ 90 template <> \ 91 struct Raw128<LANE_TYPE> { \ 92 using type = __vector RAW_VECT_LANE_TYPE; \ 93 using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \ 94 using RawT = RAW_VECT_LANE_TYPE; \ 95 typedef LANE_TYPE AlignedRawVec \ 96 __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \ 97 typedef LANE_TYPE UnalignedRawVec __attribute__(( \ 98 __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \ 99 }; 100 101 HWY_VSX_RAW128(int8_t, signed char, char) 102 HWY_VSX_RAW128(uint8_t, unsigned char, char) 103 HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int) 104 HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int) 105 HWY_VSX_RAW128(int32_t, signed int, int) 106 HWY_VSX_RAW128(uint32_t, unsigned int, int) 107 HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int) 108 HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int) 109 HWY_VSX_RAW128(float, float, int) 110 HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int) 111 112 template <> 113 struct Raw128<bfloat16_t> : public Raw128<uint16_t> {}; 114 115 template <> 116 struct Raw128<float16_t> : public Raw128<uint16_t> {}; 117 118 #undef HWY_VSX_RAW128 119 120 } // namespace detail 121 122 template <typename T, size_t N = 16 / sizeof(T)> 123 class Vec128 { 124 using Raw = typename detail::Raw128<T>::type; 125 126 public: 127 using PrivateT = T; // only for DFromV 128 static constexpr size_t kPrivateN = N; // only for DFromV 129 130 // Compound assignment. Only usable if there is a corresponding non-member 131 // binary operator overload. For example, only f32 and f64 support division. 132 HWY_INLINE Vec128& operator*=(const Vec128 other) { 133 return *this = (*this * other); 134 } 135 HWY_INLINE Vec128& operator/=(const Vec128 other) { 136 return *this = (*this / other); 137 } 138 HWY_INLINE Vec128& operator+=(const Vec128 other) { 139 return *this = (*this + other); 140 } 141 HWY_INLINE Vec128& operator-=(const Vec128 other) { 142 return *this = (*this - other); 143 } 144 HWY_INLINE Vec128& operator%=(const Vec128 other) { 145 return *this = (*this % other); 146 } 147 HWY_INLINE Vec128& operator&=(const Vec128 other) { 148 return *this = (*this & other); 149 } 150 HWY_INLINE Vec128& operator|=(const Vec128 other) { 151 return *this = (*this | other); 152 } 153 HWY_INLINE Vec128& operator^=(const Vec128 other) { 154 return *this = (*this ^ other); 155 } 156 157 Raw raw; 158 }; 159 160 template <typename T> 161 using Vec64 = Vec128<T, 8 / sizeof(T)>; 162 163 template <typename T> 164 using Vec32 = Vec128<T, 4 / sizeof(T)>; 165 166 template <typename T> 167 using Vec16 = Vec128<T, 2 / sizeof(T)>; 168 169 // FF..FF or 0. 170 template <typename T, size_t N = 16 / sizeof(T)> 171 struct Mask128 { 172 typename detail::Raw128<T>::RawBoolVec raw; 173 174 using PrivateT = T; // only for DFromM 175 static constexpr size_t kPrivateN = N; // only for DFromM 176 }; 177 178 template <class V> 179 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 180 181 template <class M> 182 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 183 184 template <class V> 185 using TFromV = typename V::PrivateT; 186 187 // ------------------------------ Zero 188 189 // Returns an all-zero vector/part. 190 template <class D, typename T = TFromD<D>> 191 HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 192 // There is no vec_splats for 64-bit, so we cannot rely on casting the 0 193 // argument in order to select the correct overload. We instead cast the 194 // return vector type; see also the comment in BitCast. 195 return Vec128<T, HWY_MAX_LANES_D(D)>{ 196 reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))}; 197 } 198 199 template <class D> 200 using VFromD = decltype(Zero(D())); 201 202 // ------------------------------ BitCast 203 204 template <class D, typename FromT> 205 HWY_API VFromD<D> BitCast(D /*d*/, 206 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { 207 // C-style casts are not sufficient when compiling with 208 // -fno-lax-vector-conversions, which will be the future default in Clang, 209 // but reinterpret_cast is. 210 return VFromD<D>{ 211 reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)}; 212 } 213 214 // ------------------------------ ResizeBitCast 215 216 template <class D, typename FromV> 217 HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) { 218 // C-style casts are not sufficient when compiling with 219 // -fno-lax-vector-conversions, which will be the future default in Clang, 220 // but reinterpret_cast is. 221 return VFromD<D>{ 222 reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)}; 223 } 224 225 // ------------------------------ Set 226 227 // Returns a vector/part with all lanes set to "t". 228 template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)> 229 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 230 using RawLane = typename detail::Raw128<TFromD<D>>::RawT; 231 return VFromD<D>{vec_splats(static_cast<RawLane>(t))}; 232 } 233 234 template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)> 235 HWY_API VFromD<D> Set(D d, TFromD<D> t) { 236 const RebindToUnsigned<decltype(d)> du; 237 return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t))); 238 } 239 240 // Returns a vector with uninitialized elements. 241 template <class D> 242 HWY_API VFromD<D> Undefined(D d) { 243 #if HWY_COMPILER_GCC_ACTUAL 244 // Suppressing maybe-uninitialized both here and at the caller does not work, 245 // so initialize. 246 return Zero(d); 247 #elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value) 248 return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)}; 249 #else 250 HWY_DIAGNOSTICS(push) 251 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 252 typename detail::Raw128<TFromD<D>>::type raw; 253 return VFromD<decltype(d)>{raw}; 254 HWY_DIAGNOSTICS(pop) 255 #endif 256 } 257 258 // ------------------------------ GetLane 259 260 // Gets the single value stored in a vector/part. 261 262 template <typename T, size_t N> 263 HWY_API T GetLane(Vec128<T, N> v) { 264 return static_cast<T>(v.raw[0]); 265 } 266 267 // ------------------------------ Dup128VecFromValues 268 269 template <class D, HWY_IF_T_SIZE_D(D, 1)> 270 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 271 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 272 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 273 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 274 TFromD<D> t11, TFromD<D> t12, 275 TFromD<D> t13, TFromD<D> t14, 276 TFromD<D> t15) { 277 const typename detail::Raw128<TFromD<D>>::type raw = { 278 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15}; 279 return VFromD<D>{raw}; 280 } 281 282 template <class D, HWY_IF_UI16_D(D)> 283 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 284 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 285 TFromD<D> t5, TFromD<D> t6, 286 TFromD<D> t7) { 287 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3, 288 t4, t5, t6, t7}; 289 return VFromD<D>{raw}; 290 } 291 292 template <class D, HWY_IF_SPECIAL_FLOAT_D(D)> 293 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 294 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 295 TFromD<D> t5, TFromD<D> t6, 296 TFromD<D> t7) { 297 const RebindToUnsigned<decltype(d)> du; 298 return BitCast( 299 d, Dup128VecFromValues( 300 du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1), 301 BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3), 302 BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5), 303 BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7))); 304 } 305 306 template <class D, HWY_IF_T_SIZE_D(D, 4)> 307 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 308 TFromD<D> t2, TFromD<D> t3) { 309 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3}; 310 return VFromD<D>{raw}; 311 } 312 313 template <class D, HWY_IF_T_SIZE_D(D, 8)> 314 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 315 const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1}; 316 return VFromD<D>{raw}; 317 } 318 319 // ================================================== LOGICAL 320 321 // ------------------------------ And 322 323 template <typename T, size_t N> 324 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { 325 const DFromV<decltype(a)> d; 326 const RebindToUnsigned<decltype(d)> du; 327 using VU = VFromD<decltype(du)>; 328 #if HWY_S390X_HAVE_Z14 329 return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw}); 330 #else 331 return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)}); 332 #endif 333 } 334 335 // ------------------------------ AndNot 336 337 // Returns ~not_mask & mask. 338 template <typename T, size_t N> 339 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { 340 const DFromV<decltype(mask)> d; 341 const RebindToUnsigned<decltype(d)> du; 342 using VU = VFromD<decltype(du)>; 343 return BitCast( 344 d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)}); 345 } 346 347 // ------------------------------ Or 348 349 template <typename T, size_t N> 350 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { 351 const DFromV<decltype(a)> d; 352 const RebindToUnsigned<decltype(d)> du; 353 using VU = VFromD<decltype(du)>; 354 #if HWY_S390X_HAVE_Z14 355 return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw}); 356 #else 357 return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)}); 358 #endif 359 } 360 361 // ------------------------------ Xor 362 363 template <typename T, size_t N> 364 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { 365 const DFromV<decltype(a)> d; 366 const RebindToUnsigned<decltype(d)> du; 367 using VU = VFromD<decltype(du)>; 368 #if HWY_S390X_HAVE_Z14 369 return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw}); 370 #else 371 return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)}); 372 #endif 373 } 374 375 // ------------------------------ Not 376 template <typename T, size_t N> 377 HWY_API Vec128<T, N> Not(Vec128<T, N> v) { 378 const DFromV<decltype(v)> d; 379 const RebindToUnsigned<decltype(d)> du; 380 using VU = VFromD<decltype(du)>; 381 return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)}); 382 } 383 384 // ------------------------------ IsConstantRawAltivecVect 385 namespace detail { 386 387 template <class RawV> 388 static HWY_INLINE bool IsConstantRawAltivecVect( 389 hwy::SizeTag<1> /* lane_size_tag */, RawV v) { 390 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 391 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && 392 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && 393 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && 394 __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && 395 __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && 396 __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && 397 __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]); 398 } 399 400 template <class RawV> 401 static HWY_INLINE bool IsConstantRawAltivecVect( 402 hwy::SizeTag<2> /* lane_size_tag */, RawV v) { 403 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 404 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && 405 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && 406 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]); 407 } 408 409 template <class RawV> 410 static HWY_INLINE bool IsConstantRawAltivecVect( 411 hwy::SizeTag<4> /* lane_size_tag */, RawV v) { 412 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 413 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]); 414 } 415 416 template <class RawV> 417 static HWY_INLINE bool IsConstantRawAltivecVect( 418 hwy::SizeTag<8> /* lane_size_tag */, RawV v) { 419 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]); 420 } 421 422 template <class RawV> 423 static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) { 424 return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v); 425 } 426 427 } // namespace detail 428 429 // ------------------------------ TernaryLogic 430 #if HWY_PPC_HAVE_10 431 namespace detail { 432 433 // NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse 434 // order of the kTernLogOp bits of AVX3 435 // _mm_ternarylogic_epi64(a, b, c, kTernLogOp) 436 template <uint8_t kTernLogOp, class V> 437 HWY_INLINE V TernaryLogic(V a, V b, V c) { 438 const DFromV<decltype(a)> d; 439 const RebindToUnsigned<decltype(d)> du; 440 using VU = VFromD<decltype(du)>; 441 const auto a_raw = BitCast(du, a).raw; 442 const auto b_raw = BitCast(du, b).raw; 443 const auto c_raw = BitCast(du, c).raw; 444 445 #if HWY_COMPILER_GCC_ACTUAL 446 // Use inline assembly on GCC to work around GCC compiler bug 447 typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result; 448 __asm__("xxeval %x0,%x1,%x2,%x3,%4" 449 : "=wa"(raw_ternlog_result) 450 : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw), 451 "n"(static_cast<unsigned>(kTernLogOp)) 452 :); 453 #else 454 const auto raw_ternlog_result = 455 vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp); 456 #endif 457 458 return BitCast(d, VU{raw_ternlog_result}); 459 } 460 461 } // namespace detail 462 #endif // HWY_PPC_HAVE_10 463 464 // ------------------------------ Xor3 465 template <typename T, size_t N> 466 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 467 #if HWY_PPC_HAVE_10 468 #if defined(__OPTIMIZE__) 469 if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) + 470 static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) + 471 static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >= 472 2) { 473 return Xor(x1, Xor(x2, x3)); 474 } else // NOLINT 475 #endif 476 { 477 return detail::TernaryLogic<0x69>(x1, x2, x3); 478 } 479 #else 480 return Xor(x1, Xor(x2, x3)); 481 #endif 482 } 483 484 // ------------------------------ Or3 485 template <typename T, size_t N> 486 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { 487 #if HWY_PPC_HAVE_10 488 #if defined(__OPTIMIZE__) 489 if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) + 490 static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) + 491 static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >= 492 2) { 493 return Or(o1, Or(o2, o3)); 494 } else // NOLINT 495 #endif 496 { 497 return detail::TernaryLogic<0x7F>(o1, o2, o3); 498 } 499 #else 500 return Or(o1, Or(o2, o3)); 501 #endif 502 } 503 504 // ------------------------------ OrAnd 505 template <typename T, size_t N> 506 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { 507 #if HWY_PPC_HAVE_10 508 #if defined(__OPTIMIZE__) 509 if (detail::IsConstantRawAltivecVect(a1.raw) && 510 detail::IsConstantRawAltivecVect(a2.raw)) { 511 return Or(o, And(a1, a2)); 512 } else // NOLINT 513 #endif 514 { 515 return detail::TernaryLogic<0x1F>(o, a1, a2); 516 } 517 #else 518 return Or(o, And(a1, a2)); 519 #endif 520 } 521 522 // ------------------------------ IfVecThenElse 523 template <typename T, size_t N> 524 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, 525 Vec128<T, N> no) { 526 const DFromV<decltype(yes)> d; 527 const RebindToUnsigned<decltype(d)> du; 528 return BitCast( 529 d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw, 530 BitCast(du, mask).raw)}); 531 } 532 533 // ------------------------------ BitwiseIfThenElse 534 535 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE 536 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE 537 #else 538 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE 539 #endif 540 541 template <class V> 542 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { 543 return IfVecThenElse(mask, yes, no); 544 } 545 546 // ------------------------------ Operator overloads (internal-only if float) 547 548 template <typename T, size_t N> 549 HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) { 550 return And(a, b); 551 } 552 553 template <typename T, size_t N> 554 HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) { 555 return Or(a, b); 556 } 557 558 template <typename T, size_t N> 559 HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) { 560 return Xor(a, b); 561 } 562 563 // ------------------------------ PopulationCount 564 565 #ifdef HWY_NATIVE_POPCNT 566 #undef HWY_NATIVE_POPCNT 567 #else 568 #define HWY_NATIVE_POPCNT 569 #endif 570 571 template <typename T, size_t N, HWY_IF_UNSIGNED(T)> 572 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { 573 return Vec128<T, N>{vec_popcnt(v.raw)}; 574 } 575 576 // ================================================== SIGN 577 578 // ------------------------------ Neg 579 580 template <typename T, size_t N, HWY_IF_SIGNED(T)> 581 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) { 582 // If T is an signed integer type, use Zero(d) - v instead of vec_neg to 583 // avoid undefined behavior in the case where v[i] == LimitsMin<T>() 584 const DFromV<decltype(v)> d; 585 return Zero(d) - v; 586 } 587 588 template <typename T, size_t N, HWY_IF_FLOAT3264(T)> 589 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) { 590 #if HWY_S390X_HAVE_Z14 591 return Xor(v, SignBit(DFromV<decltype(v)>())); 592 #else 593 return Vec128<T, N>{vec_neg(v.raw)}; 594 #endif 595 } 596 597 template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)> 598 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 599 return Xor(v, SignBit(DFromV<decltype(v)>())); 600 } 601 602 // ------------------------------ Abs 603 604 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. 605 template <class T, size_t N, HWY_IF_SIGNED(T)> 606 HWY_API Vec128<T, N> Abs(Vec128<T, N> v) { 607 // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to 608 // avoid undefined behavior in the case where v[i] == LimitsMin<T>(). 609 return Max(v, Neg(v)); 610 } 611 612 template <class T, size_t N, HWY_IF_FLOAT3264(T)> 613 HWY_API Vec128<T, N> Abs(Vec128<T, N> v) { 614 return Vec128<T, N>{vec_abs(v.raw)}; 615 } 616 617 // ------------------------------ CopySign 618 619 #if HWY_S390X_HAVE_Z14 620 template <class V> 621 HWY_API V CopySign(const V magn, const V sign) { 622 static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point"); 623 624 const DFromV<decltype(magn)> d; 625 const auto msb = SignBit(d); 626 627 // Truth table for msb, magn, sign | bitwise msb ? sign : mag 628 // 0 0 0 | 0 629 // 0 0 1 | 0 630 // 0 1 0 | 1 631 // 0 1 1 | 1 632 // 1 0 0 | 0 633 // 1 0 1 | 1 634 // 1 1 0 | 0 635 // 1 1 1 | 1 636 return BitwiseIfThenElse(msb, sign, magn); 637 } 638 #else // VSX 639 template <size_t N> 640 HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn, 641 Vec128<float, N> sign) { 642 // Work around compiler bugs that are there with vec_cpsgn on older versions 643 // of GCC/Clang 644 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 645 return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)}; 646 #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ 647 HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp) 648 return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)}; 649 #else 650 return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)}; 651 #endif 652 } 653 654 template <size_t N> 655 HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn, 656 Vec128<double, N> sign) { 657 // Work around compiler bugs that are there with vec_cpsgn on older versions 658 // of GCC/Clang 659 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 660 return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)}; 661 #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ 662 HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp) 663 return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)}; 664 #else 665 return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)}; 666 #endif 667 } 668 #endif // HWY_S390X_HAVE_Z14 669 670 template <typename T, size_t N> 671 HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) { 672 // PPC8 can also handle abs < 0, so no extra action needed. 673 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 674 return CopySign(abs, sign); 675 } 676 677 // ================================================== MEMORY (1) 678 679 // Note: type punning is safe because the types are tagged with may_alias. 680 // (https://godbolt.org/z/fqrWjfjsP) 681 682 // ------------------------------ Load 683 684 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 685 HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { 686 // Suppress the ignoring attributes warning that is generated by 687 // HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC 688 #if HWY_COMPILER_GCC 689 HWY_DIAGNOSTICS(push) 690 HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") 691 #endif 692 693 using LoadRaw = typename detail::Raw128<T>::AlignedRawVec; 694 const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned); 695 using ResultRaw = typename detail::Raw128<T>::type; 696 return Vec128<T>{reinterpret_cast<ResultRaw>(*p)}; 697 698 #if HWY_COMPILER_GCC 699 HWY_DIAGNOSTICS(pop) 700 #endif 701 } 702 703 // Any <= 64 bit 704 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> 705 HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) { 706 using BitsT = UnsignedFromSize<d.MaxBytes()>; 707 708 BitsT bits; 709 const Repartition<BitsT, decltype(d)> d_bits; 710 CopyBytes<d.MaxBytes()>(p, &bits); 711 return BitCast(d, Set(d_bits, bits)); 712 } 713 714 // ================================================== MASK 715 716 // ------------------------------ Mask 717 718 // Mask and Vec are both backed by vector types (true = FF..FF). 719 template <typename T, size_t N> 720 HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) { 721 using Raw = typename detail::Raw128<T>::RawBoolVec; 722 return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)}; 723 } 724 725 template <class D> 726 using MFromD = decltype(MaskFromVec(VFromD<D>())); 727 728 template <typename T, size_t N> 729 HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) { 730 return Vec128<T, N>{ 731 reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)}; 732 } 733 734 template <class D> 735 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { 736 return VFromD<D>{ 737 reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)}; 738 } 739 740 // mask ? yes : no 741 template <typename T, size_t N> 742 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 743 Vec128<T, N> no) { 744 const DFromV<decltype(yes)> d; 745 const RebindToUnsigned<decltype(d)> du; 746 return BitCast(d, VFromD<decltype(du)>{vec_sel( 747 BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)}); 748 } 749 750 // mask ? yes : 0 751 template <typename T, size_t N> 752 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 753 return yes & VecFromMask(DFromV<decltype(yes)>(), mask); 754 } 755 756 // mask ? 0 : no 757 template <typename T, size_t N> 758 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 759 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); 760 } 761 762 // ------------------------------ Mask logical 763 764 template <typename T, size_t N> 765 HWY_API Mask128<T, N> Not(Mask128<T, N> m) { 766 return Mask128<T, N>{vec_nor(m.raw, m.raw)}; 767 } 768 769 template <typename T, size_t N> 770 HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) { 771 #if HWY_S390X_HAVE_Z14 772 return Mask128<T, N>{a.raw & b.raw}; 773 #else 774 return Mask128<T, N>{vec_and(a.raw, b.raw)}; 775 #endif 776 } 777 778 template <typename T, size_t N> 779 HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) { 780 return Mask128<T, N>{vec_andc(b.raw, a.raw)}; 781 } 782 783 template <typename T, size_t N> 784 HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) { 785 #if HWY_S390X_HAVE_Z14 786 return Mask128<T, N>{a.raw | b.raw}; 787 #else 788 return Mask128<T, N>{vec_or(a.raw, b.raw)}; 789 #endif 790 } 791 792 template <typename T, size_t N> 793 HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) { 794 #if HWY_S390X_HAVE_Z14 795 return Mask128<T, N>{a.raw ^ b.raw}; 796 #else 797 return Mask128<T, N>{vec_xor(a.raw, b.raw)}; 798 #endif 799 } 800 801 template <typename T, size_t N> 802 HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) { 803 return Mask128<T, N>{vec_nor(a.raw, b.raw)}; 804 } 805 806 // ------------------------------ ShiftLeftSame 807 808 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 809 HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) { 810 const DFromV<decltype(v)> d; 811 const RebindToUnsigned<decltype(d)> du; 812 using TU = TFromD<decltype(du)>; 813 814 #if HWY_S390X_HAVE_Z14 815 return BitCast(d, 816 VFromD<decltype(du)>{BitCast(du, v).raw 817 << Set(du, static_cast<TU>(bits)).raw}); 818 #else 819 // Do an unsigned vec_sl operation to avoid undefined behavior 820 return BitCast( 821 d, VFromD<decltype(du)>{ 822 vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)}); 823 #endif 824 } 825 826 // ------------------------------ ShiftRightSame 827 828 template <typename T, size_t N, HWY_IF_UNSIGNED(T)> 829 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) { 830 using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT; 831 #if HWY_S390X_HAVE_Z14 832 return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))}; 833 #else 834 return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))}; 835 #endif 836 } 837 838 template <typename T, size_t N, HWY_IF_SIGNED(T)> 839 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) { 840 #if HWY_S390X_HAVE_Z14 841 using TI = typename detail::Raw128<T>::RawT; 842 return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))}; 843 #else 844 using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT; 845 return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))}; 846 #endif 847 } 848 849 // ------------------------------ ShiftLeft 850 851 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 852 HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) { 853 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); 854 return ShiftLeftSame(v, kBits); 855 } 856 857 // ------------------------------ ShiftRight 858 859 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 860 HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) { 861 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); 862 return ShiftRightSame(v, kBits); 863 } 864 865 // ------------------------------ BroadcastSignBit 866 867 template <typename T, size_t N, HWY_IF_SIGNED(T)> 868 HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) { 869 return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1)); 870 } 871 872 // ================================================== SWIZZLE (1) 873 874 // ------------------------------ TableLookupBytes 875 template <typename T, size_t N, typename TI, size_t NI> 876 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes, 877 Vec128<TI, NI> from) { 878 const Repartition<uint8_t, DFromV<decltype(from)>> du8_from; 879 return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>( 880 vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))}; 881 } 882 883 // ------------------------------ TableLookupBytesOr0 884 // For all vector widths; Altivec/VSX needs zero out 885 template <class V, class VI> 886 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { 887 const DFromV<VI> di; 888 Repartition<int8_t, decltype(di)> di8; 889 const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from))); 890 return AndNot(zeroOutMask, TableLookupBytes(bytes, from)); 891 } 892 893 // ------------------------------ Reverse 894 #if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ 895 HWY_COMPILER_GCC_ACTUAL < 900 896 // Workaround for missing vec_reve on Z14 with GCC 8 or earlier 897 template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1), 898 HWY_IF_T_SIZE_D(D, 1)> 899 HWY_API Vec128<T> Reverse(D d, Vec128<T> v) { 900 const Repartition<uint8_t, decltype(d)> du8; 901 return TableLookupBytes( 902 v, BitCast(d, Dup128VecFromValues(du8, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 903 5, 4, 3, 2, 1, 0))); 904 } 905 906 template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1), 907 HWY_IF_T_SIZE_D(D, 2)> 908 HWY_API Vec128<T> Reverse(D d, Vec128<T> v) { 909 const Repartition<uint8_t, decltype(d)> du8; 910 return TableLookupBytes( 911 v, BitCast(d, Dup128VecFromValues(du8, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 912 4, 5, 2, 3, 0, 1))); 913 } 914 915 template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1), 916 HWY_IF_T_SIZE_D(D, 4)> 917 HWY_API Vec128<T> Reverse(D d, Vec128<T> v) { 918 const Repartition<uint8_t, decltype(d)> du8; 919 return TableLookupBytes( 920 v, BitCast(d, Dup128VecFromValues(du8, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 921 6, 7, 0, 1, 2, 3))); 922 } 923 924 template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1), 925 HWY_IF_T_SIZE_D(D, 8)> 926 HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) { 927 return Vec128<T>{vec_sld(v.raw, v.raw, 8)}; 928 } 929 #else 930 template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)> 931 HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) { 932 return Vec128<T>{vec_reve(v.raw)}; 933 } 934 #endif 935 936 // ------------------------------ Shuffles (Reverse) 937 938 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). 939 // Shuffle0321 rotates one lane to the right (the previous least-significant 940 // lane is now most-significant). These could also be implemented via 941 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. 942 943 // Swap 32-bit halves in 64-bit halves. 944 template <typename T, size_t N> 945 HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) { 946 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 947 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 948 const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, 949 12, 13, 14, 15, 8, 9, 10, 11}; 950 return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)}; 951 } 952 953 // These are used by generic_ops-inl to implement LoadInterleaved3. As with 954 // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output 955 // comes from the first argument. 956 namespace detail { 957 958 template <typename T, HWY_IF_T_SIZE(T, 1)> 959 HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) { 960 const __vector unsigned char kShuffle16 = {1, 0, 19, 18}; 961 return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)}; 962 } 963 template <typename T, HWY_IF_T_SIZE(T, 2)> 964 HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) { 965 const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21}; 966 return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)}; 967 } 968 template <typename T, HWY_IF_T_SIZE(T, 4)> 969 HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) { 970 const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, 971 28, 29, 30, 31, 24, 25, 26, 27}; 972 return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)}; 973 } 974 975 template <typename T, HWY_IF_T_SIZE(T, 1)> 976 HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) { 977 const __vector unsigned char kShuffle = {0, 3, 18, 17}; 978 return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)}; 979 } 980 template <typename T, HWY_IF_T_SIZE(T, 2)> 981 HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) { 982 const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19}; 983 return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)}; 984 } 985 template <typename T, HWY_IF_T_SIZE(T, 4)> 986 HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) { 987 const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15, 988 24, 25, 26, 27, 20, 21, 22, 23}; 989 return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)}; 990 } 991 992 template <typename T, HWY_IF_T_SIZE(T, 1)> 993 HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) { 994 const __vector unsigned char kShuffle = {2, 1, 16, 19}; 995 return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)}; 996 } 997 template <typename T, HWY_IF_T_SIZE(T, 2)> 998 HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) { 999 const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23}; 1000 return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)}; 1001 } 1002 template <typename T, HWY_IF_T_SIZE(T, 4)> 1003 HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) { 1004 const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7, 1005 16, 17, 18, 19, 28, 29, 30, 31}; 1006 return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)}; 1007 } 1008 1009 } // namespace detail 1010 1011 // Swap 64-bit halves 1012 template <class T, HWY_IF_T_SIZE(T, 4)> 1013 HWY_API Vec128<T> Shuffle1032(Vec128<T> v) { 1014 const Full128<T> d; 1015 const Full128<uint64_t> du64; 1016 return BitCast(d, Reverse(du64, BitCast(du64, v))); 1017 } 1018 template <class T, HWY_IF_T_SIZE(T, 8)> 1019 HWY_API Vec128<T> Shuffle01(Vec128<T> v) { 1020 return Reverse(Full128<T>(), v); 1021 } 1022 1023 // Rotate right 32 bits 1024 template <class T, HWY_IF_T_SIZE(T, 4)> 1025 HWY_API Vec128<T> Shuffle0321(Vec128<T> v) { 1026 #if HWY_IS_LITTLE_ENDIAN 1027 return Vec128<T>{vec_sld(v.raw, v.raw, 12)}; 1028 #else 1029 return Vec128<T>{vec_sld(v.raw, v.raw, 4)}; 1030 #endif 1031 } 1032 // Rotate left 32 bits 1033 template <class T, HWY_IF_T_SIZE(T, 4)> 1034 HWY_API Vec128<T> Shuffle2103(Vec128<T> v) { 1035 #if HWY_IS_LITTLE_ENDIAN 1036 return Vec128<T>{vec_sld(v.raw, v.raw, 4)}; 1037 #else 1038 return Vec128<T>{vec_sld(v.raw, v.raw, 12)}; 1039 #endif 1040 } 1041 1042 template <class T, HWY_IF_T_SIZE(T, 4)> 1043 HWY_API Vec128<T> Shuffle0123(Vec128<T> v) { 1044 return Reverse(Full128<T>(), v); 1045 } 1046 1047 // ================================================== COMPARE 1048 1049 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 1050 1051 template <class DTo, typename TFrom, size_t NFrom> 1052 HWY_API MFromD<DTo> RebindMask(DTo /*dto*/, Mask128<TFrom, NFrom> m) { 1053 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 1054 return MFromD<DTo>{m.raw}; 1055 } 1056 1057 template <typename T, size_t N> 1058 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { 1059 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1060 return (v & bit) == bit; 1061 } 1062 1063 // ------------------------------ Equality 1064 1065 template <typename T, size_t N> 1066 HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) { 1067 return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)}; 1068 } 1069 1070 // ------------------------------ Inequality 1071 1072 // This cannot have T as a template argument, otherwise it is not more 1073 // specialized than rewritten operator== in C++20, leading to compile 1074 // errors: https://gcc.godbolt.org/z/xsrPhPvPT. 1075 template <size_t N> 1076 HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a, 1077 Vec128<uint8_t, N> b) { 1078 #if HWY_PPC_HAVE_9 1079 return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)}; 1080 #else 1081 return Not(a == b); 1082 #endif 1083 } 1084 template <size_t N> 1085 HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a, 1086 Vec128<uint16_t, N> b) { 1087 #if HWY_PPC_HAVE_9 1088 return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)}; 1089 #else 1090 return Not(a == b); 1091 #endif 1092 } 1093 template <size_t N> 1094 HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a, 1095 Vec128<uint32_t, N> b) { 1096 #if HWY_PPC_HAVE_9 1097 return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)}; 1098 #else 1099 return Not(a == b); 1100 #endif 1101 } 1102 template <size_t N> 1103 HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a, 1104 Vec128<uint64_t, N> b) { 1105 return Not(a == b); 1106 } 1107 template <size_t N> 1108 HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a, 1109 Vec128<int8_t, N> b) { 1110 #if HWY_PPC_HAVE_9 1111 return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)}; 1112 #else 1113 return Not(a == b); 1114 #endif 1115 } 1116 template <size_t N> 1117 HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a, 1118 Vec128<int16_t, N> b) { 1119 #if HWY_PPC_HAVE_9 1120 return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)}; 1121 #else 1122 return Not(a == b); 1123 #endif 1124 } 1125 template <size_t N> 1126 HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a, 1127 Vec128<int32_t, N> b) { 1128 #if HWY_PPC_HAVE_9 1129 return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)}; 1130 #else 1131 return Not(a == b); 1132 #endif 1133 } 1134 template <size_t N> 1135 HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a, 1136 Vec128<int64_t, N> b) { 1137 return Not(a == b); 1138 } 1139 1140 template <size_t N> 1141 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { 1142 return Not(a == b); 1143 } 1144 1145 template <size_t N> 1146 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, 1147 Vec128<double, N> b) { 1148 return Not(a == b); 1149 } 1150 1151 // ------------------------------ Strict inequality 1152 1153 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 1154 HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { 1155 return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)}; 1156 } 1157 1158 // ------------------------------ Weak inequality 1159 1160 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1161 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { 1162 return Mask128<T, N>{vec_cmpge(a.raw, b.raw)}; 1163 } 1164 1165 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1166 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { 1167 return Not(b > a); 1168 } 1169 1170 // ------------------------------ Reversed comparisons 1171 1172 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 1173 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { 1174 return b > a; 1175 } 1176 1177 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 1178 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { 1179 return b >= a; 1180 } 1181 1182 // ================================================== MEMORY (2) 1183 1184 // ------------------------------ Load 1185 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 1186 HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) { 1187 using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec; 1188 const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p); 1189 using ResultRaw = typename detail::Raw128<T>::type; 1190 return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)}; 1191 } 1192 1193 // For < 128 bit, LoadU == Load. 1194 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> 1195 HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) { 1196 return Load(d, p); 1197 } 1198 1199 // 128-bit SIMD => nothing to duplicate, same as an unaligned load. 1200 template <class D, typename T = TFromD<D>> 1201 HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) { 1202 return LoadU(d, p); 1203 } 1204 1205 #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14 1206 #ifdef HWY_NATIVE_LOAD_N 1207 #undef HWY_NATIVE_LOAD_N 1208 #else 1209 #define HWY_NATIVE_LOAD_N 1210 #endif 1211 1212 template <class D, typename T = TFromD<D>> 1213 HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p, 1214 size_t max_lanes_to_load) { 1215 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 1216 if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) { 1217 return Zero(d); 1218 } 1219 1220 if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) && 1221 max_lanes_to_load >= HWY_MAX_LANES_D(D)) { 1222 return LoadU(d, p); 1223 } 1224 #endif 1225 1226 const size_t num_of_bytes_to_load = 1227 HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>); 1228 const Repartition<uint8_t, decltype(d)> du8; 1229 #if HWY_S390X_HAVE_Z14 1230 return (num_of_bytes_to_load > 0) 1231 ? BitCast(d, VFromD<decltype(du8)>{vec_load_len( 1232 const_cast<unsigned char*>( 1233 reinterpret_cast<const unsigned char*>(p)), 1234 static_cast<unsigned>(num_of_bytes_to_load - 1))}) 1235 : Zero(d); 1236 #else 1237 return BitCast( 1238 d, 1239 VFromD<decltype(du8)>{vec_xl_len( 1240 const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)), 1241 num_of_bytes_to_load)}); 1242 #endif 1243 } 1244 1245 template <class D, typename T = TFromD<D>> 1246 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p, 1247 size_t max_lanes_to_load) { 1248 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 1249 if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) { 1250 return no; 1251 } 1252 1253 if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) && 1254 max_lanes_to_load >= HWY_MAX_LANES_D(D)) { 1255 return LoadU(d, p); 1256 } 1257 #endif 1258 1259 return IfThenElse(FirstN(d, max_lanes_to_load), 1260 LoadN(d, p, max_lanes_to_load), no); 1261 } 1262 1263 #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14 1264 1265 // Returns a vector with lane i=[0, N) set to "first" + i. 1266 namespace detail { 1267 1268 template <class D, HWY_IF_T_SIZE_D(D, 1)> 1269 HWY_INLINE VFromD<D> Iota0(D d) { 1270 constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7, 1271 8, 9, 10, 11, 12, 13, 14, 15}; 1272 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0}); 1273 } 1274 1275 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)> 1276 HWY_INLINE VFromD<D> Iota0(D d) { 1277 constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; 1278 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0}); 1279 } 1280 1281 template <class D, HWY_IF_UI32_D(D)> 1282 HWY_INLINE VFromD<D> Iota0(D d) { 1283 constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3}; 1284 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0}); 1285 } 1286 1287 template <class D, HWY_IF_UI64_D(D)> 1288 HWY_INLINE VFromD<D> Iota0(D d) { 1289 constexpr __vector unsigned long long kU64Iota0 = {0, 1}; 1290 return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0}); 1291 } 1292 1293 template <class D, HWY_IF_F32_D(D)> 1294 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 1295 constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f}; 1296 return VFromD<D>{kF32Iota0}; 1297 } 1298 1299 template <class D, HWY_IF_F64_D(D)> 1300 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 1301 constexpr __vector double kF64Iota0 = {0.0, 1.0}; 1302 return VFromD<D>{kF64Iota0}; 1303 } 1304 1305 } // namespace detail 1306 1307 template <class D, typename T2> 1308 HWY_API VFromD<D> Iota(D d, const T2 first) { 1309 return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); 1310 } 1311 1312 // ------------------------------ FirstN (Iota, Lt) 1313 1314 template <class D> 1315 HWY_API MFromD<D> FirstN(D d, size_t num) { 1316 const RebindToUnsigned<decltype(d)> du; 1317 using TU = TFromD<decltype(du)>; 1318 return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num))); 1319 } 1320 1321 // ------------------------------ MaskedLoad 1322 template <class D, typename T = TFromD<D>> 1323 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) { 1324 return IfThenElseZero(m, LoadU(d, p)); 1325 } 1326 1327 // ------------------------------ MaskedLoadOr 1328 template <class D, typename T = TFromD<D>> 1329 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 1330 const T* HWY_RESTRICT p) { 1331 return IfThenElse(m, LoadU(d, p), v); 1332 } 1333 1334 // ------------------------------ Store 1335 1336 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 1337 HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) { 1338 // Suppress the ignoring attributes warning that is generated by 1339 // HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC 1340 #if HWY_COMPILER_GCC 1341 HWY_DIAGNOSTICS(push) 1342 HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") 1343 #endif 1344 1345 using StoreRaw = typename detail::Raw128<T>::AlignedRawVec; 1346 *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw); 1347 1348 #if HWY_COMPILER_GCC 1349 HWY_DIAGNOSTICS(pop) 1350 #endif 1351 } 1352 1353 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 1354 HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) { 1355 using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec; 1356 *reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw); 1357 } 1358 1359 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> 1360 HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) { 1361 using BitsT = UnsignedFromSize<d.MaxBytes()>; 1362 1363 const Repartition<BitsT, decltype(d)> d_bits; 1364 const BitsT bits = GetLane(BitCast(d_bits, v)); 1365 CopyBytes<d.MaxBytes()>(&bits, p); 1366 } 1367 1368 // For < 128 bit, StoreU == Store. 1369 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>> 1370 HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) { 1371 Store(v, d, p); 1372 } 1373 1374 #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14 1375 1376 #ifdef HWY_NATIVE_STORE_N 1377 #undef HWY_NATIVE_STORE_N 1378 #else 1379 #define HWY_NATIVE_STORE_N 1380 #endif 1381 1382 template <class D, typename T = TFromD<D>> 1383 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 1384 size_t max_lanes_to_store) { 1385 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 1386 if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) { 1387 return; 1388 } 1389 1390 if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) && 1391 max_lanes_to_store >= HWY_MAX_LANES_D(D)) { 1392 StoreU(v, d, p); 1393 return; 1394 } 1395 #endif 1396 1397 const size_t num_of_bytes_to_store = 1398 HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>); 1399 const Repartition<uint8_t, decltype(d)> du8; 1400 #if HWY_S390X_HAVE_Z14 1401 if (num_of_bytes_to_store > 0) { 1402 vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p), 1403 static_cast<unsigned>(num_of_bytes_to_store - 1)); 1404 } 1405 #else 1406 vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p), 1407 num_of_bytes_to_store); 1408 #endif 1409 } 1410 #endif 1411 1412 // ------------------------------ BlendedStore 1413 1414 template <class D> 1415 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 1416 TFromD<D>* HWY_RESTRICT p) { 1417 const VFromD<D> old = LoadU(d, p); 1418 StoreU(IfThenElse(RebindMask(d, m), v, old), d, p); 1419 } 1420 1421 // ================================================== ARITHMETIC 1422 1423 namespace detail { 1424 // If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D> 1425 // rebinds D to MakeUnsigned<TFromD<D>>. 1426 1427 // Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16), 1428 // detail::RebindToUnsignedIfNotFloat<D> is the same as D. 1429 template <class D> 1430 using RebindToUnsignedIfNotFloat = 1431 hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()), 1432 RebindToUnsigned<D>, D>; 1433 } // namespace detail 1434 1435 // ------------------------------ Addition 1436 1437 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 1438 HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) { 1439 const DFromV<decltype(a)> d; 1440 const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith; 1441 1442 // If T is an integer type, do an unsigned vec_add to avoid undefined behavior 1443 #if HWY_S390X_HAVE_Z14 1444 return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw + 1445 BitCast(d_arith, b).raw}); 1446 #else 1447 return BitCast(d, VFromD<decltype(d_arith)>{vec_add( 1448 BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)}); 1449 #endif 1450 } 1451 1452 // ------------------------------ Subtraction 1453 1454 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 1455 HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) { 1456 const DFromV<decltype(a)> d; 1457 const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith; 1458 1459 // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior 1460 #if HWY_S390X_HAVE_Z14 1461 return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw - 1462 BitCast(d_arith, b).raw}); 1463 #else 1464 return BitCast(d, VFromD<decltype(d_arith)>{vec_sub( 1465 BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)}); 1466 #endif 1467 } 1468 1469 // ------------------------------ SumsOf8 1470 template <class V, HWY_IF_U8(TFromV<V>)> 1471 HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) { 1472 return SumsOf2(SumsOf4(v)); 1473 } 1474 1475 template <class V, HWY_IF_I8(TFromV<V>)> 1476 HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) { 1477 #if HWY_S390X_HAVE_Z14 1478 const DFromV<decltype(v)> di8; 1479 const RebindToUnsigned<decltype(di8)> du8; 1480 const RepartitionToWideX3<decltype(di8)> di64; 1481 1482 return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) + 1483 Set(di64, int64_t{-1024}); 1484 #else 1485 return SumsOf2(SumsOf4(v)); 1486 #endif 1487 } 1488 1489 // ------------------------------ SaturatedAdd 1490 1491 // Returns a + b clamped to the destination range. 1492 1493 #if HWY_S390X_HAVE_Z14 1494 // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most 1495 // other integer SIMD instruction sets 1496 1497 template <typename T, size_t N, HWY_IF_UNSIGNED(T), 1498 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 1499 HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) { 1500 return Add(a, Min(b, Not(a))); 1501 } 1502 1503 template <typename T, size_t N, HWY_IF_SIGNED(T), 1504 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 1505 HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) { 1506 const DFromV<decltype(a)> d; 1507 const auto sum = Add(a, b); 1508 const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); 1509 const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>())); 1510 return IfNegativeThenElse(overflow_mask, overflow_result, sum); 1511 } 1512 1513 #else // VSX 1514 1515 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB 1516 #undef HWY_NATIVE_I32_SATURATED_ADDSUB 1517 #else 1518 #define HWY_NATIVE_I32_SATURATED_ADDSUB 1519 #endif 1520 1521 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB 1522 #undef HWY_NATIVE_U32_SATURATED_ADDSUB 1523 #else 1524 #define HWY_NATIVE_U32_SATURATED_ADDSUB 1525 #endif 1526 1527 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 1528 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> 1529 HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) { 1530 return Vec128<T, N>{vec_adds(a.raw, b.raw)}; 1531 } 1532 #endif // HWY_S390X_HAVE_Z14 1533 1534 #if HWY_PPC_HAVE_10 1535 1536 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB 1537 #undef HWY_NATIVE_I64_SATURATED_ADDSUB 1538 #else 1539 #define HWY_NATIVE_I64_SATURATED_ADDSUB 1540 #endif 1541 1542 template <class V, HWY_IF_I64_D(DFromV<V>)> 1543 HWY_API V SaturatedAdd(V a, V b) { 1544 const DFromV<decltype(a)> d; 1545 const auto sum = Add(a, b); 1546 const auto overflow_mask = 1547 BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum)); 1548 const auto overflow_result = 1549 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); 1550 return IfNegativeThenElse(overflow_mask, overflow_result, sum); 1551 } 1552 1553 #endif // HWY_PPC_HAVE_10 1554 1555 // ------------------------------ SaturatedSub 1556 1557 // Returns a - b clamped to the destination range. 1558 1559 #if HWY_S390X_HAVE_Z14 1560 // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most 1561 // other integer SIMD instruction sets 1562 1563 template <typename T, size_t N, HWY_IF_UNSIGNED(T), 1564 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 1565 HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) { 1566 return Sub(a, Min(a, b)); 1567 } 1568 1569 template <typename T, size_t N, HWY_IF_SIGNED(T), 1570 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 1571 HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) { 1572 const DFromV<decltype(a)> d; 1573 const auto diff = Sub(a, b); 1574 const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); 1575 const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>())); 1576 return IfNegativeThenElse(overflow_mask, overflow_result, diff); 1577 } 1578 1579 #else // VSX 1580 1581 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 1582 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> 1583 HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) { 1584 return Vec128<T, N>{vec_subs(a.raw, b.raw)}; 1585 } 1586 #endif // HWY_S390X_HAVE_Z14 1587 1588 #if HWY_PPC_HAVE_10 1589 1590 template <class V, HWY_IF_I64_D(DFromV<V>)> 1591 HWY_API V SaturatedSub(V a, V b) { 1592 const DFromV<decltype(a)> d; 1593 const auto diff = Sub(a, b); 1594 const auto overflow_mask = 1595 BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff)); 1596 const auto overflow_result = 1597 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); 1598 return IfNegativeThenElse(overflow_mask, overflow_result, diff); 1599 } 1600 1601 #endif // HWY_PPC_HAVE_10 1602 1603 // ------------------------------ AverageRound 1604 1605 // Returns (a + b + 1) / 2 1606 1607 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 1608 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 1609 #else 1610 #define HWY_NATIVE_AVERAGE_ROUND_UI32 1611 #endif 1612 1613 #if HWY_S390X_HAVE_Z14 1614 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 1615 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 1616 #else 1617 #define HWY_NATIVE_AVERAGE_ROUND_UI64 1618 #endif 1619 1620 #define HWY_PPC_IF_AVERAGE_ROUND_T(T) void* = nullptr 1621 #else // !HWY_S390X_HAVE_Z14 1622 #define HWY_PPC_IF_AVERAGE_ROUND_T(T) \ 1623 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)) 1624 #endif // HWY_S390X_HAVE_Z14 1625 1626 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 1627 HWY_PPC_IF_AVERAGE_ROUND_T(T)> 1628 HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) { 1629 return Vec128<T, N>{vec_avg(a.raw, b.raw)}; 1630 } 1631 1632 #undef HWY_PPC_IF_AVERAGE_ROUND_T 1633 1634 // ------------------------------ Multiplication 1635 1636 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. 1637 #ifdef HWY_NATIVE_MUL_8 1638 #undef HWY_NATIVE_MUL_8 1639 #else 1640 #define HWY_NATIVE_MUL_8 1641 #endif 1642 #ifdef HWY_NATIVE_MUL_64 1643 #undef HWY_NATIVE_MUL_64 1644 #else 1645 #define HWY_NATIVE_MUL_64 1646 #endif 1647 1648 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 1649 HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) { 1650 const DFromV<decltype(a)> d; 1651 const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith; 1652 1653 // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior 1654 #if HWY_S390X_HAVE_Z14 1655 return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw * 1656 BitCast(d_arith, b).raw}); 1657 #else 1658 return BitCast(d, VFromD<decltype(d_arith)>{vec_mul( 1659 BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)}); 1660 #endif 1661 } 1662 1663 // Returns the upper sizeof(T)*8 bits of a * b in each lane. 1664 1665 #if HWY_S390X_HAVE_Z14 1666 #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \ 1667 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)) 1668 #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \ 1669 hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr 1670 #elif HWY_PPC_HAVE_10 1671 #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \ 1672 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8)) 1673 #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \ 1674 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)) 1675 #else 1676 #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \ 1677 hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr 1678 #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \ 1679 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)) 1680 #endif 1681 1682 #if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10 1683 template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T), 1684 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1685 HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) { 1686 return Vec128<T, N>{vec_mulh(a.raw, b.raw)}; 1687 } 1688 #endif 1689 1690 template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T), 1691 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1692 HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) { 1693 const auto p_even = MulEven(a, b); 1694 1695 #if HWY_IS_LITTLE_ENDIAN 1696 const auto p_even_full = ResizeBitCast(Full128<T>(), p_even); 1697 return Vec128<T, 1>{ 1698 vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))}; 1699 #else 1700 const DFromV<decltype(a)> d; 1701 return ResizeBitCast(d, p_even); 1702 #endif 1703 } 1704 1705 template <typename T, size_t N, 1706 HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T), 1707 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)> 1708 HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) { 1709 const DFromV<decltype(a)> d; 1710 1711 const auto p_even = BitCast(d, MulEven(a, b)); 1712 const auto p_odd = BitCast(d, MulOdd(a, b)); 1713 1714 #if HWY_IS_LITTLE_ENDIAN 1715 return InterleaveOdd(d, p_even, p_odd); 1716 #else 1717 return InterleaveEven(d, p_even, p_odd); 1718 #endif 1719 } 1720 1721 #if !HWY_PPC_HAVE_10 1722 template <class T, HWY_IF_UI64(T)> 1723 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) { 1724 T p_hi; 1725 Mul128(GetLane(a), GetLane(b), &p_hi); 1726 return Set(Full64<T>(), p_hi); 1727 } 1728 1729 template <class T, HWY_IF_UI64(T)> 1730 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) { 1731 const DFromV<decltype(a)> d; 1732 const Half<decltype(d)> dh; 1733 return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)), 1734 MulHigh(LowerHalf(dh, a), LowerHalf(dh, b))); 1735 } 1736 #endif // !HWY_PPC_HAVE_10 1737 1738 #undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH 1739 #undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH 1740 1741 // Multiplies even lanes (0, 2, ..) and places the double-wide result into 1742 // even and the upper half into its odd neighbor lane. 1743 template <typename T, size_t N, 1744 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 1745 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1746 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a, 1747 Vec128<T, N> b) { 1748 return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)}; 1749 } 1750 1751 // Multiplies odd lanes (1, 3, ..) and places the double-wide result into 1752 // even and the upper half into its odd neighbor lane. 1753 template <typename T, size_t N, 1754 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 1755 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1756 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a, 1757 Vec128<T, N> b) { 1758 return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)}; 1759 } 1760 1761 // ------------------------------ Rol/Ror 1762 1763 #ifdef HWY_NATIVE_ROL_ROR_8 1764 #undef HWY_NATIVE_ROL_ROR_8 1765 #else 1766 #define HWY_NATIVE_ROL_ROR_8 1767 #endif 1768 1769 #ifdef HWY_NATIVE_ROL_ROR_16 1770 #undef HWY_NATIVE_ROL_ROR_16 1771 #else 1772 #define HWY_NATIVE_ROL_ROR_16 1773 #endif 1774 1775 #ifdef HWY_NATIVE_ROL_ROR_32_64 1776 #undef HWY_NATIVE_ROL_ROR_32_64 1777 #else 1778 #define HWY_NATIVE_ROL_ROR_32_64 1779 #endif 1780 1781 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1782 HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) { 1783 const DFromV<decltype(a)> d; 1784 const RebindToUnsigned<decltype(d)> du; 1785 return BitCast( 1786 d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)}); 1787 } 1788 1789 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1790 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 1791 const DFromV<decltype(a)> d; 1792 const RebindToSigned<decltype(d)> di; 1793 return Rol(a, BitCast(d, Neg(BitCast(di, b)))); 1794 } 1795 1796 // ------------------------------ RotateRight 1797 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1798 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 1799 const DFromV<decltype(v)> d; 1800 constexpr size_t kSizeInBits = sizeof(T) * 8; 1801 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 1802 1803 return (kBits == 0) 1804 ? v 1805 : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) - 1806 kBits))); 1807 } 1808 1809 // ------------------------------ RotateLeftSame/RotateRightSame 1810 #ifdef HWY_NATIVE_ROL_ROR_SAME_8 1811 #undef HWY_NATIVE_ROL_ROR_SAME_8 1812 #else 1813 #define HWY_NATIVE_ROL_ROR_SAME_8 1814 #endif 1815 1816 #ifdef HWY_NATIVE_ROL_ROR_SAME_16 1817 #undef HWY_NATIVE_ROL_ROR_SAME_16 1818 #else 1819 #define HWY_NATIVE_ROL_ROR_SAME_16 1820 #endif 1821 1822 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 1823 #undef HWY_NATIVE_ROL_ROR_SAME_32_64 1824 #else 1825 #define HWY_NATIVE_ROL_ROR_SAME_32_64 1826 #endif 1827 1828 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1829 HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) { 1830 const DFromV<decltype(v)> d; 1831 return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits)))); 1832 } 1833 1834 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1835 HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) { 1836 const DFromV<decltype(v)> d; 1837 return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits)))); 1838 } 1839 1840 // ------------------------------ IfNegativeThenElse 1841 1842 template <typename T, size_t N> 1843 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 1844 Vec128<T, N> no) { 1845 static_assert(IsSigned<T>(), "Only works for signed/float"); 1846 1847 const DFromV<decltype(v)> d; 1848 #if HWY_PPC_HAVE_10 1849 const RebindToUnsigned<decltype(d)> du; 1850 return BitCast( 1851 d, VFromD<decltype(du)>{vec_blendv( 1852 BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)}); 1853 #else 1854 const RebindToSigned<decltype(d)> di; 1855 return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no); 1856 #endif 1857 } 1858 1859 #if HWY_PPC_HAVE_10 1860 #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 1861 #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 1862 #else 1863 #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 1864 #endif 1865 1866 #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 1867 #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 1868 #else 1869 #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 1870 #endif 1871 1872 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 1873 HWY_API V IfNegativeThenElseZero(V v, V yes) { 1874 const DFromV<decltype(v)> d; 1875 return IfNegativeThenElse(v, yes, Zero(d)); 1876 } 1877 1878 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 1879 HWY_API V IfNegativeThenZeroElse(V v, V no) { 1880 const DFromV<decltype(v)> d; 1881 return IfNegativeThenElse(v, Zero(d), no); 1882 } 1883 #endif 1884 1885 // generic_ops takes care of integer T. 1886 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1887 HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) { 1888 return Abs(a - b); 1889 } 1890 1891 // ------------------------------ Floating-point multiply-add variants 1892 1893 // Returns mul * x + add 1894 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1895 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 1896 Vec128<T, N> add) { 1897 return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)}; 1898 } 1899 1900 // Returns add - mul * x 1901 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1902 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 1903 Vec128<T, N> add) { 1904 // NOTE: the vec_nmsub operation below computes -(mul * x - add), 1905 // which is equivalent to add - mul * x in the round-to-nearest 1906 // and round-towards-zero rounding modes 1907 return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)}; 1908 } 1909 1910 // Returns mul * x - sub 1911 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1912 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, 1913 Vec128<T, N> sub) { 1914 return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)}; 1915 } 1916 1917 // Returns -mul * x - sub 1918 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1919 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, 1920 Vec128<T, N> sub) { 1921 // NOTE: The vec_nmadd operation below computes -(mul * x + sub), 1922 // which is equivalent to -mul * x - sub in the round-to-nearest 1923 // and round-towards-zero rounding modes 1924 return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)}; 1925 } 1926 1927 // ------------------------------ Floating-point div 1928 // Approximate reciprocal 1929 1930 #ifdef HWY_NATIVE_F64_APPROX_RECIP 1931 #undef HWY_NATIVE_F64_APPROX_RECIP 1932 #else 1933 #define HWY_NATIVE_F64_APPROX_RECIP 1934 #endif 1935 1936 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1937 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { 1938 #if HWY_S390X_HAVE_Z14 1939 return Vec128<T, N>{a.raw / b.raw}; 1940 #else 1941 return Vec128<T, N>{vec_div(a.raw, b.raw)}; 1942 #endif 1943 } 1944 1945 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1946 HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) { 1947 #if HWY_S390X_HAVE_Z14 1948 const DFromV<decltype(v)> d; 1949 return Set(d, T(1.0)) / v; 1950 #else 1951 return Vec128<T, N>{vec_re(v.raw)}; 1952 #endif 1953 } 1954 1955 // ------------------------------ Floating-point square root 1956 1957 #if HWY_S390X_HAVE_Z14 1958 // Approximate reciprocal square root 1959 template <size_t N> 1960 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { 1961 const DFromV<decltype(v)> d; 1962 const RebindToUnsigned<decltype(d)> du; 1963 1964 const auto half = v * Set(d, 0.5f); 1965 // Initial guess based on log2(f) 1966 const auto guess = BitCast( 1967 d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v))); 1968 // One Newton-Raphson iteration 1969 return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f)); 1970 } 1971 #else // VSX 1972 1973 #ifdef HWY_NATIVE_F64_APPROX_RSQRT 1974 #undef HWY_NATIVE_F64_APPROX_RSQRT 1975 #else 1976 #define HWY_NATIVE_F64_APPROX_RSQRT 1977 #endif 1978 1979 // Approximate reciprocal square root 1980 template <class T, size_t N, HWY_IF_FLOAT(T)> 1981 HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) { 1982 return Vec128<T, N>{vec_rsqrte(v.raw)}; 1983 } 1984 #endif // HWY_S390X_HAVE_Z14 1985 1986 // Full precision square root 1987 template <class T, size_t N, HWY_IF_FLOAT(T)> 1988 HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) { 1989 return Vec128<T, N>{vec_sqrt(v.raw)}; 1990 } 1991 1992 // ------------------------------ GetBiasedExponent 1993 1994 #if HWY_PPC_HAVE_9 1995 1996 #ifdef HWY_NATIVE_GET_BIASED_EXPONENT 1997 #undef HWY_NATIVE_GET_BIASED_EXPONENT 1998 #else 1999 #define HWY_NATIVE_GET_BIASED_EXPONENT 2000 #endif 2001 2002 template <class V, HWY_IF_FLOAT3264_V(V)> 2003 HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) { 2004 return VFromD<RebindToUnsigned<DFromV<V>>>{vec_extract_exp(v.raw)}; 2005 } 2006 2007 #endif // HWY_PPC_HAVE_9 2008 2009 // ------------------------------ Min (Gt, IfThenElse) 2010 2011 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 2012 HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) { 2013 return Vec128<T, N>{vec_min(a.raw, b.raw)}; 2014 } 2015 2016 // ------------------------------ Max (Gt, IfThenElse) 2017 2018 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 2019 HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) { 2020 return Vec128<T, N>{vec_max(a.raw, b.raw)}; 2021 } 2022 2023 // ------------------------------- Integer AbsDiff for PPC9/PPC10 2024 2025 #if HWY_PPC_HAVE_9 2026 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF 2027 #undef HWY_NATIVE_INTEGER_ABS_DIFF 2028 #else 2029 #define HWY_NATIVE_INTEGER_ABS_DIFF 2030 #endif 2031 2032 template <class V, HWY_IF_UNSIGNED_V(V), 2033 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> 2034 HWY_API V AbsDiff(const V a, const V b) { 2035 return V{vec_absd(a.raw, b.raw)}; 2036 } 2037 2038 template <class V, HWY_IF_U64_D(DFromV<V>)> 2039 HWY_API V AbsDiff(const V a, const V b) { 2040 return Sub(Max(a, b), Min(a, b)); 2041 } 2042 2043 template <class V, HWY_IF_SIGNED_V(V)> 2044 HWY_API V AbsDiff(const V a, const V b) { 2045 return Sub(Max(a, b), Min(a, b)); 2046 } 2047 2048 #endif // HWY_PPC_HAVE_9 2049 2050 // ------------------------------ Integer Div for PPC10 2051 #if HWY_PPC_HAVE_10 2052 #ifdef HWY_NATIVE_INT_DIV 2053 #undef HWY_NATIVE_INT_DIV 2054 #else 2055 #define HWY_NATIVE_INT_DIV 2056 #endif 2057 2058 template <size_t N> 2059 HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a, 2060 Vec128<int32_t, N> b) { 2061 // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid 2062 // undefined behavior if b[i] == 0 or 2063 // (a[i] == LimitsMin<int32_t>() && b[i] == -1) 2064 2065 // Clang will also optimize out I32 vec_div on PPC10 if optimizations are 2066 // enabled and any of the lanes of b are known to be zero (even in the unused 2067 // lanes of a partial vector) 2068 __vector signed int raw_result; 2069 __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2070 return Vec128<int32_t, N>{raw_result}; 2071 } 2072 2073 template <size_t N> 2074 HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a, 2075 Vec128<uint32_t, N> b) { 2076 // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid 2077 // undefined behavior if b[i] == 0 2078 2079 // Clang will also optimize out U32 vec_div on PPC10 if optimizations are 2080 // enabled and any of the lanes of b are known to be zero (even in the unused 2081 // lanes of a partial vector) 2082 __vector unsigned int raw_result; 2083 __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2084 return Vec128<uint32_t, N>{raw_result}; 2085 } 2086 2087 template <size_t N> 2088 HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a, 2089 Vec128<int64_t, N> b) { 2090 // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid 2091 // undefined behavior if b[i] == 0 or 2092 // (a[i] == LimitsMin<int64_t>() && b[i] == -1) 2093 2094 // Clang will also optimize out I64 vec_div on PPC10 if optimizations are 2095 // enabled and any of the lanes of b are known to be zero (even in the unused 2096 // lanes of a partial vector) 2097 __vector signed long long raw_result; 2098 __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2099 return Vec128<int64_t, N>{raw_result}; 2100 } 2101 2102 template <size_t N> 2103 HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a, 2104 Vec128<uint64_t, N> b) { 2105 // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid 2106 // undefined behavior if b[i] == 0 2107 2108 // Clang will also optimize out U64 vec_div on PPC10 if optimizations are 2109 // enabled and any of the lanes of b are known to be zero (even in the unused 2110 // lanes of a partial vector) 2111 __vector unsigned long long raw_result; 2112 __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2113 return Vec128<uint64_t, N>{raw_result}; 2114 } 2115 2116 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 2117 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 2118 HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) { 2119 const DFromV<decltype(a)> d; 2120 const RepartitionToWide<decltype(d)> dw; 2121 return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b), 2122 PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b)); 2123 } 2124 2125 template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 2126 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 2127 HWY_IF_V_SIZE_LE(T, N, 8)> 2128 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { 2129 const DFromV<decltype(a)> d; 2130 const Rebind<MakeWide<T>, decltype(d)> dw; 2131 return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b)); 2132 } 2133 2134 template <size_t N> 2135 HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a, 2136 Vec128<int32_t, N> b) { 2137 // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid 2138 // undefined behavior if b[i] == 0 or 2139 // (a[i] == LimitsMin<int32_t>() && b[i] == -1) 2140 2141 // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are 2142 // enabled and any of the lanes of b are known to be zero (even in the unused 2143 // lanes of a partial vector) 2144 __vector signed int raw_result; 2145 __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2146 return Vec128<int32_t, N>{raw_result}; 2147 } 2148 2149 template <size_t N> 2150 HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a, 2151 Vec128<uint32_t, N> b) { 2152 // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid 2153 // undefined behavior if b[i] == 0 2154 2155 // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are 2156 // enabled and any of the lanes of b are known to be zero (even in the unused 2157 // lanes of a partial vector) 2158 __vector unsigned int raw_result; 2159 __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2160 return Vec128<uint32_t, N>{raw_result}; 2161 } 2162 2163 template <size_t N> 2164 HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a, 2165 Vec128<int64_t, N> b) { 2166 // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid 2167 // undefined behavior if b[i] == 0 or 2168 // (a[i] == LimitsMin<int64_t>() && b[i] == -1) 2169 2170 // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are 2171 // enabled and any of the lanes of b are known to be zero (even in the unused 2172 // lanes of a partial vector) 2173 __vector signed long long raw_result; 2174 __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2175 return Vec128<int64_t, N>{raw_result}; 2176 } 2177 2178 template <size_t N> 2179 HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a, 2180 Vec128<uint64_t, N> b) { 2181 // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid 2182 // undefined behavior if b[i] == 0 2183 2184 // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are 2185 // enabled and any of the lanes of b are known to be zero (even in the unused 2186 // lanes of a partial vector) 2187 __vector unsigned long long raw_result; 2188 __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); 2189 return Vec128<uint64_t, N>{raw_result}; 2190 } 2191 2192 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 2193 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 2194 HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) { 2195 const DFromV<decltype(a)> d; 2196 const RepartitionToWide<decltype(d)> dw; 2197 return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b), 2198 PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b)); 2199 } 2200 2201 template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 2202 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 2203 HWY_IF_V_SIZE_LE(T, N, 8)> 2204 HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) { 2205 const DFromV<decltype(a)> d; 2206 const Rebind<MakeWide<T>, decltype(d)> dw; 2207 return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b)); 2208 } 2209 #endif 2210 2211 // ================================================== MEMORY (3) 2212 2213 // ------------------------------ Non-temporal stores 2214 2215 template <class D> 2216 HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 2217 __builtin_prefetch(aligned, 1, 0); 2218 Store(v, d, aligned); 2219 } 2220 2221 // ------------------------------ Scatter in generic_ops-inl.h 2222 // ------------------------------ Gather in generic_ops-inl.h 2223 2224 // ================================================== SWIZZLE (2) 2225 2226 // ------------------------------ LowerHalf 2227 2228 // Returns upper/lower half of a vector. 2229 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2230 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { 2231 return VFromD<D>{v.raw}; 2232 } 2233 template <typename T, size_t N> 2234 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 2235 return Vec128<T, N / 2>{v.raw}; 2236 } 2237 2238 // ------------------------------ ShiftLeftBytes 2239 2240 // NOTE: The ShiftLeftBytes operation moves the elements of v to the right 2241 // by kBytes bytes and zeroes out the first kBytes bytes of v on both 2242 // little-endian and big-endian PPC targets 2243 // (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both 2244 // little-endian and big-endian targets) 2245 2246 template <int kBytes, class D> 2247 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { 2248 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2249 if (kBytes == 0) return v; 2250 const auto zeros = Zero(d); 2251 #if HWY_IS_LITTLE_ENDIAN 2252 return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)}; 2253 #else 2254 return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; 2255 #endif 2256 } 2257 2258 template <int kBytes, typename T, size_t N> 2259 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { 2260 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 2261 } 2262 2263 // ------------------------------ ShiftLeftLanes 2264 2265 // NOTE: The ShiftLeftLanes operation moves the elements of v to the right 2266 // by kLanes lanes and zeroes out the first kLanes lanes of v on both 2267 // little-endian and big-endian PPC targets 2268 // (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both 2269 // little-endian and big-endian targets) 2270 2271 template <int kLanes, class D, typename T = TFromD<D>> 2272 HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) { 2273 const Repartition<uint8_t, decltype(d)> d8; 2274 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); 2275 } 2276 2277 template <int kLanes, typename T, size_t N> 2278 HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) { 2279 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 2280 } 2281 2282 // ------------------------------ ShiftRightBytes 2283 2284 // NOTE: The ShiftRightBytes operation moves the elements of v to the left 2285 // by kBytes bytes and zeroes out the last kBytes bytes of v on both 2286 // little-endian and big-endian PPC targets 2287 // (same behavior as the HWY_EMU128 ShiftRightBytes operation on both 2288 // little-endian and big-endian targets) 2289 2290 template <int kBytes, class D> 2291 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 2292 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2293 if (kBytes == 0) return v; 2294 2295 // For partial vectors, clear upper lanes so we shift in zeros. 2296 if (d.MaxBytes() != 16) { 2297 const Full128<TFromD<D>> dfull; 2298 VFromD<decltype(dfull)> vfull{v.raw}; 2299 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; 2300 } 2301 2302 const auto zeros = Zero(d); 2303 #if HWY_IS_LITTLE_ENDIAN 2304 return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; 2305 #else 2306 return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)}; 2307 #endif 2308 } 2309 2310 // ------------------------------ ShiftRightLanes 2311 2312 // NOTE: The ShiftRightLanes operation moves the elements of v to the left 2313 // by kLanes lanes and zeroes out the last kLanes lanes of v on both 2314 // little-endian and big-endian PPC targets 2315 // (same behavior as the HWY_EMU128 ShiftRightLanes operation on both 2316 // little-endian and big-endian targets) 2317 2318 template <int kLanes, class D> 2319 HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) { 2320 const Repartition<uint8_t, decltype(d)> d8; 2321 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 2322 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); 2323 } 2324 2325 // ------------------------------ UpperHalf (ShiftRightBytes) 2326 2327 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2328 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 2329 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); 2330 } 2331 2332 // ------------------------------ ExtractLane 2333 template <typename T, size_t N> 2334 HWY_API T ExtractLane(Vec128<T, N> v, size_t i) { 2335 return static_cast<T>(v.raw[i]); 2336 } 2337 2338 // ------------------------------ InsertLane 2339 template <typename T, size_t N> 2340 HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) { 2341 #if HWY_IS_LITTLE_ENDIAN 2342 typename detail::Raw128<T>::type raw_result = v.raw; 2343 raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t); 2344 return Vec128<T, N>{raw_result}; 2345 #else 2346 // On ppc64be without this, mul_test fails, but swizzle_test passes. 2347 DFromV<decltype(v)> d; 2348 alignas(16) T lanes[16 / sizeof(T)]; 2349 Store(v, d, lanes); 2350 lanes[i] = t; 2351 return Load(d, lanes); 2352 #endif 2353 } 2354 2355 // ------------------------------ CombineShiftRightBytes 2356 2357 // NOTE: The CombineShiftRightBytes operation below moves the elements of lo to 2358 // the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes() 2359 // - kBytes) bytes on both little-endian and big-endian PPC targets. 2360 2361 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 2362 HWY_API Vec128<T> CombineShiftRightBytes(D /*d*/, Vec128<T> hi, Vec128<T> lo) { 2363 constexpr size_t kSize = 16; 2364 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 2365 #if HWY_IS_LITTLE_ENDIAN 2366 return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)}; 2367 #else 2368 return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)}; 2369 #endif 2370 } 2371 2372 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2373 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 2374 constexpr size_t kSize = d.MaxBytes(); 2375 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 2376 const Repartition<uint8_t, decltype(d)> d8; 2377 using V8 = Vec128<uint8_t>; 2378 const DFromV<V8> dfull8; 2379 const Repartition<TFromD<D>, decltype(dfull8)> dfull; 2380 const V8 hi8{BitCast(d8, hi).raw}; 2381 // Move into most-significant bytes 2382 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); 2383 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); 2384 return VFromD<D>{BitCast(dfull, r).raw}; 2385 } 2386 2387 // ------------------------------ Broadcast/splat any lane 2388 2389 template <int kLane, typename T, size_t N> 2390 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { 2391 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 2392 return Vec128<T, N>{vec_splat(v.raw, kLane)}; 2393 } 2394 2395 // ------------------------------ TableLookupLanes (Shuffle01) 2396 2397 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. 2398 template <typename T, size_t N = 16 / sizeof(T)> 2399 struct Indices128 { 2400 __vector unsigned char raw; 2401 }; 2402 2403 namespace detail { 2404 2405 template <class D, HWY_IF_T_SIZE_D(D, 1)> 2406 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2407 D d) { 2408 const Repartition<uint8_t, decltype(d)> d8; 2409 return Iota(d8, 0); 2410 } 2411 2412 template <class D, HWY_IF_T_SIZE_D(D, 2)> 2413 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2414 D d) { 2415 const Repartition<uint8_t, decltype(d)> d8; 2416 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 2417 constexpr __vector unsigned char kBroadcastLaneBytes = { 2418 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 2419 #else 2420 constexpr __vector unsigned char kBroadcastLaneBytes = { 2421 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; 2422 #endif 2423 return VFromD<decltype(d8)>{kBroadcastLaneBytes}; 2424 } 2425 2426 template <class D, HWY_IF_T_SIZE_D(D, 4)> 2427 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2428 D d) { 2429 const Repartition<uint8_t, decltype(d)> d8; 2430 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 2431 constexpr __vector unsigned char kBroadcastLaneBytes = { 2432 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 2433 #else 2434 constexpr __vector unsigned char kBroadcastLaneBytes = { 2435 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15}; 2436 #endif 2437 return VFromD<decltype(d8)>{kBroadcastLaneBytes}; 2438 } 2439 2440 template <class D, HWY_IF_T_SIZE_D(D, 8)> 2441 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2442 D d) { 2443 const Repartition<uint8_t, decltype(d)> d8; 2444 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 2445 constexpr __vector unsigned char kBroadcastLaneBytes = { 2446 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; 2447 #else 2448 constexpr __vector unsigned char kBroadcastLaneBytes = { 2449 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15}; 2450 #endif 2451 return VFromD<decltype(d8)>{kBroadcastLaneBytes}; 2452 } 2453 2454 template <class D, HWY_IF_T_SIZE_D(D, 1)> 2455 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2456 const Repartition<uint8_t, decltype(d)> d8; 2457 return Zero(d8); 2458 } 2459 2460 template <class D, HWY_IF_T_SIZE_D(D, 2)> 2461 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2462 const Repartition<uint8_t, decltype(d)> d8; 2463 constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1, 2464 0, 1, 0, 1, 0, 1, 0, 1}; 2465 return VFromD<decltype(d8)>{kByteOffsets}; 2466 } 2467 2468 template <class D, HWY_IF_T_SIZE_D(D, 4)> 2469 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2470 const Repartition<uint8_t, decltype(d)> d8; 2471 constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3, 2472 0, 1, 2, 3, 0, 1, 2, 3}; 2473 return VFromD<decltype(d8)>{kByteOffsets}; 2474 } 2475 2476 template <class D, HWY_IF_T_SIZE_D(D, 8)> 2477 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2478 const Repartition<uint8_t, decltype(d)> d8; 2479 constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7, 2480 0, 1, 2, 3, 4, 5, 6, 7}; 2481 return VFromD<decltype(d8)>{kByteOffsets}; 2482 } 2483 2484 } // namespace detail 2485 2486 template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)> 2487 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 2488 D d, Vec128<TI, MaxLanes(D())> vec) { 2489 using T = TFromD<D>; 2490 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 2491 #if HWY_IS_DEBUG_BUILD 2492 const RebindToUnsigned<decltype(d)> du; 2493 using TU = TFromD<decltype(du)>; 2494 HWY_DASSERT(AllTrue( 2495 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 2496 #endif 2497 2498 const Repartition<uint8_t, decltype(d)> d8; 2499 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw}; 2500 } 2501 2502 template <class D, typename TI, 2503 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> 2504 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 2505 D d, Vec128<TI, MaxLanes(D())> vec) { 2506 using T = TFromD<D>; 2507 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 2508 #if HWY_IS_DEBUG_BUILD 2509 const RebindToUnsigned<decltype(d)> du; 2510 using TU = TFromD<decltype(du)>; 2511 HWY_DASSERT(AllTrue( 2512 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 2513 #endif 2514 2515 const Repartition<uint8_t, decltype(d)> d8; 2516 using V8 = VFromD<decltype(d8)>; 2517 2518 // Broadcast each lane index to all bytes of T and shift to bytes 2519 const V8 lane_indices = TableLookupBytes( 2520 BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); 2521 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); 2522 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); 2523 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); 2524 return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw}; 2525 } 2526 2527 template <class D, typename TI> 2528 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( 2529 D d, const TI* idx) { 2530 const Rebind<TI, decltype(d)> di; 2531 return IndicesFromVec(d, LoadU(di, idx)); 2532 } 2533 2534 template <typename T, size_t N> 2535 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 2536 const DFromV<decltype(v)> d; 2537 const Repartition<uint8_t, decltype(d)> d8; 2538 return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw})); 2539 } 2540 2541 // Single lane: no change 2542 template <typename T> 2543 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, 2544 Indices128<T, 1> /* idx */) { 2545 return v; 2546 } 2547 2548 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 2549 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, 2550 Indices128<T, N> idx) { 2551 const DFromV<decltype(a)> d; 2552 const Twice<decltype(d)> dt; 2553 const Repartition<uint8_t, decltype(dt)> dt_u8; 2554 // TableLookupLanes currently requires table and index vectors to be the same 2555 // size, though a half-length index vector would be sufficient here. 2556 #if HWY_IS_MSAN 2557 const Vec128<T, N> idx_vec{idx.raw}; 2558 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; 2559 #else 2560 // We only keep LowerHalf of the result, which is valid in idx. 2561 const Indices128<T, N * 2> idx2{idx.raw}; 2562 #endif 2563 return LowerHalf( 2564 d, TableLookupBytes(Combine(dt, b, a), 2565 BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw}))); 2566 } 2567 2568 template <typename T> 2569 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 2570 Indices128<T> idx) { 2571 return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)}; 2572 } 2573 2574 // ------------------------------ ReverseBlocks 2575 2576 // Single block: no change 2577 template <class D> 2578 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { 2579 return v; 2580 } 2581 2582 // ------------------------------ Reverse (Shuffle0123, Shuffle2301) 2583 2584 // Single lane: no change 2585 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> 2586 HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { 2587 return v; 2588 } 2589 2590 // 32-bit x2: shuffle 2591 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 2592 HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) { 2593 return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw}; 2594 } 2595 2596 // 16-bit x4: shuffle 2597 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 2598 HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) { 2599 const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, 2600 14, 15, 12, 13, 10, 11, 8, 9}; 2601 return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)}; 2602 } 2603 2604 // 16-bit x2: rotate bytes 2605 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 2606 HWY_API Vec32<T> Reverse(D d, Vec32<T> v) { 2607 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32; 2608 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); 2609 } 2610 2611 // ------------------------------- ReverseLaneBytes 2612 2613 #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \ 2614 ((!HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 710) || \ 2615 (HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 900) || \ 2616 HWY_COMPILER_CLANG >= 400) 2617 2618 // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes. 2619 #ifdef HWY_NATIVE_REVERSE_LANE_BYTES 2620 #undef HWY_NATIVE_REVERSE_LANE_BYTES 2621 #else 2622 #define HWY_NATIVE_REVERSE_LANE_BYTES 2623 #endif 2624 2625 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 2626 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> 2627 HWY_API V ReverseLaneBytes(V v) { 2628 return V{vec_revb(v.raw)}; 2629 } 2630 2631 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. 2632 #ifdef HWY_NATIVE_REVERSE2_8 2633 #undef HWY_NATIVE_REVERSE2_8 2634 #else 2635 #define HWY_NATIVE_REVERSE2_8 2636 #endif 2637 2638 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2639 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 2640 const Repartition<uint16_t, decltype(d)> du16; 2641 return BitCast(d, ReverseLaneBytes(BitCast(du16, v))); 2642 } 2643 2644 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2645 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 2646 const Repartition<uint32_t, decltype(d)> du32; 2647 return BitCast(d, ReverseLaneBytes(BitCast(du32, v))); 2648 } 2649 2650 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2651 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { 2652 const Repartition<uint64_t, decltype(d)> du64; 2653 return BitCast(d, ReverseLaneBytes(BitCast(du64, v))); 2654 } 2655 2656 #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14 2657 2658 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2659 HWY_API Vec16<T> Reverse(D d, Vec16<T> v) { 2660 return Reverse2(d, v); 2661 } 2662 2663 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2664 HWY_API Vec32<T> Reverse(D d, Vec32<T> v) { 2665 return Reverse4(d, v); 2666 } 2667 2668 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2669 HWY_API Vec64<T> Reverse(D d, Vec64<T> v) { 2670 return Reverse8(d, v); 2671 } 2672 2673 // ------------------------------ Reverse2 2674 2675 // Single lane: no change 2676 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> 2677 HWY_API Vec128<T, 1> Reverse2(D /* tag */, Vec128<T, 1> v) { 2678 return v; 2679 } 2680 2681 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 2682 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 2683 const Repartition<uint32_t, decltype(d)> du32; 2684 return BitCast(d, RotateRight<16>(BitCast(du32, v))); 2685 } 2686 2687 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 2688 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 2689 const Repartition<uint64_t, decltype(d)> du64; 2690 return BitCast(d, RotateRight<32>(BitCast(du64, v))); 2691 } 2692 2693 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> 2694 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 2695 return Shuffle01(v); 2696 } 2697 2698 // ------------------------------ Reverse4 2699 2700 template <class D, HWY_IF_T_SIZE_D(D, 2)> 2701 HWY_API VFromD<D> Reverse4(D /*d*/, VFromD<D> v) { 2702 const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, 2703 14, 15, 12, 13, 10, 11, 8, 9}; 2704 return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)}; 2705 } 2706 2707 template <class D, HWY_IF_T_SIZE_D(D, 4)> 2708 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 2709 return Reverse(d, v); 2710 } 2711 2712 template <class D, HWY_IF_T_SIZE_D(D, 8)> 2713 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) { 2714 HWY_ASSERT(0); // don't have 4 u64 lanes 2715 } 2716 2717 // ------------------------------ Reverse8 2718 2719 template <class D, HWY_IF_T_SIZE_D(D, 2)> 2720 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { 2721 return Reverse(d, v); 2722 } 2723 2724 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 2725 HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) { 2726 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit 2727 } 2728 2729 // ------------------------------ InterleaveLower 2730 2731 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides 2732 // the least-significant lane) and "b". To concatenate two half-width integers 2733 // into one, use ZipLower/Upper instead (also works with scalar). 2734 2735 template <typename T, size_t N> 2736 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 2737 return Vec128<T, N>{vec_mergeh(a.raw, b.raw)}; 2738 } 2739 2740 // Additional overload for the optional tag 2741 template <class D> 2742 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { 2743 return InterleaveLower(a, b); 2744 } 2745 2746 // ------------------------------ InterleaveUpper (UpperHalf) 2747 2748 // Full 2749 template <class D, typename T = TFromD<D>> 2750 HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) { 2751 return Vec128<T>{vec_mergel(a.raw, b.raw)}; 2752 } 2753 2754 // Partial 2755 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2756 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 2757 const Half<decltype(d)> d2; 2758 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, 2759 VFromD<D>{UpperHalf(d2, b).raw}); 2760 } 2761 2762 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 2763 2764 // Same as Interleave*, except that the return lanes are double-width integers; 2765 // this is necessary because the single-lane scalar cannot return two values. 2766 template <class V, class DW = RepartitionToWide<DFromV<V>>> 2767 HWY_API VFromD<DW> ZipLower(V a, V b) { 2768 return BitCast(DW(), InterleaveLower(a, b)); 2769 } 2770 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 2771 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 2772 return BitCast(dw, InterleaveLower(D(), a, b)); 2773 } 2774 2775 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 2776 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 2777 return BitCast(dw, InterleaveUpper(D(), a, b)); 2778 } 2779 2780 // ------------------------------ Per4LaneBlkShufDupSet4xU32 2781 2782 // Used by hwy/ops/generic_ops-inl.h to implement Per4LaneBlockShuffle 2783 namespace detail { 2784 2785 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 2786 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 2787 #else 2788 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 2789 #endif 2790 2791 template <class D> 2792 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, 2793 const uint32_t x2, 2794 const uint32_t x1, 2795 const uint32_t x0) { 2796 const __vector unsigned int raw = {x0, x1, x2, x3}; 2797 return ResizeBitCast(d, Vec128<uint32_t>{raw}); 2798 } 2799 2800 } // namespace detail 2801 2802 // ------------------------------ SlideUpLanes 2803 2804 template <class D> 2805 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 2806 const Repartition<uint8_t, decltype(d)> du8; 2807 using VU8 = VFromD<decltype(du8)>; 2808 const auto v_shift_amt = 2809 BitCast(Full128<uint8_t>(), 2810 Set(Full128<uint32_t>(), 2811 static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8))); 2812 2813 #if HWY_S390X_HAVE_Z14 2814 return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)}); 2815 #else // VSX 2816 #if HWY_IS_LITTLE_ENDIAN 2817 return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)}); 2818 #else 2819 return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)}); 2820 #endif // HWY_IS_LITTLE_ENDIAN 2821 #endif // HWY_S390X_HAVE_Z14 2822 } 2823 2824 // ------------------------------ SlideDownLanes 2825 2826 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2827 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 2828 using TU = UnsignedFromSize<d.MaxBytes()>; 2829 const Repartition<TU, decltype(d)> du; 2830 const auto v_shift_amt = 2831 Set(du, static_cast<TU>(amt * sizeof(TFromD<D>) * 8)); 2832 2833 #if HWY_IS_LITTLE_ENDIAN 2834 return BitCast(d, BitCast(du, v) >> v_shift_amt); 2835 #else 2836 return BitCast(d, BitCast(du, v) << v_shift_amt); 2837 #endif 2838 } 2839 2840 template <class D, HWY_IF_V_SIZE_D(D, 16)> 2841 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 2842 const Repartition<uint8_t, decltype(d)> du8; 2843 using VU8 = VFromD<decltype(du8)>; 2844 const auto v_shift_amt = 2845 BitCast(Full128<uint8_t>(), 2846 Set(Full128<uint32_t>(), 2847 static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8))); 2848 2849 #if HWY_S390X_HAVE_Z14 2850 return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)}); 2851 #else // VSX 2852 #if HWY_IS_LITTLE_ENDIAN 2853 return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)}); 2854 #else 2855 return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)}); 2856 #endif // HWY_IS_LITTLE_ENDIAN 2857 #endif // HWY_S390X_HAVE_Z14 2858 } 2859 2860 // ================================================== COMBINE 2861 2862 // ------------------------------ Combine (InterleaveLower) 2863 2864 // N = N/2 + N/2 (upper half undefined) 2865 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> 2866 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { 2867 const Half<decltype(d)> dh; 2868 // Treat half-width input as one lane, and expand to two lanes. 2869 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; 2870 using Raw = typename detail::Raw128<TFromV<VU>>::type; 2871 const VU lo{reinterpret_cast<Raw>(lo_half.raw)}; 2872 const VU hi{reinterpret_cast<Raw>(hi_half.raw)}; 2873 return BitCast(d, InterleaveLower(lo, hi)); 2874 } 2875 2876 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) 2877 2878 template <class D> 2879 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 2880 const Half<D> dh; 2881 return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); 2882 } 2883 2884 // ------------------------------ Concat full (InterleaveLower) 2885 2886 // hiH,hiL loH,loL |-> hiL,loL (= lower halves) 2887 template <class D, typename T = TFromD<D>> 2888 HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) { 2889 const Repartition<uint64_t, decltype(d)> d64; 2890 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); 2891 } 2892 2893 // hiH,hiL loH,loL |-> hiH,loH (= upper halves) 2894 template <class D, typename T = TFromD<D>> 2895 HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) { 2896 const Repartition<uint64_t, decltype(d)> d64; 2897 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); 2898 } 2899 2900 // hiH,hiL loH,loL |-> hiL,loH (= inner halves) 2901 template <class D, typename T = TFromD<D>> 2902 HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) { 2903 return CombineShiftRightBytes<8>(d, hi, lo); 2904 } 2905 2906 // hiH,hiL loH,loL |-> hiH,loL (= outer halves) 2907 template <class D, typename T = TFromD<D>> 2908 HWY_API Vec128<T> ConcatUpperLower(D /*d*/, Vec128<T> hi, Vec128<T> lo) { 2909 const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7, 2910 24, 25, 26, 27, 28, 29, 30, 31}; 2911 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)}; 2912 } 2913 2914 // ------------------------------ Concat partial (Combine, LowerHalf) 2915 2916 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2917 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 2918 const Half<decltype(d)> d2; 2919 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); 2920 } 2921 2922 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2923 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 2924 const Half<decltype(d)> d2; 2925 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); 2926 } 2927 2928 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2929 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 2930 const Half<decltype(d)> d2; 2931 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); 2932 } 2933 2934 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2935 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 2936 const Half<decltype(d)> d2; 2937 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); 2938 } 2939 2940 // ------------------------------ TruncateTo 2941 2942 template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), 2943 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr, 2944 HWY_IF_LANES_D(D, 1)> 2945 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<FromT, 1> v) { 2946 using Raw = typename detail::Raw128<TFromD<D>>::type; 2947 #if HWY_IS_LITTLE_ENDIAN 2948 return VFromD<D>{reinterpret_cast<Raw>(v.raw)}; 2949 #else 2950 return VFromD<D>{reinterpret_cast<Raw>( 2951 vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))}; 2952 #endif 2953 } 2954 2955 namespace detail { 2956 2957 template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), 2958 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)> 2959 HWY_API VFromD<D> Truncate2To( 2960 D /* tag */, Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo, 2961 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) { 2962 return VFromD<D>{vec_pack(lo.raw, hi.raw)}; 2963 } 2964 2965 } // namespace detail 2966 2967 template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), 2968 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)> 2969 HWY_API VFromD<D> TruncateTo(D /* d */, 2970 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 2971 return VFromD<D>{vec_pack(v.raw, v.raw)}; 2972 } 2973 2974 template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), 2975 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr, 2976 HWY_IF_LANES_GT_D(D, 1)> 2977 HWY_API VFromD<D> TruncateTo(D d, 2978 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 2979 const Rebind<MakeNarrow<FromT>, decltype(d)> d2; 2980 return TruncateTo(d, TruncateTo(d2, v)); 2981 } 2982 2983 // ------------------------------ ConcatOdd (TruncateTo) 2984 2985 // 8-bit full 2986 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 2987 HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { 2988 const Repartition<uint16_t, decltype(d)> dw; 2989 const RebindToUnsigned<decltype(d)> du; 2990 #if HWY_IS_LITTLE_ENDIAN 2991 // Right-shift 8 bits per u16 so we can pack. 2992 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); 2993 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); 2994 #else 2995 const Vec128<uint16_t> uH = BitCast(dw, hi); 2996 const Vec128<uint16_t> uL = BitCast(dw, lo); 2997 #endif 2998 return BitCast(d, detail::Truncate2To(du, uL, uH)); 2999 } 3000 3001 // 8-bit x8 3002 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3003 HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) { 3004 // Don't care about upper half, no need to zero. 3005 const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23}; 3006 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; 3007 } 3008 3009 // 8-bit x4 3010 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3011 HWY_API Vec32<T> ConcatOdd(D /*d*/, Vec32<T> hi, Vec32<T> lo) { 3012 // Don't care about upper half, no need to zero. 3013 const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19}; 3014 return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; 3015 } 3016 3017 // 16-bit full 3018 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3019 HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { 3020 const Repartition<uint32_t, decltype(d)> dw; 3021 const RebindToUnsigned<decltype(d)> du; 3022 #if HWY_IS_LITTLE_ENDIAN 3023 const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi)); 3024 const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo)); 3025 #else 3026 const Vec128<uint32_t> uH = BitCast(dw, hi); 3027 const Vec128<uint32_t> uL = BitCast(dw, lo); 3028 #endif 3029 return BitCast(d, detail::Truncate2To(du, uL, uH)); 3030 } 3031 3032 // 16-bit x4 3033 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3034 HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) { 3035 // Don't care about upper half, no need to zero. 3036 const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23}; 3037 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)}; 3038 } 3039 3040 // 32-bit full 3041 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 3042 HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) { 3043 #if HWY_IS_LITTLE_ENDIAN 3044 (void)d; 3045 const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15, 3046 20, 21, 22, 23, 28, 29, 30, 31}; 3047 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)}; 3048 #else 3049 const RebindToUnsigned<decltype(d)> du; 3050 const Repartition<uint64_t, decltype(d)> dw; 3051 return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); 3052 #endif 3053 } 3054 3055 // Any type x2 3056 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> 3057 HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 3058 return InterleaveUpper(d, lo, hi); 3059 } 3060 3061 // ------------------------------ ConcatEven (TruncateTo) 3062 3063 // 8-bit full 3064 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3065 HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { 3066 const Repartition<uint16_t, decltype(d)> dw; 3067 const RebindToUnsigned<decltype(d)> du; 3068 #if HWY_IS_LITTLE_ENDIAN 3069 const Vec128<uint16_t> uH = BitCast(dw, hi); 3070 const Vec128<uint16_t> uL = BitCast(dw, lo); 3071 #else 3072 // Right-shift 8 bits per u16 so we can pack. 3073 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); 3074 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); 3075 #endif 3076 return BitCast(d, detail::Truncate2To(du, uL, uH)); 3077 } 3078 3079 // 8-bit x8 3080 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3081 HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) { 3082 // Don't care about upper half, no need to zero. 3083 const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22}; 3084 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; 3085 } 3086 3087 // 8-bit x4 3088 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3089 HWY_API Vec32<T> ConcatEven(D /*d*/, Vec32<T> hi, Vec32<T> lo) { 3090 // Don't care about upper half, no need to zero. 3091 const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18}; 3092 return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; 3093 } 3094 3095 // 16-bit full 3096 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3097 HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { 3098 // Isolate lower 16 bits per u32 so we can pack. 3099 const Repartition<uint32_t, decltype(d)> dw; 3100 const RebindToUnsigned<decltype(d)> du; 3101 #if HWY_IS_LITTLE_ENDIAN 3102 const Vec128<uint32_t> uH = BitCast(dw, hi); 3103 const Vec128<uint32_t> uL = BitCast(dw, lo); 3104 #else 3105 const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi)); 3106 const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo)); 3107 #endif 3108 return BitCast(d, detail::Truncate2To(du, uL, uH)); 3109 } 3110 3111 // 16-bit x4 3112 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3113 HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) { 3114 // Don't care about upper half, no need to zero. 3115 const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21}; 3116 return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)}; 3117 } 3118 3119 // 32-bit full 3120 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 3121 HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) { 3122 #if HWY_IS_LITTLE_ENDIAN 3123 const Repartition<uint64_t, decltype(d)> dw; 3124 const RebindToUnsigned<decltype(d)> du; 3125 return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); 3126 #else 3127 (void)d; 3128 constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11, 3129 16, 17, 18, 19, 24, 25, 26, 27}; 3130 return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)}; 3131 #endif 3132 } 3133 3134 // Any T x2 3135 template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> 3136 HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 3137 return InterleaveLower(d, lo, hi); 3138 } 3139 3140 // ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd) 3141 #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3142 #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3143 #else 3144 #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3145 #endif 3146 3147 template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V), 3148 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 3149 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 3150 HWY_API VFromD<D> OrderedTruncate2To(D d, V a, V b) { 3151 #if HWY_IS_LITTLE_ENDIAN 3152 return ConcatEven(d, BitCast(d, b), BitCast(d, a)); 3153 #else 3154 return ConcatOdd(d, BitCast(d, b), BitCast(d, a)); 3155 #endif 3156 } 3157 3158 // ------------------------------ DupEven (InterleaveLower) 3159 3160 template <typename T> 3161 HWY_API Vec128<T, 1> DupEven(Vec128<T, 1> v) { 3162 return v; 3163 } 3164 3165 template <typename T> 3166 HWY_API Vec128<T, 2> DupEven(Vec128<T, 2> v) { 3167 return InterleaveLower(DFromV<decltype(v)>(), v, v); 3168 } 3169 3170 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 3171 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 3172 const DFromV<decltype(v)> d; 3173 const Repartition<uint8_t, decltype(d)> du8; 3174 constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6, 3175 8, 8, 10, 10, 12, 12, 14, 14}; 3176 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle})); 3177 } 3178 3179 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 3180 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 3181 const DFromV<decltype(v)> d; 3182 const Repartition<uint8_t, decltype(d)> du8; 3183 constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5, 3184 8, 9, 8, 9, 12, 13, 12, 13}; 3185 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle})); 3186 } 3187 3188 template <typename T, HWY_IF_T_SIZE(T, 4)> 3189 HWY_API Vec128<T> DupEven(Vec128<T> v) { 3190 #if HWY_S390X_HAVE_Z14 3191 const DFromV<decltype(v)> d; 3192 const Repartition<uint8_t, decltype(d)> du8; 3193 return TableLookupBytes( 3194 v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 3195 11, 8, 9, 10, 11))); 3196 #else 3197 return Vec128<T>{vec_mergee(v.raw, v.raw)}; 3198 #endif 3199 } 3200 3201 // ------------------------------ DupOdd (InterleaveUpper) 3202 3203 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 3204 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3205 const DFromV<decltype(v)> d; 3206 const Repartition<uint8_t, decltype(d)> du8; 3207 constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7, 3208 9, 9, 11, 11, 13, 13, 15, 15}; 3209 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle})); 3210 } 3211 3212 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 3213 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3214 const DFromV<decltype(v)> d; 3215 const Repartition<uint8_t, decltype(d)> du8; 3216 constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7, 3217 10, 11, 10, 11, 14, 15, 14, 15}; 3218 return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle})); 3219 } 3220 3221 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 3222 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3223 #if HWY_S390X_HAVE_Z14 3224 const DFromV<decltype(v)> d; 3225 const Repartition<uint8_t, decltype(d)> du8; 3226 return TableLookupBytes( 3227 v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 3228 15, 12, 13, 14, 15))); 3229 #else 3230 return Vec128<T, N>{vec_mergeo(v.raw, v.raw)}; 3231 #endif 3232 } 3233 3234 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 3235 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3236 return InterleaveUpper(DFromV<decltype(v)>(), v, v); 3237 } 3238 3239 // ------------------------------ OddEven (IfThenElse) 3240 3241 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 3242 HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { 3243 const DFromV<decltype(a)> d; 3244 const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 3245 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; 3246 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a); 3247 } 3248 3249 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 3250 HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { 3251 const DFromV<decltype(a)> d; 3252 const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 3253 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; 3254 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a); 3255 } 3256 3257 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 3258 HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { 3259 const DFromV<decltype(a)> d; 3260 const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 3261 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0}; 3262 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a); 3263 } 3264 3265 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 3266 HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) { 3267 // Same as ConcatUpperLower for full vectors; do not call that because this 3268 // is more efficient for 64x1 vectors. 3269 const DFromV<decltype(a)> d; 3270 const __vector unsigned char mask = { 3271 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0}; 3272 return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a); 3273 } 3274 3275 // ------------------------------ InterleaveEven 3276 3277 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3278 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 3279 const Full128<TFromD<D>> d_full; 3280 const Indices128<TFromD<D>> idx{ 3281 Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 3282 10, 26, 12, 28, 14, 30) 3283 .raw}; 3284 return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a), 3285 ResizeBitCast(d_full, b), idx)); 3286 } 3287 3288 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3289 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 3290 const Full128<TFromD<D>> d_full; 3291 const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1, 3292 16, 17, 4, 5, 20, 21, 8, 3293 9, 24, 25, 12, 13, 28, 29) 3294 .raw}; 3295 return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a), 3296 ResizeBitCast(d_full, b), idx)); 3297 } 3298 3299 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3300 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 3301 #if HWY_S390X_HAVE_Z14 3302 const Full128<TFromD<D>> d_full; 3303 const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1, 3304 2, 3, 16, 17, 18, 19, 8, 3305 9, 10, 11, 24, 25, 26, 27) 3306 .raw}; 3307 return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a), 3308 ResizeBitCast(d_full, b), idx)); 3309 #else 3310 (void)d; 3311 return VFromD<D>{vec_mergee(a.raw, b.raw)}; 3312 #endif 3313 } 3314 3315 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3316 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 3317 return InterleaveLower(a, b); 3318 } 3319 3320 // ------------------------------ InterleaveOdd 3321 3322 template <class D, HWY_IF_T_SIZE_D(D, 1)> 3323 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 3324 const Full128<TFromD<D>> d_full; 3325 const Indices128<TFromD<D>> idx{ 3326 Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 3327 11, 27, 13, 29, 15, 31) 3328 .raw}; 3329 return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a), 3330 ResizeBitCast(d_full, b), idx)); 3331 } 3332 3333 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3334 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 3335 const Full128<TFromD<D>> d_full; 3336 const Indices128<TFromD<D>> idx{ 3337 Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10, 3338 11, 26, 27, 14, 15, 30, 31) 3339 .raw}; 3340 return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a), 3341 ResizeBitCast(d_full, b), idx)); 3342 } 3343 3344 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3345 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 3346 #if HWY_S390X_HAVE_Z14 3347 const Full128<TFromD<D>> d_full; 3348 const Indices128<TFromD<D>> idx{ 3349 Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12, 3350 13, 14, 15, 28, 29, 30, 31) 3351 .raw}; 3352 return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a), 3353 ResizeBitCast(d_full, b), idx)); 3354 #else 3355 (void)d; 3356 return VFromD<D>{vec_mergeo(a.raw, b.raw)}; 3357 #endif 3358 } 3359 3360 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3361 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 3362 return InterleaveUpper(d, a, b); 3363 } 3364 3365 // ------------------------------ OddEvenBlocks 3366 template <typename T, size_t N> 3367 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 3368 return even; 3369 } 3370 3371 // ------------------------------ SwapAdjacentBlocks 3372 template <typename T, size_t N> 3373 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 3374 return v; 3375 } 3376 3377 // ------------------------------ InterleaveEvenBlocks 3378 template <class D, class V = VFromD<D>> 3379 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 3380 return a; 3381 } 3382 // ------------------------------ InterleaveOddBlocks 3383 template <class D, class V = VFromD<D>> 3384 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 3385 return a; 3386 } 3387 3388 // ------------------------------ MulFixedPoint15 (OddEven) 3389 3390 #if HWY_S390X_HAVE_Z14 3391 HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) { 3392 const DFromV<decltype(a)> di16; 3393 const RepartitionToWide<decltype(di16)> di32; 3394 3395 const auto round_up_incr = Set(di32, 0x4000); 3396 const auto i32_product = MulEven(a, b) + round_up_incr; 3397 3398 return ResizeBitCast(di16, ShiftLeft<1>(i32_product)); 3399 } 3400 template <size_t N, HWY_IF_LANES_GT(N, 1)> 3401 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, 3402 Vec128<int16_t, N> b) { 3403 const DFromV<decltype(a)> di16; 3404 const RepartitionToWide<decltype(di16)> di32; 3405 3406 const auto round_up_incr = Set(di32, 0x4000); 3407 const auto even_product = MulEven(a, b) + round_up_incr; 3408 const auto odd_product = MulOdd(a, b) + round_up_incr; 3409 3410 return OddEven(BitCast(di16, ShiftRight<15>(odd_product)), 3411 BitCast(di16, ShiftLeft<1>(even_product))); 3412 } 3413 #else 3414 template <size_t N> 3415 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, 3416 Vec128<int16_t, N> b) { 3417 const Vec128<int16_t> zero = Zero(Full128<int16_t>()); 3418 return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)}; 3419 } 3420 #endif 3421 3422 // ------------------------------ Shl 3423 3424 namespace detail { 3425 template <typename T, size_t N> 3426 HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v, 3427 Vec128<T, N> bits) { 3428 #if HWY_S390X_HAVE_Z14 3429 return Vec128<T, N>{v.raw << bits.raw}; 3430 #else 3431 return Vec128<T, N>{vec_sl(v.raw, bits.raw)}; 3432 #endif 3433 } 3434 3435 // Signed left shift is the same as unsigned. 3436 template <typename T, size_t N> 3437 HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v, 3438 Vec128<T, N> bits) { 3439 const DFromV<decltype(v)> di; 3440 const RebindToUnsigned<decltype(di)> du; 3441 return BitCast(di, 3442 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); 3443 } 3444 3445 } // namespace detail 3446 3447 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> 3448 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 3449 return detail::Shl(hwy::TypeTag<T>(), v, bits); 3450 } 3451 3452 // ------------------------------ Shr 3453 3454 namespace detail { 3455 template <typename T, size_t N> 3456 HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v, 3457 Vec128<T, N> bits) { 3458 #if HWY_S390X_HAVE_Z14 3459 return Vec128<T, N>{v.raw >> bits.raw}; 3460 #else 3461 return Vec128<T, N>{vec_sr(v.raw, bits.raw)}; 3462 #endif 3463 } 3464 3465 template <typename T, size_t N> 3466 HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v, 3467 Vec128<T, N> bits) { 3468 #if HWY_S390X_HAVE_Z14 3469 return Vec128<T, N>{v.raw >> bits.raw}; 3470 #else 3471 const DFromV<decltype(v)> di; 3472 const RebindToUnsigned<decltype(di)> du; 3473 return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)}; 3474 #endif 3475 } 3476 3477 } // namespace detail 3478 3479 template <typename T, size_t N> 3480 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) { 3481 return detail::Shr(hwy::TypeTag<T>(), v, bits); 3482 } 3483 3484 // ------------------------------ MulEven/Odd 64x64 (UpperHalf) 3485 3486 template <class T, HWY_IF_UI64(T)> 3487 HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 3488 #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) 3489 using V64 = typename detail::Raw128<T>::type; 3490 const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw)); 3491 #if HWY_IS_LITTLE_ENDIAN 3492 return Vec128<T>{mul128_result}; 3493 #else 3494 // Need to swap the two halves of mul128_result on big-endian targets as 3495 // the upper 64 bits of the product are in lane 0 of mul128_result and 3496 // the lower 64 bits of the product are in lane 1 of mul128_result 3497 return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)}; 3498 #endif 3499 #else 3500 alignas(16) T mul[2]; 3501 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); 3502 return Load(Full128<T>(), mul); 3503 #endif 3504 } 3505 3506 template <class T, HWY_IF_UI64(T)> 3507 HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 3508 #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) 3509 using V64 = typename detail::Raw128<T>::type; 3510 const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw)); 3511 #if HWY_IS_LITTLE_ENDIAN 3512 return Vec128<T>{mul128_result}; 3513 #else 3514 // Need to swap the two halves of mul128_result on big-endian targets as 3515 // the upper 64 bits of the product are in lane 0 of mul128_result and 3516 // the lower 64 bits of the product are in lane 1 of mul128_result 3517 return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)}; 3518 #endif 3519 #else 3520 alignas(16) T mul[2]; 3521 const Full64<T> d2; 3522 mul[0] = 3523 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); 3524 return Load(Full128<T>(), mul); 3525 #endif 3526 } 3527 3528 // ------------------------------ PromoteEvenTo/PromoteOddTo 3529 #include "hwy/ops/inside-inl.h" 3530 3531 // ------------------------------ WidenMulPairwiseAdd 3532 3533 template <class DF, HWY_IF_F32_D(DF), 3534 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 3535 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 3536 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), 3537 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); 3538 } 3539 3540 // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. 3541 template <class D32, HWY_IF_UI32_D(D32), 3542 class V16 = VFromD<RepartitionToNarrow<D32>>> 3543 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) { 3544 #if HWY_S390X_HAVE_Z14 3545 (void)d32; 3546 return MulEven(a, b) + MulOdd(a, b); 3547 #else 3548 return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)}; 3549 #endif 3550 } 3551 3552 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) 3553 3554 // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. 3555 template <class D32, HWY_IF_UI32_D(D32), 3556 class V16 = VFromD<RepartitionToNarrow<D32>>> 3557 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b, 3558 VFromD<D32> sum0, 3559 VFromD<D32>& /*sum1*/) { 3560 #if HWY_S390X_HAVE_Z14 3561 return MulEven(a, b) + MulOdd(a, b) + sum0; 3562 #else 3563 return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)}; 3564 #endif 3565 } 3566 3567 // ------------------------------ RearrangeToOddPlusEven 3568 template <size_t N> 3569 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(Vec128<int32_t, N> sum0, 3570 Vec128<int32_t, N> /*sum1*/) { 3571 return sum0; // invariant already holds 3572 } 3573 3574 template <size_t N> 3575 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven( 3576 Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) { 3577 return sum0; // invariant already holds 3578 } 3579 3580 template <class VW> 3581 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { 3582 return Add(sum0, sum1); 3583 } 3584 3585 // ------------------------------ SatWidenMulPairwiseAccumulate 3586 #if !HWY_S390X_HAVE_Z14 3587 3588 #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 3589 #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 3590 #else 3591 #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 3592 #endif 3593 3594 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> 3595 HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate( 3596 DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a, 3597 VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) { 3598 return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)}; 3599 } 3600 3601 #endif // !HWY_S390X_HAVE_Z14 3602 3603 // ------------------------------ SumOfMulQuadAccumulate 3604 #if !HWY_S390X_HAVE_Z14 3605 3606 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 3607 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 3608 #else 3609 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 3610 #endif 3611 template <class DU32, HWY_IF_U32_D(DU32)> 3612 HWY_API VFromD<DU32> SumOfMulQuadAccumulate( 3613 DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a, 3614 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { 3615 return VFromD<DU32>{vec_msum(a.raw, b.raw, sum.raw)}; 3616 } 3617 3618 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 3619 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 3620 #else 3621 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 3622 #endif 3623 3624 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> 3625 HWY_API VFromD<DI32> SumOfMulQuadAccumulate( 3626 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u, 3627 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { 3628 return VFromD<DI32>{vec_msum(b_i.raw, a_u.raw, sum.raw)}; 3629 } 3630 3631 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 3632 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 3633 #else 3634 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 3635 #endif 3636 template <class DI32, HWY_IF_I32_D(DI32)> 3637 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32, 3638 VFromD<Repartition<int8_t, DI32>> a, 3639 VFromD<Repartition<int8_t, DI32>> b, 3640 VFromD<DI32> sum) { 3641 const Repartition<uint8_t, decltype(di32)> du8; 3642 3643 const auto result_sum_0 = 3644 SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum); 3645 const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a)))); 3646 return result_sum_0 - result_sum_1; 3647 } 3648 3649 #endif // !HWY_S390X_HAVE_Z14 3650 3651 // ================================================== CONVERT 3652 3653 // ------------------------------ Promotions (part w/ narrow lanes -> full) 3654 3655 // Unsigned to signed/unsigned: zero-extend. 3656 template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)), 3657 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)> 3658 HWY_API VFromD<D> PromoteTo(D /* d */, 3659 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 3660 // First pretend the input has twice the lanes - the upper half will be 3661 // ignored by ZipLower. 3662 const Rebind<FromT, Twice<D>> d2; 3663 const VFromD<decltype(d2)> twice{v.raw}; 3664 // Then cast to narrow as expected by ZipLower, in case the sign of FromT 3665 // differs from that of D. 3666 const RepartitionToNarrow<D> dn; 3667 3668 #if HWY_IS_LITTLE_ENDIAN 3669 return ZipLower(BitCast(dn, twice), Zero(dn)); 3670 #else 3671 return ZipLower(Zero(dn), BitCast(dn, twice)); 3672 #endif 3673 } 3674 3675 // Signed: replicate sign bit. 3676 template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)), 3677 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)> 3678 HWY_API VFromD<D> PromoteTo(D /* d */, 3679 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 3680 using Raw = typename detail::Raw128<TFromD<D>>::type; 3681 return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))}; 3682 } 3683 3684 // 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit. 3685 template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D), 3686 HWY_IF_T_SIZE(FromT, 1)> 3687 HWY_API VFromD<D> PromoteTo(D d32, 3688 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 3689 const DFromV<decltype(v)> d8; 3690 const Rebind<MakeWide<FromT>, decltype(d8)> d16; 3691 return PromoteTo(d32, PromoteTo(d16, v)); 3692 } 3693 3694 // 8-bit or 16-bit to 64-bit: First, promote to MakeWide<FromT>, and then 3695 // convert to 64-bit. 3696 template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D), 3697 HWY_IF_NOT_FLOAT_NOR_SPECIAL(FromT), 3698 HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))> 3699 HWY_API VFromD<D> PromoteTo(D d64, 3700 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 3701 const Rebind<MakeWide<FromT>, decltype(d64)> dw; 3702 return PromoteTo(d64, PromoteTo(dw, v)); 3703 } 3704 3705 #if HWY_PPC_HAVE_9 3706 3707 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. 3708 #ifdef HWY_NATIVE_F16C 3709 #undef HWY_NATIVE_F16C 3710 #else 3711 #define HWY_NATIVE_F16C 3712 #endif 3713 3714 template <class D, HWY_IF_F32_D(D)> 3715 HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) { 3716 return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)}; 3717 } 3718 3719 #endif // HWY_PPC_HAVE_9 3720 3721 template <class D, HWY_IF_F32_D(D)> 3722 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { 3723 const Rebind<uint16_t, decltype(df32)> du16; 3724 const RebindToSigned<decltype(df32)> di32; 3725 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 3726 } 3727 3728 template <class D, HWY_IF_F64_D(D)> 3729 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { 3730 const __vector float raw_v = InterleaveLower(v, v).raw; 3731 #if HWY_IS_LITTLE_ENDIAN 3732 return VFromD<D>{vec_doubleo(raw_v)}; 3733 #elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ 3734 HWY_COMPILER_GCC_ACTUAL < 1000 3735 // Workaround for compiler errors with GCC 9 or earlier on Z14 3736 return VFromD<D>{__builtin_s390_vflls(raw_v)}; 3737 #else 3738 return VFromD<D>{vec_doublee(raw_v)}; 3739 #endif 3740 } 3741 3742 template <class D, HWY_IF_F64_D(D)> 3743 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) { 3744 #if HWY_S390X_HAVE_Z14 3745 const RebindToSigned<decltype(df64)> di64; 3746 return ConvertTo(df64, PromoteTo(di64, v)); 3747 #else // VSX 3748 (void)df64; 3749 const __vector signed int raw_v = InterleaveLower(v, v).raw; 3750 #if HWY_IS_LITTLE_ENDIAN 3751 return VFromD<D>{vec_doubleo(raw_v)}; 3752 #else 3753 return VFromD<D>{vec_doublee(raw_v)}; 3754 #endif 3755 #endif // HWY_S390X_HAVE_Z14 3756 } 3757 3758 template <class D, HWY_IF_F64_D(D)> 3759 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) { 3760 #if HWY_S390X_HAVE_Z14 3761 const RebindToUnsigned<decltype(df64)> du64; 3762 return ConvertTo(df64, PromoteTo(du64, v)); 3763 #else // VSX 3764 (void)df64; 3765 const __vector unsigned int raw_v = InterleaveLower(v, v).raw; 3766 #if HWY_IS_LITTLE_ENDIAN 3767 return VFromD<D>{vec_doubleo(raw_v)}; 3768 #else 3769 return VFromD<D>{vec_doublee(raw_v)}; 3770 #endif 3771 #endif // HWY_S390X_HAVE_Z14 3772 } 3773 3774 #if !HWY_S390X_HAVE_Z14 3775 namespace detail { 3776 3777 template <class V> 3778 static HWY_INLINE V VsxF2INormalizeSrcVals(V v) { 3779 #if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND) 3780 // Workaround for QEMU 7/8 VSX float to int conversion bug 3781 return IfThenElseZero(v == v, v); 3782 #else 3783 return v; 3784 #endif 3785 } 3786 3787 template <class VF32> 3788 static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<int64_t, DFromV<VF32>>> 3789 VsxXvcvspsxds(VF32 vf32) { 3790 using VI64 = VFromD<Repartition<int64_t, DFromV<VF32>>>; 3791 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \ 3792 HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds) 3793 // Use __builtin_vsx_xvcvspsxds if it is available (which is the case with 3794 // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10) 3795 return VI64{__builtin_vsx_xvcvspsxds(vf32.raw)}; 3796 #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN 3797 // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64 3798 // vec_signedo intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been 3799 // removed from GCC in GCC 15 3800 return VI64{vec_signedo(vf32.raw)}; 3801 #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN 3802 // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64 3803 // vec_signede intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been 3804 // removed from GCC in GCC 15 3805 return VI64{vec_signede(vf32.raw)}; 3806 #else 3807 // Inline assembly fallback for older versions of Clang that do not have the 3808 // __builtin_vsx_xvcvspsxds intrinsic 3809 __vector signed long long raw_result; 3810 __asm__("xvcvspsxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :); 3811 return VI64{raw_result}; 3812 #endif 3813 } 3814 3815 template <class VF32> 3816 static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<uint64_t, DFromV<VF32>>> 3817 VsxXvcvspuxds(VF32 vf32) { 3818 using VU64 = VFromD<Repartition<uint64_t, DFromV<VF32>>>; 3819 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \ 3820 HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds) 3821 // Use __builtin_vsx_xvcvspuxds if it is available (which is the case with 3822 // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10) 3823 return VU64{reinterpret_cast<__vector unsigned long long>( 3824 __builtin_vsx_xvcvspuxds(vf32.raw))}; 3825 #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN 3826 // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64 3827 // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been 3828 // removed from GCC in GCC 15 3829 return VU64{vec_unsignedo(vf32.raw)}; 3830 #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN 3831 // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64 3832 // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been 3833 // removed from GCC in GCC 15 3834 return VU64{vec_unsignede(vf32.raw)}; 3835 #else 3836 // Inline assembly fallback for older versions of Clang that do not have the 3837 // __builtin_vsx_xvcvspuxds intrinsic 3838 __vector unsigned long long raw_result; 3839 __asm__("xvcvspuxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :); 3840 return VU64{raw_result}; 3841 #endif 3842 } 3843 3844 } // namespace detail 3845 #endif // !HWY_S390X_HAVE_Z14 3846 3847 template <class D, HWY_IF_I64_D(D)> 3848 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { 3849 #if !HWY_S390X_HAVE_Z14 3850 const Repartition<float, decltype(di64)> dt_f32; 3851 const auto vt_f32 = ResizeBitCast(dt_f32, v); 3852 return detail::VsxXvcvspsxds( 3853 detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32))); 3854 #else 3855 const RebindToFloat<decltype(di64)> df64; 3856 return ConvertTo(di64, PromoteTo(df64, v)); 3857 #endif 3858 } 3859 3860 template <class D, HWY_IF_U64_D(D)> 3861 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { 3862 #if !HWY_S390X_HAVE_Z14 3863 const Repartition<float, decltype(du64)> dt_f32; 3864 const auto vt_f32 = ResizeBitCast(dt_f32, v); 3865 return detail::VsxXvcvspuxds( 3866 detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32))); 3867 #else 3868 const RebindToFloat<decltype(du64)> df64; 3869 return ConvertTo(du64, PromoteTo(df64, v)); 3870 #endif 3871 } 3872 3873 // ------------------------------ PromoteUpperTo 3874 3875 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO 3876 #undef HWY_NATIVE_PROMOTE_UPPER_TO 3877 #else 3878 #define HWY_NATIVE_PROMOTE_UPPER_TO 3879 #endif 3880 3881 // Unsigned to signed/unsigned: zero-extend. 3882 template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16), 3883 HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)), 3884 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)> 3885 HWY_API VFromD<D> PromoteUpperTo(D d, Vec128<FromT> v) { 3886 const RebindToUnsigned<D> du; 3887 const RepartitionToNarrow<decltype(du)> dn; 3888 3889 #if HWY_IS_LITTLE_ENDIAN 3890 return BitCast(d, ZipUpper(du, v, Zero(dn))); 3891 #else 3892 return BitCast(d, ZipUpper(du, Zero(dn), v)); 3893 #endif 3894 } 3895 3896 // Signed: replicate sign bit. 3897 template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16), 3898 HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)), 3899 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)> 3900 HWY_API VFromD<D> PromoteUpperTo(D /* d */, Vec128<FromT> v) { 3901 using Raw = typename detail::Raw128<TFromD<D>>::type; 3902 return VFromD<D>{reinterpret_cast<Raw>(vec_unpackl(v.raw))}; 3903 } 3904 3905 // F16 to F32 3906 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3907 HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<float16_t> v) { 3908 #if HWY_PPC_HAVE_9 3909 (void)df32; 3910 return VFromD<D>{vec_extract_fp32_from_shortl(v.raw)}; 3911 #else 3912 const Rebind<float16_t, decltype(df32)> dh; 3913 return PromoteTo(df32, UpperHalf(dh, v)); 3914 #endif 3915 } 3916 3917 // BF16 to F32 3918 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3919 HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<bfloat16_t> v) { 3920 const Repartition<uint16_t, decltype(df32)> du16; 3921 const RebindToSigned<decltype(df32)> di32; 3922 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); 3923 } 3924 3925 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3926 HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) { 3927 const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw; 3928 #if HWY_IS_LITTLE_ENDIAN 3929 return VFromD<D>{vec_doubleo(raw_v)}; 3930 #elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ 3931 HWY_COMPILER_GCC_ACTUAL < 1000 3932 // Workaround for compiler error with GCC 9 or earlier on Z14 3933 return VFromD<D>{__builtin_s390_vflls(raw_v)}; 3934 #else 3935 return VFromD<D>{vec_doublee(raw_v)}; 3936 #endif 3937 } 3938 3939 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3940 HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) { 3941 #if HWY_S390X_HAVE_Z14 3942 const RebindToSigned<decltype(df64)> di64; 3943 return ConvertTo(df64, PromoteUpperTo(di64, v)); 3944 #else // VSX 3945 (void)df64; 3946 const __vector signed int raw_v = 3947 InterleaveUpper(Full128<int32_t>(), v, v).raw; 3948 #if HWY_IS_LITTLE_ENDIAN 3949 return VFromD<D>{vec_doubleo(raw_v)}; 3950 #else 3951 return VFromD<D>{vec_doublee(raw_v)}; 3952 #endif 3953 #endif // HWY_S390X_HAVE_Z14 3954 } 3955 3956 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3957 HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) { 3958 #if HWY_S390X_HAVE_Z14 3959 const RebindToUnsigned<decltype(df64)> du64; 3960 return ConvertTo(df64, PromoteUpperTo(du64, v)); 3961 #else // VSX 3962 (void)df64; 3963 const __vector unsigned int raw_v = 3964 InterleaveUpper(Full128<uint32_t>(), v, v).raw; 3965 #if HWY_IS_LITTLE_ENDIAN 3966 return VFromD<D>{vec_doubleo(raw_v)}; 3967 #else 3968 return VFromD<D>{vec_doublee(raw_v)}; 3969 #endif 3970 #endif // HWY_S390X_HAVE_Z14 3971 } 3972 3973 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> 3974 HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) { 3975 #if !HWY_S390X_HAVE_Z14 3976 (void)di64; 3977 return detail::VsxXvcvspsxds( 3978 detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))); 3979 #else 3980 const RebindToFloat<decltype(di64)> df64; 3981 return ConvertTo(di64, PromoteUpperTo(df64, v)); 3982 #endif 3983 } 3984 3985 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> 3986 HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) { 3987 #if !HWY_S390X_HAVE_Z14 3988 (void)du64; 3989 return detail::VsxXvcvspuxds( 3990 detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))); 3991 #else 3992 const RebindToFloat<decltype(du64)> df64; 3993 return ConvertTo(du64, PromoteUpperTo(df64, v)); 3994 #endif 3995 } 3996 3997 // Generic version for <=64 bit input/output 3998 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V> 3999 HWY_API VFromD<D> PromoteUpperTo(D d, V v) { 4000 const Rebind<TFromV<V>, decltype(d)> dh; 4001 return PromoteTo(d, UpperHalf(dh, v)); 4002 } 4003 4004 // ------------------------------ PromoteEvenTo/PromoteOddTo 4005 4006 namespace detail { 4007 4008 // Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10 4009 #if HWY_PPC_HAVE_9 && \ 4010 (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200) 4011 4012 #if HWY_IS_LITTLE_ENDIAN 4013 template <class D, class V> 4014 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, 4015 hwy::SizeTag<4> /*to_lane_size_tag*/, 4016 hwy::SignedTag /*from_type_tag*/, D /*d_to*/, 4017 V v) { 4018 return VFromD<D>{vec_signexti(v.raw)}; 4019 } 4020 template <class D, class V> 4021 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, 4022 hwy::SizeTag<8> /*to_lane_size_tag*/, 4023 hwy::SignedTag /*from_type_tag*/, D /*d_to*/, 4024 V v) { 4025 return VFromD<D>{vec_signextll(v.raw)}; 4026 } 4027 #else 4028 template <class D, class V> 4029 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/, 4030 hwy::SizeTag<4> /*to_lane_size_tag*/, 4031 hwy::SignedTag /*from_type_tag*/, D /*d_to*/, 4032 V v) { 4033 return VFromD<D>{vec_signexti(v.raw)}; 4034 } 4035 template <class D, class V> 4036 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/, 4037 hwy::SizeTag<8> /*to_lane_size_tag*/, 4038 hwy::SignedTag /*from_type_tag*/, D /*d_to*/, 4039 V v) { 4040 return VFromD<D>{vec_signextll(v.raw)}; 4041 } 4042 #endif // HWY_IS_LITTLE_ENDIAN 4043 4044 #endif // HWY_PPC_HAVE_9 4045 4046 // I32/U32/F32->F64 PromoteEvenTo 4047 #if HWY_S390X_HAVE_Z14 4048 template <class D, class V> 4049 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, 4050 hwy::SizeTag<8> /*to_lane_size_tag*/, 4051 hwy::FloatTag /*from_type_tag*/, D /*d_to*/, 4052 V v) { 4053 return VFromD<D>{vec_doublee(v.raw)}; 4054 } 4055 template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)> 4056 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, 4057 hwy::SizeTag<8> /*to_lane_size_tag*/, 4058 FromTypeTag /*from_type_tag*/, D d_to, V v) { 4059 const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw; 4060 return ConvertTo(d_to, PromoteEvenTo(dw, v)); 4061 } 4062 #else // VSX 4063 template <class D, class V, class FromTypeTag> 4064 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, 4065 hwy::SizeTag<8> /*to_lane_size_tag*/, 4066 FromTypeTag /*from_type_tag*/, D /*d_to*/, 4067 V v) { 4068 return VFromD<D>{vec_doublee(v.raw)}; 4069 } 4070 #endif // HWY_S390X_HAVE_Z14 4071 4072 // F32->I64 PromoteEvenTo 4073 template <class D, class V> 4074 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, 4075 hwy::SizeTag<8> /*to_lane_size_tag*/, 4076 hwy::FloatTag /*from_type_tag*/, D d_to, 4077 V v) { 4078 #if !HWY_S390X_HAVE_Z14 4079 (void)d_to; 4080 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); 4081 #if HWY_IS_LITTLE_ENDIAN 4082 // VsxXvcvspsxds expects the source values to be in the odd lanes on 4083 // little-endian PPC, and the Shuffle2103 operation below will shift the even 4084 // lanes of normalized_v into the odd lanes. 4085 return VsxXvcvspsxds(Shuffle2103(normalized_v)); 4086 #else 4087 // VsxXvcvspsxds expects the source values to be in the even lanes on 4088 // big-endian PPC. 4089 return VsxXvcvspsxds(normalized_v); 4090 #endif 4091 #else 4092 const RebindToFloat<decltype(d_to)> df64; 4093 return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), 4094 hwy::FloatTag(), df64, v)); 4095 #endif 4096 } 4097 4098 // F32->U64 PromoteEvenTo 4099 template <class D, class V> 4100 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/, 4101 hwy::SizeTag<8> /*to_lane_size_tag*/, 4102 hwy::FloatTag /*from_type_tag*/, D d_to, 4103 V v) { 4104 #if !HWY_S390X_HAVE_Z14 4105 (void)d_to; 4106 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); 4107 #if HWY_IS_LITTLE_ENDIAN 4108 // VsxXvcvspuxds expects the source values to be in the odd lanes 4109 // on little-endian PPC, and the Shuffle2103 operation below will shift the 4110 // even lanes of normalized_v into the odd lanes. 4111 return VsxXvcvspuxds(Shuffle2103(normalized_v)); 4112 #else 4113 // VsxXvcvspuxds expects the source values to be in the even lanes 4114 // on big-endian PPC. 4115 return VsxXvcvspuxds(normalized_v); 4116 #endif 4117 #else 4118 const RebindToFloat<decltype(d_to)> df64; 4119 return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), 4120 hwy::FloatTag(), df64, v)); 4121 #endif 4122 } 4123 4124 // I32/U32/F32->F64 PromoteOddTo 4125 #if HWY_S390X_HAVE_Z14 4126 template <class D, class V> 4127 HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/, 4128 hwy::SizeTag<8> /*to_lane_size_tag*/, 4129 hwy::FloatTag /*from_type_tag*/, D d_to, 4130 V v) { 4131 return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(), 4132 d_to, V{vec_sld(v.raw, v.raw, 4)}); 4133 } 4134 template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)> 4135 HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/, 4136 hwy::SizeTag<8> /*to_lane_size_tag*/, 4137 FromTypeTag /*from_type_tag*/, D d_to, V v) { 4138 const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw; 4139 return ConvertTo(d_to, PromoteOddTo(dw, v)); 4140 } 4141 #else 4142 template <class D, class V, class FromTypeTag> 4143 HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/, 4144 hwy::SizeTag<8> /*to_lane_size_tag*/, 4145 FromTypeTag /*from_type_tag*/, D /*d_to*/, 4146 V v) { 4147 return VFromD<D>{vec_doubleo(v.raw)}; 4148 } 4149 #endif 4150 4151 // F32->I64 PromoteOddTo 4152 template <class D, class V> 4153 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/, 4154 hwy::SizeTag<8> /*to_lane_size_tag*/, 4155 hwy::FloatTag /*from_type_tag*/, D d_to, 4156 V v) { 4157 #if !HWY_S390X_HAVE_Z14 4158 (void)d_to; 4159 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); 4160 #if HWY_IS_LITTLE_ENDIAN 4161 // VsxXvcvspsxds expects the source values to be in the odd lanes 4162 // on little-endian PPC 4163 return VsxXvcvspsxds(normalized_v); 4164 #else 4165 // VsxXvcvspsxds expects the source values to be in the even lanes 4166 // on big-endian PPC, and the Shuffle0321 operation below will shift the odd 4167 // lanes of normalized_v into the even lanes. 4168 return VsxXvcvspsxds(Shuffle0321(normalized_v)); 4169 #endif 4170 #else 4171 const RebindToFloat<decltype(d_to)> df64; 4172 return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(), 4173 hwy::FloatTag(), df64, v)); 4174 #endif 4175 } 4176 4177 // F32->U64 PromoteOddTo 4178 template <class D, class V> 4179 HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/, 4180 hwy::SizeTag<8> /*to_lane_size_tag*/, 4181 hwy::FloatTag /*from_type_tag*/, D d_to, 4182 V v) { 4183 #if !HWY_S390X_HAVE_Z14 4184 (void)d_to; 4185 const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); 4186 #if HWY_IS_LITTLE_ENDIAN 4187 // VsxXvcvspuxds expects the source values to be in the odd lanes 4188 // on little-endian PPC 4189 return VsxXvcvspuxds(normalized_v); 4190 #else 4191 // VsxXvcvspuxds expects the source values to be in the even lanes 4192 // on big-endian PPC, and the Shuffle0321 operation below will shift the odd 4193 // lanes of normalized_v into the even lanes. 4194 return VsxXvcvspuxds(Shuffle0321(normalized_v)); 4195 #endif 4196 #else 4197 const RebindToFloat<decltype(d_to)> df64; 4198 return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(), 4199 hwy::FloatTag(), df64, v)); 4200 #endif 4201 } 4202 4203 } // namespace detail 4204 4205 // ------------------------------ Demotions (full -> part w/ narrow lanes) 4206 4207 template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), 4208 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4209 HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)> 4210 HWY_API VFromD<D> DemoteTo(D /* tag */, 4211 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4212 return VFromD<D>{vec_packsu(v.raw, v.raw)}; 4213 } 4214 4215 template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT), 4216 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4217 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)> 4218 HWY_API VFromD<D> DemoteTo(D /* tag */, 4219 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4220 return VFromD<D>{vec_packs(v.raw, v.raw)}; 4221 } 4222 4223 template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), 4224 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4225 HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)> 4226 HWY_API VFromD<D> DemoteTo(D /* tag */, 4227 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4228 return VFromD<D>{vec_packs(v.raw, v.raw)}; 4229 } 4230 4231 template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT), 4232 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 4233 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr> 4234 HWY_API VFromD<D> DemoteTo(D d, 4235 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4236 const Rebind<MakeNarrow<FromT>, D> d2; 4237 return DemoteTo(d, DemoteTo(d2, v)); 4238 } 4239 4240 template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT), 4241 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 4242 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr> 4243 HWY_API VFromD<D> DemoteTo(D d, 4244 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4245 const Rebind<MakeNarrow<FromT>, D> d2; 4246 return DemoteTo(d, DemoteTo(d2, v)); 4247 } 4248 4249 template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT), 4250 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 4251 hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr> 4252 HWY_API VFromD<D> DemoteTo(D d, 4253 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4254 const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2; 4255 return DemoteTo(d, DemoteTo(d2, v)); 4256 } 4257 4258 #if HWY_PPC_HAVE_9 && \ 4259 (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)) 4260 4261 // We already toggled HWY_NATIVE_F16C above. 4262 4263 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> 4264 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { 4265 // Avoid vec_pack_to_short_fp32 on Clang because its implementation is buggy. 4266 #if HWY_COMPILER_GCC_ACTUAL 4267 (void)df16; 4268 return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)}; 4269 #elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp) 4270 // Work around bug in the clang implementation of vec_pack_to_short_fp32 4271 // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets 4272 // if the __builtin_vsx_xvcvsphp intrinsic is available 4273 const RebindToUnsigned<decltype(df16)> du16; 4274 const Rebind<uint32_t, D> du; 4275 const VFromD<decltype(du)> bits16{ 4276 reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))}; 4277 return BitCast(df16, TruncateTo(du16, bits16)); 4278 #else 4279 #error "Only define the function if we have a native implementation" 4280 #endif 4281 } 4282 4283 #endif // HWY_PPC_HAVE_9 4284 4285 #if HWY_PPC_HAVE_9 4286 4287 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 4288 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 4289 #else 4290 #define HWY_NATIVE_DEMOTE_F64_TO_F16 4291 #endif 4292 4293 namespace detail { 4294 4295 // On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64 4296 // vector with the resulting F16 bits in the lower 16 bits of U64 lane 0 4297 4298 // On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as 4299 // an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1 4300 static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) { 4301 // Inline assembly is needed for the PPC9 xscvdphp instruction as there is 4302 // currently no intrinsic available for the PPC9 xscvdphp instruction 4303 __vector unsigned long long raw_result; 4304 __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw)); 4305 return Vec128<uint64_t>{raw_result}; 4306 } 4307 4308 } // namespace detail 4309 4310 template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)> 4311 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) { 4312 const RebindToUnsigned<decltype(df16)> du16; 4313 const Rebind<uint64_t, decltype(df16)> du64; 4314 4315 const Full128<double> df64_full; 4316 #if HWY_IS_LITTLE_ENDIAN 4317 const auto bits16_as_u64 = 4318 UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v))); 4319 #else 4320 const auto bits16_as_u64 = 4321 LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v))); 4322 #endif 4323 4324 return BitCast(df16, TruncateTo(du16, bits16_as_u64)); 4325 } 4326 4327 template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)> 4328 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) { 4329 const RebindToUnsigned<decltype(df16)> du16; 4330 const Rebind<uint64_t, decltype(df16)> du64; 4331 const Rebind<double, decltype(df16)> df64; 4332 4333 #if HWY_IS_LITTLE_ENDIAN 4334 const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v)); 4335 const auto bits64_as_u64_1 = detail::VsxXscvdphp(v); 4336 const auto bits64_as_u64 = 4337 InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1); 4338 #else 4339 const auto bits64_as_u64_0 = detail::VsxXscvdphp(v); 4340 const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v)); 4341 const auto bits64_as_u64 = 4342 InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1); 4343 #endif 4344 4345 return BitCast(df16, TruncateTo(du16, bits64_as_u64)); 4346 } 4347 4348 #elif HWY_S390X_HAVE_Z14 4349 4350 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 4351 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 4352 #else 4353 #define HWY_NATIVE_DEMOTE_F64_TO_F16 4354 #endif 4355 4356 namespace detail { 4357 4358 template <class DF32, HWY_IF_F32_D(DF32)> 4359 static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd( 4360 DF32 df32, VFromD<Rebind<double, DF32>> v) { 4361 const Twice<DF32> dt_f32; 4362 4363 __vector float raw_f32_in_even; 4364 __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw)); 4365 4366 const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even}; 4367 return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even)); 4368 } 4369 4370 } // namespace detail 4371 4372 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)> 4373 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) { 4374 const Rebind<float, decltype(df16)> df32; 4375 return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v)); 4376 } 4377 4378 #endif // HWY_PPC_HAVE_9 4379 4380 #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16) 4381 4382 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 4383 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 4384 #else 4385 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 4386 #endif 4387 4388 namespace detail { 4389 4390 // VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32 4391 // vector with the resulting BF16 bits in the lower 16 bits of each U32 lane 4392 template <class D, HWY_IF_BF16_D(D)> 4393 static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16( 4394 D dbf16, VFromD<Rebind<float, D>> v) { 4395 const Rebind<uint32_t, decltype(dbf16)> du32; 4396 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 4397 4398 using VU32 = __vector unsigned int; 4399 4400 // Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16 4401 // conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a 4402 // __vector unsigned char argument (at least as of GCC 13 and Clang 17) 4403 return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>( 4404 __builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))}; 4405 } 4406 4407 } // namespace detail 4408 4409 template <class D, HWY_IF_BF16_D(D)> 4410 HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { 4411 const RebindToUnsigned<decltype(dbf16)> du16; 4412 return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v))); 4413 } 4414 4415 #endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16) 4416 4417 // Specializations for partial vectors because vec_packs sets lanes above 2*N. 4418 template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN), 4419 HWY_IF_SIGNED_V(V), 4420 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4421 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4422 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4423 const DFromV<decltype(a)> d; 4424 const Twice<decltype(d)> dt; 4425 return DemoteTo(dn, Combine(dt, b, a)); 4426 } 4427 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN), 4428 HWY_IF_SIGNED_V(V), 4429 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4430 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4431 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4432 const Twice<decltype(dn)> dn_full; 4433 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4434 4435 const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)}; 4436 const auto vu32_full = BitCast(du32_full, v_full); 4437 return LowerHalf( 4438 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4439 } 4440 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN), 4441 HWY_IF_SIGNED_V(V), 4442 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4443 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4444 HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) { 4445 return VFromD<DN>{vec_packs(a.raw, b.raw)}; 4446 } 4447 4448 template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), 4449 HWY_IF_UNSIGNED_D(DN), HWY_IF_SIGNED_V(V), 4450 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4451 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4452 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4453 const DFromV<decltype(a)> d; 4454 const Twice<decltype(d)> dt; 4455 return DemoteTo(dn, Combine(dt, b, a)); 4456 } 4457 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN), 4458 HWY_IF_SIGNED_V(V), 4459 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4460 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4461 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4462 const Twice<decltype(dn)> dn_full; 4463 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4464 4465 const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)}; 4466 const auto vu32_full = BitCast(du32_full, v_full); 4467 return LowerHalf( 4468 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4469 } 4470 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN), 4471 HWY_IF_SIGNED_V(V), 4472 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4473 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4474 HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) { 4475 return VFromD<DN>{vec_packsu(a.raw, b.raw)}; 4476 } 4477 4478 template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), 4479 HWY_IF_UNSIGNED_D(DN), HWY_IF_UNSIGNED_V(V), 4480 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4481 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4482 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4483 const DFromV<decltype(a)> d; 4484 const Twice<decltype(d)> dt; 4485 return DemoteTo(dn, Combine(dt, b, a)); 4486 } 4487 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN), 4488 HWY_IF_UNSIGNED_V(V), 4489 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4490 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4491 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 4492 const Twice<decltype(dn)> dn_full; 4493 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4494 4495 const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)}; 4496 const auto vu32_full = BitCast(du32_full, v_full); 4497 return LowerHalf( 4498 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4499 } 4500 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN), 4501 HWY_IF_UNSIGNED_V(V), 4502 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 4503 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 4504 HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) { 4505 return VFromD<DN>{vec_packs(a.raw, b.raw)}; 4506 } 4507 4508 #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16) 4509 template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>), 4510 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)> 4511 HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) { 4512 const RebindToUnsigned<decltype(dbf16)> du16; 4513 const Half<decltype(dbf16)> dh_bf16; 4514 return BitCast(dbf16, 4515 OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a), 4516 detail::VsxXvcvspbf16(dh_bf16, b))); 4517 } 4518 #endif 4519 4520 template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V, 4521 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4522 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 4523 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 4524 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 4525 return ReorderDemote2To(d, a, b); 4526 } 4527 4528 #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16) 4529 template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>), 4530 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 4531 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 4532 return ReorderDemote2To(d, a, b); 4533 } 4534 #endif 4535 4536 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 4537 HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) { 4538 #if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ 4539 HWY_COMPILER_GCC_ACTUAL < 1000 4540 // Workaround for compiler error with GCC 9 or earlier on Z14 4541 return Vec32<float>{__builtin_s390_vflrd(v.raw, 0, 0)}; 4542 #else 4543 return Vec32<float>{vec_floate(v.raw)}; 4544 #endif 4545 } 4546 4547 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 4548 HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) { 4549 #if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ 4550 HWY_COMPILER_GCC_ACTUAL < 1000 4551 // Workaround for compiler error with GCC 9 or earlier on Z14 4552 const Vec128<float> f64_to_f32{__builtin_s390_vflrd(v.raw, 0, 0)}; 4553 #elif HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN 4554 const Vec128<float> f64_to_f32{vec_floate(v.raw)}; 4555 #else 4556 const Vec128<float> f64_to_f32{vec_floato(v.raw)}; 4557 #endif 4558 4559 #if HWY_S390X_HAVE_Z14 4560 const Twice<decltype(d)> dt; 4561 return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32)); 4562 #else 4563 const RebindToUnsigned<D> du; 4564 const Rebind<uint64_t, D> du64; 4565 return Vec64<float>{ 4566 BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw}; 4567 #endif 4568 } 4569 4570 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)> 4571 HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) { 4572 #if HWY_S390X_HAVE_Z14 4573 const Rebind<int64_t, decltype(di32)> di64; 4574 return DemoteTo(di32, ConvertTo(di64, v)); 4575 #else 4576 (void)di32; 4577 return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)}; 4578 #endif 4579 } 4580 4581 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)> 4582 HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) { 4583 #if HWY_S390X_HAVE_Z14 4584 const Rebind<int64_t, decltype(di32)> di64; 4585 return DemoteTo(di32, ConvertTo(di64, v)); 4586 #else 4587 (void)di32; 4588 4589 #if HWY_IS_LITTLE_ENDIAN 4590 const Vec128<int32_t> f64_to_i32{ 4591 vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)}; 4592 #else 4593 const Vec128<int32_t> f64_to_i32{ 4594 vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)}; 4595 #endif 4596 4597 const Rebind<int64_t, D> di64; 4598 const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32); 4599 return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)}; 4600 #endif 4601 } 4602 4603 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)> 4604 HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) { 4605 #if HWY_S390X_HAVE_Z14 4606 const Rebind<uint64_t, decltype(du32)> du64; 4607 return DemoteTo(du32, ConvertTo(du64, v)); 4608 #else 4609 (void)du32; 4610 return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)}; 4611 #endif 4612 } 4613 4614 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> 4615 HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) { 4616 #if HWY_S390X_HAVE_Z14 4617 const Rebind<uint64_t, decltype(du32)> du64; 4618 return DemoteTo(du32, ConvertTo(du64, v)); 4619 #else 4620 (void)du32; 4621 #if HWY_IS_LITTLE_ENDIAN 4622 const Vec128<uint32_t> f64_to_u32{ 4623 vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)}; 4624 #else 4625 const Vec128<uint32_t> f64_to_u32{ 4626 vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)}; 4627 #endif 4628 4629 const Rebind<uint64_t, D> du64; 4630 const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32); 4631 return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)}; 4632 #endif 4633 } 4634 4635 #if HWY_S390X_HAVE_Z14 4636 namespace detail { 4637 4638 template <class V, HWY_IF_I64(TFromV<V>)> 4639 HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) { 4640 __vector double raw_result; 4641 // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14 4642 __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw)); 4643 return VFromD<RebindToFloat<DFromV<V>>>{raw_result}; 4644 } 4645 4646 template <class V, HWY_IF_U64(TFromV<V>)> 4647 HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) { 4648 __vector double raw_result; 4649 // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14 4650 __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw)); 4651 return VFromD<RebindToFloat<DFromV<V>>>{raw_result}; 4652 } 4653 4654 } // namespace detail 4655 #endif // HWY_S390X_HAVE_Z14 4656 4657 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 4658 HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) { 4659 #if HWY_S390X_HAVE_Z14 4660 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); 4661 #else // VSX 4662 (void)df32; 4663 return Vec32<float>{vec_floate(v.raw)}; 4664 #endif 4665 } 4666 4667 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 4668 HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) { 4669 #if HWY_S390X_HAVE_Z14 4670 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); 4671 #else // VSX 4672 #if HWY_IS_LITTLE_ENDIAN 4673 const Vec128<float> i64_to_f32{vec_floate(v.raw)}; 4674 #else 4675 const Vec128<float> i64_to_f32{vec_floato(v.raw)}; 4676 #endif 4677 4678 const RebindToUnsigned<decltype(df32)> du32; 4679 const Rebind<uint64_t, decltype(df32)> du64; 4680 return Vec64<float>{ 4681 BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw}; 4682 #endif 4683 } 4684 4685 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 4686 HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) { 4687 #if HWY_S390X_HAVE_Z14 4688 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); 4689 #else // VSX 4690 (void)df32; 4691 return Vec32<float>{vec_floate(v.raw)}; 4692 #endif 4693 } 4694 4695 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 4696 HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) { 4697 #if HWY_S390X_HAVE_Z14 4698 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); 4699 #else // VSX 4700 #if HWY_IS_LITTLE_ENDIAN 4701 const Vec128<float> u64_to_f32{vec_floate(v.raw)}; 4702 #else 4703 const Vec128<float> u64_to_f32{vec_floato(v.raw)}; 4704 #endif 4705 4706 const RebindToUnsigned<decltype(df32)> du; 4707 const Rebind<uint64_t, decltype(df32)> du64; 4708 return Vec64<float>{ 4709 BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw}; 4710 #endif 4711 } 4712 4713 // For already range-limited input [0, 255]. 4714 template <size_t N> 4715 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) { 4716 const Rebind<uint16_t, DFromV<decltype(v)>> du16; 4717 const Rebind<uint8_t, decltype(du16)> du8; 4718 return TruncateTo(du8, TruncateTo(du16, v)); 4719 } 4720 // ------------------------------ Integer <=> fp (ShiftRight, OddEven) 4721 4722 // Note: altivec.h vec_ct* currently contain C casts which triggers 4723 // -Wdeprecate-lax-vec-conv-all warnings, so disable them. 4724 4725 #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 4726 template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT), 4727 HWY_IF_V_SIZE_LE_D(D, 8)> 4728 HWY_API VFromD<D> ConvertTo(D df32, 4729 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4730 const Rebind<double, decltype(df32)> df64; 4731 return DemoteTo(df32, PromoteTo(df64, v)); 4732 } 4733 template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT), 4734 HWY_IF_V_SIZE_D(D, 16)> 4735 HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) { 4736 const RepartitionToWide<decltype(df32)> df64; 4737 4738 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000 4739 // Workaround for compiler error with GCC 9 or earlier on Z14 4740 const VFromD<D> vf32_lo{ 4741 __builtin_s390_vflrd(PromoteLowerTo(df64, v).raw, 0, 0)}; 4742 const VFromD<D> vf32_hi{ 4743 __builtin_s390_vflrd(PromoteUpperTo(df64, v).raw, 0, 0)}; 4744 #else 4745 const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)}; 4746 const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)}; 4747 #endif 4748 return ConcatEven(df32, vf32_hi, vf32_lo); 4749 } 4750 #else // Z15 or PPC 4751 template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)> 4752 HWY_API VFromD<D> ConvertTo(D /* tag */, 4753 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4754 HWY_DIAGNOSTICS(push) 4755 #if HWY_COMPILER_CLANG 4756 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") 4757 #endif 4758 #if HWY_S390X_HAVE_Z15 4759 return VFromD<D>{vec_float(v.raw)}; 4760 #else 4761 return VFromD<D>{vec_ctf(v.raw, 0)}; 4762 #endif 4763 HWY_DIAGNOSTICS(pop) 4764 } 4765 #endif // HWY_TARGET == HWY_Z14 4766 4767 template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT), 4768 HWY_IF_T_SIZE_D(D, sizeof(FromT))> 4769 HWY_API VFromD<D> ConvertTo(D /* tag */, 4770 Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) { 4771 return VFromD<D>{vec_double(v.raw)}; 4772 } 4773 4774 // Truncates (rounds toward zero). 4775 #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 4776 template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 4777 HWY_API VFromD<D> ConvertTo(D di32, 4778 Vec128<float, Rebind<float, D>().MaxLanes()> v) { 4779 const Rebind<int64_t, decltype(di32)> di64; 4780 return DemoteTo(di32, PromoteTo(di64, v)); 4781 } 4782 template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)> 4783 HWY_API VFromD<D> ConvertTo(D di32, 4784 Vec128<float, Rebind<float, D>().MaxLanes()> v) { 4785 const RepartitionToWide<decltype(di32)> di64; 4786 return OrderedDemote2To(di32, PromoteLowerTo(di64, v), 4787 PromoteUpperTo(di64, v)); 4788 } 4789 #else // Z15 or PPC 4790 template <class D, HWY_IF_I32_D(D)> 4791 HWY_API VFromD<D> ConvertTo(D /* tag */, 4792 Vec128<float, Rebind<float, D>().MaxLanes()> v) { 4793 #if defined(__OPTIMIZE__) 4794 if (detail::IsConstantRawAltivecVect(v.raw)) { 4795 constexpr int32_t kMinI32 = LimitsMin<int32_t>(); 4796 constexpr int32_t kMaxI32 = LimitsMax<int32_t>(); 4797 return Dup128VecFromValues( 4798 D(), 4799 (v.raw[0] >= -2147483648.0f) 4800 ? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0]) 4801 : kMaxI32) 4802 : ((v.raw[0] < 0) ? kMinI32 : 0), 4803 (v.raw[1] >= -2147483648.0f) 4804 ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1]) 4805 : kMaxI32) 4806 : ((v.raw[1] < 0) ? kMinI32 : 0), 4807 (v.raw[2] >= -2147483648.0f) 4808 ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2]) 4809 : kMaxI32) 4810 : ((v.raw[2] < 0) ? kMinI32 : 0), 4811 (v.raw[3] >= -2147483648.0f) 4812 ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3]) 4813 : kMaxI32) 4814 : ((v.raw[3] < 0) ? kMinI32 : 0)); 4815 } 4816 #endif 4817 4818 #if HWY_S390X_HAVE_Z15 4819 // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in 4820 // the range of an int32_t 4821 __vector signed int raw_result; 4822 __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); 4823 return VFromD<D>{raw_result}; 4824 #else 4825 HWY_DIAGNOSTICS(push) 4826 #if HWY_COMPILER_CLANG 4827 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") 4828 #endif 4829 return VFromD<D>{vec_cts(v.raw, 0)}; 4830 HWY_DIAGNOSTICS(pop) 4831 #endif // HWY_S390X_HAVE_Z15 4832 } 4833 #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 4834 4835 template <class D, HWY_IF_I64_D(D)> 4836 HWY_API VFromD<D> ConvertTo(D /* tag */, 4837 Vec128<double, Rebind<double, D>().MaxLanes()> v) { 4838 #if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14) 4839 if (detail::IsConstantRawAltivecVect(v.raw)) { 4840 constexpr int64_t kMinI64 = LimitsMin<int64_t>(); 4841 constexpr int64_t kMaxI64 = LimitsMax<int64_t>(); 4842 return Dup128VecFromValues(D(), 4843 (v.raw[0] >= -9223372036854775808.0) 4844 ? ((v.raw[0] < 9223372036854775808.0) 4845 ? static_cast<int64_t>(v.raw[0]) 4846 : kMaxI64) 4847 : ((v.raw[0] < 0) ? kMinI64 : 0LL), 4848 (v.raw[1] >= -9223372036854775808.0) 4849 ? ((v.raw[1] < 9223372036854775808.0) 4850 ? static_cast<int64_t>(v.raw[1]) 4851 : kMaxI64) 4852 : ((v.raw[1] < 0) ? kMinI64 : 0LL)); 4853 } 4854 #endif 4855 4856 // Use inline assembly to avoid undefined behavior if v[i] is not within the 4857 // range of an int64_t 4858 __vector signed long long raw_result; 4859 #if HWY_S390X_HAVE_Z14 4860 __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); 4861 #else 4862 __asm__("xvcvdpsxds %x0,%x1" 4863 : "=wa"(raw_result) 4864 : "wa"(detail::VsxF2INormalizeSrcVals(v).raw)); 4865 #endif 4866 return VFromD<D>{raw_result}; 4867 } 4868 4869 #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 4870 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 4871 HWY_API VFromD<D> ConvertTo(D du32, 4872 Vec128<float, Rebind<float, D>().MaxLanes()> v) { 4873 const Rebind<uint64_t, decltype(du32)> du64; 4874 return DemoteTo(du32, PromoteTo(du64, v)); 4875 } 4876 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)> 4877 HWY_API VFromD<D> ConvertTo(D du32, 4878 Vec128<float, Rebind<float, D>().MaxLanes()> v) { 4879 const RepartitionToWide<decltype(du32)> du64; 4880 return OrderedDemote2To(du32, PromoteLowerTo(du64, v), 4881 PromoteUpperTo(du64, v)); 4882 } 4883 #else // Z15 or VSX 4884 template <class D, HWY_IF_U32_D(D)> 4885 HWY_API VFromD<D> ConvertTo(D /* tag */, 4886 Vec128<float, Rebind<float, D>().MaxLanes()> v) { 4887 #if defined(__OPTIMIZE__) 4888 if (detail::IsConstantRawAltivecVect(v.raw)) { 4889 constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>(); 4890 return Dup128VecFromValues( 4891 D(), 4892 (v.raw[0] >= 0.0f) 4893 ? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0]) 4894 : kMaxU32) 4895 : 0, 4896 (v.raw[1] >= 0.0f) 4897 ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1]) 4898 : kMaxU32) 4899 : 0, 4900 (v.raw[2] >= 0.0f) 4901 ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2]) 4902 : kMaxU32) 4903 : 0, 4904 (v.raw[3] >= 0.0f) 4905 ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3]) 4906 : kMaxU32) 4907 : 0); 4908 } 4909 #endif 4910 4911 #if HWY_S390X_HAVE_Z15 4912 // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in 4913 // the range of an uint32_t 4914 __vector unsigned int raw_result; 4915 __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); 4916 return VFromD<D>{raw_result}; 4917 #else // VSX 4918 HWY_DIAGNOSTICS(push) 4919 #if HWY_COMPILER_CLANG 4920 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") 4921 #endif 4922 VFromD<D> result{vec_ctu(v.raw, 0)}; 4923 HWY_DIAGNOSTICS(pop) 4924 return result; 4925 #endif // HWY_S390X_HAVE_Z15 4926 } 4927 #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 4928 4929 template <class D, HWY_IF_U64_D(D)> 4930 HWY_API VFromD<D> ConvertTo(D /* tag */, 4931 Vec128<double, Rebind<double, D>().MaxLanes()> v) { 4932 HWY_DIAGNOSTICS(push) 4933 #if HWY_COMPILER_CLANG 4934 HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") 4935 #endif 4936 4937 #if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14) 4938 if (detail::IsConstantRawAltivecVect(v.raw)) { 4939 constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>(); 4940 return Dup128VecFromValues( 4941 D(), 4942 (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0) 4943 ? static_cast<uint64_t>(v.raw[0]) 4944 : kMaxU64) 4945 : 0, 4946 (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0) 4947 ? static_cast<uint64_t>(v.raw[1]) 4948 : kMaxU64) 4949 : 0); 4950 } 4951 #endif 4952 4953 // Use inline assembly to avoid undefined behavior if v[i] is not within the 4954 // range of an uint64_t 4955 __vector unsigned long long raw_result; 4956 #if HWY_S390X_HAVE_Z14 4957 __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); 4958 #else // VSX 4959 __asm__("xvcvdpuxds %x0,%x1" 4960 : "=wa"(raw_result) 4961 : "wa"(detail::VsxF2INormalizeSrcVals(v).raw)); 4962 #endif 4963 return VFromD<D>{raw_result}; 4964 } 4965 4966 // ------------------------------ Floating-point rounding (ConvertTo) 4967 4968 // Toward nearest integer, ties to even 4969 template <size_t N> 4970 HWY_API Vec128<float, N> Round(Vec128<float, N> v) { 4971 return Vec128<float, N>{vec_round(v.raw)}; 4972 } 4973 4974 template <size_t N> 4975 HWY_API Vec128<double, N> Round(Vec128<double, N> v) { 4976 #if HWY_S390X_HAVE_Z14 4977 return Vec128<double, N>{vec_round(v.raw)}; 4978 #else 4979 return Vec128<double, N>{vec_rint(v.raw)}; 4980 #endif 4981 } 4982 4983 template <typename T, size_t N, HWY_IF_FLOAT3264(T)> 4984 HWY_API Vec128<MakeSigned<T>, N> NearestInt(Vec128<T, N> v) { 4985 const DFromV<decltype(v)> d; 4986 const RebindToSigned<decltype(d)> di; 4987 return ConvertTo(di, Round(v)); 4988 } 4989 4990 template <class DI32, HWY_IF_I32_D(DI32)> 4991 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32, 4992 VFromD<Rebind<double, DI32>> v) { 4993 return DemoteTo(di32, Round(v)); 4994 } 4995 4996 // Toward zero, aka truncate 4997 template <typename T, size_t N, HWY_IF_FLOAT(T)> 4998 HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) { 4999 return Vec128<T, N>{vec_trunc(v.raw)}; 5000 } 5001 5002 // Toward +infinity, aka ceiling 5003 template <typename T, size_t N, HWY_IF_FLOAT(T)> 5004 HWY_API Vec128<T, N> Ceil(Vec128<T, N> v) { 5005 return Vec128<T, N>{vec_ceil(v.raw)}; 5006 } 5007 5008 // Toward -infinity, aka floor 5009 template <typename T, size_t N, HWY_IF_FLOAT(T)> 5010 HWY_API Vec128<T, N> Floor(Vec128<T, N> v) { 5011 return Vec128<T, N>{vec_floor(v.raw)}; 5012 } 5013 5014 // ------------------------------ Floating-point classification 5015 5016 template <typename T, size_t N> 5017 HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) { 5018 static_assert(IsFloat<T>(), "Only for float"); 5019 return v != v; 5020 } 5021 5022 template <typename T, size_t N> 5023 HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) { 5024 static_assert(IsFloat<T>(), "Only for float"); 5025 using TU = MakeUnsigned<T>; 5026 const DFromV<decltype(v)> d; 5027 const RebindToUnsigned<decltype(d)> du; 5028 const VFromD<decltype(du)> vu = BitCast(du, v); 5029 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 5030 return RebindMask( 5031 d, 5032 Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>())))); 5033 } 5034 5035 // Returns whether normal/subnormal/zero. 5036 template <typename T, size_t N> 5037 HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) { 5038 static_assert(IsFloat<T>(), "Only for float"); 5039 using TU = MakeUnsigned<T>; 5040 const DFromV<decltype(v)> d; 5041 const RebindToUnsigned<decltype(d)> du; 5042 const VFromD<decltype(du)> vu = BitCast(du, v); 5043 // 'Shift left' to clear the sign bit, check for exponent<max. 5044 return RebindMask( 5045 d, 5046 Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>())))); 5047 } 5048 5049 // ================================================== CRYPTO 5050 5051 #if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO) 5052 5053 // Per-target flag to prevent generic_ops-inl.h from defining AESRound. 5054 #ifdef HWY_NATIVE_AES 5055 #undef HWY_NATIVE_AES 5056 #else 5057 #define HWY_NATIVE_AES 5058 #endif 5059 5060 namespace detail { 5061 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600 5062 using CipherTag = Full128<uint64_t>; 5063 #else 5064 using CipherTag = Full128<uint8_t>; 5065 #endif // !HWY_COMPILER_CLANG 5066 using CipherVec = VFromD<CipherTag>; 5067 } // namespace detail 5068 5069 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, 5070 Vec128<uint8_t> round_key) { 5071 const detail::CipherTag dc; 5072 const Full128<uint8_t> du8; 5073 #if HWY_IS_LITTLE_ENDIAN 5074 return Reverse(du8, 5075 BitCast(du8, detail::CipherVec{vec_cipher_be( 5076 BitCast(dc, Reverse(du8, state)).raw, 5077 BitCast(dc, Reverse(du8, round_key)).raw)})); 5078 #else 5079 return BitCast(du8, detail::CipherVec{vec_cipher_be( 5080 BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); 5081 #endif 5082 } 5083 5084 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, 5085 Vec128<uint8_t> round_key) { 5086 const detail::CipherTag dc; 5087 const Full128<uint8_t> du8; 5088 #if HWY_IS_LITTLE_ENDIAN 5089 return Reverse(du8, 5090 BitCast(du8, detail::CipherVec{vec_cipherlast_be( 5091 BitCast(dc, Reverse(du8, state)).raw, 5092 BitCast(dc, Reverse(du8, round_key)).raw)})); 5093 #else 5094 return BitCast(du8, detail::CipherVec{vec_cipherlast_be( 5095 BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); 5096 #endif 5097 } 5098 5099 HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, 5100 Vec128<uint8_t> round_key) { 5101 const detail::CipherTag dc; 5102 const Full128<uint8_t> du8; 5103 #if HWY_IS_LITTLE_ENDIAN 5104 return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be( 5105 BitCast(dc, Reverse(du8, state)).raw, 5106 Zero(dc).raw)})), 5107 round_key); 5108 #else 5109 return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be( 5110 BitCast(dc, state).raw, Zero(dc).raw)}), 5111 round_key); 5112 #endif 5113 } 5114 5115 HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, 5116 Vec128<uint8_t> round_key) { 5117 const detail::CipherTag dc; 5118 const Full128<uint8_t> du8; 5119 #if HWY_IS_LITTLE_ENDIAN 5120 return Reverse(du8, 5121 BitCast(du8, detail::CipherVec{vec_ncipherlast_be( 5122 BitCast(dc, Reverse(du8, state)).raw, 5123 BitCast(dc, Reverse(du8, round_key)).raw)})); 5124 #else 5125 return BitCast(du8, detail::CipherVec{vec_ncipherlast_be( 5126 BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); 5127 #endif 5128 } 5129 5130 HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { 5131 const Full128<uint8_t> du8; 5132 const auto zero = Zero(du8); 5133 5134 // PPC8/PPC9/PPC10 does not have a single instruction for the AES 5135 // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do. 5136 5137 // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10 5138 // by doing an AESLastRound operation with a zero round_key followed by an 5139 // AESRoundInv operation with a zero round_key. 5140 return AESRoundInv(AESLastRound(state, zero), zero); 5141 } 5142 5143 template <uint8_t kRcon> 5144 HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { 5145 constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0, 5146 0, 0, 0, 0, kRcon, 0, 0, 0}; 5147 constexpr __vector unsigned char kRotWordShuffle = { 5148 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12}; 5149 const detail::CipherTag dc; 5150 const Full128<uint8_t> du8; 5151 const auto sub_word_result = 5152 BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)}); 5153 const auto rot_word_result = 5154 TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle}); 5155 return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask}); 5156 } 5157 5158 template <size_t N> 5159 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a, 5160 Vec128<uint64_t, N> b) { 5161 // NOTE: Lane 1 of both a and b need to be zeroed out for the 5162 // vec_pmsum_be operation below as the vec_pmsum_be operation 5163 // does a carryless multiplication of each 64-bit half and then 5164 // adds the two halves using an bitwise XOR operation. 5165 5166 const DFromV<decltype(a)> d; 5167 const auto zero = Zero(d); 5168 5169 using VU64 = __vector unsigned long long; 5170 const VU64 pmsum_result = reinterpret_cast<VU64>( 5171 vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw)); 5172 5173 #if HWY_IS_LITTLE_ENDIAN 5174 return Vec128<uint64_t, N>{pmsum_result}; 5175 #else 5176 // Need to swap the two halves of pmsum_result on big-endian targets as 5177 // the upper 64 bits of the carryless multiplication result are in lane 0 of 5178 // pmsum_result and the lower 64 bits of the carryless multiplication result 5179 // are in lane 1 of mul128_result 5180 return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)}; 5181 #endif 5182 } 5183 5184 template <size_t N> 5185 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a, 5186 Vec128<uint64_t, N> b) { 5187 // NOTE: Lane 0 of both a and b need to be zeroed out for the 5188 // vec_pmsum_be operation below as the vec_pmsum_be operation 5189 // does a carryless multiplication of each 64-bit half and then 5190 // adds the two halves using an bitwise XOR operation. 5191 5192 const DFromV<decltype(a)> d; 5193 const auto zero = Zero(d); 5194 5195 using VU64 = __vector unsigned long long; 5196 const VU64 pmsum_result = reinterpret_cast<VU64>( 5197 vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw))); 5198 5199 #if HWY_IS_LITTLE_ENDIAN 5200 return Vec128<uint64_t, N>{pmsum_result}; 5201 #else 5202 // Need to swap the two halves of pmsum_result on big-endian targets as 5203 // the upper 64 bits of the carryless multiplication result are in lane 0 of 5204 // pmsum_result and the lower 64 bits of the carryless multiplication result 5205 // are in lane 1 of mul128_result 5206 return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)}; 5207 #endif 5208 } 5209 5210 #endif // !defined(HWY_DISABLE_PPC8_CRYPTO) 5211 5212 // ================================================== MISC 5213 5214 // ------------------------------ LoadMaskBits (TestBit) 5215 5216 namespace detail { 5217 5218 template <class D, HWY_IF_T_SIZE_D(D, 1)> 5219 HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { 5220 #if HWY_PPC_HAVE_10 5221 const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)}; 5222 5223 #if HWY_IS_LITTLE_ENDIAN 5224 return MFromD<D>{MaskFromVec(mask_vec).raw}; 5225 #else 5226 return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw}; 5227 #endif // HWY_IS_LITTLE_ENDIAN 5228 5229 #else // PPC9 or earlier 5230 const Full128<uint8_t> du8; 5231 const Full128<uint16_t> du16; 5232 const Vec128<uint8_t> vbits = 5233 BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))); 5234 5235 // Replicate bytes 8x such that each byte contains the bit that governs it. 5236 #if HWY_IS_LITTLE_ENDIAN 5237 const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0, 5238 1, 1, 1, 1, 1, 1, 1, 1}; 5239 #else 5240 const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1, 5241 0, 0, 0, 0, 0, 0, 0, 0}; 5242 #endif // HWY_IS_LITTLE_ENDIAN 5243 5244 const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)}; 5245 const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128, 5246 1, 2, 4, 8, 16, 32, 64, 128}; 5247 return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw}; 5248 #endif // HWY_PPC_HAVE_10 5249 } 5250 5251 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5252 HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { 5253 #if HWY_PPC_HAVE_10 5254 const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)}; 5255 5256 #if HWY_IS_LITTLE_ENDIAN 5257 return MFromD<D>{MaskFromVec(mask_vec).raw}; 5258 #else 5259 return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw}; 5260 #endif // HWY_IS_LITTLE_ENDIAN 5261 5262 #else // PPC9 or earlier 5263 const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128}; 5264 const auto vmask_bits = 5265 Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits)); 5266 return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw}; 5267 #endif // HWY_PPC_HAVE_10 5268 } 5269 5270 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5271 HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { 5272 #if HWY_PPC_HAVE_10 5273 const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)}; 5274 5275 #if HWY_IS_LITTLE_ENDIAN 5276 return MFromD<D>{MaskFromVec(mask_vec).raw}; 5277 #else 5278 return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw}; 5279 #endif // HWY_IS_LITTLE_ENDIAN 5280 5281 #else // PPC9 or earlier 5282 const __vector unsigned int kBit = {1, 2, 4, 8}; 5283 const auto vmask_bits = 5284 Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits)); 5285 return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw}; 5286 #endif // HWY_PPC_HAVE_10 5287 } 5288 5289 template <class D, HWY_IF_T_SIZE_D(D, 8)> 5290 HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) { 5291 #if HWY_PPC_HAVE_10 5292 const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)}; 5293 5294 #if HWY_IS_LITTLE_ENDIAN 5295 return MFromD<D>{MaskFromVec(mask_vec).raw}; 5296 #else 5297 return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw}; 5298 #endif // HWY_IS_LITTLE_ENDIAN 5299 5300 #else // PPC9 or earlier 5301 const __vector unsigned long long kBit = {1, 2}; 5302 const auto vmask_bits = 5303 Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits)); 5304 return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw}; 5305 #endif // HWY_PPC_HAVE_10 5306 } 5307 5308 } // namespace detail 5309 5310 // `p` points to at least 8 readable bytes, not all of which need be valid. 5311 template <class D, HWY_IF_LANES_LE_D(D, 8)> 5312 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 5313 // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t 5314 uint64_t mask_bits = bits[0]; 5315 5316 constexpr size_t kN = MaxLanes(d); 5317 if (kN < 8) mask_bits &= (1u << kN) - 1; 5318 5319 return detail::LoadMaskBits128(d, mask_bits); 5320 } 5321 5322 template <class D, HWY_IF_LANES_D(D, 16)> 5323 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 5324 // First, copy the mask bits to a uint16_t as there as there are at most 5325 // 16 lanes in a vector. 5326 5327 // Copying the mask bits to a uint16_t first will also ensure that the 5328 // mask bits are loaded into the lower 16 bits on big-endian PPC targets. 5329 uint16_t u16_mask_bits; 5330 CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits); 5331 5332 #if HWY_IS_LITTLE_ENDIAN 5333 return detail::LoadMaskBits128(d, u16_mask_bits); 5334 #else 5335 // On big-endian targets, u16_mask_bits need to be byte swapped as bits 5336 // contains the mask bits in little-endian byte order 5337 5338 // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a 5339 // single lhbrx instruction on big-endian PPC targets when optimizations 5340 // are enabled. 5341 #if HWY_HAS_BUILTIN(__builtin_bswap16) 5342 return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits)); 5343 #else 5344 return detail::LoadMaskBits128( 5345 d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8))); 5346 #endif 5347 #endif 5348 } 5349 5350 template <typename T> 5351 struct CompressIsPartition { 5352 // generic_ops-inl does not guarantee IsPartition for 8-bit. 5353 enum { value = (sizeof(T) != 1) }; 5354 }; 5355 5356 // ------------------------------ Dup128MaskFromMaskBits 5357 5358 template <class D> 5359 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 5360 constexpr size_t kN = MaxLanes(d); 5361 if (kN < 8) mask_bits &= (1u << kN) - 1; 5362 return detail::LoadMaskBits128(d, mask_bits); 5363 } 5364 5365 // ------------------------------ StoreMaskBits 5366 5367 namespace detail { 5368 5369 // Returns the lowest N of the mask bits. 5370 template <class D> 5371 constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { 5372 return (d.MaxBytes() == 16) ? mask_bits 5373 : mask_bits & ((1ull << d.MaxLanes()) - 1); 5374 } 5375 5376 #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN 5377 // fallback for missing vec_extractm 5378 template <size_t N> 5379 HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits, 5380 __vector unsigned char bit_shuffle) { 5381 // clang POWER8 and 9 targets appear to differ in their return type of 5382 // vec_vbpermq: unsigned or signed, so cast to avoid a warning. 5383 using VU64 = detail::Raw128<uint64_t>::type; 5384 #if HWY_S390X_HAVE_Z14 5385 const Vec128<uint64_t> extracted{ 5386 reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))}; 5387 #else 5388 const Vec128<uint64_t> extracted{ 5389 reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))}; 5390 #endif 5391 return extracted.raw[HWY_IS_LITTLE_ENDIAN]; 5392 } 5393 5394 #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN 5395 5396 } // namespace detail 5397 5398 template <class D, HWY_IF_T_SIZE_D(D, 1)> 5399 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5400 const Repartition<uint8_t, decltype(d)> du8; 5401 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); 5402 5403 #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN 5404 return detail::OnlyActive(d, 5405 static_cast<uint64_t>(vec_extractm(sign_bits.raw))); 5406 #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 5407 const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, 5408 56, 48, 40, 32, 24, 16, 8, 0}; 5409 return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); 5410 #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN 5411 } 5412 5413 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5414 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5415 const RebindToUnsigned<decltype(d)> du; 5416 5417 const Repartition<uint8_t, decltype(d)> du8; 5418 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); 5419 5420 #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN 5421 return detail::OnlyActive( 5422 d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw))); 5423 #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 5424 (void)du; 5425 #if HWY_IS_LITTLE_ENDIAN 5426 const __vector unsigned char kBitShuffle = { 5427 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128}; 5428 #else 5429 const __vector unsigned char kBitShuffle = { 5430 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; 5431 #endif 5432 return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); 5433 #endif // HWY_PPC_HAVE_10 5434 } 5435 5436 template <class D, HWY_IF_T_SIZE_D(D, 4)> 5437 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5438 const RebindToUnsigned<decltype(d)> du; 5439 5440 const Repartition<uint8_t, decltype(d)> du8; 5441 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); 5442 5443 #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN 5444 return detail::OnlyActive( 5445 d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw))); 5446 #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 5447 (void)du; 5448 #if HWY_IS_LITTLE_ENDIAN 5449 const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128, 5450 128, 128, 128, 128, 128, 128, 5451 128, 128, 128, 128}; 5452 #else 5453 const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, 5454 128, 128, 128, 128, 128, 128, 5455 96, 64, 32, 0}; 5456 #endif 5457 return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); 5458 #endif // HWY_PPC_HAVE_10 5459 } 5460 5461 template <class D, HWY_IF_T_SIZE_D(D, 8)> 5462 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 5463 const RebindToUnsigned<decltype(d)> du; 5464 5465 const Repartition<uint8_t, decltype(d)> du8; 5466 const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask)); 5467 5468 #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN 5469 return detail::OnlyActive( 5470 d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw))); 5471 #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 5472 (void)du; 5473 #if HWY_IS_LITTLE_ENDIAN 5474 const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128, 5475 128, 128, 128, 128, 128, 128, 5476 128, 128, 128, 128}; 5477 #else 5478 const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, 5479 128, 128, 128, 128, 128, 128, 5480 128, 128, 64, 0}; 5481 #endif 5482 return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); 5483 #endif // HWY_PPC_HAVE_10 5484 } 5485 5486 // `p` points to at least 8 writable bytes. 5487 template <class D, HWY_IF_LANES_LE_D(D, 8)> 5488 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 5489 // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask 5490 // to an uint8_t and store the result in bits[0]. 5491 bits[0] = static_cast<uint8_t>(BitsFromMask(d, mask)); 5492 return sizeof(uint8_t); 5493 } 5494 5495 template <class D, HWY_IF_LANES_D(D, 16)> 5496 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 5497 const auto mask_bits = BitsFromMask(d, mask); 5498 5499 // First convert mask_bits to a uint16_t as we only want to store 5500 // the lower 16 bits of mask_bits as there are 16 lanes in mask. 5501 5502 // Converting mask_bits to a uint16_t first will also ensure that 5503 // the lower 16 bits of mask_bits are stored instead of the upper 16 bits 5504 // of mask_bits on big-endian PPC targets. 5505 #if HWY_IS_LITTLE_ENDIAN 5506 const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits); 5507 #else 5508 // On big-endian targets, the bytes of mask_bits need to be swapped 5509 // as StoreMaskBits expects the mask bits to be stored in little-endian 5510 // byte order. 5511 5512 // GCC will also optimize the byte swap and CopyBytes operations below 5513 // to a single sthbrx instruction when optimizations are enabled on 5514 // big-endian PPC targets 5515 #if HWY_HAS_BUILTIN(__builtin_bswap16) 5516 const uint16_t u16_mask_bits = 5517 __builtin_bswap16(static_cast<uint16_t>(mask_bits)); 5518 #else 5519 const uint16_t u16_mask_bits = static_cast<uint16_t>( 5520 (mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8)); 5521 #endif 5522 #endif 5523 5524 CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits); 5525 return sizeof(uint16_t); 5526 } 5527 5528 // ------------------------------ Mask testing 5529 5530 template <class D, HWY_IF_V_SIZE_D(D, 16)> 5531 HWY_API bool AllFalse(D d, MFromD<D> mask) { 5532 const RebindToUnsigned<decltype(d)> du; 5533 return static_cast<bool>( 5534 vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw)); 5535 } 5536 5537 template <class D, HWY_IF_V_SIZE_D(D, 16)> 5538 HWY_API bool AllTrue(D d, MFromD<D> mask) { 5539 const RebindToUnsigned<decltype(d)> du; 5540 using TU = TFromD<decltype(du)>; 5541 return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, 5542 Set(du, hwy::LimitsMax<TU>()).raw)); 5543 } 5544 5545 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 5546 HWY_API bool AllFalse(D d, MFromD<D> mask) { 5547 const Full128<TFromD<D>> d_full; 5548 constexpr size_t kN = MaxLanes(d); 5549 return AllFalse(d_full, 5550 And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN))); 5551 } 5552 5553 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 5554 HWY_API bool AllTrue(D d, MFromD<D> mask) { 5555 const Full128<TFromD<D>> d_full; 5556 constexpr size_t kN = MaxLanes(d); 5557 return AllTrue( 5558 d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN)))); 5559 } 5560 5561 template <class D> 5562 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 5563 return PopCount(BitsFromMask(d, mask)); 5564 } 5565 5566 #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5567 namespace detail { 5568 5569 template <class V> 5570 static HWY_INLINE size_t VsxCntlzLsbb(V v) { 5571 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \ 5572 HWY_IS_LITTLE_ENDIAN 5573 // Use inline assembly to work around bug in GCC 11 and earlier on 5574 // little-endian PPC9 5575 int idx; 5576 __asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw)); 5577 return static_cast<size_t>(idx); 5578 #else 5579 return static_cast<size_t>(vec_cntlz_lsbb(v.raw)); 5580 #endif 5581 } 5582 5583 template <class V> 5584 static HWY_INLINE size_t VsxCnttzLsbb(V v) { 5585 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \ 5586 HWY_IS_LITTLE_ENDIAN 5587 // Use inline assembly to work around bug in GCC 11 and earlier on 5588 // little-endian PPC9 5589 int idx; 5590 __asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw)); 5591 return static_cast<size_t>(idx); 5592 #else 5593 return static_cast<size_t>(vec_cnttz_lsbb(v.raw)); 5594 #endif 5595 } 5596 5597 } // namespace detail 5598 #endif 5599 5600 template <class D, typename T = TFromD<D>> 5601 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 5602 // For little-endian PPC10, BitsFromMask is already efficient. 5603 #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5604 if (detail::IsFull(d)) { 5605 const Repartition<uint8_t, D> d8; 5606 const auto bytes = BitCast(d8, VecFromMask(d, mask)); 5607 return detail::VsxCntlzLsbb(bytes) / sizeof(T); 5608 } 5609 #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5610 return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask)); 5611 } 5612 5613 template <class D, typename T = TFromD<D>> 5614 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 5615 // For little-endian PPC10, BitsFromMask is already efficient. 5616 #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5617 constexpr size_t kN = 16 / sizeof(T); 5618 if (detail::IsFull(d)) { 5619 const Repartition<uint8_t, D> d8; 5620 const auto bytes = BitCast(d8, VecFromMask(d, mask)); 5621 const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T); 5622 return idx == kN ? -1 : static_cast<intptr_t>(idx); 5623 } 5624 #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5625 const uint64_t mask_bits = BitsFromMask(d, mask); 5626 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; 5627 } 5628 5629 template <class D, typename T = TFromD<D>> 5630 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 5631 // For little-endian PPC10, BitsFromMask is already efficient. 5632 #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5633 if (detail::IsFull(d)) { 5634 const Repartition<uint8_t, D> d8; 5635 const auto bytes = BitCast(d8, VecFromMask(d, mask)); 5636 const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T); 5637 return 16 / sizeof(T) - 1 - idx; 5638 } 5639 #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5640 return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(d, mask)); 5641 } 5642 5643 template <class D, typename T = TFromD<D>> 5644 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 5645 // For little-endian PPC10, BitsFromMask is already efficient. 5646 #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5647 constexpr size_t kN = 16 / sizeof(T); 5648 if (detail::IsFull(d)) { 5649 const Repartition<uint8_t, D> d8; 5650 const auto bytes = BitCast(d8, VecFromMask(d, mask)); 5651 const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T); 5652 return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx); 5653 } 5654 #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) 5655 const uint64_t mask_bits = BitsFromMask(d, mask); 5656 return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) 5657 : -1; 5658 } 5659 5660 // ------------------------------ Compress, CompressBits 5661 5662 namespace detail { 5663 5664 #if HWY_PPC_HAVE_10 5665 template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 1)> 5666 HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) { 5667 constexpr unsigned kGenPcvmMode = 5668 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u); 5669 5670 // Inline assembly is used instead of the vec_genpcvm intrinsic to work around 5671 // compiler bugs on little-endian PPC10 5672 typename detail::Raw128<TFromD<D>>::type idx; 5673 __asm__("xxgenpcvbm %x0, %1, %2" 5674 : "=wa"(idx) 5675 : "v"(mask.raw), "i"(kGenPcvmMode)); 5676 return VFromD<decltype(d)>{idx}; 5677 } 5678 template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)> 5679 HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) { 5680 constexpr unsigned kGenPcvmMode = 5681 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u); 5682 5683 // Inline assembly is used instead of the vec_genpcvm intrinsic to work around 5684 // compiler bugs on little-endian PPC10 5685 typename detail::Raw128<TFromD<D>>::type idx; 5686 __asm__("xxgenpcvhm %x0, %1, %2" 5687 : "=wa"(idx) 5688 : "v"(mask.raw), "i"(kGenPcvmMode)); 5689 return VFromD<decltype(d)>{idx}; 5690 } 5691 template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)> 5692 HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) { 5693 constexpr unsigned kGenPcvmMode = 5694 (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u); 5695 5696 // Inline assembly is used instead of the vec_genpcvm intrinsic to work around 5697 // compiler bugs on little-endian PPC10 5698 typename detail::Raw128<TFromD<D>>::type idx; 5699 __asm__("xxgenpcvwm %x0, %1, %2" 5700 : "=wa"(idx) 5701 : "v"(mask.raw), "i"(kGenPcvmMode)); 5702 return VFromD<decltype(d)>{idx}; 5703 } 5704 #endif 5705 5706 // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. 5707 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5708 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 5709 HWY_DASSERT(mask_bits < 256); 5710 const Rebind<uint8_t, decltype(d)> d8; 5711 const Twice<decltype(d8)> d8t; 5712 const RebindToUnsigned<decltype(d)> du; 5713 5714 // To reduce cache footprint, store lane indices and convert to byte indices 5715 // (2*lane + 0..1), with the doubling baked into the table. It's not clear 5716 // that the additional cost of unpacking nibbles is worthwhile. 5717 alignas(16) static constexpr uint8_t table[2048] = { 5718 // PrintCompress16x8Tables 5719 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5720 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5721 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 5722 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5723 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 5724 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 5725 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 5726 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5727 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 5728 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 5729 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 5730 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 5731 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 5732 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 5733 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 5734 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5735 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 5736 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 5737 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 5738 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 5739 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 5740 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 5741 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 5742 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 5743 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 5744 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 5745 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 5746 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 5747 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 5748 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 5749 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 5750 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5751 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 5752 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 5753 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 5754 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 5755 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 5756 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 5757 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 5758 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 5759 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 5760 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 5761 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 5762 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 5763 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 5764 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 5765 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 5766 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 5767 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 5768 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 5769 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 5770 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 5771 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 5772 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 5773 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 5774 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 5775 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 5776 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 5777 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 5778 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 5779 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 5780 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 5781 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 5782 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5783 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 5784 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 5785 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 5786 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 5787 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 5788 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 5789 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 5790 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 5791 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 5792 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 5793 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 5794 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 5795 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 5796 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 5797 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 5798 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 5799 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 5800 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 5801 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 5802 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 5803 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 5804 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 5805 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 5806 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 5807 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 5808 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 5809 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 5810 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 5811 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 5812 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 5813 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 5814 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 5815 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 5816 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 5817 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 5818 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 5819 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 5820 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 5821 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 5822 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 5823 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 5824 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 5825 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 5826 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 5827 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 5828 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 5829 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 5830 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 5831 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 5832 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 5833 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 5834 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 5835 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 5836 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 5837 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 5838 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 5839 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 5840 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 5841 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 5842 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 5843 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 5844 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 5845 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 5846 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 5847 5848 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5849 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); 5850 constexpr uint16_t kPairIndexIncrement = 5851 HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; 5852 5853 return BitCast(d, pairs + Set(du, kPairIndexIncrement)); 5854 } 5855 5856 template <class D, HWY_IF_T_SIZE_D(D, 2)> 5857 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 5858 HWY_DASSERT(mask_bits < 256); 5859 const Rebind<uint8_t, decltype(d)> d8; 5860 const Twice<decltype(d8)> d8t; 5861 const RebindToUnsigned<decltype(d)> du; 5862 5863 // To reduce cache footprint, store lane indices and convert to byte indices 5864 // (2*lane + 0..1), with the doubling baked into the table. It's not clear 5865 // that the additional cost of unpacking nibbles is worthwhile. 5866 alignas(16) static constexpr uint8_t table[2048] = { 5867 // PrintCompressNot16x8Tables 5868 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 5869 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 5870 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 5871 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 5872 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 5873 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 5874 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 5875 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 5876 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 5877 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 5878 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 5879 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 5880 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 5881 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 5882 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 5883 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 5884 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 5885 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 5886 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 5887 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 5888 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 5889 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 5890 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 5891 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 5892 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 5893 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 5894 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 5895 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 5896 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 5897 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 5898 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 5899 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 5900 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 5901 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 5902 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 5903 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 5904 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 5905 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 5906 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 5907 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 5908 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 5909 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 5910 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 5911 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 5912 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 5913 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 5914 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 5915 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 5916 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 5917 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 5918 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 5919 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 5920 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 5921 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 5922 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 5923 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 5924 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 5925 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 5926 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 5927 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 5928 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 5929 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 5930 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 5931 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 5932 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 5933 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 5934 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 5935 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 5936 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 5937 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 5938 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 5939 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 5940 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 5941 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 5942 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 5943 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 5944 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 5945 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 5946 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 5947 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 5948 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 5949 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 5950 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 5951 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 5952 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 5953 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 5954 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 5955 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 5956 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 5957 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 5958 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 5959 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 5960 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 5961 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 5962 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 5963 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 5964 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 5965 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 5966 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 5967 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 5968 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 5969 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 5970 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 5971 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 5972 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 5973 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 5974 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 5975 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 5976 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 5977 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 5978 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 5979 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 5980 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 5981 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 5982 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 5983 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 5984 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 5985 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 5986 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 5987 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 5988 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 5989 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 5990 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 5991 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 5992 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 5993 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 5994 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 5995 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 5996 5997 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5998 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); 5999 constexpr uint16_t kPairIndexIncrement = 6000 HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; 6001 6002 return BitCast(d, pairs + Set(du, kPairIndexIncrement)); 6003 } 6004 6005 template <class D, HWY_IF_T_SIZE_D(D, 4)> 6006 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 6007 HWY_DASSERT(mask_bits < 16); 6008 6009 // There are only 4 lanes, so we can afford to load the index vector directly. 6010 alignas(16) static constexpr uint8_t u8_indices[256] = { 6011 // PrintCompress32x4Tables 6012 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 6013 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 6014 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 6015 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 6016 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 6017 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 6018 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 6019 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 6020 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 6021 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 6022 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 6023 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 6024 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 6025 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 6026 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 6027 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 6028 6029 const Repartition<uint8_t, decltype(d)> d8; 6030 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 6031 } 6032 6033 template <class D, HWY_IF_T_SIZE_D(D, 4)> 6034 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 6035 HWY_DASSERT(mask_bits < 16); 6036 6037 // There are only 4 lanes, so we can afford to load the index vector directly. 6038 alignas(16) static constexpr uint8_t u8_indices[256] = { 6039 // PrintCompressNot32x4Tables 6040 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6041 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 6042 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 6043 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6044 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 6045 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 6046 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6047 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 6048 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 6049 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 6050 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6051 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 6052 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6053 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 6054 12, 13, 14, 15}; 6055 6056 const Repartition<uint8_t, decltype(d)> d8; 6057 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 6058 } 6059 6060 template <class D, HWY_IF_T_SIZE_D(D, 8)> 6061 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 6062 HWY_DASSERT(mask_bits < 4); 6063 6064 // There are only 2 lanes, so we can afford to load the index vector directly. 6065 alignas(16) static constexpr uint8_t u8_indices[64] = { 6066 // PrintCompress64x2Tables 6067 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 6068 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 6069 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 6070 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 6071 6072 const Repartition<uint8_t, decltype(d)> d8; 6073 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 6074 } 6075 6076 template <class D, HWY_IF_T_SIZE_D(D, 8)> 6077 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 6078 HWY_DASSERT(mask_bits < 4); 6079 6080 // There are only 2 lanes, so we can afford to load the index vector directly. 6081 alignas(16) static constexpr uint8_t u8_indices[64] = { 6082 // PrintCompressNot64x2Tables 6083 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 6084 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 6085 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 6086 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 6087 6088 const Repartition<uint8_t, decltype(d)> d8; 6089 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 6090 } 6091 6092 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 6093 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) { 6094 const DFromV<decltype(v)> d; 6095 const RebindToUnsigned<decltype(d)> du; 6096 6097 HWY_DASSERT(mask_bits < (1ull << N)); 6098 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 6099 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 6100 } 6101 6102 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 6103 HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) { 6104 const DFromV<decltype(v)> d; 6105 const RebindToUnsigned<decltype(d)> du; 6106 6107 HWY_DASSERT(mask_bits < (1ull << N)); 6108 const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); 6109 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 6110 } 6111 6112 } // namespace detail 6113 6114 // Single lane: no-op 6115 template <typename T> 6116 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 6117 return v; 6118 } 6119 6120 // Two lanes: conditional swap 6121 template <typename T, HWY_IF_T_SIZE(T, 8)> 6122 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { 6123 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. 6124 const Full128<T> d; 6125 const Vec128<T> m = VecFromMask(d, mask); 6126 const Vec128<T> maskL = DupEven(m); 6127 const Vec128<T> maskH = DupOdd(m); 6128 const Vec128<T> swap = AndNot(maskL, maskH); 6129 return IfVecThenElse(swap, Shuffle01(v), v); 6130 } 6131 6132 #if HWY_PPC_HAVE_10 6133 #ifdef HWY_NATIVE_COMPRESS8 6134 #undef HWY_NATIVE_COMPRESS8 6135 #else 6136 #define HWY_NATIVE_COMPRESS8 6137 #endif 6138 6139 // General case, 1 byte 6140 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 6141 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 6142 const DFromV<decltype(v)> d; 6143 return TableLookupBytes( 6144 v, detail::CompressOrExpandIndicesFromMask<true>(d, mask)); 6145 } 6146 #endif 6147 6148 // General case, 2 or 4 bytes 6149 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 6150 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 6151 const DFromV<decltype(v)> d; 6152 return detail::CompressBits(v, BitsFromMask(d, mask)); 6153 } 6154 6155 // ------------------------------ CompressNot 6156 6157 // Single lane: no-op 6158 template <typename T> 6159 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 6160 return v; 6161 } 6162 6163 // Two lanes: conditional swap 6164 template <typename T, HWY_IF_T_SIZE(T, 8)> 6165 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { 6166 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. 6167 const Full128<T> d; 6168 const Vec128<T> m = VecFromMask(d, mask); 6169 const Vec128<T> maskL = DupEven(m); 6170 const Vec128<T> maskH = DupOdd(m); 6171 const Vec128<T> swap = AndNot(maskH, maskL); 6172 return IfVecThenElse(swap, Shuffle01(v), v); 6173 } 6174 6175 #if HWY_PPC_HAVE_10 6176 // General case, 1 byte 6177 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 6178 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 6179 const DFromV<decltype(v)> d; 6180 return TableLookupBytes( 6181 v, detail::CompressOrExpandIndicesFromMask<true>(d, Not(mask))); 6182 } 6183 #endif 6184 6185 // General case, 2 or 4 bytes 6186 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 6187 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 6188 const DFromV<decltype(v)> d; 6189 // For partial vectors, we cannot pull the Not() into the table because 6190 // BitsFromMask clears the upper bits. 6191 if (N < 16 / sizeof(T)) { 6192 return detail::CompressBits(v, BitsFromMask(d, Not(mask))); 6193 } 6194 return detail::CompressNotBits(v, BitsFromMask(d, mask)); 6195 } 6196 6197 // ------------------------------ CompressBlocksNot 6198 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 6199 Mask128<uint64_t> /* m */) { 6200 return v; 6201 } 6202 6203 #if HWY_PPC_HAVE_10 6204 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 6205 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 6206 const uint8_t* HWY_RESTRICT bits) { 6207 const DFromV<decltype(v)> d; 6208 return Compress(v, LoadMaskBits(d, bits)); 6209 } 6210 #endif 6211 6212 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 6213 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 6214 const uint8_t* HWY_RESTRICT bits) { 6215 // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply 6216 // convert bits[0] to a uint64_t 6217 uint64_t mask_bits = bits[0]; 6218 if (N < 8) { 6219 mask_bits &= (1ull << N) - 1; 6220 } 6221 6222 return detail::CompressBits(v, mask_bits); 6223 } 6224 6225 // ------------------------------ CompressStore, CompressBitsStore 6226 6227 #if HWY_PPC_HAVE_10 6228 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6229 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, 6230 TFromD<D>* HWY_RESTRICT unaligned) { 6231 const size_t count = CountTrue(d, m); 6232 const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m); 6233 const auto compressed = TableLookupBytes(v, indices); 6234 StoreU(compressed, d, unaligned); 6235 return count; 6236 } 6237 #endif 6238 6239 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 6240 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, 6241 TFromD<D>* HWY_RESTRICT unaligned) { 6242 const RebindToUnsigned<decltype(d)> du; 6243 6244 const uint64_t mask_bits = BitsFromMask(d, m); 6245 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); 6246 const size_t count = PopCount(mask_bits); 6247 6248 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 6249 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 6250 StoreU(compressed, d, unaligned); 6251 return count; 6252 } 6253 6254 #if HWY_PPC_HAVE_10 6255 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6256 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 6257 TFromD<D>* HWY_RESTRICT unaligned) { 6258 const size_t count = CountTrue(d, m); 6259 const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m); 6260 const auto compressed = TableLookupBytes(v, indices); 6261 StoreN(compressed, d, unaligned, count); 6262 return count; 6263 } 6264 #endif 6265 6266 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 6267 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 6268 TFromD<D>* HWY_RESTRICT unaligned) { 6269 const RebindToUnsigned<decltype(d)> du; 6270 6271 const uint64_t mask_bits = BitsFromMask(d, m); 6272 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); 6273 const size_t count = PopCount(mask_bits); 6274 6275 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 6276 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 6277 #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14 6278 StoreN(compressed, d, unaligned, count); 6279 #else 6280 BlendedStore(compressed, FirstN(d, count), d, unaligned); 6281 #endif 6282 return count; 6283 } 6284 6285 #if HWY_PPC_HAVE_10 6286 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6287 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 6288 D d, TFromD<D>* HWY_RESTRICT unaligned) { 6289 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); 6290 } 6291 #endif 6292 6293 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 6294 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 6295 D d, TFromD<D>* HWY_RESTRICT unaligned) { 6296 const RebindToUnsigned<decltype(d)> du; 6297 6298 // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply 6299 // convert bits[0] to a uint64_t 6300 uint64_t mask_bits = bits[0]; 6301 constexpr size_t kN = MaxLanes(d); 6302 if (kN < 8) { 6303 mask_bits &= (1ull << kN) - 1; 6304 } 6305 const size_t count = PopCount(mask_bits); 6306 6307 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 6308 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 6309 StoreU(compressed, d, unaligned); 6310 6311 return count; 6312 } 6313 6314 // ------------------------------ Expand 6315 #if HWY_PPC_HAVE_10 6316 #ifdef HWY_NATIVE_EXPAND 6317 #undef HWY_NATIVE_EXPAND 6318 #else 6319 #define HWY_NATIVE_EXPAND 6320 #endif 6321 6322 template <typename T, size_t N, 6323 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> 6324 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { 6325 const DFromV<decltype(v)> d; 6326 const auto idx = detail::CompressOrExpandIndicesFromMask<false>(d, mask); 6327 return IfThenElseZero(mask, TableLookupBytes(v, idx)); 6328 } 6329 6330 template <typename T, HWY_IF_T_SIZE(T, 8)> 6331 HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { 6332 // Same as Compress, just zero out the mask=false lanes. 6333 return IfThenElseZero(mask, Compress(v, mask)); 6334 } 6335 6336 // For single-element vectors, this is at least as fast as native. 6337 template <typename T> 6338 HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) { 6339 return IfThenElseZero(mask, v); 6340 } 6341 6342 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6343 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 6344 const TFromD<D>* HWY_RESTRICT unaligned) { 6345 return Expand(LoadU(d, unaligned), mask); 6346 } 6347 #endif // HWY_PPC_HAVE_10 6348 6349 // ------------------------------ StoreInterleaved2/3/4 6350 6351 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in 6352 // generic_ops-inl.h. 6353 6354 // ------------------------------ Additional mask logical operations 6355 namespace detail { 6356 6357 #if HWY_IS_LITTLE_ENDIAN 6358 template <class V> 6359 HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { 6360 return v; 6361 } 6362 template <class V> 6363 HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) { 6364 return v; 6365 } 6366 #else 6367 template <class V, HWY_IF_T_SIZE_V(V, 1)> 6368 HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { 6369 const DFromV<decltype(v)> d; 6370 return Reverse8(d, v); 6371 } 6372 template <class V, HWY_IF_T_SIZE_V(V, 2)> 6373 HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { 6374 const DFromV<decltype(v)> d; 6375 return Reverse4(d, v); 6376 } 6377 template <class V, HWY_IF_T_SIZE_V(V, 4)> 6378 HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { 6379 const DFromV<decltype(v)> d; 6380 return Reverse2(d, v); 6381 } 6382 template <class V, HWY_IF_T_SIZE_V(V, 8)> 6383 HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { 6384 return v; 6385 } 6386 template <class V> 6387 HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) { 6388 const DFromV<decltype(v)> d; 6389 return Reverse(d, v); 6390 } 6391 #endif 6392 6393 template <class V> 6394 HWY_INLINE V I128Subtract(V a, V b) { 6395 #if HWY_S390X_HAVE_Z14 6396 #if HWY_COMPILER_CLANG 6397 // Workaround for bug in vec_sub_u128 in Clang vecintrin.h 6398 typedef __uint128_t VU128 __attribute__((__vector_size__(16))); 6399 const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>( 6400 reinterpret_cast<VU128>(a.raw) - reinterpret_cast<VU128>(b.raw))}; 6401 #else // !HWY_COMPILER_CLANG 6402 const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>( 6403 vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw), 6404 reinterpret_cast<__vector unsigned char>(b.raw)))}; 6405 #endif // HWY_COMPILER_CLANG 6406 #elif defined(__SIZEOF_INT128__) 6407 using VU128 = __vector unsigned __int128; 6408 const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>( 6409 vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))}; 6410 #else 6411 const DFromV<decltype(a)> d; 6412 const Repartition<uint64_t, decltype(d)> du64; 6413 6414 const auto u64_a = BitCast(du64, a); 6415 const auto u64_b = BitCast(du64, b); 6416 6417 const auto diff_u64 = u64_a - u64_b; 6418 const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b); 6419 6420 #if HWY_IS_LITTLE_ENDIAN 6421 const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64); 6422 #else 6423 const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64); 6424 #endif 6425 6426 const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted); 6427 #endif 6428 6429 return diff_i128; 6430 } 6431 6432 } // namespace detail 6433 6434 template <class T> 6435 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { 6436 return mask; 6437 } 6438 template <class T> 6439 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) { 6440 const FixedTag<T, 2> d; 6441 const auto vmask = VecFromMask(d, mask); 6442 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); 6443 } 6444 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 6445 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 6446 const Simd<T, N, 0> d; 6447 const Full64<T> d_full64; 6448 6449 const auto vmask = VecFromMask(d, mask); 6450 const auto vmask_le64 = 6451 BitCast(Full64<int64_t>(), 6452 detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask))); 6453 const auto neg_vmask_le64 = Neg(vmask_le64); 6454 const auto neg_vmask = ResizeBitCast( 6455 d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64))); 6456 6457 return MaskFromVec(Or(vmask, neg_vmask)); 6458 } 6459 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 6460 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) { 6461 const Full128<T> d; 6462 auto vmask = VecFromMask(d, mask); 6463 6464 const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask); 6465 const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128); 6466 const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128); 6467 6468 return MaskFromVec(BitCast(d, Or(vmask, neg_vmask))); 6469 } 6470 6471 template <class T, size_t N> 6472 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 6473 return Not(SetAtOrAfterFirst(mask)); 6474 } 6475 6476 template <class T> 6477 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) { 6478 return mask; 6479 } 6480 template <class T> 6481 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) { 6482 const FixedTag<T, 2> d; 6483 const RebindToSigned<decltype(d)> di; 6484 6485 const auto vmask = BitCast(di, VecFromMask(d, mask)); 6486 const auto zero = Zero(di); 6487 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); 6488 return MaskFromVec(BitCast(d, And(vmask, vmask2))); 6489 } 6490 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 6491 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 6492 const Simd<T, N, 0> d; 6493 const Full64<T> d_full64; 6494 const RebindToSigned<decltype(d)> di; 6495 6496 const auto vmask = VecFromMask(d, mask); 6497 const auto vmask_le64 = 6498 BitCast(Full64<int64_t>(), 6499 detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask))); 6500 const auto neg_vmask_le64 = Neg(vmask_le64); 6501 const auto neg_vmask = ResizeBitCast( 6502 d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64))); 6503 6504 const auto first_vmask = BitCast(di, And(vmask, neg_vmask)); 6505 return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask)))); 6506 } 6507 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 6508 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) { 6509 const Full128<T> d; 6510 const RebindToSigned<decltype(d)> di; 6511 6512 const auto vmask = VecFromMask(d, mask); 6513 const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask); 6514 const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128); 6515 const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128); 6516 6517 return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask))))); 6518 } 6519 6520 template <class T> 6521 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) { 6522 const FixedTag<T, 1> d; 6523 const RebindToSigned<decltype(d)> di; 6524 using TI = MakeSigned<T>; 6525 6526 return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); 6527 } 6528 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> 6529 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 6530 const Simd<T, N, 0> d; 6531 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); 6532 } 6533 6534 // ------------------------------ SumsOf2 and SumsOf4 6535 namespace detail { 6536 6537 #if !HWY_S390X_HAVE_Z14 6538 // Casts nominally int32_t result to D. 6539 template <class D> 6540 HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a, 6541 __vector signed int b) { 6542 const Repartition<int32_t, D> di32; 6543 #ifdef __OPTIMIZE__ 6544 if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { 6545 const int64_t sum0 = 6546 static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) + 6547 static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) + 6548 static_cast<int64_t>(b[0]); 6549 const int64_t sum1 = 6550 static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) + 6551 static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) + 6552 static_cast<int64_t>(b[1]); 6553 const int64_t sum2 = 6554 static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) + 6555 static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) + 6556 static_cast<int64_t>(b[2]); 6557 const int64_t sum3 = 6558 static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) + 6559 static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) + 6560 static_cast<int64_t>(b[3]); 6561 const int32_t sign0 = static_cast<int32_t>(sum0 >> 63); 6562 const int32_t sign1 = static_cast<int32_t>(sum1 >> 63); 6563 const int32_t sign2 = static_cast<int32_t>(sum2 >> 63); 6564 const int32_t sign3 = static_cast<int32_t>(sum3 >> 63); 6565 using Raw = typename detail::Raw128<int32_t>::type; 6566 return BitCast( 6567 d, 6568 VFromD<decltype(di32)>{Raw{ 6569 (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0) 6570 : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF), 6571 (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1) 6572 : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF), 6573 (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2) 6574 : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF), 6575 (sign3 == (sum3 >> 31)) 6576 ? static_cast<int32_t>(sum3) 6577 : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}}); 6578 } else // NOLINT 6579 #endif 6580 { 6581 return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)}); 6582 } 6583 } 6584 6585 // Casts nominally uint32_t result to D. 6586 template <class D> 6587 HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a, 6588 __vector unsigned int b) { 6589 const Repartition<uint32_t, D> du32; 6590 #ifdef __OPTIMIZE__ 6591 if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { 6592 const uint64_t sum0 = 6593 static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) + 6594 static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) + 6595 static_cast<uint64_t>(b[0]); 6596 const uint64_t sum1 = 6597 static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) + 6598 static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) + 6599 static_cast<uint64_t>(b[1]); 6600 const uint64_t sum2 = 6601 static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) + 6602 static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) + 6603 static_cast<uint64_t>(b[2]); 6604 const uint64_t sum3 = 6605 static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) + 6606 static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) + 6607 static_cast<uint64_t>(b[3]); 6608 return BitCast( 6609 d, 6610 VFromD<decltype(du32)>{(__vector unsigned int){ 6611 static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu), 6612 static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu), 6613 static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu), 6614 static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3 6615 : 0xFFFFFFFFu)}}); 6616 } else // NOLINT 6617 #endif 6618 { 6619 return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)}); 6620 } 6621 } 6622 6623 // Casts nominally int32_t result to D. 6624 template <class D> 6625 HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a, 6626 __vector signed int b) { 6627 const Repartition<int32_t, D> di32; 6628 #ifdef __OPTIMIZE__ 6629 const Repartition<uint64_t, D> du64; 6630 constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN; 6631 if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) && 6632 __builtin_constant_p(b[kDestLaneOffset + 2])) { 6633 const int64_t sum0 = static_cast<int64_t>(a[0]) + 6634 static_cast<int64_t>(a[1]) + 6635 static_cast<int64_t>(b[kDestLaneOffset]); 6636 const int64_t sum1 = static_cast<int64_t>(a[2]) + 6637 static_cast<int64_t>(a[3]) + 6638 static_cast<int64_t>(b[kDestLaneOffset + 2]); 6639 const int32_t sign0 = static_cast<int32_t>(sum0 >> 63); 6640 const int32_t sign1 = static_cast<int32_t>(sum1 >> 63); 6641 return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){ 6642 (sign0 == (sum0 >> 31)) 6643 ? static_cast<uint32_t>(sum0) 6644 : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF), 6645 (sign1 == (sum1 >> 31)) 6646 ? static_cast<uint32_t>(sum1) 6647 : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}}); 6648 } else // NOLINT 6649 #endif 6650 { 6651 __vector signed int sum; 6652 6653 // Inline assembly is used for vsum2sws to avoid unnecessary shuffling 6654 // on little-endian PowerPC targets as the result of the vsum2sws 6655 // instruction will already be in the correct lanes on little-endian 6656 // PowerPC targets. 6657 __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); 6658 6659 return BitCast(d, VFromD<decltype(di32)>{sum}); 6660 } 6661 } 6662 6663 // Casts nominally int32_t result to D. 6664 template <class D> 6665 HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a, 6666 __vector signed int b) { 6667 const Repartition<int32_t, D> di32; 6668 #ifdef __OPTIMIZE__ 6669 if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { 6670 const int64_t sum0 = static_cast<int64_t>(a[0]) + 6671 static_cast<int64_t>(a[1]) + 6672 static_cast<int64_t>(b[0]); 6673 const int64_t sum1 = static_cast<int64_t>(a[2]) + 6674 static_cast<int64_t>(a[3]) + 6675 static_cast<int64_t>(b[1]); 6676 const int64_t sum2 = static_cast<int64_t>(a[4]) + 6677 static_cast<int64_t>(a[5]) + 6678 static_cast<int64_t>(b[2]); 6679 const int64_t sum3 = static_cast<int64_t>(a[6]) + 6680 static_cast<int64_t>(a[7]) + 6681 static_cast<int64_t>(b[3]); 6682 const int32_t sign0 = static_cast<int32_t>(sum0 >> 63); 6683 const int32_t sign1 = static_cast<int32_t>(sum1 >> 63); 6684 const int32_t sign2 = static_cast<int32_t>(sum2 >> 63); 6685 const int32_t sign3 = static_cast<int32_t>(sum3 >> 63); 6686 using Raw = typename detail::Raw128<int32_t>::type; 6687 return BitCast( 6688 d, 6689 VFromD<decltype(di32)>{Raw{ 6690 (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0) 6691 : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF), 6692 (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1) 6693 : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF), 6694 (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2) 6695 : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF), 6696 (sign3 == (sum3 >> 31)) 6697 ? static_cast<int32_t>(sum3) 6698 : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}}); 6699 } else // NOLINT 6700 #endif 6701 { 6702 return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)}); 6703 } 6704 } 6705 6706 // Casts nominally int32_t result to D. 6707 template <class D> 6708 HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a, 6709 __vector signed int b) { 6710 const Repartition<int32_t, D> di32; 6711 #ifdef __OPTIMIZE__ 6712 constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3; 6713 if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) { 6714 const int64_t sum = 6715 static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) + 6716 static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) + 6717 static_cast<int64_t>(b[kDestLaneOffset]); 6718 const int32_t sign = static_cast<int32_t>(sum >> 63); 6719 #if HWY_IS_LITTLE_ENDIAN 6720 return BitCast( 6721 d, VFromD<decltype(di32)>{(__vector signed int){ 6722 (sign == (sum >> 31)) ? static_cast<int32_t>(sum) 6723 : static_cast<int32_t>(sign ^ 0x7FFFFFFF), 6724 0, 0, 0}}); 6725 #else 6726 return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){ 6727 0, 0, 0, 6728 (sign == (sum >> 31)) 6729 ? static_cast<int32_t>(sum) 6730 : static_cast<int32_t>(sign ^ 0x7FFFFFFF)}}); 6731 #endif 6732 } else // NOLINT 6733 #endif 6734 { 6735 __vector signed int sum; 6736 6737 // Inline assembly is used for vsumsws to avoid unnecessary shuffling 6738 // on little-endian PowerPC targets as the result of the vsumsws 6739 // instruction will already be in the correct lanes on little-endian 6740 // PowerPC targets. 6741 __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); 6742 6743 return BitCast(d, VFromD<decltype(di32)>{sum}); 6744 } 6745 } 6746 6747 template <size_t N> 6748 HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) { 6749 const RebindToSigned<DFromV<decltype(v)>> di16; 6750 const RepartitionToWide<decltype(di16)> di32; 6751 return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw, 6752 Set(di32, 65536).raw); 6753 } 6754 #endif // !HWY_S390X_HAVE_Z14 6755 6756 // U16->U32 SumsOf2 6757 template <class V> 6758 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 6759 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 6760 const DFromV<V> d; 6761 const RepartitionToWide<decltype(d)> dw; 6762 6763 #if HWY_S390X_HAVE_Z14 6764 return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)}; 6765 #else 6766 return BitCast(dw, AltivecU16SumsOf2(v)); 6767 #endif 6768 } 6769 6770 // I16->I32 SumsOf2 6771 template <class V> 6772 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 6773 hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 6774 const DFromV<V> d; 6775 const RepartitionToWide<decltype(d)> dw; 6776 6777 #if HWY_S390X_HAVE_Z14 6778 const RebindToUnsigned<decltype(d)> du; 6779 return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), 6780 BitCast(du, Xor(v, SignBit(d))))) + 6781 Set(dw, int32_t{-65536}); 6782 #else 6783 return AltivecVsum4shs(dw, v.raw, Zero(dw).raw); 6784 #endif 6785 } 6786 6787 #if HWY_S390X_HAVE_Z14 6788 // U32->U64 SumsOf2 6789 template <class V> 6790 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 6791 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 6792 const DFromV<V> d; 6793 const RepartitionToWide<decltype(d)> dw; 6794 return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)}; 6795 } 6796 6797 // I32->I64 SumsOf2 6798 template <class V> 6799 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 6800 hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 6801 const DFromV<V> d; 6802 const RepartitionToWide<decltype(d)> dw; 6803 const RebindToUnsigned<decltype(d)> du; 6804 6805 return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(), 6806 BitCast(du, Xor(v, SignBit(d))))) + 6807 Set(dw, int64_t{-4294967296LL}); 6808 } 6809 #endif 6810 6811 // U8->U32 SumsOf4 6812 template <class V> 6813 HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4( 6814 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 6815 const DFromV<V> d; 6816 const RepartitionToWideX2<decltype(d)> dw2; 6817 6818 #if HWY_S390X_HAVE_Z14 6819 return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)}; 6820 #else 6821 return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw); 6822 #endif 6823 } 6824 6825 // I8->I32 SumsOf4 6826 template <class V> 6827 HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4( 6828 hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 6829 const DFromV<V> d; 6830 const RepartitionToWideX2<decltype(d)> dw2; 6831 6832 #if HWY_S390X_HAVE_Z14 6833 const RebindToUnsigned<decltype(d)> du; 6834 return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), 6835 BitCast(du, Xor(v, SignBit(d))))) + 6836 Set(dw2, int32_t{-512}); 6837 #else 6838 return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw); 6839 #endif 6840 } 6841 6842 // U16->U64 SumsOf4 6843 template <class V> 6844 HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4( 6845 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 6846 const DFromV<V> d; 6847 const RepartitionToWide<decltype(d)> dw; 6848 const RepartitionToWide<decltype(dw)> dw2; 6849 6850 #if HWY_S390X_HAVE_Z14 6851 return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)}; 6852 #else 6853 const RebindToSigned<decltype(dw)> dw_i; 6854 return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw); 6855 #endif 6856 } 6857 6858 // I16->I64 SumsOf4 6859 template <class V> 6860 HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4( 6861 hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 6862 const DFromV<V> d; 6863 const RepartitionToWide<decltype(d)> dw; 6864 const RepartitionToWide<decltype(dw)> dw2; 6865 6866 #if HWY_S390X_HAVE_Z14 6867 const RebindToUnsigned<decltype(d)> du; 6868 return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), 6869 BitCast(du, Xor(v, SignBit(d))))) + 6870 Set(dw2, int64_t{-131072}); 6871 #else // VSX 6872 const auto sums_of_4_in_lo32 = 6873 AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw); 6874 6875 #if HWY_IS_LITTLE_ENDIAN 6876 return PromoteEvenTo(dw2, sums_of_4_in_lo32); 6877 #else 6878 return PromoteOddTo(dw2, sums_of_4_in_lo32); 6879 #endif // HWY_IS_LITTLE_ENDIAN 6880 #endif // HWY_S390X_HAVE_Z14 6881 } 6882 6883 } // namespace detail 6884 6885 // ------------------------------ SumOfLanes 6886 6887 // We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16); 6888 // enable generic for the rest. 6889 #undef HWY_IF_SUM_OF_LANES_D 6890 #if HWY_S390X_HAVE_Z14 6891 #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D) 6892 #else 6893 #define HWY_IF_SUM_OF_LANES_D(D) \ 6894 HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8)) 6895 #endif 6896 6897 #if HWY_S390X_HAVE_Z14 6898 namespace detail { 6899 6900 #if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_s390_vsumqf) && \ 6901 HWY_HAS_BUILTIN(__builtin_s390_vsumqg) 6902 // Workaround for bug in vec_sum_u128 in Clang vecintrin.h 6903 template <class T, HWY_IF_UI32(T)> 6904 HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) { 6905 typedef __uint128_t VU128 __attribute__((__vector_size__(16))); 6906 const DFromV<decltype(v)> d; 6907 const RebindToUnsigned<decltype(d)> du; 6908 const VU128 sum = {__builtin_s390_vsumqf(BitCast(du, v).raw, Zero(du).raw)}; 6909 return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)}; 6910 } 6911 template <class T, HWY_IF_UI64(T)> 6912 HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) { 6913 typedef __uint128_t VU128 __attribute__((__vector_size__(16))); 6914 const DFromV<decltype(v)> d; 6915 const RebindToUnsigned<decltype(d)> du; 6916 const VU128 sum = {__builtin_s390_vsumqg(BitCast(du, v).raw, Zero(du).raw)}; 6917 return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)}; 6918 } 6919 #else 6920 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), 6921 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> 6922 HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) { 6923 const DFromV<decltype(v)> d; 6924 const RebindToUnsigned<decltype(d)> du; 6925 return BitCast( 6926 d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)}); 6927 } 6928 #endif 6929 6930 } // namespace detail 6931 6932 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)> 6933 HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) { 6934 return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v)); 6935 } 6936 #endif 6937 6938 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)> 6939 HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) { 6940 constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; 6941 return Broadcast<kSumLaneIdx>( 6942 BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v))); 6943 } 6944 6945 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> 6946 HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) { 6947 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; 6948 return Broadcast<kSumLaneIdx>( 6949 BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v))); 6950 } 6951 6952 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 6953 HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) { 6954 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; 6955 #if HWY_S390X_HAVE_Z14 6956 return Broadcast<kSumLaneIdx>( 6957 BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4( 6958 hwy::UnsignedTag(), hwy::SizeTag<2>(), v)))); 6959 #else // VSX 6960 const auto zero = Zero(Full128<int32_t>()); 6961 return Broadcast<kSumLaneIdx>( 6962 detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw)); 6963 #endif 6964 } 6965 6966 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)> 6967 HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) { 6968 #if HWY_S390X_HAVE_Z14 6969 const RebindToUnsigned<decltype(di16)> du16; 6970 return BitCast(di16, SumOfLanes(du16, BitCast(du16, v))); 6971 #else 6972 constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; 6973 return Broadcast<kSumLaneIdx>( 6974 BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v))); 6975 #endif 6976 } 6977 6978 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> 6979 HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) { 6980 #if HWY_S390X_HAVE_Z14 6981 const RebindToUnsigned<decltype(di16)> du16; 6982 return BitCast(di16, SumOfLanes(du16, BitCast(du16, v))); 6983 #else 6984 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; 6985 return Broadcast<kSumLaneIdx>( 6986 BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v))); 6987 #endif 6988 } 6989 6990 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 6991 HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) { 6992 #if HWY_S390X_HAVE_Z14 6993 const RebindToUnsigned<decltype(di16)> du16; 6994 return BitCast(di16, SumOfLanes(du16, BitCast(du16, v))); 6995 #else 6996 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; 6997 const Full128<int32_t> di32; 6998 const auto zero = Zero(di32); 6999 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws( 7000 di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw)); 7001 #endif 7002 } 7003 7004 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)> 7005 HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) { 7006 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; 7007 return Broadcast<kSumLaneIdx>( 7008 BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v))); 7009 } 7010 7011 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)> 7012 HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) { 7013 const Twice<decltype(du8)> dt_u8; 7014 return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v))); 7015 } 7016 7017 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 7018 HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) { 7019 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; 7020 return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v))); 7021 } 7022 7023 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 7024 HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) { 7025 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; 7026 7027 #if HWY_S390X_HAVE_Z14 7028 return Broadcast<kSumLaneIdx>( 7029 BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4( 7030 hwy::UnsignedTag(), hwy::SizeTag<1>(), v)))); 7031 #else 7032 const Full128<uint32_t> du32; 7033 const RebindToSigned<decltype(du32)> di32; 7034 const Vec128<uint32_t> zero = Zero(du32); 7035 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws( 7036 du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw, 7037 BitCast(di32, zero).raw)); 7038 #endif 7039 } 7040 7041 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)> 7042 HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) { 7043 #if HWY_S390X_HAVE_Z14 7044 const RebindToUnsigned<decltype(di8)> du8; 7045 return BitCast(di8, SumOfLanes(du8, BitCast(du8, v))); 7046 #else 7047 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; 7048 return Broadcast<kSumLaneIdx>( 7049 BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v))); 7050 #endif 7051 } 7052 7053 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)> 7054 HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) { 7055 const Twice<decltype(di8)> dt_i8; 7056 return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v))); 7057 } 7058 7059 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> 7060 HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) { 7061 #if HWY_S390X_HAVE_Z14 7062 const RebindToUnsigned<decltype(di8)> du8; 7063 return BitCast(di8, SumOfLanes(du8, BitCast(du8, v))); 7064 #else 7065 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; 7066 return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v))); 7067 #endif 7068 } 7069 7070 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> 7071 HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) { 7072 #if HWY_S390X_HAVE_Z14 7073 const RebindToUnsigned<decltype(di8)> du8; 7074 return BitCast(di8, SumOfLanes(du8, BitCast(du8, v))); 7075 #else 7076 constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; 7077 const Full128<int32_t> di32; 7078 const Vec128<int32_t> zero = Zero(di32); 7079 return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws( 7080 di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw)); 7081 #endif 7082 } 7083 7084 #if HWY_S390X_HAVE_Z14 7085 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)> 7086 HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) { 7087 const RebindToUnsigned<decltype(d32)> du32; 7088 return Broadcast<1>( 7089 BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(), 7090 BitCast(du32, v)))); 7091 } 7092 7093 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> 7094 HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) { 7095 return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v)); 7096 } 7097 #endif 7098 7099 // generic_ops defines MinOfLanes and MaxOfLanes. 7100 7101 // ------------------------------ ReduceSum for N=4 I8/U8 7102 7103 // GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4 7104 // I8/U8 ReduceSum implementation in generic_ops-inl.h 7105 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 7106 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 7107 #else 7108 #define HWY_NATIVE_REDUCE_SUM_4_UI8 7109 #endif 7110 7111 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)> 7112 HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) { 7113 return static_cast<TFromD<D>>(GetLane(SumsOf4(v))); 7114 } 7115 7116 // ------------------------------ BitShuffle 7117 7118 #ifdef HWY_NATIVE_BITSHUFFLE 7119 #undef HWY_NATIVE_BITSHUFFLE 7120 #else 7121 #define HWY_NATIVE_BITSHUFFLE 7122 #endif 7123 7124 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), 7125 HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)> 7126 HWY_API V BitShuffle(V v, VI idx) { 7127 const DFromV<decltype(v)> d64; 7128 const RebindToUnsigned<decltype(d64)> du64; 7129 const Repartition<uint8_t, decltype(d64)> du8; 7130 7131 const Full128<TFromD<decltype(du64)>> d_full_u64; 7132 const Full128<TFromD<decltype(du8)>> d_full_u8; 7133 7134 using RawVU64 = __vector unsigned long long; 7135 7136 #if HWY_PPC_HAVE_9 7137 7138 #if HWY_IS_LITTLE_ENDIAN 7139 (void)d_full_u64; 7140 auto bit_idx = ResizeBitCast(d_full_u8, idx); 7141 #else 7142 auto bit_idx = 7143 BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))); 7144 #endif 7145 7146 bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F})); 7147 7148 return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>( 7149 vec_bperm(BitCast(du64, v).raw, bit_idx.raw))}); 7150 #else // !HWY_PPC_HAVE_9 7151 7152 #if HWY_IS_LITTLE_ENDIAN 7153 const auto bit_idx_xor_mask = BitCast( 7154 d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu}, 7155 uint64_t{0x3F3F3F3F3F3F3F3Fu})); 7156 const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask); 7157 constexpr int kBitShufResultByteShrAmt = 8; 7158 #else 7159 const auto bit_idx_xor_mask = BitCast( 7160 d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu}, 7161 uint64_t{0x7F7F7F7F7F7F7F7Fu})); 7162 const auto bit_idx = 7163 Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))), 7164 bit_idx_xor_mask); 7165 constexpr int kBitShufResultByteShrAmt = 6; 7166 #endif 7167 7168 #if HWY_S390X_HAVE_Z14 7169 const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>( 7170 vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))}; 7171 #elif defined(__SIZEOF_INT128__) 7172 using RawVU128 = __vector unsigned __int128; 7173 const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>( 7174 vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))}; 7175 #else 7176 using RawVU128 = __vector unsigned char; 7177 const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>( 7178 vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))}; 7179 #endif 7180 7181 return ResizeBitCast( 7182 d64, PromoteTo(d_full_u64, 7183 ResizeBitCast( 7184 Rebind<uint8_t, decltype(d_full_u64)>(), 7185 CombineShiftRightBytes<kBitShufResultByteShrAmt>( 7186 d_full_u64, bit_shuf_result, bit_shuf_result)))); 7187 #endif // HWY_PPC_HAVE_9 7188 } 7189 7190 // ------------------------------ Lt128 7191 7192 namespace detail { 7193 7194 // Returns vector-mask for Lt128. 7195 template <class D, class V = VFromD<D>> 7196 HWY_INLINE V Lt128Vec(D d, V a, V b) { 7197 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 7198 #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) 7199 (void)d; 7200 using VU64 = __vector unsigned long long; 7201 using VU128 = __vector unsigned __int128; 7202 #if HWY_IS_LITTLE_ENDIAN 7203 const VU128 a_u128 = reinterpret_cast<VU128>(a.raw); 7204 const VU128 b_u128 = reinterpret_cast<VU128>(b.raw); 7205 #else 7206 // NOTE: Need to swap the halves of both a and b on big-endian targets 7207 // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits 7208 // of a and b are in lane 0 whereas the vec_cmplt operation below expects 7209 // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on 7210 // big-endian PPC targets. 7211 const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8)); 7212 const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8)); 7213 #endif 7214 return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))}; 7215 #else // !HWY_PPC_HAVE_10 7216 // Truth table of Eq and Lt for Hi and Lo u64. 7217 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) 7218 // =H =L cH cL | out = cH | (=H & cL) 7219 // 0 0 0 0 | 0 7220 // 0 0 0 1 | 0 7221 // 0 0 1 0 | 1 7222 // 0 0 1 1 | 1 7223 // 0 1 0 0 | 0 7224 // 0 1 0 1 | 0 7225 // 0 1 1 0 | 1 7226 // 1 0 0 0 | 0 7227 // 1 0 0 1 | 1 7228 // 1 1 0 0 | 0 7229 const auto eqHL = Eq(a, b); 7230 const V ltHL = VecFromMask(d, Lt(a, b)); 7231 const V ltLX = ShiftLeftLanes<1>(ltHL); 7232 const V vecHx = IfThenElse(eqHL, ltLX, ltHL); 7233 return InterleaveUpper(d, vecHx, vecHx); 7234 #endif 7235 } 7236 7237 // Returns vector-mask for Eq128. 7238 template <class D, class V = VFromD<D>> 7239 HWY_INLINE V Eq128Vec(D d, V a, V b) { 7240 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 7241 #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) 7242 (void)d; 7243 using VU64 = __vector unsigned long long; 7244 using VU128 = __vector unsigned __int128; 7245 return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw), 7246 reinterpret_cast<VU128>(b.raw)))}; 7247 #else 7248 const auto eqHL = VecFromMask(d, Eq(a, b)); 7249 const auto eqLH = Reverse2(d, eqHL); 7250 return And(eqHL, eqLH); 7251 #endif 7252 } 7253 7254 template <class D, class V = VFromD<D>> 7255 HWY_INLINE V Ne128Vec(D d, V a, V b) { 7256 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64"); 7257 #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) 7258 (void)d; 7259 using VU64 = __vector unsigned long long; 7260 using VU128 = __vector unsigned __int128; 7261 return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw), 7262 reinterpret_cast<VU128>(b.raw)))}; 7263 #else 7264 const auto neHL = VecFromMask(d, Ne(a, b)); 7265 const auto neLH = Reverse2(d, neHL); 7266 return Or(neHL, neLH); 7267 #endif 7268 } 7269 7270 template <class D, class V = VFromD<D>> 7271 HWY_INLINE V Lt128UpperVec(D d, V a, V b) { 7272 const V ltHL = VecFromMask(d, Lt(a, b)); 7273 return InterleaveUpper(d, ltHL, ltHL); 7274 } 7275 7276 template <class D, class V = VFromD<D>> 7277 HWY_INLINE V Eq128UpperVec(D d, V a, V b) { 7278 const V eqHL = VecFromMask(d, Eq(a, b)); 7279 return InterleaveUpper(d, eqHL, eqHL); 7280 } 7281 7282 template <class D, class V = VFromD<D>> 7283 HWY_INLINE V Ne128UpperVec(D d, V a, V b) { 7284 const V neHL = VecFromMask(d, Ne(a, b)); 7285 return InterleaveUpper(d, neHL, neHL); 7286 } 7287 7288 } // namespace detail 7289 7290 template <class D, class V = VFromD<D>> 7291 HWY_API MFromD<D> Lt128(D d, V a, V b) { 7292 return MaskFromVec(detail::Lt128Vec(d, a, b)); 7293 } 7294 7295 template <class D, class V = VFromD<D>> 7296 HWY_API MFromD<D> Eq128(D d, V a, V b) { 7297 return MaskFromVec(detail::Eq128Vec(d, a, b)); 7298 } 7299 7300 template <class D, class V = VFromD<D>> 7301 HWY_API MFromD<D> Ne128(D d, V a, V b) { 7302 return MaskFromVec(detail::Ne128Vec(d, a, b)); 7303 } 7304 7305 template <class D, class V = VFromD<D>> 7306 HWY_API MFromD<D> Lt128Upper(D d, V a, V b) { 7307 return MaskFromVec(detail::Lt128UpperVec(d, a, b)); 7308 } 7309 7310 template <class D, class V = VFromD<D>> 7311 HWY_API MFromD<D> Eq128Upper(D d, V a, V b) { 7312 return MaskFromVec(detail::Eq128UpperVec(d, a, b)); 7313 } 7314 7315 template <class D, class V = VFromD<D>> 7316 HWY_API MFromD<D> Ne128Upper(D d, V a, V b) { 7317 return MaskFromVec(detail::Ne128UpperVec(d, a, b)); 7318 } 7319 7320 // ------------------------------ Min128, Max128 (Lt128) 7321 7322 // Avoids the extra MaskFromVec in Lt128. 7323 template <class D, class V = VFromD<D>> 7324 HWY_API V Min128(D d, const V a, const V b) { 7325 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); 7326 } 7327 7328 template <class D, class V = VFromD<D>> 7329 HWY_API V Max128(D d, const V a, const V b) { 7330 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); 7331 } 7332 7333 template <class D, class V = VFromD<D>> 7334 HWY_API V Min128Upper(D d, const V a, const V b) { 7335 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); 7336 } 7337 7338 template <class D, class V = VFromD<D>> 7339 HWY_API V Max128Upper(D d, const V a, const V b) { 7340 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); 7341 } 7342 7343 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex 7344 7345 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT 7346 #undef HWY_NATIVE_LEADING_ZERO_COUNT 7347 #else 7348 #define HWY_NATIVE_LEADING_ZERO_COUNT 7349 #endif 7350 7351 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 7352 HWY_API V LeadingZeroCount(V v) { 7353 #if HWY_S390X_HAVE_Z14 7354 const DFromV<decltype(v)> d; 7355 const RebindToUnsigned<decltype(d)> du; 7356 7357 #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__) 7358 // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a 7359 // constant 7360 __asm__("" : "+v"(v.raw)); 7361 #endif 7362 7363 return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)}); 7364 #else 7365 return V{vec_cntlz(v.raw)}; 7366 #endif 7367 } 7368 7369 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 7370 HWY_API V HighestSetBitIndex(V v) { 7371 const DFromV<decltype(v)> d; 7372 using T = TFromD<decltype(d)>; 7373 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); 7374 } 7375 7376 #if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14 7377 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 7378 HWY_API V TrailingZeroCount(V v) { 7379 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 7380 return V{vec_vctz(v.raw)}; 7381 #else 7382 #if HWY_S390X_HAVE_Z14 7383 const DFromV<decltype(v)> d; 7384 const RebindToUnsigned<decltype(d)> du; 7385 7386 #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__) 7387 // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a 7388 // constant 7389 __asm__("" : "+v"(v.raw)); 7390 #endif 7391 7392 return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)}); 7393 #else 7394 return V{vec_cnttz(v.raw)}; 7395 #endif // HWY_S390X_HAVE_Z14 7396 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 7397 } 7398 #else 7399 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 7400 HWY_API V TrailingZeroCount(V v) { 7401 const DFromV<decltype(v)> d; 7402 const RebindToSigned<decltype(d)> di; 7403 using TI = TFromD<decltype(di)>; 7404 7405 const auto vi = BitCast(di, v); 7406 const auto lowest_bit = And(vi, Neg(vi)); 7407 constexpr TI kNumOfBitsInT{sizeof(TI) * 8}; 7408 const auto bit_idx = HighestSetBitIndex(lowest_bit); 7409 return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)), 7410 Set(di, kNumOfBitsInT), bit_idx)); 7411 } 7412 #endif 7413 7414 #undef HWY_PPC_HAVE_9 7415 #undef HWY_PPC_HAVE_10 7416 #undef HWY_S390X_HAVE_Z14 7417 #undef HWY_S390X_HAVE_Z15 7418 7419 // NOLINTNEXTLINE(google-readability-namespace-comments) 7420 } // namespace HWY_NAMESPACE 7421 } // namespace hwy 7422 HWY_AFTER_NAMESPACE();