x86_128-inl.h (525219B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL 17 // operations when compiling for those targets. 18 // External include guard in highway.h - see comment there. 19 20 // Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL 21 #include "hwy/base.h" 22 23 // Avoid uninitialized warnings in GCC's emmintrin.h - see 24 // https://github.com/google/highway/issues/710 and pull/902 25 HWY_DIAGNOSTICS(push) 26 #if HWY_COMPILER_GCC_ACTUAL 27 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 28 HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, 29 ignored "-Wmaybe-uninitialized") 30 #endif 31 32 #include <emmintrin.h> 33 #include <stdio.h> 34 #if HWY_TARGET == HWY_SSSE3 35 #include <tmmintrin.h> // SSSE3 36 #elif HWY_TARGET <= HWY_SSE4 37 #include <smmintrin.h> // SSE4 38 #ifndef HWY_DISABLE_PCLMUL_AES 39 #include <wmmintrin.h> // CLMUL 40 #endif 41 #endif 42 43 #include "hwy/ops/shared-inl.h" 44 45 HWY_BEFORE_NAMESPACE(); 46 namespace hwy { 47 namespace HWY_NAMESPACE { 48 namespace detail { 49 50 // Enable generic functions for whichever of (f16, bf16) are not supported. 51 #if !HWY_HAVE_FLOAT16 52 #define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) 53 #else 54 #define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D) 55 #endif 56 57 #undef HWY_AVX3_HAVE_F32_TO_BF16C 58 #if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \ 59 (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \ 60 HWY_AVX3_ENABLE_AVX512BF16 61 #define HWY_AVX3_HAVE_F32_TO_BF16C 1 62 #else 63 #define HWY_AVX3_HAVE_F32_TO_BF16C 0 64 #endif 65 66 #undef HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT 67 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64 68 #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "v" 69 #else 70 #define HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT "x" 71 #endif 72 73 #undef HWY_X86_HAVE_AVX10_2_OPS 74 #if HWY_TARGET_IS_AVX10_2 && \ 75 (HWY_COMPILER_GCC_ACTUAL >= 1501 || \ 76 (HWY_COMPILER3_CLANG >= 200103 && HWY_COMPILER_CLANG != 2100)) 77 #define HWY_X86_HAVE_AVX10_2_OPS 1 78 #else 79 #define HWY_X86_HAVE_AVX10_2_OPS 0 80 #endif 81 82 template <typename T> 83 struct Raw128 { 84 using type = __m128i; 85 }; 86 #if HWY_HAVE_FLOAT16 87 template <> 88 struct Raw128<float16_t> { 89 using type = __m128h; 90 }; 91 #endif // HWY_HAVE_FLOAT16 92 template <> 93 struct Raw128<float> { 94 using type = __m128; 95 }; 96 template <> 97 struct Raw128<double> { 98 using type = __m128d; 99 }; 100 101 } // namespace detail 102 103 template <typename T, size_t N = 16 / sizeof(T)> 104 class Vec128 { 105 using Raw = typename detail::Raw128<T>::type; 106 107 public: 108 using PrivateT = T; // only for DFromV 109 static constexpr size_t kPrivateN = N; // only for DFromV 110 111 // Compound assignment. Only usable if there is a corresponding non-member 112 // binary operator overload. For example, only f32 and f64 support division. 113 HWY_INLINE Vec128& operator*=(const Vec128 other) { 114 return *this = (*this * other); 115 } 116 HWY_INLINE Vec128& operator/=(const Vec128 other) { 117 return *this = (*this / other); 118 } 119 HWY_INLINE Vec128& operator+=(const Vec128 other) { 120 return *this = (*this + other); 121 } 122 HWY_INLINE Vec128& operator-=(const Vec128 other) { 123 return *this = (*this - other); 124 } 125 HWY_INLINE Vec128& operator%=(const Vec128 other) { 126 return *this = (*this % other); 127 } 128 HWY_INLINE Vec128& operator&=(const Vec128 other) { 129 return *this = (*this & other); 130 } 131 HWY_INLINE Vec128& operator|=(const Vec128 other) { 132 return *this = (*this | other); 133 } 134 HWY_INLINE Vec128& operator^=(const Vec128 other) { 135 return *this = (*this ^ other); 136 } 137 138 Raw raw; 139 }; 140 141 template <typename T> 142 using Vec64 = Vec128<T, 8 / sizeof(T)>; 143 144 template <typename T> 145 using Vec32 = Vec128<T, 4 / sizeof(T)>; 146 147 template <typename T> 148 using Vec16 = Vec128<T, 2 / sizeof(T)>; 149 150 namespace detail { 151 152 #if HWY_TARGET <= HWY_AVX3 153 154 // Template arg: sizeof(lane type) 155 template <size_t size> 156 struct RawMask128T {}; 157 template <> 158 struct RawMask128T<1> { 159 using type = __mmask16; 160 }; 161 template <> 162 struct RawMask128T<2> { 163 using type = __mmask8; 164 }; 165 template <> 166 struct RawMask128T<4> { 167 using type = __mmask8; 168 }; 169 template <> 170 struct RawMask128T<8> { 171 using type = __mmask8; 172 }; 173 174 template <typename T> 175 using RawMask128 = typename RawMask128T<sizeof(T)>::type; 176 177 #else // AVX2 or earlier 178 179 template <typename T> 180 using RawMask128 = typename Raw128<T>::type; 181 182 #endif // HWY_TARGET <= HWY_AVX3 183 184 } // namespace detail 185 186 template <typename T, size_t N = 16 / sizeof(T)> 187 struct Mask128 { 188 using Raw = typename detail::RawMask128<T>; 189 190 using PrivateT = T; // only for DFromM 191 static constexpr size_t kPrivateN = N; // only for DFromM 192 193 #if HWY_TARGET <= HWY_AVX3 194 static Mask128<T, N> FromBits(uint64_t mask_bits) { 195 return Mask128<T, N>{static_cast<Raw>(mask_bits)}; 196 } 197 #else 198 // Lanes are either FF..FF or 0. 199 #endif 200 201 Raw raw; 202 }; 203 204 template <class V> 205 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 206 207 template <class M> 208 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 209 210 template <class V> 211 using TFromV = typename V::PrivateT; 212 213 // ------------------------------ Zero 214 215 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. 216 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 217 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 218 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; 219 } 220 #if HWY_HAVE_FLOAT16 221 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 222 HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 223 return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()}; 224 } 225 #endif // HWY_HAVE_FLOAT16 226 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 227 HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 228 return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()}; 229 } 230 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 231 HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 232 return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()}; 233 } 234 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)> 235 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 236 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; 237 } 238 239 // Using the existing Zero function instead of a dedicated function for 240 // deduction avoids having to forward-declare Vec256 here. 241 template <class D> 242 using VFromD = decltype(Zero(D())); 243 244 // ------------------------------ BitCast 245 246 namespace detail { 247 248 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } 249 #if HWY_HAVE_FLOAT16 250 HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); } 251 #endif // HWY_HAVE_FLOAT16 252 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } 253 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } 254 255 #if HWY_AVX3_HAVE_F32_TO_BF16C 256 HWY_INLINE __m128i BitCastToInteger(__m128bh v) { 257 // Need to use reinterpret_cast on GCC/Clang or BitCastScalar on MSVC to 258 // bit cast a __m128bh to a __m128i as there is currently no intrinsic 259 // available (as of GCC 13 and Clang 17) that can bit cast a __m128bh vector 260 // to a __m128i vector 261 262 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG 263 // On GCC or Clang, use reinterpret_cast to bit cast a __m128bh to a __m128i 264 return reinterpret_cast<__m128i>(v); 265 #else 266 // On MSVC, use BitCastScalar to bit cast a __m128bh to a __m128i as MSVC does 267 // not allow reinterpret_cast, static_cast, or a C-style cast to be used to 268 // bit cast from one SSE/AVX vector type to a different SSE/AVX vector type 269 return BitCastScalar<__m128i>(v); 270 #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG 271 } 272 #endif // HWY_AVX3_HAVE_F32_TO_BF16C 273 274 template <typename T, size_t N> 275 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { 276 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; 277 } 278 279 // Cannot rely on function overloading because return types differ. 280 template <typename T> 281 struct BitCastFromInteger128 { 282 HWY_INLINE __m128i operator()(__m128i v) { return v; } 283 }; 284 #if HWY_HAVE_FLOAT16 285 template <> 286 struct BitCastFromInteger128<float16_t> { 287 HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); } 288 }; 289 #endif // HWY_HAVE_FLOAT16 290 template <> 291 struct BitCastFromInteger128<float> { 292 HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } 293 }; 294 template <> 295 struct BitCastFromInteger128<double> { 296 HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } 297 }; 298 299 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 300 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 301 Vec128<uint8_t, D().MaxBytes()> v) { 302 return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)}; 303 } 304 305 } // namespace detail 306 307 template <class D, typename FromT, HWY_IF_V_SIZE_LE_D(D, 16)> 308 HWY_API VFromD<D> BitCast(D d, 309 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { 310 return detail::BitCastFromByte(d, detail::BitCastToByte(v)); 311 } 312 313 // ------------------------------ Set 314 315 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 316 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 317 return VFromD<D>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT 318 } 319 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> 320 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 321 return VFromD<D>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT 322 } 323 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 324 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 325 return VFromD<D>{_mm_set1_epi32(static_cast<int>(t))}; 326 } 327 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 328 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 329 return VFromD<D>{_mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT 330 } 331 #if HWY_HAVE_FLOAT16 332 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 333 HWY_API VFromD<D> Set(D /* tag */, float16_t t) { 334 return VFromD<D>{_mm_set1_ph(t)}; 335 } 336 #endif // HWY_HAVE_FLOAT16 337 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 338 HWY_API VFromD<D> Set(D /* tag */, float t) { 339 return VFromD<D>{_mm_set1_ps(t)}; 340 } 341 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 342 HWY_API VFromD<D> Set(D /* tag */, double t) { 343 return VFromD<D>{_mm_set1_pd(t)}; 344 } 345 346 // Generic for all vector lengths. 347 template <class D, HWY_X86_IF_EMULATED_D(D)> 348 HWY_API VFromD<D> Set(D df, TFromD<D> t) { 349 const RebindToUnsigned<decltype(df)> du; 350 static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16"); 351 uint16_t bits; 352 CopyBytes<2>(&t, &bits); 353 return BitCast(df, Set(du, bits)); 354 } 355 356 // ------------------------------ Undefined 357 358 HWY_DIAGNOSTICS(push) 359 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 360 361 // Returns a vector with uninitialized elements. 362 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 363 HWY_API VFromD<D> Undefined(D /* tag */) { 364 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC 365 // generate an XOR instruction. 366 return VFromD<D>{_mm_undefined_si128()}; 367 } 368 #if HWY_HAVE_FLOAT16 369 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 370 HWY_API VFromD<D> Undefined(D /* tag */) { 371 return VFromD<D>{_mm_undefined_ph()}; 372 } 373 #endif // HWY_HAVE_FLOAT16 374 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 375 HWY_API VFromD<D> Undefined(D /* tag */) { 376 return VFromD<D>{_mm_undefined_ps()}; 377 } 378 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 379 HWY_API VFromD<D> Undefined(D /* tag */) { 380 return VFromD<D>{_mm_undefined_pd()}; 381 } 382 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)> 383 HWY_API VFromD<D> Undefined(D /* tag */) { 384 return VFromD<D>{_mm_undefined_si128()}; 385 } 386 387 HWY_DIAGNOSTICS(pop) 388 389 // ------------------------------ GetLane 390 391 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 392 HWY_API T GetLane(const Vec128<T, N> v) { 393 return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF); 394 } 395 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 396 HWY_API T GetLane(const Vec128<T, N> v) { 397 const DFromV<decltype(v)> d; 398 const RebindToUnsigned<decltype(d)> du; 399 const uint16_t bits = 400 static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF); 401 return BitCastScalar<T>(bits); 402 } 403 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 404 HWY_API T GetLane(const Vec128<T, N> v) { 405 return static_cast<T>(_mm_cvtsi128_si32(v.raw)); 406 } 407 template <size_t N> 408 HWY_API float GetLane(const Vec128<float, N> v) { 409 return _mm_cvtss_f32(v.raw); 410 } 411 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 412 HWY_API T GetLane(const Vec128<T, N> v) { 413 #if HWY_ARCH_X86_32 414 const DFromV<decltype(v)> d; 415 alignas(16) T lanes[2]; 416 Store(v, d, lanes); 417 return lanes[0]; 418 #else 419 return static_cast<T>(_mm_cvtsi128_si64(v.raw)); 420 #endif 421 } 422 template <size_t N> 423 HWY_API double GetLane(const Vec128<double, N> v) { 424 return _mm_cvtsd_f64(v.raw); 425 } 426 427 // ------------------------------ ResizeBitCast 428 429 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), 430 HWY_IF_V_SIZE_LE_D(D, 16)> 431 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 432 const Repartition<uint8_t, decltype(d)> du8; 433 return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)}); 434 } 435 436 // ------------------------------ Dup128VecFromValues 437 438 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 439 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 440 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 441 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 442 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 443 TFromD<D> t11, TFromD<D> t12, 444 TFromD<D> t13, TFromD<D> t14, 445 TFromD<D> t15) { 446 return VFromD<D>{_mm_setr_epi8( 447 static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2), 448 static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5), 449 static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8), 450 static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11), 451 static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14), 452 static_cast<char>(t15))}; 453 } 454 455 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 456 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 457 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 458 TFromD<D> t5, TFromD<D> t6, 459 TFromD<D> t7) { 460 return VFromD<D>{ 461 _mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1), 462 static_cast<int16_t>(t2), static_cast<int16_t>(t3), 463 static_cast<int16_t>(t4), static_cast<int16_t>(t5), 464 static_cast<int16_t>(t6), static_cast<int16_t>(t7))}; 465 } 466 467 // Generic for all vector lengths 468 template <class D, HWY_IF_BF16_D(D)> 469 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 470 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 471 TFromD<D> t5, TFromD<D> t6, 472 TFromD<D> t7) { 473 const RebindToSigned<decltype(d)> di; 474 return BitCast(d, 475 Dup128VecFromValues( 476 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), 477 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), 478 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), 479 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); 480 } 481 482 #if HWY_HAVE_FLOAT16 483 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 484 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 485 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 486 TFromD<D> t5, TFromD<D> t6, 487 TFromD<D> t7) { 488 return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)}; 489 } 490 #else 491 // Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true 492 template <class D, HWY_IF_F16_D(D)> 493 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 494 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 495 TFromD<D> t5, TFromD<D> t6, 496 TFromD<D> t7) { 497 const RebindToSigned<decltype(d)> di; 498 return BitCast(d, 499 Dup128VecFromValues( 500 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), 501 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), 502 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), 503 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); 504 } 505 #endif // HWY_HAVE_FLOAT16 506 507 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 508 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 509 TFromD<D> t2, TFromD<D> t3) { 510 return VFromD<D>{ 511 _mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1), 512 static_cast<int32_t>(t2), static_cast<int32_t>(t3))}; 513 } 514 515 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 516 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 517 TFromD<D> t2, TFromD<D> t3) { 518 return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)}; 519 } 520 521 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 522 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 523 // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic 524 // available 525 return VFromD<D>{ 526 _mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))}; 527 } 528 529 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 530 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 531 return VFromD<D>{_mm_setr_pd(t0, t1)}; 532 } 533 534 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 535 namespace detail { 536 537 template <class RawV> 538 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( 539 hwy::SizeTag<1> /* num_of_lanes_tag*/, RawV v) { 540 return __builtin_constant_p(v[0]); 541 } 542 543 template <class RawV> 544 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( 545 hwy::SizeTag<2> /* num_of_lanes_tag*/, RawV v) { 546 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]); 547 } 548 549 template <class RawV> 550 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( 551 hwy::SizeTag<4> /* num_of_lanes_tag*/, RawV v) { 552 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 553 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]); 554 } 555 556 template <class RawV> 557 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( 558 hwy::SizeTag<8> /* num_of_lanes_tag*/, RawV v) { 559 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 560 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && 561 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && 562 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]); 563 } 564 565 template <class RawV> 566 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( 567 hwy::SizeTag<16> /* num_of_lanes_tag*/, RawV v) { 568 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 569 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && 570 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && 571 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && 572 __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && 573 __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && 574 __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && 575 __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]); 576 } 577 578 #if HWY_TARGET <= HWY_AVX2 579 template <class RawV> 580 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantRawX86Vec( 581 hwy::SizeTag<32> /* num_of_lanes_tag*/, RawV v) { 582 return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && 583 __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && 584 __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && 585 __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && 586 __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && 587 __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && 588 __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && 589 __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]) && 590 __builtin_constant_p(v[16]) && __builtin_constant_p(v[17]) && 591 __builtin_constant_p(v[18]) && __builtin_constant_p(v[19]) && 592 __builtin_constant_p(v[20]) && __builtin_constant_p(v[21]) && 593 __builtin_constant_p(v[22]) && __builtin_constant_p(v[23]) && 594 __builtin_constant_p(v[24]) && __builtin_constant_p(v[25]) && 595 __builtin_constant_p(v[26]) && __builtin_constant_p(v[27]) && 596 __builtin_constant_p(v[28]) && __builtin_constant_p(v[29]) && 597 __builtin_constant_p(v[30]) && __builtin_constant_p(v[31]); 598 } 599 #endif 600 601 template <size_t kNumOfLanes, class V> 602 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86Vec( 603 hwy::SizeTag<kNumOfLanes> num_of_lanes_tag, V v) { 604 using T = TFromV<V>; 605 #if HWY_HAVE_FLOAT16 && HWY_HAVE_SCALAR_F16_TYPE 606 using F16VecLaneT = hwy::float16_t::Native; 607 #else 608 using F16VecLaneT = uint16_t; 609 #endif 610 using RawVecLaneT = If<hwy::IsSame<T, hwy::float16_t>(), F16VecLaneT, 611 If<hwy::IsSame<T, hwy::bfloat16_t>(), uint16_t, T>>; 612 613 // Suppress the -Wignored-attributes warning that is emitted by 614 // RemoveCvRef<decltype(v.raw)> with GCC 615 HWY_DIAGNOSTICS(push) 616 HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") 617 typedef RawVecLaneT GccRawVec 618 __attribute__((__vector_size__(sizeof(RemoveCvRef<decltype(v.raw)>)))); 619 HWY_DIAGNOSTICS(pop) 620 621 return IsConstantRawX86Vec(num_of_lanes_tag, 622 reinterpret_cast<GccRawVec>(v.raw)); 623 } 624 625 template <class TTo, class V> 626 static HWY_INLINE HWY_MAYBE_UNUSED bool IsConstantX86VecForF2IConv(V v) { 627 constexpr size_t kNumOfLanesInRawSrcVec = 628 HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TFromV<V>)); 629 constexpr size_t kNumOfLanesInRawResultVec = 630 HWY_MAX(HWY_MAX_LANES_V(V), 16 / sizeof(TTo)); 631 constexpr size_t kNumOfLanesToCheck = 632 HWY_MIN(kNumOfLanesInRawSrcVec, kNumOfLanesInRawResultVec); 633 634 return IsConstantX86Vec(hwy::SizeTag<kNumOfLanesToCheck>(), v); 635 } 636 637 } // namespace detail 638 #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 639 640 // ================================================== LOGICAL 641 642 // ------------------------------ And 643 644 template <typename T, size_t N> 645 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { 646 const DFromV<decltype(a)> d; // for float16_t 647 const RebindToUnsigned<decltype(d)> du; 648 return BitCast(d, VFromD<decltype(du)>{ 649 _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); 650 } 651 template <size_t N> 652 HWY_API Vec128<float, N> And(Vec128<float, N> a, Vec128<float, N> b) { 653 return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)}; 654 } 655 template <size_t N> 656 HWY_API Vec128<double, N> And(Vec128<double, N> a, Vec128<double, N> b) { 657 return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)}; 658 } 659 660 // ------------------------------ AndNot 661 662 // Returns ~not_mask & mask. 663 template <typename T, size_t N> 664 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { 665 const DFromV<decltype(mask)> d; // for float16_t 666 const RebindToUnsigned<decltype(d)> du; 667 return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128( 668 BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); 669 } 670 template <size_t N> 671 HWY_API Vec128<float, N> AndNot(Vec128<float, N> not_mask, 672 Vec128<float, N> mask) { 673 return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)}; 674 } 675 template <size_t N> 676 HWY_API Vec128<double, N> AndNot(Vec128<double, N> not_mask, 677 Vec128<double, N> mask) { 678 return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)}; 679 } 680 681 // ------------------------------ Or 682 683 template <typename T, size_t N> 684 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { 685 const DFromV<decltype(a)> d; // for float16_t 686 const RebindToUnsigned<decltype(d)> du; 687 return BitCast(d, VFromD<decltype(du)>{ 688 _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); 689 } 690 691 template <size_t N> 692 HWY_API Vec128<float, N> Or(Vec128<float, N> a, Vec128<float, N> b) { 693 return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)}; 694 } 695 template <size_t N> 696 HWY_API Vec128<double, N> Or(Vec128<double, N> a, Vec128<double, N> b) { 697 return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)}; 698 } 699 700 // ------------------------------ Xor 701 702 template <typename T, size_t N> 703 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { 704 const DFromV<decltype(a)> d; // for float16_t 705 const RebindToUnsigned<decltype(d)> du; 706 return BitCast(d, VFromD<decltype(du)>{ 707 _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); 708 } 709 710 template <size_t N> 711 HWY_API Vec128<float, N> Xor(Vec128<float, N> a, Vec128<float, N> b) { 712 return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)}; 713 } 714 template <size_t N> 715 HWY_API Vec128<double, N> Xor(Vec128<double, N> a, Vec128<double, N> b) { 716 return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)}; 717 } 718 719 // ------------------------------ Not 720 template <typename T, size_t N> 721 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { 722 const DFromV<decltype(v)> d; 723 const RebindToUnsigned<decltype(d)> du; 724 using VU = VFromD<decltype(du)>; 725 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 726 const __m128i vu = BitCast(du, v).raw; 727 return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); 728 #else 729 return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)})); 730 #endif 731 } 732 733 // ------------------------------ Xor3 734 template <typename T, size_t N> 735 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 736 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 737 const DFromV<decltype(x1)> d; 738 const RebindToUnsigned<decltype(d)> du; 739 using VU = VFromD<decltype(du)>; 740 const __m128i ret = _mm_ternarylogic_epi64( 741 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); 742 return BitCast(d, VU{ret}); 743 #else 744 return Xor(x1, Xor(x2, x3)); 745 #endif 746 } 747 748 // ------------------------------ Or3 749 template <typename T, size_t N> 750 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { 751 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 752 const DFromV<decltype(o1)> d; 753 const RebindToUnsigned<decltype(d)> du; 754 using VU = VFromD<decltype(du)>; 755 const __m128i ret = _mm_ternarylogic_epi64( 756 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); 757 return BitCast(d, VU{ret}); 758 #else 759 return Or(o1, Or(o2, o3)); 760 #endif 761 } 762 763 // ------------------------------ OrAnd 764 template <typename T, size_t N> 765 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { 766 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 767 const DFromV<decltype(o)> d; 768 const RebindToUnsigned<decltype(d)> du; 769 using VU = VFromD<decltype(du)>; 770 const __m128i ret = _mm_ternarylogic_epi64( 771 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); 772 return BitCast(d, VU{ret}); 773 #else 774 return Or(o, And(a1, a2)); 775 #endif 776 } 777 778 // ------------------------------ IfVecThenElse 779 template <typename T, size_t N> 780 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, 781 Vec128<T, N> no) { 782 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 783 const DFromV<decltype(no)> d; 784 const RebindToUnsigned<decltype(d)> du; 785 using VU = VFromD<decltype(du)>; 786 return BitCast( 787 d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, 788 BitCast(du, no).raw, 0xCA)}); 789 #else 790 return IfThenElse(MaskFromVec(mask), yes, no); 791 #endif 792 } 793 794 // ------------------------------ BitwiseIfThenElse 795 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 796 797 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE 798 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE 799 #else 800 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE 801 #endif 802 803 template <class V> 804 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { 805 return IfVecThenElse(mask, yes, no); 806 } 807 808 #endif 809 810 // ------------------------------ Operator overloads (internal-only if float) 811 812 template <typename T, size_t N> 813 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { 814 return And(a, b); 815 } 816 817 template <typename T, size_t N> 818 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { 819 return Or(a, b); 820 } 821 822 template <typename T, size_t N> 823 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { 824 return Xor(a, b); 825 } 826 827 // ------------------------------ PopulationCount 828 829 // 8/16 require BITALG, 32/64 require VPOPCNTDQ. 830 #if HWY_TARGET <= HWY_AVX3_DL 831 832 #ifdef HWY_NATIVE_POPCNT 833 #undef HWY_NATIVE_POPCNT 834 #else 835 #define HWY_NATIVE_POPCNT 836 #endif 837 838 namespace detail { 839 840 template <typename T, size_t N> 841 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, 842 Vec128<T, N> v) { 843 return Vec128<T, N>{_mm_popcnt_epi8(v.raw)}; 844 } 845 template <typename T, size_t N> 846 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, 847 Vec128<T, N> v) { 848 return Vec128<T, N>{_mm_popcnt_epi16(v.raw)}; 849 } 850 template <typename T, size_t N> 851 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, 852 Vec128<T, N> v) { 853 return Vec128<T, N>{_mm_popcnt_epi32(v.raw)}; 854 } 855 template <typename T, size_t N> 856 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, 857 Vec128<T, N> v) { 858 return Vec128<T, N>{_mm_popcnt_epi64(v.raw)}; 859 } 860 861 } // namespace detail 862 863 template <typename T, size_t N> 864 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { 865 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); 866 } 867 868 #endif // HWY_TARGET <= HWY_AVX3_DL 869 870 // ================================================== SIGN 871 872 // ------------------------------ Neg 873 874 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility 875 namespace detail { 876 877 template <typename T, size_t N> 878 HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) { 879 return Xor(v, SignBit(DFromV<decltype(v)>())); 880 } 881 882 template <typename T, size_t N> 883 HWY_INLINE Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, const Vec128<T, N> v) { 884 return Xor(v, SignBit(DFromV<decltype(v)>())); 885 } 886 887 template <typename T, size_t N> 888 HWY_INLINE Vec128<T, N> Neg(hwy::SignedTag /*tag*/, const Vec128<T, N> v) { 889 return Zero(DFromV<decltype(v)>()) - v; 890 } 891 892 } // namespace detail 893 894 template <typename T, size_t N> 895 HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) { 896 return detail::Neg(hwy::TypeTag<T>(), v); 897 } 898 899 // ------------------------------ Floating-point Abs 900 // Generic for all vector lengths 901 template <class V, HWY_IF_FLOAT(TFromV<V>)> 902 HWY_API V Abs(V v) { 903 const DFromV<decltype(v)> d; 904 const RebindToSigned<decltype(d)> di; 905 using TI = TFromD<decltype(di)>; 906 return v & BitCast(d, Set(di, static_cast<TI>(~SignMask<TI>()))); 907 } 908 909 // ------------------------------ CopySign 910 // Generic for all vector lengths. 911 template <class V> 912 HWY_API V CopySign(const V magn, const V sign) { 913 static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point"); 914 915 const DFromV<decltype(magn)> d; 916 const auto msb = SignBit(d); 917 918 // Truth table for msb, magn, sign | bitwise msb ? sign : mag 919 // 0 0 0 | 0 920 // 0 0 1 | 0 921 // 0 1 0 | 1 922 // 0 1 1 | 1 923 // 1 0 0 | 0 924 // 1 0 1 | 1 925 // 1 1 0 | 0 926 // 1 1 1 | 1 927 return BitwiseIfThenElse(msb, sign, magn); 928 } 929 930 // ------------------------------ CopySignToAbs 931 // Generic for all vector lengths. 932 template <class V> 933 HWY_API V CopySignToAbs(const V abs, const V sign) { 934 const DFromV<decltype(abs)> d; 935 return OrAnd(abs, SignBit(d), sign); 936 } 937 938 // ================================================== MASK 939 940 #if HWY_TARGET <= HWY_AVX3 941 // ------------------------------ MaskFromVec 942 943 namespace detail { 944 945 template <typename T, size_t N> 946 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/, 947 const Vec128<T, N> v) { 948 return Mask128<T, N>{_mm_movepi8_mask(v.raw)}; 949 } 950 template <typename T, size_t N> 951 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/, 952 const Vec128<T, N> v) { 953 return Mask128<T, N>{_mm_movepi16_mask(v.raw)}; 954 } 955 template <typename T, size_t N> 956 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/, 957 const Vec128<T, N> v) { 958 return Mask128<T, N>{_mm_movepi32_mask(v.raw)}; 959 } 960 template <typename T, size_t N> 961 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/, 962 const Vec128<T, N> v) { 963 return Mask128<T, N>{_mm_movepi64_mask(v.raw)}; 964 } 965 966 } // namespace detail 967 968 template <typename T, size_t N> 969 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 970 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v); 971 } 972 // There do not seem to be native floating-point versions of these instructions. 973 #if HWY_HAVE_FLOAT16 974 template <size_t N> 975 HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) { 976 const RebindToSigned<DFromV<decltype(v)>> di; 977 return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw}; 978 } 979 #endif 980 template <size_t N> 981 HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) { 982 const RebindToSigned<DFromV<decltype(v)>> di; 983 return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw}; 984 } 985 template <size_t N> 986 HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) { 987 const RebindToSigned<DFromV<decltype(v)>> di; 988 return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw}; 989 } 990 991 template <class D> 992 using MFromD = decltype(MaskFromVec(VFromD<D>())); 993 994 // ------------------------------ MaskFalse (MFromD) 995 996 #ifdef HWY_NATIVE_MASK_FALSE 997 #undef HWY_NATIVE_MASK_FALSE 998 #else 999 #define HWY_NATIVE_MASK_FALSE 1000 #endif 1001 1002 // Generic for all vector lengths 1003 template <class D> 1004 HWY_API MFromD<D> MaskFalse(D /*d*/) { 1005 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)}; 1006 } 1007 1008 // ------------------------------ SetMask 1009 #ifdef HWY_NATIVE_SET_MASK 1010 #undef HWY_NATIVE_SET_MASK 1011 #else 1012 #define HWY_NATIVE_SET_MASK 1013 #endif 1014 1015 template <class D> 1016 HWY_API MFromD<D> SetMask(D /*d*/, bool val) { 1017 constexpr uint64_t kMask = (HWY_MAX_LANES_D(D) < 64) 1018 ? ((1ULL << (HWY_MAX_LANES_D(D) & 63)) - 1ULL) 1019 : LimitsMax<uint64_t>(); 1020 1021 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>( 1022 static_cast<uint64_t>(-static_cast<int64_t>(val)) & kMask)}; 1023 } 1024 1025 // ------------------------------ IsNegative (MFromD) 1026 #ifdef HWY_NATIVE_IS_NEGATIVE 1027 #undef HWY_NATIVE_IS_NEGATIVE 1028 #else 1029 #define HWY_NATIVE_IS_NEGATIVE 1030 #endif 1031 1032 // Generic for all vector lengths 1033 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 1034 HWY_API MFromD<DFromV<V>> IsNegative(V v) { 1035 return MaskFromVec(v); 1036 } 1037 1038 // ------------------------------ PromoteMaskTo (MFromD) 1039 1040 #ifdef HWY_NATIVE_PROMOTE_MASK_TO 1041 #undef HWY_NATIVE_PROMOTE_MASK_TO 1042 #else 1043 #define HWY_NATIVE_PROMOTE_MASK_TO 1044 #endif 1045 1046 // AVX3 PromoteMaskTo is generic for all vector lengths 1047 template <class DTo, class DFrom, 1048 HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)), 1049 class DFrom_2 = Rebind<TFromD<DFrom>, DTo>, 1050 hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr> 1051 HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, 1052 MFromD<DFrom> m) { 1053 return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)}; 1054 } 1055 1056 // ------------------------------ DemoteMaskTo (MFromD) 1057 1058 #ifdef HWY_NATIVE_DEMOTE_MASK_TO 1059 #undef HWY_NATIVE_DEMOTE_MASK_TO 1060 #else 1061 #define HWY_NATIVE_DEMOTE_MASK_TO 1062 #endif 1063 1064 // AVX3 DemoteMaskTo is generic for all vector lengths 1065 template <class DTo, class DFrom, 1066 HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1), 1067 class DFrom_2 = Rebind<TFromD<DFrom>, DTo>, 1068 hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr> 1069 HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, 1070 MFromD<DFrom> m) { 1071 return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)}; 1072 } 1073 1074 // ------------------------------ CombineMasks (MFromD) 1075 1076 #ifdef HWY_NATIVE_COMBINE_MASKS 1077 #undef HWY_NATIVE_COMBINE_MASKS 1078 #else 1079 #define HWY_NATIVE_COMBINE_MASKS 1080 #endif 1081 1082 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. 1083 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) 1084 #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \ 1085 HWY_COMPILER_CLANG >= 800 1086 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1 1087 #else 1088 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0 1089 #endif 1090 #endif // HWY_COMPILER_HAS_MASK_INTRINSICS 1091 1092 template <class D, HWY_IF_LANES_D(D, 2)> 1093 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, 1094 MFromD<Half<D>> lo) { 1095 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1096 const __mmask8 combined_mask = _kor_mask8( 1097 _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1), 1098 _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1))); 1099 #else 1100 const auto combined_mask = 1101 (static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1); 1102 #endif 1103 1104 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; 1105 } 1106 1107 template <class D, HWY_IF_LANES_D(D, 4)> 1108 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, 1109 MFromD<Half<D>> lo) { 1110 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1111 const __mmask8 combined_mask = _kor_mask8( 1112 _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2), 1113 _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3))); 1114 #else 1115 const auto combined_mask = 1116 (static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3); 1117 #endif 1118 1119 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; 1120 } 1121 1122 template <class D, HWY_IF_LANES_D(D, 8)> 1123 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, 1124 MFromD<Half<D>> lo) { 1125 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1126 const __mmask8 combined_mask = _kor_mask8( 1127 _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4), 1128 _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15))); 1129 #else 1130 const auto combined_mask = 1131 (static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u); 1132 #endif 1133 1134 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; 1135 } 1136 1137 template <class D, HWY_IF_LANES_D(D, 16)> 1138 HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi, 1139 MFromD<Half<D>> lo) { 1140 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1141 const __mmask16 combined_mask = _mm512_kunpackb( 1142 static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw)); 1143 #else 1144 const auto combined_mask = 1145 ((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu)); 1146 #endif 1147 1148 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)}; 1149 } 1150 1151 // ------------------------------ LowerHalfOfMask (MFromD) 1152 1153 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK 1154 #undef HWY_NATIVE_LOWER_HALF_OF_MASK 1155 #else 1156 #define HWY_NATIVE_LOWER_HALF_OF_MASK 1157 #endif 1158 1159 // Generic for all vector lengths 1160 template <class D> 1161 HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) { 1162 using RawM = decltype(MFromD<D>().raw); 1163 constexpr size_t kN = MaxLanes(d); 1164 constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8; 1165 1166 MFromD<D> result_mask{static_cast<RawM>(m.raw)}; 1167 1168 if (kN < kNumOfBitsInRawMask) { 1169 result_mask = 1170 And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)}); 1171 } 1172 1173 return result_mask; 1174 } 1175 1176 // ------------------------------ UpperHalfOfMask (MFromD) 1177 1178 #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK 1179 #undef HWY_NATIVE_UPPER_HALF_OF_MASK 1180 #else 1181 #define HWY_NATIVE_UPPER_HALF_OF_MASK 1182 #endif 1183 1184 template <class D, HWY_IF_LANES_D(D, 1)> 1185 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { 1186 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1187 const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1); 1188 #else 1189 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1; 1190 #endif 1191 1192 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; 1193 } 1194 1195 template <class D, HWY_IF_LANES_D(D, 2)> 1196 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { 1197 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1198 const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2); 1199 #else 1200 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2; 1201 #endif 1202 1203 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; 1204 } 1205 1206 template <class D, HWY_IF_LANES_D(D, 4)> 1207 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { 1208 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1209 const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4); 1210 #else 1211 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4; 1212 #endif 1213 1214 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; 1215 } 1216 1217 template <class D, HWY_IF_LANES_D(D, 8)> 1218 HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) { 1219 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1220 const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8); 1221 #else 1222 const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8; 1223 #endif 1224 1225 return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)}; 1226 } 1227 1228 // ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks) 1229 1230 #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 1231 #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 1232 #else 1233 #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 1234 #endif 1235 1236 // Generic for all vector lengths 1237 template <class DTo, class DFrom, 1238 HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2), 1239 class DTo_2 = Repartition<TFromD<DTo>, DFrom>, 1240 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr> 1241 HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/, 1242 MFromD<DFrom> a, MFromD<DFrom> b) { 1243 using MH = MFromD<Half<DTo>>; 1244 using RawMH = decltype(MH().raw); 1245 1246 return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)}, 1247 MH{static_cast<RawMH>(a.raw)}); 1248 } 1249 1250 // ------------------------------ Slide mask up/down 1251 #ifdef HWY_NATIVE_SLIDE_MASK 1252 #undef HWY_NATIVE_SLIDE_MASK 1253 #else 1254 #define HWY_NATIVE_SLIDE_MASK 1255 #endif 1256 1257 template <class D, HWY_IF_LANES_LE_D(D, 8)> 1258 HWY_API MFromD<D> SlideMask1Up(D d, MFromD<D> m) { 1259 using RawM = decltype(MFromD<D>().raw); 1260 constexpr size_t kN = MaxLanes(d); 1261 constexpr unsigned kValidLanesMask = (1u << kN) - 1u; 1262 1263 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1264 MFromD<D> result_mask{ 1265 static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))}; 1266 1267 if (kN < 8) { 1268 result_mask = 1269 And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)}); 1270 } 1271 #else 1272 MFromD<D> result_mask{ 1273 static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)}; 1274 #endif 1275 1276 return result_mask; 1277 } 1278 1279 template <class D, HWY_IF_LANES_D(D, 16)> 1280 HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) { 1281 using RawM = decltype(MFromD<D>().raw); 1282 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1283 return MFromD<D>{ 1284 static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))}; 1285 #else 1286 return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)}; 1287 #endif 1288 } 1289 1290 template <class D, HWY_IF_LANES_LE_D(D, 8)> 1291 HWY_API MFromD<D> SlideMask1Down(D d, MFromD<D> m) { 1292 using RawM = decltype(MFromD<D>().raw); 1293 constexpr size_t kN = MaxLanes(d); 1294 constexpr unsigned kValidLanesMask = (1u << kN) - 1u; 1295 1296 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1297 if (kN < 8) { 1298 m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)}); 1299 } 1300 1301 return MFromD<D>{ 1302 static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))}; 1303 #else 1304 return MFromD<D>{ 1305 static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)}; 1306 #endif 1307 } 1308 1309 template <class D, HWY_IF_LANES_D(D, 16)> 1310 HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) { 1311 using RawM = decltype(MFromD<D>().raw); 1312 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1313 return MFromD<D>{ 1314 static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))}; 1315 #else 1316 return MFromD<D>{ 1317 static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)}; 1318 #endif 1319 } 1320 1321 // Generic for all vector lengths 1322 template <class D> 1323 HWY_API MFromD<D> SlideMaskUpLanes(D d, MFromD<D> m, size_t amt) { 1324 using RawM = decltype(MFromD<D>().raw); 1325 constexpr size_t kN = MaxLanes(d); 1326 constexpr uint64_t kValidLanesMask = 1327 static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL); 1328 1329 return MFromD<D>{static_cast<RawM>( 1330 (static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)}; 1331 } 1332 1333 // Generic for all vector lengths 1334 template <class D> 1335 HWY_API MFromD<D> SlideMaskDownLanes(D d, MFromD<D> m, size_t amt) { 1336 using RawM = decltype(MFromD<D>().raw); 1337 constexpr size_t kN = MaxLanes(d); 1338 constexpr uint64_t kValidLanesMask = 1339 static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL); 1340 1341 return MFromD<D>{static_cast<RawM>( 1342 (static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))}; 1343 } 1344 1345 // ------------------------------ VecFromMask 1346 1347 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 1348 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1349 return Vec128<T, N>{_mm_movm_epi8(v.raw)}; 1350 } 1351 1352 template <typename T, size_t N, HWY_IF_UI16(T)> 1353 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1354 return Vec128<T, N>{_mm_movm_epi16(v.raw)}; 1355 } 1356 1357 template <typename T, size_t N, HWY_IF_UI32(T)> 1358 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1359 return Vec128<T, N>{_mm_movm_epi32(v.raw)}; 1360 } 1361 1362 template <typename T, size_t N, HWY_IF_UI64(T)> 1363 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1364 return Vec128<T, N>{_mm_movm_epi64(v.raw)}; 1365 } 1366 1367 #if HWY_HAVE_FLOAT16 1368 template <size_t N> 1369 HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) { 1370 return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))}; 1371 } 1372 #endif // HWY_HAVE_FLOAT16 1373 1374 template <size_t N> 1375 HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) { 1376 return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; 1377 } 1378 1379 template <size_t N> 1380 HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) { 1381 return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; 1382 } 1383 1384 // Generic for all vector lengths. 1385 template <class D> 1386 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { 1387 return VecFromMask(v); 1388 } 1389 1390 // ------------------------------ RebindMask (MaskFromVec) 1391 1392 template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)> 1393 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { 1394 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 1395 return MFromD<DTo>{m.raw}; 1396 } 1397 1398 // ------------------------------ IfThenElse 1399 1400 namespace detail { 1401 1402 template <typename T, size_t N> 1403 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */, 1404 Mask128<T, N> mask, Vec128<T, N> yes, 1405 Vec128<T, N> no) { 1406 return Vec128<T, N>{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; 1407 } 1408 template <typename T, size_t N> 1409 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */, 1410 Mask128<T, N> mask, Vec128<T, N> yes, 1411 Vec128<T, N> no) { 1412 return Vec128<T, N>{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; 1413 } 1414 template <typename T, size_t N> 1415 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */, 1416 Mask128<T, N> mask, Vec128<T, N> yes, 1417 Vec128<T, N> no) { 1418 return Vec128<T, N>{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; 1419 } 1420 template <typename T, size_t N> 1421 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */, 1422 Mask128<T, N> mask, Vec128<T, N> yes, 1423 Vec128<T, N> no) { 1424 return Vec128<T, N>{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; 1425 } 1426 1427 } // namespace detail 1428 1429 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1430 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 1431 Vec128<T, N> no) { 1432 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no); 1433 } 1434 1435 #if HWY_HAVE_FLOAT16 1436 template <size_t N> 1437 HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask, 1438 Vec128<float16_t, N> yes, 1439 Vec128<float16_t, N> no) { 1440 return Vec128<float16_t, N>{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)}; 1441 } 1442 #endif // HWY_HAVE_FLOAT16 1443 1444 // Generic for all vector lengths. 1445 template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)> 1446 HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) { 1447 const RebindToUnsigned<D> du; 1448 return BitCast( 1449 D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no))); 1450 } 1451 1452 template <size_t N> 1453 HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask, 1454 Vec128<float, N> yes, Vec128<float, N> no) { 1455 return Vec128<float, N>{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)}; 1456 } 1457 1458 template <size_t N> 1459 HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask, 1460 Vec128<double, N> yes, 1461 Vec128<double, N> no) { 1462 return Vec128<double, N>{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)}; 1463 } 1464 1465 namespace detail { 1466 1467 template <typename T, size_t N> 1468 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */, 1469 Mask128<T, N> mask, Vec128<T, N> yes) { 1470 return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; 1471 } 1472 template <typename T, size_t N> 1473 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */, 1474 Mask128<T, N> mask, Vec128<T, N> yes) { 1475 return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; 1476 } 1477 template <typename T, size_t N> 1478 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */, 1479 Mask128<T, N> mask, Vec128<T, N> yes) { 1480 return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; 1481 } 1482 template <typename T, size_t N> 1483 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */, 1484 Mask128<T, N> mask, Vec128<T, N> yes) { 1485 return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; 1486 } 1487 1488 } // namespace detail 1489 1490 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1491 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 1492 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes); 1493 } 1494 1495 template <size_t N> 1496 HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask, 1497 Vec128<float, N> yes) { 1498 return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)}; 1499 } 1500 1501 template <size_t N> 1502 HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask, 1503 Vec128<double, N> yes) { 1504 return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)}; 1505 } 1506 1507 // Generic for all vector lengths. 1508 template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)> 1509 HWY_API V IfThenElseZero(MFromD<D> mask, V yes) { 1510 const RebindToUnsigned<D> du; 1511 return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes))); 1512 } 1513 1514 namespace detail { 1515 1516 template <typename T, size_t N> 1517 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */, 1518 Mask128<T, N> mask, Vec128<T, N> no) { 1519 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. 1520 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; 1521 } 1522 template <typename T, size_t N> 1523 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */, 1524 Mask128<T, N> mask, Vec128<T, N> no) { 1525 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; 1526 } 1527 template <typename T, size_t N> 1528 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */, 1529 Mask128<T, N> mask, Vec128<T, N> no) { 1530 return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; 1531 } 1532 template <typename T, size_t N> 1533 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */, 1534 Mask128<T, N> mask, Vec128<T, N> no) { 1535 return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; 1536 } 1537 1538 } // namespace detail 1539 1540 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1541 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 1542 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no); 1543 } 1544 1545 template <size_t N> 1546 HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask, 1547 Vec128<float, N> no) { 1548 return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; 1549 } 1550 1551 template <size_t N> 1552 HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask, 1553 Vec128<double, N> no) { 1554 return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; 1555 } 1556 1557 // Generic for all vector lengths. 1558 template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)> 1559 HWY_API V IfThenZeroElse(MFromD<D> mask, V no) { 1560 const RebindToUnsigned<D> du; 1561 return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no))); 1562 } 1563 1564 // ------------------------------ Mask logical 1565 1566 namespace detail { 1567 1568 template <typename T, size_t N> 1569 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 1570 const Mask128<T, N> b) { 1571 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1572 return Mask128<T, N>{_kand_mask16(a.raw, b.raw)}; 1573 #else 1574 return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)}; 1575 #endif 1576 } 1577 template <typename T, size_t N> 1578 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 1579 const Mask128<T, N> b) { 1580 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1581 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; 1582 #else 1583 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; 1584 #endif 1585 } 1586 template <typename T, size_t N> 1587 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 1588 const Mask128<T, N> b) { 1589 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1590 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; 1591 #else 1592 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; 1593 #endif 1594 } 1595 template <typename T, size_t N> 1596 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 1597 const Mask128<T, N> b) { 1598 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1599 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)}; 1600 #else 1601 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)}; 1602 #endif 1603 } 1604 1605 template <typename T, size_t N> 1606 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 1607 const Mask128<T, N> b) { 1608 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1609 return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)}; 1610 #else 1611 return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)}; 1612 #endif 1613 } 1614 template <typename T, size_t N> 1615 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 1616 const Mask128<T, N> b) { 1617 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1618 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; 1619 #else 1620 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; 1621 #endif 1622 } 1623 template <typename T, size_t N> 1624 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 1625 const Mask128<T, N> b) { 1626 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1627 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; 1628 #else 1629 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; 1630 #endif 1631 } 1632 template <typename T, size_t N> 1633 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 1634 const Mask128<T, N> b) { 1635 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1636 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)}; 1637 #else 1638 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)}; 1639 #endif 1640 } 1641 1642 template <typename T, size_t N> 1643 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 1644 const Mask128<T, N> b) { 1645 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1646 return Mask128<T, N>{_kor_mask16(a.raw, b.raw)}; 1647 #else 1648 return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)}; 1649 #endif 1650 } 1651 template <typename T, size_t N> 1652 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 1653 const Mask128<T, N> b) { 1654 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1655 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; 1656 #else 1657 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; 1658 #endif 1659 } 1660 template <typename T, size_t N> 1661 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 1662 const Mask128<T, N> b) { 1663 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1664 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; 1665 #else 1666 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; 1667 #endif 1668 } 1669 template <typename T, size_t N> 1670 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 1671 const Mask128<T, N> b) { 1672 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1673 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)}; 1674 #else 1675 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)}; 1676 #endif 1677 } 1678 1679 template <typename T, size_t N> 1680 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a, 1681 const Mask128<T, N> b) { 1682 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1683 return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)}; 1684 #else 1685 return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)}; 1686 #endif 1687 } 1688 template <typename T, size_t N> 1689 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a, 1690 const Mask128<T, N> b) { 1691 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1692 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; 1693 #else 1694 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; 1695 #endif 1696 } 1697 template <typename T, size_t N> 1698 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a, 1699 const Mask128<T, N> b) { 1700 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1701 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; 1702 #else 1703 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; 1704 #endif 1705 } 1706 template <typename T, size_t N> 1707 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a, 1708 const Mask128<T, N> b) { 1709 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1710 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)}; 1711 #else 1712 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)}; 1713 #endif 1714 } 1715 1716 template <typename T, size_t N> 1717 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/, 1718 const Mask128<T, N> a, 1719 const Mask128<T, N> b) { 1720 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1721 return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)}; 1722 #else 1723 return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; 1724 #endif 1725 } 1726 template <typename T, size_t N> 1727 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/, 1728 const Mask128<T, N> a, 1729 const Mask128<T, N> b) { 1730 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1731 return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)}; 1732 #else 1733 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; 1734 #endif 1735 } 1736 template <typename T, size_t N> 1737 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/, 1738 const Mask128<T, N> a, 1739 const Mask128<T, N> b) { 1740 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1741 return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; 1742 #else 1743 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; 1744 #endif 1745 } 1746 template <typename T, size_t N> 1747 HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/, 1748 const Mask128<T, N> a, 1749 const Mask128<T, N> b) { 1750 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1751 return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)}; 1752 #else 1753 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)}; 1754 #endif 1755 } 1756 1757 // UnmaskedNot returns ~m.raw without zeroing out any invalid bits 1758 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 1759 HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) { 1760 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1761 return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))}; 1762 #else 1763 return Mask128<T, N>{static_cast<__mmask16>(~m.raw)}; 1764 #endif 1765 } 1766 1767 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 1768 HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) { 1769 #if HWY_COMPILER_HAS_MASK_INTRINSICS 1770 return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))}; 1771 #else 1772 return Mask128<T, N>{static_cast<__mmask8>(~m.raw)}; 1773 #endif 1774 } 1775 1776 template <typename T> 1777 HWY_INLINE Mask128<T> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) { 1778 // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid 1779 return UnmaskedNot(m); 1780 } 1781 template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)> 1782 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> m) { 1783 // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there 1784 // are fewer than 16 valid bits in m 1785 1786 // Return (~m) & ((1ull << N) - 1) 1787 return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); 1788 } 1789 template <typename T> 1790 HWY_INLINE Mask128<T> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) { 1791 // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid 1792 return UnmaskedNot(m); 1793 } 1794 template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)> 1795 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> m) { 1796 // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there 1797 // are fewer than 8 valid bits in m 1798 1799 // Return (~m) & ((1ull << N) - 1) 1800 return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); 1801 } 1802 template <typename T, size_t N> 1803 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> m) { 1804 // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most 1805 // 4 valid bits in m 1806 1807 // Return (~m) & ((1ull << N) - 1) 1808 return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); 1809 } 1810 template <typename T, size_t N> 1811 HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> m) { 1812 // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most 1813 // 2 valid bits in m 1814 1815 // Return (~m) & ((1ull << N) - 1) 1816 return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1)); 1817 } 1818 1819 } // namespace detail 1820 1821 template <typename T, size_t N> 1822 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 1823 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b); 1824 } 1825 1826 template <typename T, size_t N> 1827 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 1828 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b); 1829 } 1830 1831 template <typename T, size_t N> 1832 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 1833 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b); 1834 } 1835 1836 template <typename T, size_t N> 1837 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 1838 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b); 1839 } 1840 1841 template <typename T, size_t N> 1842 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 1843 // Flip only the valid bits 1844 return detail::Not(hwy::SizeTag<sizeof(T)>(), m); 1845 } 1846 1847 template <typename T, size_t N> 1848 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { 1849 return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b); 1850 } 1851 1852 #else // AVX2 or below 1853 1854 // ------------------------------ Mask 1855 1856 // Mask and Vec are the same (true = FF..FF). 1857 template <typename T, size_t N> 1858 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 1859 return Mask128<T, N>{v.raw}; 1860 } 1861 1862 template <class D> 1863 using MFromD = decltype(MaskFromVec(VFromD<D>())); 1864 1865 template <typename T, size_t N> 1866 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) { 1867 return Vec128<T, N>{v.raw}; 1868 } 1869 1870 // Generic for all vector lengths. 1871 template <class D> 1872 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { 1873 return VecFromMask(v); 1874 } 1875 1876 #if HWY_TARGET >= HWY_SSSE3 1877 1878 // mask ? yes : no 1879 template <typename T, size_t N> 1880 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 1881 Vec128<T, N> no) { 1882 const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask); 1883 return Or(And(vmask, yes), AndNot(vmask, no)); 1884 } 1885 1886 #else // HWY_TARGET < HWY_SSSE3 1887 1888 // mask ? yes : no 1889 template <typename T, size_t N> 1890 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 1891 Vec128<T, N> no) { 1892 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; 1893 } 1894 template <size_t N> 1895 HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask, 1896 Vec128<float, N> yes, Vec128<float, N> no) { 1897 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; 1898 } 1899 template <size_t N> 1900 HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask, 1901 Vec128<double, N> yes, 1902 Vec128<double, N> no) { 1903 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; 1904 } 1905 1906 #endif // HWY_TARGET >= HWY_SSSE3 1907 1908 // mask ? yes : 0 1909 template <typename T, size_t N> 1910 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 1911 return yes & VecFromMask(DFromV<decltype(yes)>(), mask); 1912 } 1913 1914 // mask ? 0 : no 1915 template <typename T, size_t N> 1916 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 1917 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); 1918 } 1919 1920 // ------------------------------ Mask logical 1921 1922 template <typename T, size_t N> 1923 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 1924 const Simd<T, N, 0> d; 1925 return MaskFromVec(Not(VecFromMask(d, m))); 1926 } 1927 1928 template <typename T, size_t N> 1929 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 1930 const Simd<T, N, 0> d; 1931 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 1932 } 1933 1934 template <typename T, size_t N> 1935 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 1936 const Simd<T, N, 0> d; 1937 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 1938 } 1939 1940 template <typename T, size_t N> 1941 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 1942 const Simd<T, N, 0> d; 1943 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 1944 } 1945 1946 template <typename T, size_t N> 1947 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 1948 const Simd<T, N, 0> d; 1949 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 1950 } 1951 1952 template <typename T, size_t N> 1953 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { 1954 const Simd<T, N, 0> d; 1955 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 1956 } 1957 1958 #endif // HWY_TARGET <= HWY_AVX3 1959 1960 // ------------------------------ ShiftLeft 1961 1962 template <int kBits, size_t N> 1963 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { 1964 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)}; 1965 } 1966 1967 template <int kBits, size_t N> 1968 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { 1969 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)}; 1970 } 1971 1972 template <int kBits, size_t N> 1973 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { 1974 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)}; 1975 } 1976 1977 template <int kBits, size_t N> 1978 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { 1979 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)}; 1980 } 1981 template <int kBits, size_t N> 1982 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { 1983 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)}; 1984 } 1985 template <int kBits, size_t N> 1986 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { 1987 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)}; 1988 } 1989 1990 #if HWY_TARGET <= HWY_AVX3_DL 1991 1992 namespace detail { 1993 template <typename T, size_t N> 1994 HWY_API Vec128<T, N> GaloisAffine( 1995 Vec128<T, N> v, VFromD<Repartition<uint64_t, Simd<T, N, 0>>> matrix) { 1996 return Vec128<T, N>{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)}; 1997 } 1998 } // namespace detail 1999 2000 #else // HWY_TARGET > HWY_AVX3_DL 2001 2002 template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2003 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) { 2004 const DFromV<decltype(v)> d8; 2005 // Use raw instead of BitCast to support N=1. 2006 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw}; 2007 return kBits == 1 2008 ? (v + v) 2009 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); 2010 } 2011 2012 #endif // HWY_TARGET > HWY_AVX3_DL 2013 2014 // ------------------------------ ShiftRight 2015 2016 template <int kBits, size_t N> 2017 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) { 2018 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)}; 2019 } 2020 template <int kBits, size_t N> 2021 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) { 2022 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)}; 2023 } 2024 template <int kBits, size_t N> 2025 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) { 2026 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)}; 2027 } 2028 2029 template <int kBits, size_t N> 2030 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) { 2031 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)}; 2032 } 2033 template <int kBits, size_t N> 2034 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) { 2035 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)}; 2036 } 2037 2038 #if HWY_TARGET > HWY_AVX3_DL 2039 2040 template <int kBits, size_t N> 2041 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) { 2042 const DFromV<decltype(v)> d8; 2043 // Use raw instead of BitCast to support N=1. 2044 const Vec128<uint8_t, N> shifted{ 2045 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw}; 2046 return shifted & Set(d8, 0xFF >> kBits); 2047 } 2048 2049 template <int kBits, size_t N> 2050 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) { 2051 const DFromV<decltype(v)> di; 2052 const RebindToUnsigned<decltype(di)> du; 2053 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); 2054 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); 2055 return (shifted ^ shifted_sign) - shifted_sign; 2056 } 2057 2058 #endif // HWY_TARGET > HWY_AVX3_DL 2059 2060 // i64 is implemented after BroadcastSignBit. 2061 2062 // ================================================== MEMORY (1) 2063 2064 // Clang static analysis claims the memory immediately after a partial vector 2065 // store is uninitialized, and also flags the input to partial loads (at least 2066 // for loadl_pd) as "garbage". Since 2025-07, MSAN began raising errors. We 2067 // work around this by using CopyBytes instead of intrinsics, but only for MSAN 2068 // and static analyzer builds to avoid potentially bad code generation. 2069 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. 2070 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE 2071 #if HWY_IS_MSAN || (defined(__clang_analyzer__) || \ 2072 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)) 2073 #define HWY_SAFE_PARTIAL_LOAD_STORE 1 2074 #else 2075 #define HWY_SAFE_PARTIAL_LOAD_STORE 0 2076 #endif 2077 #endif // HWY_SAFE_PARTIAL_LOAD_STORE 2078 2079 // ------------------------------ Load 2080 2081 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 2082 HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) { 2083 return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))}; 2084 } 2085 #if HWY_HAVE_FLOAT16 2086 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 2087 HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) { 2088 return Vec128<float16_t>{_mm_load_ph(aligned)}; 2089 } 2090 #endif // HWY_HAVE_FLOAT16 2091 // Generic for all vector lengths greater than or equal to 16 bytes. 2092 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> 2093 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) { 2094 const RebindToUnsigned<decltype(d)> du; 2095 return BitCast(d, Load(du, detail::U16LanePointer(aligned))); 2096 } 2097 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 2098 HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) { 2099 return Vec128<float>{_mm_load_ps(aligned)}; 2100 } 2101 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 2102 HWY_API Vec128<double> Load(D /* tag */, const double* HWY_RESTRICT aligned) { 2103 return Vec128<double>{_mm_load_pd(aligned)}; 2104 } 2105 2106 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 2107 HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) { 2108 return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))}; 2109 } 2110 #if HWY_HAVE_FLOAT16 2111 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 2112 HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) { 2113 return Vec128<float16_t>{_mm_loadu_ph(p)}; 2114 } 2115 #endif // HWY_HAVE_FLOAT16 2116 // Generic for all vector lengths greater than or equal to 16 bytes. 2117 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> 2118 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 2119 const RebindToUnsigned<decltype(d)> du; 2120 return BitCast(d, LoadU(du, detail::U16LanePointer(p))); 2121 } 2122 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 2123 HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { 2124 return Vec128<float>{_mm_loadu_ps(p)}; 2125 } 2126 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 2127 HWY_API Vec128<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { 2128 return Vec128<double>{_mm_loadu_pd(p)}; 2129 } 2130 2131 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)> 2132 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { 2133 const RebindToUnsigned<decltype(d)> du; // for float16_t 2134 #if HWY_SAFE_PARTIAL_LOAD_STORE 2135 __m128i v = _mm_setzero_si128(); 2136 CopyBytes<8>(p, &v); // not same size 2137 #else 2138 const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p)); 2139 #endif 2140 return BitCast(d, VFromD<decltype(du)>{v}); 2141 } 2142 2143 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 2144 HWY_API Vec64<float> Load(D /* tag */, const float* HWY_RESTRICT p) { 2145 #if HWY_SAFE_PARTIAL_LOAD_STORE 2146 __m128 v = _mm_setzero_ps(); 2147 CopyBytes<8>(p, &v); // not same size 2148 return Vec64<float>{v}; 2149 #else 2150 const __m128 hi = _mm_setzero_ps(); 2151 return Vec64<float>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))}; 2152 #endif 2153 } 2154 2155 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> 2156 HWY_API Vec64<double> Load(D /* tag */, const double* HWY_RESTRICT p) { 2157 #if HWY_SAFE_PARTIAL_LOAD_STORE 2158 __m128d v = _mm_setzero_pd(); 2159 CopyBytes<8>(p, &v); // not same size 2160 return Vec64<double>{v}; 2161 #else 2162 return Vec64<double>{_mm_load_sd(p)}; 2163 #endif 2164 } 2165 2166 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 2167 HWY_API Vec32<float> Load(D /* tag */, const float* HWY_RESTRICT p) { 2168 #if HWY_SAFE_PARTIAL_LOAD_STORE 2169 __m128 v = _mm_setzero_ps(); 2170 CopyBytes<4>(p, &v); // not same size 2171 return Vec32<float>{v}; 2172 #else 2173 return Vec32<float>{_mm_load_ss(p)}; 2174 #endif 2175 } 2176 2177 // Any <= 32 bit except <float, 1> 2178 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)> 2179 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { 2180 const RebindToUnsigned<decltype(d)> du; // for float16_t 2181 // Clang ArgumentPromotionPass seems to break this code. We can unpoison 2182 // before SetTableIndices -> LoadU -> Load and the memory is poisoned again. 2183 detail::MaybeUnpoison(p, Lanes(d)); 2184 2185 #if HWY_SAFE_PARTIAL_LOAD_STORE 2186 __m128i v = Zero(Full128<TFromD<decltype(du)>>()).raw; 2187 CopyBytes<d.MaxBytes()>(p, &v); // not same size as VFromD 2188 #else 2189 int32_t bits = 0; 2190 CopyBytes<d.MaxBytes()>(p, &bits); // not same size as VFromD 2191 const __m128i v = _mm_cvtsi32_si128(bits); 2192 #endif 2193 return BitCast(d, VFromD<decltype(du)>{v}); 2194 } 2195 2196 // For < 128 bit, LoadU == Load. 2197 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2198 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 2199 return Load(d, p); 2200 } 2201 2202 // 128-bit SIMD => nothing to duplicate, same as an unaligned load. 2203 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 2204 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { 2205 return LoadU(d, p); 2206 } 2207 2208 // ------------------------------ Store 2209 2210 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 2211 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { 2212 _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); 2213 } 2214 #if HWY_HAVE_FLOAT16 2215 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 2216 HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) { 2217 _mm_store_ph(aligned, v.raw); 2218 } 2219 #endif // HWY_HAVE_FLOAT16 2220 // Generic for all vector lengths greater than or equal to 16 bytes. 2221 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> 2222 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 2223 const RebindToUnsigned<decltype(d)> du; 2224 Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned)); 2225 } 2226 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 2227 HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) { 2228 _mm_store_ps(aligned, v.raw); 2229 } 2230 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 2231 HWY_API void Store(Vec128<double> v, D /* tag */, 2232 double* HWY_RESTRICT aligned) { 2233 _mm_store_pd(aligned, v.raw); 2234 } 2235 2236 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 2237 HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) { 2238 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); 2239 } 2240 #if HWY_HAVE_FLOAT16 2241 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 2242 HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) { 2243 _mm_storeu_ph(p, v.raw); 2244 } 2245 #endif // HWY_HAVE_FLOAT16 2246 // Generic for all vector lengths greater than or equal to 16 bytes. 2247 template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)> 2248 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2249 const RebindToUnsigned<decltype(d)> du; 2250 StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p)); 2251 } 2252 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 2253 HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) { 2254 _mm_storeu_ps(p, v.raw); 2255 } 2256 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 2257 HWY_API void StoreU(Vec128<double> v, D /* tag */, double* HWY_RESTRICT p) { 2258 _mm_storeu_pd(p, v.raw); 2259 } 2260 2261 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)> 2262 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2263 #if HWY_SAFE_PARTIAL_LOAD_STORE 2264 (void)d; 2265 CopyBytes<8>(&v, p); // not same size 2266 #else 2267 const RebindToUnsigned<decltype(d)> du; // for float16_t 2268 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw); 2269 #endif 2270 } 2271 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 2272 HWY_API void Store(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) { 2273 #if HWY_SAFE_PARTIAL_LOAD_STORE 2274 CopyBytes<8>(&v, p); // not same size 2275 #else 2276 _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); 2277 #endif 2278 } 2279 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> 2280 HWY_API void Store(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) { 2281 #if HWY_SAFE_PARTIAL_LOAD_STORE 2282 CopyBytes<8>(&v, p); // not same size 2283 #else 2284 _mm_storel_pd(p, v.raw); 2285 #endif 2286 } 2287 2288 // Any <= 32 bit except <float, 1> 2289 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_NOT_FLOAT3264_D(D)> 2290 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2291 CopyBytes<d.MaxBytes()>(&v, p); // not same size 2292 } 2293 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 2294 HWY_API void Store(Vec32<float> v, D /* tag */, float* HWY_RESTRICT p) { 2295 #if HWY_SAFE_PARTIAL_LOAD_STORE 2296 CopyBytes<4>(&v, p); // not same size 2297 #else 2298 _mm_store_ss(p, v.raw); 2299 #endif 2300 } 2301 2302 // For < 128 bit, StoreU == Store. 2303 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2304 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2305 Store(v, d, p); 2306 } 2307 2308 // ================================================== SWIZZLE (1) 2309 2310 // ------------------------------ TableLookupBytes 2311 template <typename T, size_t N, typename TI, size_t NI> 2312 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, 2313 const Vec128<TI, NI> from) { 2314 const DFromV<decltype(from)> d; 2315 const Repartition<uint8_t, decltype(d)> du8; 2316 2317 const DFromV<decltype(bytes)> d_bytes; 2318 const Repartition<uint8_t, decltype(d_bytes)> du8_bytes; 2319 #if HWY_TARGET == HWY_SSE2 2320 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) 2321 typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); 2322 (void)d; 2323 (void)du8; 2324 (void)d_bytes; 2325 (void)du8_bytes; 2326 return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>( 2327 __builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw), 2328 reinterpret_cast<GccU8RawVectType>(from.raw)))}; 2329 #else 2330 const Full128<uint8_t> du8_full; 2331 2332 alignas(16) uint8_t result_bytes[16]; 2333 alignas(16) uint8_t u8_bytes[16]; 2334 alignas(16) uint8_t from_bytes[16]; 2335 2336 Store(Vec128<uint8_t>{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes); 2337 Store(Vec128<uint8_t>{BitCast(du8, from).raw}, du8_full, from_bytes); 2338 2339 for (int i = 0; i < 16; i++) { 2340 result_bytes[i] = u8_bytes[from_bytes[i] & 15]; 2341 } 2342 2343 return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw}); 2344 #endif 2345 #else // SSSE3 or newer 2346 return BitCast( 2347 d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw, 2348 BitCast(du8, from).raw)}); 2349 #endif 2350 } 2351 2352 // ------------------------------ TableLookupBytesOr0 2353 // For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3 2354 template <class V, class VI> 2355 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { 2356 #if HWY_TARGET == HWY_SSE2 2357 const DFromV<decltype(from)> d; 2358 const Repartition<int8_t, decltype(d)> di8; 2359 2360 const auto di8_from = BitCast(di8, from); 2361 return BitCast(d, IfThenZeroElse(di8_from < Zero(di8), 2362 TableLookupBytes(bytes, di8_from))); 2363 #else 2364 return TableLookupBytes(bytes, from); 2365 #endif 2366 } 2367 2368 // ------------------------------ Shuffles (ShiftRight, TableLookupBytes) 2369 2370 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). 2371 // Shuffle0321 rotates one lane to the right (the previous least-significant 2372 // lane is now most-significant). These could also be implemented via 2373 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. 2374 2375 // Swap 32-bit halves in 64-bit halves. 2376 template <typename T, size_t N> 2377 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) { 2378 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 2379 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2380 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)}; 2381 } 2382 template <size_t N> 2383 HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) { 2384 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2385 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; 2386 } 2387 2388 // These are used by generic_ops-inl to implement LoadInterleaved3. As with 2389 // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output 2390 // comes from the first argument. 2391 namespace detail { 2392 2393 template <typename T, HWY_IF_T_SIZE(T, 1)> 2394 HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) { 2395 const DFromV<decltype(a)> d; 2396 const Twice<decltype(d)> d2; 2397 const auto ba = Combine(d2, b, a); 2398 #if HWY_TARGET == HWY_SSE2 2399 Vec32<uint16_t> ba_shuffled{ 2400 _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; 2401 return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled))); 2402 #else 2403 const RebindToUnsigned<decltype(d2)> d2_u; 2404 const auto shuffle_idx = 2405 BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 2406 0, 0, 0, 0)); 2407 return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw}; 2408 #endif 2409 } 2410 template <typename T, HWY_IF_T_SIZE(T, 2)> 2411 HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) { 2412 const DFromV<decltype(a)> d; 2413 const Twice<decltype(d)> d2; 2414 const auto ba = Combine(d2, b, a); 2415 #if HWY_TARGET == HWY_SSE2 2416 Vec64<uint32_t> ba_shuffled{ 2417 _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; 2418 return Vec64<T>{ 2419 _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))}; 2420 #else 2421 const RebindToUnsigned<decltype(d2)> d2_u; 2422 const auto shuffle_idx = BitCast( 2423 d2, 2424 Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0)); 2425 return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw}; 2426 #endif 2427 } 2428 template <typename T, HWY_IF_T_SIZE(T, 4)> 2429 HWY_API Vec128<T> ShuffleTwo2301(const Vec128<T> a, const Vec128<T> b) { 2430 const DFromV<decltype(a)> d; 2431 const RebindToFloat<decltype(d)> df; 2432 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); 2433 return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, 2434 BitCast(df, b).raw, m)}); 2435 } 2436 2437 template <typename T, HWY_IF_T_SIZE(T, 1)> 2438 HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) { 2439 const DFromV<decltype(a)> d; 2440 #if HWY_TARGET == HWY_SSE2 2441 const auto zero = Zero(d); 2442 const Rebind<int16_t, decltype(d)> di16; 2443 const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16( 2444 _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; 2445 const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16( 2446 _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; 2447 const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); 2448 return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; 2449 #else 2450 const Twice<decltype(d)> d2; 2451 const auto ba = Combine(d2, b, a); 2452 const RebindToUnsigned<decltype(d2)> d2_u; 2453 const auto shuffle_idx = 2454 BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2455 0, 0, 0, 0)); 2456 return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw}; 2457 #endif 2458 } 2459 template <typename T, HWY_IF_T_SIZE(T, 2)> 2460 HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) { 2461 const DFromV<decltype(a)> d; 2462 #if HWY_TARGET == HWY_SSE2 2463 const Vec32<T> a_shuffled{ 2464 _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))}; 2465 const Vec32<T> b_shuffled{ 2466 _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))}; 2467 return Combine(d, b_shuffled, a_shuffled); 2468 #else 2469 const Twice<decltype(d)> d2; 2470 const auto ba = Combine(d2, b, a); 2471 const RebindToUnsigned<decltype(d2)> d2_u; 2472 const auto shuffle_idx = BitCast( 2473 d2, 2474 Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0)); 2475 return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw}; 2476 #endif 2477 } 2478 template <typename T, HWY_IF_T_SIZE(T, 4)> 2479 HWY_API Vec128<T> ShuffleTwo1230(const Vec128<T> a, const Vec128<T> b) { 2480 const DFromV<decltype(a)> d; 2481 const RebindToFloat<decltype(d)> df; 2482 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); 2483 return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, 2484 BitCast(df, b).raw, m)}); 2485 } 2486 2487 template <typename T, HWY_IF_T_SIZE(T, 1)> 2488 HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) { 2489 const DFromV<decltype(a)> d; 2490 #if HWY_TARGET == HWY_SSE2 2491 const auto zero = Zero(d); 2492 const Rebind<int16_t, decltype(d)> di16; 2493 const Vec32<int16_t> a_shuffled{_mm_shufflelo_epi16( 2494 _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; 2495 const Vec32<int16_t> b_shuffled{_mm_shufflelo_epi16( 2496 _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; 2497 const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); 2498 return Vec32<T>{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; 2499 #else 2500 const Twice<decltype(d)> d2; 2501 const auto ba = Combine(d2, b, a); 2502 const RebindToUnsigned<decltype(d2)> d2_u; 2503 const auto shuffle_idx = 2504 BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 2505 0, 0, 0, 0)); 2506 return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw}; 2507 #endif 2508 } 2509 template <typename T, HWY_IF_T_SIZE(T, 2)> 2510 HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) { 2511 const DFromV<decltype(a)> d; 2512 #if HWY_TARGET == HWY_SSE2 2513 const Vec32<T> a_shuffled{ 2514 _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))}; 2515 const Vec32<T> b_shuffled{ 2516 _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))}; 2517 return Combine(d, b_shuffled, a_shuffled); 2518 #else 2519 const Twice<decltype(d)> d2; 2520 const auto ba = Combine(d2, b, a); 2521 const RebindToUnsigned<decltype(d2)> d2_u; 2522 const auto shuffle_idx = BitCast( 2523 d2, 2524 Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0)); 2525 return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw}; 2526 #endif 2527 } 2528 template <typename T, HWY_IF_T_SIZE(T, 4)> 2529 HWY_API Vec128<T> ShuffleTwo3012(const Vec128<T> a, const Vec128<T> b) { 2530 const DFromV<decltype(a)> d; 2531 const RebindToFloat<decltype(d)> df; 2532 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); 2533 return BitCast(d, Vec128<float>{_mm_shuffle_ps(BitCast(df, a).raw, 2534 BitCast(df, b).raw, m)}); 2535 } 2536 2537 } // namespace detail 2538 2539 // Swap 64-bit halves 2540 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) { 2541 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 2542 } 2543 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) { 2544 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 2545 } 2546 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) { 2547 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; 2548 } 2549 HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) { 2550 return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 2551 } 2552 HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) { 2553 return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)}; 2554 } 2555 HWY_API Vec128<double> Shuffle01(const Vec128<double> v) { 2556 return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)}; 2557 } 2558 2559 // Rotate right 32 bits 2560 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) { 2561 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; 2562 } 2563 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) { 2564 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)}; 2565 } 2566 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) { 2567 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; 2568 } 2569 // Rotate left 32 bits 2570 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) { 2571 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; 2572 } 2573 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) { 2574 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)}; 2575 } 2576 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) { 2577 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; 2578 } 2579 2580 // Reverse 2581 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) { 2582 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; 2583 } 2584 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) { 2585 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)}; 2586 } 2587 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) { 2588 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; 2589 } 2590 2591 // ================================================== COMPARE 2592 2593 #if HWY_TARGET <= HWY_AVX3 2594 2595 // Comparisons set a mask bit to 1 if the condition is true, else 0. 2596 2597 // ------------------------------ TestBit 2598 2599 namespace detail { 2600 2601 template <typename T, size_t N> 2602 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v, 2603 const Vec128<T, N> bit) { 2604 return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)}; 2605 } 2606 template <typename T, size_t N> 2607 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v, 2608 const Vec128<T, N> bit) { 2609 return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)}; 2610 } 2611 template <typename T, size_t N> 2612 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v, 2613 const Vec128<T, N> bit) { 2614 return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)}; 2615 } 2616 template <typename T, size_t N> 2617 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v, 2618 const Vec128<T, N> bit) { 2619 return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)}; 2620 } 2621 2622 } // namespace detail 2623 2624 template <typename T, size_t N> 2625 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) { 2626 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 2627 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit); 2628 } 2629 2630 // ------------------------------ Equality 2631 2632 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2633 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 2634 return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; 2635 } 2636 2637 template <typename T, size_t N, HWY_IF_UI16(T)> 2638 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 2639 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; 2640 } 2641 2642 template <typename T, size_t N, HWY_IF_UI32(T)> 2643 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 2644 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; 2645 } 2646 2647 template <typename T, size_t N, HWY_IF_UI64(T)> 2648 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) { 2649 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; 2650 } 2651 2652 #if HWY_HAVE_FLOAT16 2653 template <size_t N> 2654 HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a, 2655 Vec128<float16_t, N> b) { 2656 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 2657 HWY_DIAGNOSTICS(push) 2658 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 2659 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; 2660 HWY_DIAGNOSTICS(pop) 2661 } 2662 #endif // HWY_HAVE_FLOAT16 2663 template <size_t N> 2664 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { 2665 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; 2666 } 2667 2668 template <size_t N> 2669 HWY_API Mask128<double, N> operator==(Vec128<double, N> a, 2670 Vec128<double, N> b) { 2671 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; 2672 } 2673 2674 // ------------------------------ Inequality 2675 2676 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2677 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 2678 return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; 2679 } 2680 2681 template <typename T, size_t N, HWY_IF_UI16(T)> 2682 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 2683 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; 2684 } 2685 2686 template <typename T, size_t N, HWY_IF_UI32(T)> 2687 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 2688 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; 2689 } 2690 2691 template <typename T, size_t N, HWY_IF_UI64(T)> 2692 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) { 2693 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; 2694 } 2695 2696 #if HWY_HAVE_FLOAT16 2697 template <size_t N> 2698 HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a, 2699 Vec128<float16_t, N> b) { 2700 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 2701 HWY_DIAGNOSTICS(push) 2702 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 2703 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; 2704 HWY_DIAGNOSTICS(pop) 2705 } 2706 #endif // HWY_HAVE_FLOAT16 2707 template <size_t N> 2708 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { 2709 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; 2710 } 2711 2712 template <size_t N> 2713 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, 2714 Vec128<double, N> b) { 2715 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; 2716 } 2717 2718 // ------------------------------ Strict inequality 2719 2720 // Signed/float < 2721 template <size_t N> 2722 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 2723 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; 2724 } 2725 template <size_t N> 2726 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a, 2727 Vec128<int16_t, N> b) { 2728 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; 2729 } 2730 template <size_t N> 2731 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a, 2732 Vec128<int32_t, N> b) { 2733 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; 2734 } 2735 template <size_t N> 2736 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a, 2737 Vec128<int64_t, N> b) { 2738 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; 2739 } 2740 2741 template <size_t N> 2742 HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a, 2743 Vec128<uint8_t, N> b) { 2744 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; 2745 } 2746 template <size_t N> 2747 HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a, 2748 Vec128<uint16_t, N> b) { 2749 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; 2750 } 2751 template <size_t N> 2752 HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a, 2753 Vec128<uint32_t, N> b) { 2754 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; 2755 } 2756 template <size_t N> 2757 HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a, 2758 Vec128<uint64_t, N> b) { 2759 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; 2760 } 2761 2762 #if HWY_HAVE_FLOAT16 2763 template <size_t N> 2764 HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a, 2765 Vec128<float16_t, N> b) { 2766 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 2767 HWY_DIAGNOSTICS(push) 2768 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 2769 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; 2770 HWY_DIAGNOSTICS(pop) 2771 } 2772 #endif // HWY_HAVE_FLOAT16 2773 template <size_t N> 2774 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) { 2775 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; 2776 } 2777 template <size_t N> 2778 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) { 2779 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; 2780 } 2781 2782 // ------------------------------ Weak inequality 2783 2784 #if HWY_HAVE_FLOAT16 2785 template <size_t N> 2786 HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a, 2787 Vec128<float16_t, N> b) { 2788 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 2789 HWY_DIAGNOSTICS(push) 2790 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 2791 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; 2792 HWY_DIAGNOSTICS(pop) 2793 } 2794 #endif // HWY_HAVE_FLOAT16 2795 template <size_t N> 2796 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) { 2797 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; 2798 } 2799 template <size_t N> 2800 HWY_API Mask128<double, N> operator>=(Vec128<double, N> a, 2801 Vec128<double, N> b) { 2802 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; 2803 } 2804 2805 template <size_t N> 2806 HWY_API Mask128<int8_t, N> operator>=(Vec128<int8_t, N> a, 2807 Vec128<int8_t, N> b) { 2808 return Mask128<int8_t, N>{_mm_cmpge_epi8_mask(a.raw, b.raw)}; 2809 } 2810 template <size_t N> 2811 HWY_API Mask128<int16_t, N> operator>=(Vec128<int16_t, N> a, 2812 Vec128<int16_t, N> b) { 2813 return Mask128<int16_t, N>{_mm_cmpge_epi16_mask(a.raw, b.raw)}; 2814 } 2815 template <size_t N> 2816 HWY_API Mask128<int32_t, N> operator>=(Vec128<int32_t, N> a, 2817 Vec128<int32_t, N> b) { 2818 return Mask128<int32_t, N>{_mm_cmpge_epi32_mask(a.raw, b.raw)}; 2819 } 2820 template <size_t N> 2821 HWY_API Mask128<int64_t, N> operator>=(Vec128<int64_t, N> a, 2822 Vec128<int64_t, N> b) { 2823 return Mask128<int64_t, N>{_mm_cmpge_epi64_mask(a.raw, b.raw)}; 2824 } 2825 2826 template <size_t N> 2827 HWY_API Mask128<uint8_t, N> operator>=(Vec128<uint8_t, N> a, 2828 Vec128<uint8_t, N> b) { 2829 return Mask128<uint8_t, N>{_mm_cmpge_epu8_mask(a.raw, b.raw)}; 2830 } 2831 template <size_t N> 2832 HWY_API Mask128<uint16_t, N> operator>=(Vec128<uint16_t, N> a, 2833 Vec128<uint16_t, N> b) { 2834 return Mask128<uint16_t, N>{_mm_cmpge_epu16_mask(a.raw, b.raw)}; 2835 } 2836 template <size_t N> 2837 HWY_API Mask128<uint32_t, N> operator>=(Vec128<uint32_t, N> a, 2838 Vec128<uint32_t, N> b) { 2839 return Mask128<uint32_t, N>{_mm_cmpge_epu32_mask(a.raw, b.raw)}; 2840 } 2841 template <size_t N> 2842 HWY_API Mask128<uint64_t, N> operator>=(Vec128<uint64_t, N> a, 2843 Vec128<uint64_t, N> b) { 2844 return Mask128<uint64_t, N>{_mm_cmpge_epu64_mask(a.raw, b.raw)}; 2845 } 2846 2847 #else // AVX2 or below 2848 2849 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 2850 2851 template <class DTo, typename TFrom, size_t NFrom, HWY_IF_V_SIZE_LE_D(DTo, 16)> 2852 HWY_API MFromD<DTo> RebindMask(DTo dto, Mask128<TFrom, NFrom> m) { 2853 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 2854 const Simd<TFrom, NFrom, 0> d; 2855 return MaskFromVec(BitCast(dto, VecFromMask(d, m))); 2856 } 2857 2858 template <typename T, size_t N> 2859 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { 2860 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 2861 return (v & bit) == bit; 2862 } 2863 2864 // ------------------------------ Equality 2865 2866 // Unsigned 2867 template <size_t N> 2868 HWY_API Mask128<uint8_t, N> operator==(Vec128<uint8_t, N> a, 2869 Vec128<uint8_t, N> b) { 2870 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; 2871 } 2872 template <size_t N> 2873 HWY_API Mask128<uint16_t, N> operator==(Vec128<uint16_t, N> a, 2874 Vec128<uint16_t, N> b) { 2875 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; 2876 } 2877 template <size_t N> 2878 HWY_API Mask128<uint32_t, N> operator==(Vec128<uint32_t, N> a, 2879 Vec128<uint32_t, N> b) { 2880 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; 2881 } 2882 template <size_t N> 2883 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, 2884 const Vec128<uint64_t, N> b) { 2885 #if HWY_TARGET >= HWY_SSSE3 2886 const DFromV<decltype(a)> d64; 2887 const RepartitionToNarrow<decltype(d64)> d32; 2888 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); 2889 const auto cmp64 = cmp32 & Shuffle2301(cmp32); 2890 return MaskFromVec(BitCast(d64, cmp64)); 2891 #else 2892 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)}; 2893 #endif 2894 } 2895 2896 // Signed 2897 template <size_t N> 2898 HWY_API Mask128<int8_t, N> operator==(Vec128<int8_t, N> a, 2899 Vec128<int8_t, N> b) { 2900 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)}; 2901 } 2902 template <size_t N> 2903 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, 2904 Vec128<int16_t, N> b) { 2905 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)}; 2906 } 2907 template <size_t N> 2908 HWY_API Mask128<int32_t, N> operator==(Vec128<int32_t, N> a, 2909 Vec128<int32_t, N> b) { 2910 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)}; 2911 } 2912 template <size_t N> 2913 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, 2914 const Vec128<int64_t, N> b) { 2915 // Same as signed ==; avoid duplicating the SSSE3 version. 2916 const DFromV<decltype(a)> d; 2917 RebindToUnsigned<decltype(d)> du; 2918 return RebindMask(d, BitCast(du, a) == BitCast(du, b)); 2919 } 2920 2921 // Float 2922 template <size_t N> 2923 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) { 2924 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)}; 2925 } 2926 template <size_t N> 2927 HWY_API Mask128<double, N> operator==(Vec128<double, N> a, 2928 Vec128<double, N> b) { 2929 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)}; 2930 } 2931 2932 // ------------------------------ Inequality 2933 2934 // This cannot have T as a template argument, otherwise it is not more 2935 // specialized than rewritten operator== in C++20, leading to compile 2936 // errors: https://gcc.godbolt.org/z/xsrPhPvPT. 2937 template <size_t N> 2938 HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a, 2939 Vec128<uint8_t, N> b) { 2940 return Not(a == b); 2941 } 2942 template <size_t N> 2943 HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a, 2944 Vec128<uint16_t, N> b) { 2945 return Not(a == b); 2946 } 2947 template <size_t N> 2948 HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a, 2949 Vec128<uint32_t, N> b) { 2950 return Not(a == b); 2951 } 2952 template <size_t N> 2953 HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a, 2954 Vec128<uint64_t, N> b) { 2955 return Not(a == b); 2956 } 2957 template <size_t N> 2958 HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a, 2959 Vec128<int8_t, N> b) { 2960 return Not(a == b); 2961 } 2962 template <size_t N> 2963 HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a, 2964 Vec128<int16_t, N> b) { 2965 return Not(a == b); 2966 } 2967 template <size_t N> 2968 HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a, 2969 Vec128<int32_t, N> b) { 2970 return Not(a == b); 2971 } 2972 template <size_t N> 2973 HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a, 2974 Vec128<int64_t, N> b) { 2975 return Not(a == b); 2976 } 2977 2978 template <size_t N> 2979 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) { 2980 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)}; 2981 } 2982 template <size_t N> 2983 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a, 2984 Vec128<double, N> b) { 2985 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)}; 2986 } 2987 2988 // ------------------------------ Strict inequality 2989 2990 namespace detail { 2991 2992 template <size_t N> 2993 HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a, 2994 Vec128<int8_t, N> b) { 2995 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)}; 2996 } 2997 template <size_t N> 2998 HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a, 2999 Vec128<int16_t, N> b) { 3000 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)}; 3001 } 3002 template <size_t N> 3003 HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a, 3004 Vec128<int32_t, N> b) { 3005 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)}; 3006 } 3007 3008 template <size_t N> 3009 HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/, 3010 const Vec128<int64_t, N> a, 3011 const Vec128<int64_t, N> b) { 3012 #if HWY_TARGET >= HWY_SSSE3 3013 // See https://stackoverflow.com/questions/65166174/: 3014 const DFromV<decltype(a)> d; 3015 const RepartitionToNarrow<decltype(d)> d32; 3016 const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw}; 3017 const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw}; 3018 // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper: 3019 // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0. 3020 const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw; 3021 // Duplicate upper to lower half. 3022 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))}; 3023 #else 3024 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 3025 #endif 3026 } 3027 3028 template <typename T, size_t N> 3029 HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a, 3030 Vec128<T, N> b) { 3031 const DFromV<decltype(a)> du; 3032 const RebindToSigned<decltype(du)> di; 3033 const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1); 3034 const auto sa = BitCast(di, Xor(a, msb)); 3035 const auto sb = BitCast(di, Xor(b, msb)); 3036 return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); 3037 } 3038 3039 template <size_t N> 3040 HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a, 3041 Vec128<float, N> b) { 3042 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)}; 3043 } 3044 template <size_t N> 3045 HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a, 3046 Vec128<double, N> b) { 3047 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)}; 3048 } 3049 3050 } // namespace detail 3051 3052 template <typename T, size_t N> 3053 HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { 3054 return detail::Gt(hwy::TypeTag<T>(), a, b); 3055 } 3056 3057 // ------------------------------ Weak inequality 3058 3059 namespace detail { 3060 template <typename T, size_t N> 3061 HWY_INLINE Mask128<T, N> Ge(hwy::SignedTag tag, Vec128<T, N> a, 3062 Vec128<T, N> b) { 3063 return Not(Gt(tag, b, a)); 3064 } 3065 3066 template <typename T, size_t N> 3067 HWY_INLINE Mask128<T, N> Ge(hwy::UnsignedTag tag, Vec128<T, N> a, 3068 Vec128<T, N> b) { 3069 return Not(Gt(tag, b, a)); 3070 } 3071 3072 template <size_t N> 3073 HWY_INLINE Mask128<float, N> Ge(hwy::FloatTag /*tag*/, Vec128<float, N> a, 3074 Vec128<float, N> b) { 3075 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)}; 3076 } 3077 template <size_t N> 3078 HWY_INLINE Mask128<double, N> Ge(hwy::FloatTag /*tag*/, Vec128<double, N> a, 3079 Vec128<double, N> b) { 3080 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)}; 3081 } 3082 3083 } // namespace detail 3084 3085 template <typename T, size_t N> 3086 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { 3087 return detail::Ge(hwy::TypeTag<T>(), a, b); 3088 } 3089 3090 #endif // HWY_TARGET <= HWY_AVX3 3091 3092 // ------------------------------ Reversed comparisons 3093 3094 template <typename T, size_t N> 3095 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { 3096 return b > a; 3097 } 3098 3099 template <typename T, size_t N> 3100 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { 3101 return b >= a; 3102 } 3103 3104 // ------------------------------ Iota (Load) 3105 3106 namespace detail { 3107 3108 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3109 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3110 return VFromD<D>{_mm_set_epi8( 3111 static_cast<char>(15), static_cast<char>(14), static_cast<char>(13), 3112 static_cast<char>(12), static_cast<char>(11), static_cast<char>(10), 3113 static_cast<char>(9), static_cast<char>(8), static_cast<char>(7), 3114 static_cast<char>(6), static_cast<char>(5), static_cast<char>(4), 3115 static_cast<char>(3), static_cast<char>(2), static_cast<char>(1), 3116 static_cast<char>(0))}; 3117 } 3118 3119 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> 3120 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3121 return VFromD<D>{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, 3122 int16_t{3}, int16_t{2}, int16_t{1}, 3123 int16_t{0})}; 3124 } 3125 3126 #if HWY_HAVE_FLOAT16 3127 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 3128 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3129 return VFromD<D>{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5}, 3130 float16_t{4}, float16_t{3}, float16_t{2}, 3131 float16_t{1}, float16_t{0})}; 3132 } 3133 #endif // HWY_HAVE_FLOAT16 3134 3135 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 3136 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3137 return VFromD<D>{ 3138 _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; 3139 } 3140 3141 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 3142 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3143 return VFromD<D>{_mm_set_epi64x(int64_t{1}, int64_t{0})}; 3144 } 3145 3146 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 3147 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3148 return VFromD<D>{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)}; 3149 } 3150 3151 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3152 HWY_INLINE VFromD<D> Iota0(D /*d*/) { 3153 return VFromD<D>{_mm_set_pd(1.0, 0.0)}; 3154 } 3155 3156 #if HWY_COMPILER_MSVC 3157 template <class V, HWY_IF_V_SIZE_V(V, 1)> 3158 static HWY_INLINE V MaskOutVec128Iota(V v) { 3159 const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)}; 3160 return v & mask_out_mask; 3161 } 3162 template <class V, HWY_IF_V_SIZE_V(V, 2)> 3163 static HWY_INLINE V MaskOutVec128Iota(V v) { 3164 #if HWY_TARGET <= HWY_SSE4 3165 return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)}; 3166 #else 3167 const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)}; 3168 return v & mask_out_mask; 3169 #endif 3170 } 3171 template <class V, HWY_IF_V_SIZE_V(V, 4)> 3172 static HWY_INLINE V MaskOutVec128Iota(V v) { 3173 const DFromV<decltype(v)> d; 3174 const Repartition<float, decltype(d)> df; 3175 using VF = VFromD<decltype(df)>; 3176 return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)}); 3177 } 3178 template <class V, HWY_IF_V_SIZE_V(V, 8)> 3179 static HWY_INLINE V MaskOutVec128Iota(V v) { 3180 const DFromV<decltype(v)> d; 3181 const RebindToUnsigned<decltype(d)> du; 3182 using VU = VFromD<decltype(du)>; 3183 return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)}); 3184 } 3185 template <class V, HWY_IF_V_SIZE_GT_V(V, 8)> 3186 static HWY_INLINE V MaskOutVec128Iota(V v) { 3187 return v; 3188 } 3189 #endif 3190 3191 } // namespace detail 3192 3193 template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)> 3194 HWY_API VFromD<D> Iota(D d, const T2 first) { 3195 const auto result_iota = 3196 detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first)); 3197 #if HWY_COMPILER_MSVC 3198 return detail::MaskOutVec128Iota(result_iota); 3199 #else 3200 return result_iota; 3201 #endif 3202 } 3203 3204 // ------------------------------ FirstN (Iota, Lt) 3205 3206 template <class D, class M = MFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 3207 HWY_API M FirstN(D d, size_t num) { 3208 constexpr size_t kN = MaxLanes(d); 3209 // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks 3210 // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI. 3211 num = HWY_MIN(num, kN); 3212 #if HWY_TARGET <= HWY_AVX3 3213 #if HWY_ARCH_X86_64 3214 const uint64_t all = (1ull << kN) - 1; 3215 return M::FromBits(_bzhi_u64(all, num)); 3216 #else 3217 const uint32_t all = static_cast<uint32_t>((1ull << kN) - 1); 3218 return M::FromBits(_bzhi_u32(all, static_cast<uint32_t>(num))); 3219 #endif // HWY_ARCH_X86_64 3220 #else // HWY_TARGET > HWY_AVX3 3221 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. 3222 using TI = TFromD<decltype(di)>; 3223 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num))); 3224 #endif // HWY_TARGET <= HWY_AVX3 3225 } 3226 3227 // ------------------------------ InterleaveLower 3228 3229 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides 3230 // the least-significant lane) and "b". To concatenate two half-width integers 3231 // into one, use ZipLower/Upper instead (also works with scalar). 3232 3233 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 3234 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 3235 return Vec128<T, N>{_mm_unpacklo_epi8(a.raw, b.raw)}; 3236 } 3237 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 3238 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 3239 const DFromV<decltype(a)> d; 3240 const RebindToUnsigned<decltype(d)> du; 3241 using VU = VFromD<decltype(du)>; // for float16_t 3242 return BitCast( 3243 d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); 3244 } 3245 template <typename T, size_t N, HWY_IF_UI32(T)> 3246 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 3247 return Vec128<T, N>{_mm_unpacklo_epi32(a.raw, b.raw)}; 3248 } 3249 template <typename T, size_t N, HWY_IF_UI64(T)> 3250 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 3251 return Vec128<T, N>{_mm_unpacklo_epi64(a.raw, b.raw)}; 3252 } 3253 3254 template <size_t N> 3255 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, 3256 Vec128<float, N> b) { 3257 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)}; 3258 } 3259 template <size_t N> 3260 HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a, 3261 Vec128<double, N> b) { 3262 return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)}; 3263 } 3264 3265 // Generic for all vector lengths. 3266 template <class D> 3267 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { 3268 return InterleaveLower(a, b); 3269 } 3270 3271 // ================================================== MEMORY (2) 3272 3273 // ------------------------------ MaskedLoad 3274 3275 #if HWY_TARGET <= HWY_AVX3 3276 3277 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3278 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3279 const TFromD<D>* HWY_RESTRICT p) { 3280 return VFromD<D>{_mm_maskz_loadu_epi8(m.raw, p)}; 3281 } 3282 3283 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3284 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 3285 const TFromD<D>* HWY_RESTRICT p) { 3286 const RebindToUnsigned<decltype(d)> du; // for float16_t 3287 return BitCast(d, VFromD<decltype(du)>{_mm_maskz_loadu_epi16(m.raw, p)}); 3288 } 3289 3290 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 3291 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3292 const TFromD<D>* HWY_RESTRICT p) { 3293 return VFromD<D>{_mm_maskz_loadu_epi32(m.raw, p)}; 3294 } 3295 3296 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 3297 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3298 const TFromD<D>* HWY_RESTRICT p) { 3299 return VFromD<D>{_mm_maskz_loadu_epi64(m.raw, p)}; 3300 } 3301 3302 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 3303 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3304 const float* HWY_RESTRICT p) { 3305 return VFromD<D>{_mm_maskz_loadu_ps(m.raw, p)}; 3306 } 3307 3308 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3309 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3310 const double* HWY_RESTRICT p) { 3311 return VFromD<D>{_mm_maskz_loadu_pd(m.raw, p)}; 3312 } 3313 3314 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3315 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, 3316 const TFromD<D>* HWY_RESTRICT p) { 3317 return VFromD<D>{_mm_mask_loadu_epi8(v.raw, m.raw, p)}; 3318 } 3319 3320 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3321 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 3322 const TFromD<D>* HWY_RESTRICT p) { 3323 const RebindToUnsigned<decltype(d)> du; // for float16_t 3324 return BitCast(d, VFromD<decltype(du)>{ 3325 _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)}); 3326 } 3327 3328 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 3329 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, 3330 const TFromD<D>* HWY_RESTRICT p) { 3331 return VFromD<D>{_mm_mask_loadu_epi32(v.raw, m.raw, p)}; 3332 } 3333 3334 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 3335 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, 3336 const TFromD<D>* HWY_RESTRICT p) { 3337 return VFromD<D>{_mm_mask_loadu_epi64(v.raw, m.raw, p)}; 3338 } 3339 3340 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 3341 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, 3342 const float* HWY_RESTRICT p) { 3343 return VFromD<D>{_mm_mask_loadu_ps(v.raw, m.raw, p)}; 3344 } 3345 3346 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3347 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */, 3348 const double* HWY_RESTRICT p) { 3349 return VFromD<D>{_mm_mask_loadu_pd(v.raw, m.raw, p)}; 3350 } 3351 3352 #elif HWY_TARGET == HWY_AVX2 3353 3354 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 3355 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3356 const TFromD<D>* HWY_RESTRICT p) { 3357 auto p_p = reinterpret_cast<const int*>(p); // NOLINT 3358 return VFromD<D>{_mm_maskload_epi32(p_p, m.raw)}; 3359 } 3360 3361 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 3362 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D /* tag */, 3363 const TFromD<D>* HWY_RESTRICT p) { 3364 auto p_p = reinterpret_cast<const long long*>(p); // NOLINT 3365 return VFromD<D>{_mm_maskload_epi64(p_p, m.raw)}; 3366 } 3367 3368 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 3369 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const float* HWY_RESTRICT p) { 3370 const RebindToSigned<decltype(d)> di; 3371 return VFromD<D>{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)}; 3372 } 3373 3374 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3375 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const double* HWY_RESTRICT p) { 3376 const RebindToSigned<decltype(d)> di; 3377 return VFromD<D>{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)}; 3378 } 3379 3380 // There is no maskload_epi8/16, so blend instead. 3381 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 3382 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 3383 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 3384 const TFromD<D>* HWY_RESTRICT p) { 3385 return IfThenElseZero(m, LoadU(d, p)); 3386 } 3387 3388 #else // <= SSE4 3389 3390 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). 3391 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3392 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 3393 const TFromD<D>* HWY_RESTRICT p) { 3394 return IfThenElseZero(m, LoadU(d, p)); 3395 } 3396 3397 #endif 3398 3399 // ------------------------------ MaskedLoadOr 3400 3401 #if HWY_TARGET > HWY_AVX3 // else: native 3402 3403 // Generic for all vector lengths. 3404 template <class D> 3405 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 3406 const TFromD<D>* HWY_RESTRICT p) { 3407 return IfThenElse(m, LoadU(d, p), v); 3408 } 3409 3410 #endif // HWY_TARGET > HWY_AVX3 3411 3412 // ------------------------------ LoadN (InterleaveLower) 3413 3414 #if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT 3415 3416 #ifdef HWY_NATIVE_LOAD_N 3417 #undef HWY_NATIVE_LOAD_N 3418 #else 3419 #define HWY_NATIVE_LOAD_N 3420 #endif 3421 3422 // Generic for all vector lengths. 3423 template <class D, HWY_IF_T_SIZE_ONE_OF_D( 3424 D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) | 3425 (1 << 4) | (1 << 8))> 3426 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 3427 size_t num_lanes) { 3428 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> 3429 d_full; 3430 return ResizeBitCast(d, MaskedLoad(FirstN(d_full, num_lanes), d_full, p)); 3431 } 3432 3433 // Generic for all vector lengths. 3434 template <class D, HWY_IF_T_SIZE_ONE_OF_D( 3435 D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) | 3436 (1 << 4) | (1 << 8))> 3437 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 3438 size_t num_lanes) { 3439 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> 3440 d_full; 3441 return ResizeBitCast(d, MaskedLoadOr(ResizeBitCast(d_full, no), 3442 FirstN(d_full, num_lanes), d_full, p)); 3443 } 3444 3445 #if HWY_TARGET > HWY_AVX3 3446 namespace detail { 3447 3448 // 'Leading' means the part that fits in 32-bit lanes. With 2-byte vectors, 3449 // there are none, so return the remainder (v_trailing). 3450 template <class D, HWY_IF_V_SIZE_LE_D(D, 2)> 3451 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN( 3452 VFromD<D> /*load_mask*/, D /*d*/, const TFromD<D>* HWY_RESTRICT /*p*/, 3453 VFromD<D> v_trailing) { 3454 return v_trailing; 3455 } 3456 3457 template <class D, HWY_IF_V_SIZE_LE_D(D, 2)> 3458 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr( 3459 VFromD<D> /*no*/, VFromD<D> /*load_mask*/, D /*d*/, 3460 const TFromD<D>* HWY_RESTRICT /*p*/, VFromD<D> v_trailing) { 3461 return v_trailing; 3462 } 3463 3464 template <class D, HWY_IF_V_SIZE_GT_D(D, 2)> 3465 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingN(VFromD<D> load_mask, D d, 3466 const TFromD<D>* HWY_RESTRICT p, 3467 VFromD<D> v_trailing) { 3468 using DI32 = Repartition<int32_t, D>; 3469 const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full; 3470 3471 // ResizeBitCast of load_mask to di32 is okay below if 3472 // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past 3473 // the first (lowest-index) lanes of load_mask.raw will have already been 3474 // zeroed out by FirstN. 3475 return ResizeBitCast( 3476 d, IfNegativeThenElse( 3477 ResizeBitCast(di32_full, load_mask), 3478 MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)), 3479 di32_full, reinterpret_cast<const int32_t*>(p)), 3480 ResizeBitCast(di32_full, v_trailing))); 3481 } 3482 3483 template <class D, HWY_IF_V_SIZE_GT_D(D, 2)> 3484 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadLeadingNOr(VFromD<D> no, 3485 VFromD<D> load_mask, D d, 3486 const TFromD<D>* HWY_RESTRICT p, 3487 VFromD<D> v_trailing) { 3488 using DI32 = Repartition<int32_t, D>; 3489 const FixedTag<int32_t, HWY_MAX(HWY_MAX_LANES_D(DI32), 4)> di32_full; 3490 3491 // ResizeBitCast of load_mask to di32 is okay below if 3492 // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past 3493 // the first (lowest-index) lanes of load_mask.raw will have already been 3494 // zeroed out by FirstN. 3495 return ResizeBitCast( 3496 d, IfNegativeThenElse( 3497 ResizeBitCast(di32_full, load_mask), 3498 MaskedLoadOr(ResizeBitCast(di32_full, no), 3499 MaskFromVec(ResizeBitCast(di32_full, load_mask)), 3500 di32_full, reinterpret_cast<const int32_t*>(p)), 3501 ResizeBitCast(di32_full, v_trailing))); 3502 } 3503 3504 // Single lane: load or default value. 3505 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 3506 HWY_IF_LANES_D(D, 1)> 3507 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d, 3508 const TFromD<D>* HWY_RESTRICT p, 3509 size_t num_lanes) { 3510 return (num_lanes > 0) ? LoadU(d, p) : Zero(d); 3511 } 3512 3513 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 3514 HWY_IF_LANES_D(D, 1)> 3515 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( 3516 VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p, 3517 size_t num_lanes) { 3518 return (num_lanes > 0) ? LoadU(d, p) : no; 3519 } 3520 3521 // Two lanes: load 1, 2, or default. 3522 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)> 3523 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> /*load_mask*/, D d, 3524 const TFromD<D>* HWY_RESTRICT p, 3525 size_t num_lanes) { 3526 if (num_lanes > 1) { 3527 return LoadU(d, p); 3528 } else { 3529 const FixedTag<TFromD<D>, 1> d1; 3530 return (num_lanes == 1) ? ResizeBitCast(d, LoadU(d1, p)) : Zero(d); 3531 } 3532 } 3533 3534 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_D(D, 2)> 3535 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( 3536 VFromD<D> no, VFromD<D> /*load_mask*/, D d, const TFromD<D>* HWY_RESTRICT p, 3537 size_t num_lanes) { 3538 if (num_lanes > 1) { 3539 return LoadU(d, p); 3540 } else { 3541 if (num_lanes == 0) return no; 3542 // Load one, upper lane is default. 3543 const FixedTag<TFromD<D>, 1> d1; 3544 return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no); 3545 } 3546 } 3547 3548 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)> 3549 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d, 3550 const TFromD<D>* HWY_RESTRICT p, 3551 size_t num_lanes) { 3552 const size_t trailing_n = num_lanes & 3; 3553 if (trailing_n == 0) return Zero(d); 3554 3555 VFromD<D> v_trailing = And(load_mask, Set(d, p[num_lanes - 1])); 3556 3557 if ((trailing_n & 2) != 0) { 3558 const Repartition<int16_t, decltype(d)> di16; 3559 int16_t i16_bits; 3560 CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits); 3561 v_trailing = BitCast( 3562 d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), 3563 BitCast(di16, v_trailing))); 3564 } 3565 3566 return v_trailing; 3567 } 3568 3569 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 2)> 3570 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( 3571 VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p, 3572 size_t num_lanes) { 3573 const size_t trailing_n = num_lanes & 3; 3574 if (trailing_n == 0) return no; 3575 3576 VFromD<D> v_trailing = IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no); 3577 3578 if ((trailing_n & 2) != 0) { 3579 const Repartition<int16_t, decltype(d)> di16; 3580 int16_t i16_bits; 3581 CopyBytes<sizeof(int16_t)>(p + num_lanes - trailing_n, &i16_bits); 3582 v_trailing = BitCast( 3583 d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), 3584 BitCast(di16, v_trailing))); 3585 } 3586 3587 return v_trailing; 3588 } 3589 3590 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)> 3591 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingN(VFromD<D> load_mask, D d, 3592 const TFromD<D>* HWY_RESTRICT p, 3593 size_t num_lanes) { 3594 if ((num_lanes & 1) != 0) { 3595 return And(load_mask, Set(d, p[num_lanes - 1])); 3596 } else { 3597 return Zero(d); 3598 } 3599 } 3600 3601 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)> 3602 HWY_INLINE VFromD<D> AVX2UIF8Or16LoadTrailingNOr( 3603 VFromD<D> no, VFromD<D> load_mask, D d, const TFromD<D>* HWY_RESTRICT p, 3604 size_t num_lanes) { 3605 if ((num_lanes & 1) != 0) { 3606 return IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no); 3607 } else { 3608 return no; 3609 } 3610 } 3611 3612 } // namespace detail 3613 3614 // Generic for all vector lengths. 3615 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 3616 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, size_t N) { 3617 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> 3618 d_full; 3619 3620 const VFromD<D> load_mask = 3621 ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N))); 3622 const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D)); 3623 const VFromD<D> v_trailing = 3624 detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes); 3625 3626 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 3627 if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) && 3628 num_lanes < (4 / sizeof(TFromD<D>))) { 3629 return v_trailing; 3630 } 3631 #endif 3632 3633 return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing); 3634 } 3635 3636 // Generic for all vector lengths. 3637 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 3638 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 3639 size_t N) { 3640 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> 3641 d_full; 3642 3643 const VFromD<D> load_mask = 3644 ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N))); 3645 const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D)); 3646 const VFromD<D> v_trailing = 3647 detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes); 3648 3649 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD 3650 if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD<D>))) && 3651 num_lanes < (4 / sizeof(TFromD<D>))) { 3652 return v_trailing; 3653 } 3654 #endif 3655 3656 return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing); 3657 } 3658 3659 #endif // HWY_TARGET > HWY_AVX3 3660 #endif // HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT 3661 3662 // ------------------------------ BlendedStore 3663 3664 namespace detail { 3665 3666 // There is no maskload_epi8/16 with which we could safely implement 3667 // BlendedStore. Manual blending is also unsafe because loading a full vector 3668 // that crosses the array end causes asan faults. Resort to scalar code; the 3669 // caller should instead use memcpy, assuming m is FirstN(d, n). 3670 template <class D> 3671 HWY_API void ScalarMaskedStore(VFromD<D> v, MFromD<D> m, D d, 3672 TFromD<D>* HWY_RESTRICT p) { 3673 const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t. 3674 using TI = TFromD<decltype(di)>; 3675 alignas(16) TI buf[MaxLanes(d)]; 3676 alignas(16) TI mask[MaxLanes(d)]; 3677 Store(BitCast(di, v), di, buf); 3678 Store(BitCast(di, VecFromMask(d, m)), di, mask); 3679 for (size_t i = 0; i < MaxLanes(d); ++i) { 3680 if (mask[i]) { 3681 CopySameSize(buf + i, p + i); 3682 } 3683 } 3684 } 3685 } // namespace detail 3686 3687 #if HWY_TARGET <= HWY_AVX3 3688 3689 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3690 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, 3691 TFromD<D>* HWY_RESTRICT p) { 3692 _mm_mask_storeu_epi8(p, m.raw, v.raw); 3693 } 3694 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3695 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 3696 TFromD<D>* HWY_RESTRICT p) { 3697 const RebindToUnsigned<decltype(d)> du; // for float16_t 3698 _mm_mask_storeu_epi16(reinterpret_cast<uint16_t*>(p), RebindMask(du, m).raw, 3699 BitCast(du, v).raw); 3700 } 3701 3702 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 3703 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, 3704 TFromD<D>* HWY_RESTRICT p) { 3705 auto pi = reinterpret_cast<int*>(p); // NOLINT 3706 _mm_mask_storeu_epi32(pi, m.raw, v.raw); 3707 } 3708 3709 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 3710 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D /* tag */, 3711 TFromD<D>* HWY_RESTRICT p) { 3712 auto pi = reinterpret_cast<long long*>(p); // NOLINT 3713 _mm_mask_storeu_epi64(pi, m.raw, v.raw); 3714 } 3715 3716 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 3717 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, float* HWY_RESTRICT p) { 3718 _mm_mask_storeu_ps(p, m.raw, v.raw); 3719 } 3720 3721 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 3722 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D, double* HWY_RESTRICT p) { 3723 _mm_mask_storeu_pd(p, m.raw, v.raw); 3724 } 3725 3726 #elif HWY_TARGET == HWY_AVX2 3727 3728 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 3729 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 3730 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 3731 TFromD<D>* HWY_RESTRICT p) { 3732 detail::ScalarMaskedStore(v, m, d, p); 3733 } 3734 3735 namespace detail { 3736 3737 template <class D, class V, class M, HWY_IF_UI32_D(D)> 3738 HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) { 3739 auto pi = reinterpret_cast<int*>(p); // NOLINT 3740 _mm_maskstore_epi32(pi, m.raw, v.raw); 3741 } 3742 3743 template <class D, class V, class M, HWY_IF_UI64_D(D)> 3744 HWY_INLINE void NativeBlendedStore(V v, M m, TFromD<D>* HWY_RESTRICT p) { 3745 auto pi = reinterpret_cast<long long*>(p); // NOLINT 3746 _mm_maskstore_epi64(pi, m.raw, v.raw); 3747 } 3748 3749 template <class D, class V, class M, HWY_IF_F32_D(D)> 3750 HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) { 3751 _mm_maskstore_ps(p, m.raw, v.raw); 3752 } 3753 3754 template <class D, class V, class M, HWY_IF_F64_D(D)> 3755 HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) { 3756 _mm_maskstore_pd(p, m.raw, v.raw); 3757 } 3758 3759 } // namespace detail 3760 3761 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 3762 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 3763 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 3764 TFromD<D>* HWY_RESTRICT p) { 3765 const RebindToSigned<decltype(d)> di; 3766 // For partial vectors, avoid writing other lanes by zeroing their mask. 3767 if (d.MaxBytes() < 16) { 3768 const Full128<TFromD<D>> dfull; 3769 const Mask128<TFromD<D>> mfull{m.raw}; 3770 m = MFromD<D>{And(mfull, FirstN(dfull, MaxLanes(d))).raw}; 3771 } 3772 3773 // Float/double require, and unsigned ints tolerate, signed int masks. 3774 detail::NativeBlendedStore<D>(v, RebindMask(di, m), p); 3775 } 3776 3777 #else // <= SSE4 3778 3779 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3780 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 3781 TFromD<D>* HWY_RESTRICT p) { 3782 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). 3783 detail::ScalarMaskedStore(v, m, d, p); 3784 } 3785 3786 #endif // SSE4 3787 3788 // ================================================== ARITHMETIC 3789 3790 // ------------------------------ Addition 3791 3792 // Unsigned 3793 template <size_t N> 3794 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, 3795 const Vec128<uint8_t, N> b) { 3796 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)}; 3797 } 3798 template <size_t N> 3799 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, 3800 const Vec128<uint16_t, N> b) { 3801 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)}; 3802 } 3803 template <size_t N> 3804 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, 3805 const Vec128<uint32_t, N> b) { 3806 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)}; 3807 } 3808 template <size_t N> 3809 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, 3810 const Vec128<uint64_t, N> b) { 3811 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)}; 3812 } 3813 3814 // Signed 3815 template <size_t N> 3816 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, 3817 const Vec128<int8_t, N> b) { 3818 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)}; 3819 } 3820 template <size_t N> 3821 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, 3822 const Vec128<int16_t, N> b) { 3823 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)}; 3824 } 3825 template <size_t N> 3826 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, 3827 const Vec128<int32_t, N> b) { 3828 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)}; 3829 } 3830 template <size_t N> 3831 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, 3832 const Vec128<int64_t, N> b) { 3833 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)}; 3834 } 3835 3836 // Float 3837 #if HWY_HAVE_FLOAT16 3838 template <size_t N> 3839 HWY_API Vec128<float16_t, N> operator+(const Vec128<float16_t, N> a, 3840 const Vec128<float16_t, N> b) { 3841 return Vec128<float16_t, N>{_mm_add_ph(a.raw, b.raw)}; 3842 } 3843 #endif // HWY_HAVE_FLOAT16 3844 template <size_t N> 3845 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, 3846 const Vec128<float, N> b) { 3847 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)}; 3848 } 3849 template <size_t N> 3850 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a, 3851 const Vec128<double, N> b) { 3852 return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)}; 3853 } 3854 3855 // ------------------------------ Subtraction 3856 3857 // Unsigned 3858 template <size_t N> 3859 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, 3860 const Vec128<uint8_t, N> b) { 3861 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; 3862 } 3863 template <size_t N> 3864 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, 3865 Vec128<uint16_t, N> b) { 3866 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; 3867 } 3868 template <size_t N> 3869 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, 3870 const Vec128<uint32_t, N> b) { 3871 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; 3872 } 3873 template <size_t N> 3874 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, 3875 const Vec128<uint64_t, N> b) { 3876 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; 3877 } 3878 3879 // Signed 3880 template <size_t N> 3881 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, 3882 const Vec128<int8_t, N> b) { 3883 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)}; 3884 } 3885 template <size_t N> 3886 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, 3887 const Vec128<int16_t, N> b) { 3888 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)}; 3889 } 3890 template <size_t N> 3891 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, 3892 const Vec128<int32_t, N> b) { 3893 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)}; 3894 } 3895 template <size_t N> 3896 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, 3897 const Vec128<int64_t, N> b) { 3898 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)}; 3899 } 3900 3901 // Float 3902 #if HWY_HAVE_FLOAT16 3903 template <size_t N> 3904 HWY_API Vec128<float16_t, N> operator-(const Vec128<float16_t, N> a, 3905 const Vec128<float16_t, N> b) { 3906 return Vec128<float16_t, N>{_mm_sub_ph(a.raw, b.raw)}; 3907 } 3908 #endif // HWY_HAVE_FLOAT16 3909 template <size_t N> 3910 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, 3911 const Vec128<float, N> b) { 3912 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)}; 3913 } 3914 template <size_t N> 3915 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a, 3916 const Vec128<double, N> b) { 3917 return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)}; 3918 } 3919 3920 // ------------------------------ AddSub 3921 3922 #if HWY_TARGET <= HWY_SSSE3 3923 3924 #undef HWY_IF_ADDSUB_V 3925 #define HWY_IF_ADDSUB_V(V) \ 3926 HWY_IF_V_SIZE_GT_V( \ 3927 V, ((hwy::IsFloat3264<TFromV<V>>()) ? 32 : sizeof(TFromV<V>))) 3928 3929 template <size_t N, HWY_IF_LANES_GT(N, 1)> 3930 HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) { 3931 return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)}; 3932 } 3933 HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) { 3934 return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)}; 3935 } 3936 #endif // HWY_TARGET <= HWY_SSSE3 3937 3938 // ------------------------------ PairwiseAdd128/PairwiseSub128 3939 3940 // Need to use the default implementation of PairwiseAdd128/PairwiseSub128 in 3941 // generic_ops-inl.h for U8/I8/F16/I64/U64 vectors and 64-byte vectors 3942 3943 #if HWY_TARGET <= HWY_SSSE3 3944 3945 #undef HWY_IF_PAIRWISE_ADD_128_D 3946 #undef HWY_IF_PAIRWISE_SUB_128_D 3947 #define HWY_IF_PAIRWISE_ADD_128_D(D) \ 3948 hwy::EnableIf<( \ 3949 HWY_MAX_LANES_D(D) > (32 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) || \ 3950 (HWY_MAX_LANES_D(D) > (8 / sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) && \ 3951 !(hwy::IsSameEither<hwy::HWY_NAMESPACE::TFromD<D>, int16_t, \ 3952 uint16_t>() || \ 3953 sizeof(hwy::HWY_NAMESPACE::TFromD<D>) == 4 || \ 3954 hwy::IsSame<hwy::HWY_NAMESPACE::TFromD<D>, double>())))>* = nullptr 3955 #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_PAIRWISE_ADD_128_D(D) 3956 3957 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)> 3958 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3959 return VFromD<D>{_mm_hadd_epi16(a.raw, b.raw)}; 3960 } 3961 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI16_D(D)> 3962 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3963 const DFromV<decltype(a)> d; 3964 const RebindToSigned<decltype(d)> di; 3965 return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi16(a.raw, b.raw)}))); 3966 } 3967 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> 3968 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3969 return VFromD<D>{_mm_hadd_epi32(a.raw, b.raw)}; 3970 } 3971 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> 3972 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3973 const DFromV<decltype(a)> d; 3974 const RebindToSigned<decltype(d)> di; 3975 return BitCast(d, Neg(BitCast(di, VFromD<D>{_mm_hsub_epi32(a.raw, b.raw)}))); 3976 } 3977 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3978 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3979 return VFromD<D>{_mm_hadd_ps(a.raw, b.raw)}; 3980 } 3981 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3982 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3983 return Neg(VFromD<D>{_mm_hsub_ps(a.raw, b.raw)}); 3984 } 3985 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3986 HWY_API VFromD<D> PairwiseAdd128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3987 return VFromD<D>{_mm_hadd_pd(a.raw, b.raw)}; 3988 } 3989 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3990 HWY_API VFromD<D> PairwiseSub128(D /*d*/, VFromD<D> a, VFromD<D> b) { 3991 return Neg(VFromD<D>{_mm_hsub_pd(a.raw, b.raw)}); 3992 } 3993 3994 #endif // HWY_TARGET <= HWY_SSSE3 3995 3996 // ------------------------------ SumsOf8 3997 template <size_t N> 3998 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) { 3999 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())}; 4000 } 4001 4002 // Generic for all vector lengths 4003 template <class V, HWY_IF_I8_D(DFromV<V>)> 4004 HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) { 4005 const DFromV<decltype(v)> d; 4006 const RebindToUnsigned<decltype(d)> du; 4007 const Repartition<int64_t, decltype(d)> di64; 4008 4009 // Adjust the values of v to be in the 0..255 range by adding 128 to each lane 4010 // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then 4011 // bitcasting the Xor result to an u8 vector. 4012 const auto v_adj = BitCast(du, Xor(v, SignBit(d))); 4013 4014 // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj) 4015 // operation to account for the adjustment made above. 4016 return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024}); 4017 } 4018 4019 #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF 4020 #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF 4021 #else 4022 #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF 4023 #endif 4024 4025 template <size_t N> 4026 HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a, 4027 const Vec128<uint8_t, N> b) { 4028 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)}; 4029 } 4030 4031 // Generic for all vector lengths 4032 template <class V, HWY_IF_I8_D(DFromV<V>)> 4033 HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) { 4034 const DFromV<V> d; 4035 const RebindToUnsigned<decltype(d)> du; 4036 const RepartitionToWideX3<decltype(d)> di64; 4037 4038 // Adjust the values of a and b to be in the 0..255 range by adding 128 to 4039 // each lane of a and b (which is the same as an bitwise XOR of each i8 lane 4040 // by 128) and then bitcasting the results of the Xor operations to u8 4041 // vectors. 4042 const auto i8_msb = SignBit(d); 4043 const auto a_adj = BitCast(du, Xor(a, i8_msb)); 4044 const auto b_adj = BitCast(du, Xor(b, i8_msb)); 4045 4046 // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an 4047 // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true 4048 return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj)); 4049 } 4050 4051 // ------------------------------ SumsOf4 4052 #if HWY_TARGET <= HWY_AVX3 4053 namespace detail { 4054 4055 template <size_t N> 4056 HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4( 4057 hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, 4058 Vec128<uint8_t, N> v) { 4059 const DFromV<decltype(v)> d; 4060 4061 // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be 4062 // zeroed out and the sums of the 4 consecutive lanes are already in the 4063 // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result. 4064 return Vec128<uint32_t, (N + 3) / 4>{ 4065 _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)}; 4066 } 4067 4068 // detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h 4069 4070 } // namespace detail 4071 #endif // HWY_TARGET <= HWY_AVX3 4072 4073 // ------------------------------ SumsOfAdjQuadAbsDiff 4074 4075 #if HWY_TARGET <= HWY_SSE4 4076 #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 4077 #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 4078 #else 4079 #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 4080 #endif 4081 4082 template <int kAOffset, int kBOffset, size_t N> 4083 HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff( 4084 Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 4085 static_assert(0 <= kAOffset && kAOffset <= 1, 4086 "kAOffset must be between 0 and 1"); 4087 static_assert(0 <= kBOffset && kBOffset <= 3, 4088 "kBOffset must be between 0 and 3"); 4089 return Vec128<uint16_t, (N + 1) / 2>{ 4090 _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)}; 4091 } 4092 4093 // Generic for all vector lengths 4094 template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)> 4095 HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfAdjQuadAbsDiff(V a, V b) { 4096 const DFromV<decltype(a)> d; 4097 const RebindToUnsigned<decltype(d)> du; 4098 const RepartitionToWide<decltype(d)> dw; 4099 4100 // Adjust the values of a and b to be in the 0..255 range by adding 128 to 4101 // each lane of a and b (which is the same as an bitwise XOR of each i8 lane 4102 // by 128) and then bitcasting the results of the Xor operations to u8 4103 // vectors. 4104 const auto i8_msb = SignBit(d); 4105 const auto a_adj = BitCast(du, Xor(a, i8_msb)); 4106 const auto b_adj = BitCast(du, Xor(b, i8_msb)); 4107 4108 // The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can 4109 // simply be bitcasted to an i16 vector as 4110 // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true. 4111 return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj)); 4112 } 4113 #endif 4114 4115 // ------------------------------ SumsOfShuffledQuadAbsDiff 4116 4117 #if HWY_TARGET <= HWY_AVX3 4118 #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 4119 #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 4120 #else 4121 #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 4122 #endif 4123 4124 template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N> 4125 HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfShuffledQuadAbsDiff( 4126 Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 4127 static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3"); 4128 static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3"); 4129 static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3"); 4130 static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3"); 4131 return Vec128<uint16_t, (N + 1) / 2>{ 4132 _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))}; 4133 } 4134 4135 // Generic for all vector lengths 4136 template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V, 4137 HWY_IF_I8_D(DFromV<V>)> 4138 HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfShuffledQuadAbsDiff(V a, 4139 V b) { 4140 const DFromV<decltype(a)> d; 4141 const RebindToUnsigned<decltype(d)> du; 4142 const RepartitionToWide<decltype(d)> dw; 4143 4144 // Adjust the values of a and b to be in the 0..255 range by adding 128 to 4145 // each lane of a and b (which is the same as an bitwise XOR of each i8 lane 4146 // by 128) and then bitcasting the results of the Xor operations to u8 4147 // vectors. 4148 const auto i8_msb = SignBit(d); 4149 const auto a_adj = BitCast(du, Xor(a, i8_msb)); 4150 const auto b_adj = BitCast(du, Xor(b, i8_msb)); 4151 4152 // The result of 4153 // SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can 4154 // simply be bitcasted to an i16 vector as 4155 // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true. 4156 return BitCast( 4157 dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj)); 4158 } 4159 #endif 4160 4161 // ------------------------------ SaturatedAdd 4162 4163 // Returns a + b clamped to the destination range. 4164 4165 // Unsigned 4166 template <size_t N> 4167 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, 4168 const Vec128<uint8_t, N> b) { 4169 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)}; 4170 } 4171 template <size_t N> 4172 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, 4173 const Vec128<uint16_t, N> b) { 4174 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)}; 4175 } 4176 4177 // Signed 4178 template <size_t N> 4179 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, 4180 const Vec128<int8_t, N> b) { 4181 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)}; 4182 } 4183 template <size_t N> 4184 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, 4185 const Vec128<int16_t, N> b) { 4186 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)}; 4187 } 4188 4189 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 4190 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB 4191 #undef HWY_NATIVE_I32_SATURATED_ADDSUB 4192 #else 4193 #define HWY_NATIVE_I32_SATURATED_ADDSUB 4194 #endif 4195 4196 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB 4197 #undef HWY_NATIVE_I64_SATURATED_ADDSUB 4198 #else 4199 #define HWY_NATIVE_I64_SATURATED_ADDSUB 4200 #endif 4201 4202 template <size_t N> 4203 HWY_API Vec128<int32_t, N> SaturatedAdd(Vec128<int32_t, N> a, 4204 Vec128<int32_t, N> b) { 4205 const DFromV<decltype(a)> d; 4206 const auto sum = a + b; 4207 const auto overflow_mask = MaskFromVec( 4208 Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); 4209 const auto i32_max = Set(d, LimitsMax<int32_t>()); 4210 const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32( 4211 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; 4212 return IfThenElse(overflow_mask, overflow_result, sum); 4213 } 4214 4215 template <size_t N> 4216 HWY_API Vec128<int64_t, N> SaturatedAdd(Vec128<int64_t, N> a, 4217 Vec128<int64_t, N> b) { 4218 const DFromV<decltype(a)> d; 4219 const auto sum = a + b; 4220 const auto overflow_mask = MaskFromVec( 4221 Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); 4222 const auto i64_max = Set(d, LimitsMax<int64_t>()); 4223 const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64( 4224 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; 4225 return IfThenElse(overflow_mask, overflow_result, sum); 4226 } 4227 #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 4228 4229 // ------------------------------ SaturatedSub 4230 4231 // Returns a - b clamped to the destination range. 4232 4233 // Unsigned 4234 template <size_t N> 4235 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, 4236 const Vec128<uint8_t, N> b) { 4237 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)}; 4238 } 4239 template <size_t N> 4240 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, 4241 const Vec128<uint16_t, N> b) { 4242 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)}; 4243 } 4244 4245 // Signed 4246 template <size_t N> 4247 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, 4248 const Vec128<int8_t, N> b) { 4249 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)}; 4250 } 4251 template <size_t N> 4252 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, 4253 const Vec128<int16_t, N> b) { 4254 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)}; 4255 } 4256 4257 #if HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 4258 template <size_t N> 4259 HWY_API Vec128<int32_t, N> SaturatedSub(Vec128<int32_t, N> a, 4260 Vec128<int32_t, N> b) { 4261 const DFromV<decltype(a)> d; 4262 const auto diff = a - b; 4263 const auto overflow_mask = MaskFromVec( 4264 Vec128<int32_t, N>{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); 4265 const auto i32_max = Set(d, LimitsMax<int32_t>()); 4266 const Vec128<int32_t, N> overflow_result{_mm_mask_ternarylogic_epi32( 4267 i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; 4268 return IfThenElse(overflow_mask, overflow_result, diff); 4269 } 4270 4271 template <size_t N> 4272 HWY_API Vec128<int64_t, N> SaturatedSub(Vec128<int64_t, N> a, 4273 Vec128<int64_t, N> b) { 4274 const DFromV<decltype(a)> d; 4275 const auto diff = a - b; 4276 const auto overflow_mask = MaskFromVec( 4277 Vec128<int64_t, N>{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); 4278 const auto i64_max = Set(d, LimitsMax<int64_t>()); 4279 const Vec128<int64_t, N> overflow_result{_mm_mask_ternarylogic_epi64( 4280 i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; 4281 return IfThenElse(overflow_mask, overflow_result, diff); 4282 } 4283 #endif // HWY_TARGET <= HWY_AVX3 && !HWY_IS_MSAN 4284 4285 // ------------------------------ AverageRound 4286 4287 // Returns (a + b + 1) / 2 4288 4289 // Unsigned 4290 template <size_t N> 4291 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, 4292 const Vec128<uint8_t, N> b) { 4293 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)}; 4294 } 4295 template <size_t N> 4296 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, 4297 const Vec128<uint16_t, N> b) { 4298 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)}; 4299 } 4300 4301 // I8/I16 AverageRound is generic for all vector lengths 4302 template <class V, HWY_IF_SIGNED_V(V), 4303 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 4304 HWY_API V AverageRound(V a, V b) { 4305 const DFromV<decltype(a)> d; 4306 const RebindToUnsigned<decltype(d)> du; 4307 const V sign_bit = SignBit(d); 4308 return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)), 4309 BitCast(du, Xor(b, sign_bit)))), 4310 sign_bit); 4311 } 4312 4313 // ------------------------------ Integer multiplication 4314 4315 template <size_t N> 4316 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, 4317 const Vec128<uint16_t, N> b) { 4318 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; 4319 } 4320 template <size_t N> 4321 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, 4322 const Vec128<int16_t, N> b) { 4323 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)}; 4324 } 4325 4326 // Returns the upper sizeof(T)*8 bits of a * b in each lane. 4327 template <size_t N> 4328 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, 4329 const Vec128<uint16_t, N> b) { 4330 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)}; 4331 } 4332 template <size_t N> 4333 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, 4334 const Vec128<int16_t, N> b) { 4335 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)}; 4336 } 4337 4338 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)), 4339 HWY_IF_LANES_D(DFromV<V>, 1)> 4340 HWY_API V MulHigh(V a, V b) { 4341 const DFromV<decltype(a)> d; 4342 const Full128<TFromD<decltype(d)>> d_full; 4343 return ResizeBitCast( 4344 d, Slide1Down(d_full, ResizeBitCast(d_full, MulEven(a, b)))); 4345 } 4346 4347 // I8/U8/I32/U32 MulHigh is generic for all vector lengths >= 2 lanes 4348 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4)), 4349 HWY_IF_LANES_GT_D(DFromV<V>, 1)> 4350 HWY_API V MulHigh(V a, V b) { 4351 const DFromV<decltype(a)> d; 4352 4353 const auto p_even = BitCast(d, MulEven(a, b)); 4354 const auto p_odd = BitCast(d, MulOdd(a, b)); 4355 return InterleaveOdd(d, p_even, p_odd); 4356 } 4357 4358 // Multiplies even lanes (0, 2 ..) and places the double-wide result into 4359 // even and the upper half into its odd neighbor lane. 4360 template <class V, HWY_IF_U8_D(DFromV<V>)> 4361 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) { 4362 const DFromV<decltype(a)> d; 4363 const RepartitionToWide<decltype(d)> dw; 4364 const auto lo8_mask = Set(dw, uint16_t{0x00FF}); 4365 return And(ResizeBitCast(dw, a), lo8_mask) * 4366 And(ResizeBitCast(dw, b), lo8_mask); 4367 } 4368 4369 template <class V, HWY_IF_I8_D(DFromV<V>)> 4370 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) { 4371 const DFromV<decltype(a)> d; 4372 const RepartitionToWide<decltype(d)> dw; 4373 return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) * 4374 ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b))); 4375 } 4376 4377 template <class V, HWY_IF_UI16_D(DFromV<V>)> 4378 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulEven(V a, V b) { 4379 const DFromV<decltype(a)> d; 4380 const RepartitionToWide<decltype(d)> dw; 4381 const RepartitionToNarrow<decltype(dw)> dw_as_d16; 4382 4383 const auto lo = ResizeBitCast(dw, a * b); 4384 const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b))); 4385 return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); 4386 } 4387 4388 template <size_t N> 4389 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, 4390 const Vec128<uint32_t, N> b) { 4391 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)}; 4392 } 4393 4394 template <size_t N> 4395 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, 4396 const Vec128<int32_t, N> b) { 4397 #if HWY_TARGET >= HWY_SSSE3 4398 const DFromV<decltype(a)> d; 4399 const RepartitionToWide<decltype(d)> dw; 4400 const RebindToUnsigned<decltype(d)> du; 4401 4402 // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) + 4403 // (((a[i] >> 31) * b[i]) << 32) + 4404 // (((b[i] >> 31) * a[i]) << 32) + 4405 // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) 4406 4407 // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the 4408 // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero. 4409 4410 // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) == 4411 // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32) 4412 4413 // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be 4414 // computed using MulEven(BitCast(du, a), BitCast(du, b)) 4415 4416 const auto neg_p_hi = ShiftLeft<32>( 4417 ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a))); 4418 const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b))); 4419 return p_lo - neg_p_hi; 4420 #else 4421 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)}; 4422 #endif 4423 } 4424 4425 template <class V, HWY_IF_T_SIZE_V(V, 1)> 4426 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) { 4427 const DFromV<decltype(a)> d; 4428 const RepartitionToWide<decltype(d)> dw; 4429 return ShiftRight<8>(ResizeBitCast(dw, a)) * 4430 ShiftRight<8>(ResizeBitCast(dw, b)); 4431 } 4432 4433 template <class V, HWY_IF_UI16_D(DFromV<V>)> 4434 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) { 4435 const DFromV<decltype(a)> d; 4436 const RepartitionToWide<decltype(d)> dw; 4437 const RebindToUnsigned<decltype(dw)> dw_u; 4438 const RepartitionToNarrow<decltype(dw)> dw_as_d16; 4439 4440 const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b))); 4441 const auto hi = ResizeBitCast(dw, MulHigh(a, b)); 4442 return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); 4443 } 4444 4445 template <class V, HWY_IF_UI32_D(DFromV<V>)> 4446 HWY_API VFromD<RepartitionToWide<DFromV<V>>> MulOdd(V a, V b) { 4447 return MulEven(DupOdd(a), DupOdd(b)); 4448 } 4449 4450 template <size_t N> 4451 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, 4452 const Vec128<uint32_t, N> b) { 4453 #if HWY_TARGET >= HWY_SSSE3 4454 // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. 4455 // 64-bit right shift would also work but also needs port 5, so no benefit. 4456 // Notation: x=don't care, z=0. 4457 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); 4458 const auto mullo_x2x0 = MulEven(a, b); 4459 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); 4460 const auto mullo_x3x1 = 4461 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1}); 4462 // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating 4463 // the latter requires one more instruction or a constant. 4464 const __m128i mul_20 = 4465 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); 4466 const __m128i mul_31 = 4467 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); 4468 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)}; 4469 #else 4470 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)}; 4471 #endif 4472 } 4473 4474 template <size_t N> 4475 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, 4476 const Vec128<int32_t, N> b) { 4477 // Same as unsigned; avoid duplicating the SSSE3 code. 4478 const DFromV<decltype(a)> d; 4479 const RebindToUnsigned<decltype(d)> du; 4480 return BitCast(d, BitCast(du, a) * BitCast(du, b)); 4481 } 4482 4483 #if HWY_TARGET <= HWY_AVX3 4484 // Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*. 4485 #ifdef HWY_NATIVE_MUL_64 4486 #undef HWY_NATIVE_MUL_64 4487 #else 4488 #define HWY_NATIVE_MUL_64 4489 #endif 4490 4491 template <size_t N> 4492 HWY_API Vec128<uint64_t, N> operator*(Vec128<uint64_t, N> a, 4493 Vec128<uint64_t, N> b) { 4494 return Vec128<uint64_t, N>{_mm_mullo_epi64(a.raw, b.raw)}; 4495 } 4496 template <size_t N> 4497 HWY_API Vec128<int64_t, N> operator*(Vec128<int64_t, N> a, 4498 Vec128<int64_t, N> b) { 4499 return Vec128<int64_t, N>{_mm_mullo_epi64(a.raw, b.raw)}; 4500 } 4501 #endif 4502 4503 // ------------------------------ RotateRight (ShiftRight, Or) 4504 4505 // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8 4506 // RotateRight uses detail::GaloisAffine on AVX3_DL 4507 4508 #if HWY_TARGET > HWY_AVX3_DL 4509 template <int kBits, size_t N> 4510 HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) { 4511 static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); 4512 if (kBits == 0) return v; 4513 // AVX3 does not support 8-bit. 4514 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v)); 4515 } 4516 #endif 4517 4518 template <int kBits, size_t N> 4519 HWY_API Vec128<uint16_t, N> RotateRight(const Vec128<uint16_t, N> v) { 4520 static_assert(0 <= kBits && kBits < 16, "Invalid shift count"); 4521 if (kBits == 0) return v; 4522 #if HWY_TARGET <= HWY_AVX3_DL 4523 return Vec128<uint16_t, N>{_mm_shrdi_epi16(v.raw, v.raw, kBits)}; 4524 #else 4525 // AVX3 does not support 16-bit. 4526 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v)); 4527 #endif 4528 } 4529 4530 template <int kBits, size_t N> 4531 HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) { 4532 static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); 4533 #if HWY_TARGET <= HWY_AVX3 4534 return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)}; 4535 #else 4536 if (kBits == 0) return v; 4537 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v)); 4538 #endif 4539 } 4540 4541 template <int kBits, size_t N> 4542 HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) { 4543 static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); 4544 #if HWY_TARGET <= HWY_AVX3 4545 return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)}; 4546 #else 4547 if (kBits == 0) return v; 4548 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v)); 4549 #endif 4550 } 4551 4552 // I8/I16/I32/I64 RotateRight is generic for all vector lengths 4553 template <int kBits, class V, HWY_IF_SIGNED_V(V)> 4554 HWY_API V RotateRight(V v) { 4555 const DFromV<decltype(v)> d; 4556 const RebindToUnsigned<decltype(d)> du; 4557 return BitCast(d, RotateRight<kBits>(BitCast(du, v))); 4558 } 4559 4560 // ------------------------------ Rol/Ror 4561 #if HWY_TARGET <= HWY_AVX3_DL 4562 #ifdef HWY_NATIVE_ROL_ROR_16 4563 #undef HWY_NATIVE_ROL_ROR_16 4564 #else 4565 #define HWY_NATIVE_ROL_ROR_16 4566 #endif 4567 4568 template <class T, size_t N, HWY_IF_UI16(T)> 4569 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 4570 return Vec128<T, N>{_mm_shrdv_epi16(a.raw, a.raw, b.raw)}; 4571 } 4572 4573 // U16/I16 Rol is generic for all vector lengths on AVX3_DL 4574 template <class V, HWY_IF_UI16(TFromV<V>)> 4575 HWY_API V Rol(V a, V b) { 4576 const DFromV<decltype(a)> d; 4577 const RebindToSigned<decltype(d)> di; 4578 return Ror(a, BitCast(d, Neg(BitCast(di, b)))); 4579 } 4580 4581 #endif // HWY_TARGET <= HWY_AVX3_DL 4582 4583 #if HWY_TARGET <= HWY_AVX3 4584 4585 #ifdef HWY_NATIVE_ROL_ROR_32_64 4586 #undef HWY_NATIVE_ROL_ROR_32_64 4587 #else 4588 #define HWY_NATIVE_ROL_ROR_32_64 4589 #endif 4590 4591 template <class T, size_t N, HWY_IF_UI32(T)> 4592 HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) { 4593 return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)}; 4594 } 4595 4596 template <class T, size_t N, HWY_IF_UI32(T)> 4597 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 4598 return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)}; 4599 } 4600 4601 template <class T, size_t N, HWY_IF_UI64(T)> 4602 HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) { 4603 return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)}; 4604 } 4605 4606 template <class T, size_t N, HWY_IF_UI64(T)> 4607 HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) { 4608 return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)}; 4609 } 4610 4611 #endif 4612 4613 // ------------------------------ RotateLeftSame/RotateRightSame 4614 4615 #if HWY_TARGET <= HWY_AVX3_DL 4616 4617 #ifdef HWY_NATIVE_ROL_ROR_SAME_16 4618 #undef HWY_NATIVE_ROL_ROR_SAME_16 4619 #else 4620 #define HWY_NATIVE_ROL_ROR_SAME_16 4621 #endif 4622 4623 // Generic for all vector lengths 4624 template <class V, HWY_IF_UI16(TFromV<V>)> 4625 HWY_API V RotateLeftSame(V v, int bits) { 4626 const DFromV<decltype(v)> d; 4627 return Ror(v, 4628 Set(d, static_cast<TFromV<V>>(0u - static_cast<unsigned>(bits)))); 4629 } 4630 4631 template <class V, HWY_IF_UI16(TFromV<V>)> 4632 HWY_API V RotateRightSame(V v, int bits) { 4633 const DFromV<decltype(v)> d; 4634 return Ror(v, Set(d, static_cast<TFromV<V>>(bits))); 4635 } 4636 #endif // HWY_TARGET <= HWY_AVX3_DL 4637 4638 #if HWY_TARGET <= HWY_AVX3 4639 4640 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 4641 #undef HWY_NATIVE_ROL_ROR_SAME_32_64 4642 #else 4643 #define HWY_NATIVE_ROL_ROR_SAME_32_64 4644 #endif 4645 4646 // Generic for all vector lengths 4647 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4648 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> 4649 HWY_API V RotateLeftSame(V v, int bits) { 4650 const DFromV<decltype(v)> d; 4651 return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits)))); 4652 } 4653 4654 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4655 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> 4656 HWY_API V RotateRightSame(V v, int bits) { 4657 const DFromV<decltype(v)> d; 4658 return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits)))); 4659 } 4660 #endif // HWY_TARGET <= HWY_AVX3 4661 4662 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) 4663 4664 template <size_t N> 4665 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) { 4666 const DFromV<decltype(v)> d; 4667 return VecFromMask(v < Zero(d)); 4668 } 4669 4670 template <size_t N> 4671 HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) { 4672 return ShiftRight<15>(v); 4673 } 4674 4675 template <size_t N> 4676 HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) { 4677 return ShiftRight<31>(v); 4678 } 4679 4680 template <size_t N> 4681 HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) { 4682 const DFromV<decltype(v)> d; 4683 #if HWY_TARGET <= HWY_AVX3 4684 (void)d; 4685 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)}; 4686 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 4687 return VecFromMask(v < Zero(d)); 4688 #else 4689 // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift 4690 // avoids generating a zero. 4691 const RepartitionToNarrow<decltype(d)> d32; 4692 const auto sign = ShiftRight<31>(BitCast(d32, v)); 4693 return Vec128<int64_t, N>{ 4694 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 4695 #endif 4696 } 4697 4698 // ------------------------------ Integer Abs 4699 4700 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. 4701 template <size_t N> 4702 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { 4703 #if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2 4704 const DFromV<decltype(v)> d; 4705 const RebindToUnsigned<decltype(d)> du; 4706 const auto zero = Zero(du); 4707 const auto v_as_u8 = BitCast(du, v); 4708 return BitCast(d, Min(v_as_u8, zero - v_as_u8)); 4709 #else 4710 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)}; 4711 #endif 4712 } 4713 4714 template <size_t N> 4715 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { 4716 #if HWY_TARGET == HWY_SSE2 4717 const auto zero = Zero(DFromV<decltype(v)>()); 4718 return Max(v, zero - v); 4719 #else 4720 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)}; 4721 #endif 4722 } 4723 4724 template <size_t N> 4725 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { 4726 #if HWY_TARGET <= HWY_SSSE3 4727 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)}; 4728 #else 4729 const auto zero = Zero(DFromV<decltype(v)>()); 4730 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); 4731 #endif 4732 } 4733 4734 #if HWY_TARGET <= HWY_AVX3 4735 template <size_t N> 4736 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { 4737 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)}; 4738 } 4739 #else 4740 // I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 4741 template <class V, HWY_IF_I64(TFromV<V>)> 4742 HWY_API V Abs(V v) { 4743 const auto zero = Zero(DFromV<decltype(v)>()); 4744 return IfNegativeThenElse(v, zero - v, v); 4745 } 4746 #endif 4747 4748 #ifdef HWY_NATIVE_SATURATED_ABS 4749 #undef HWY_NATIVE_SATURATED_ABS 4750 #else 4751 #define HWY_NATIVE_SATURATED_ABS 4752 #endif 4753 4754 // Generic for all vector lengths 4755 template <class V, HWY_IF_I8(TFromV<V>)> 4756 HWY_API V SaturatedAbs(V v) { 4757 const DFromV<decltype(v)> d; 4758 const RebindToUnsigned<decltype(d)> du; 4759 return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v)))); 4760 } 4761 4762 // Generic for all vector lengths 4763 template <class V, HWY_IF_I16(TFromV<V>)> 4764 HWY_API V SaturatedAbs(V v) { 4765 return Max(v, SaturatedSub(Zero(DFromV<V>()), v)); 4766 } 4767 4768 // Generic for all vector lengths 4769 template <class V, HWY_IF_I32(TFromV<V>)> 4770 HWY_API V SaturatedAbs(V v) { 4771 const auto abs_v = Abs(v); 4772 4773 #if HWY_TARGET <= HWY_SSE4 4774 const DFromV<decltype(v)> d; 4775 const RebindToUnsigned<decltype(d)> du; 4776 return BitCast(d, Min(BitCast(du, abs_v), 4777 Set(du, static_cast<uint32_t>(LimitsMax<int32_t>())))); 4778 #else 4779 return Add(abs_v, BroadcastSignBit(abs_v)); 4780 #endif 4781 } 4782 4783 // Generic for all vector lengths 4784 template <class V, HWY_IF_I64(TFromV<V>)> 4785 HWY_API V SaturatedAbs(V v) { 4786 const auto abs_v = Abs(v); 4787 return Add(abs_v, BroadcastSignBit(abs_v)); 4788 } 4789 4790 // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL 4791 // srli_epi64: the count should be unsigned int. Note that this is not the same 4792 // as the Shift3264Count in x86_512-inl.h (GCC also requires int). 4793 #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \ 4794 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) 4795 using Shift64Count = int; 4796 #else 4797 // Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this. 4798 using Shift64Count = unsigned int; 4799 #endif 4800 4801 template <int kBits, size_t N> 4802 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) { 4803 #if HWY_TARGET <= HWY_AVX3 4804 return Vec128<int64_t, N>{ 4805 _mm_srai_epi64(v.raw, static_cast<Shift64Count>(kBits))}; 4806 #else 4807 const DFromV<decltype(v)> di; 4808 const RebindToUnsigned<decltype(di)> du; 4809 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); 4810 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); 4811 return right | sign; 4812 #endif 4813 } 4814 4815 // ------------------------------ IfNegativeThenElse 4816 template <size_t N> 4817 HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v, 4818 const Vec128<int8_t, N> yes, 4819 const Vec128<int8_t, N> no) { 4820 // int8: IfThenElse only looks at the MSB on SSE4 or newer 4821 #if HWY_TARGET <= HWY_SSE4 4822 const auto mask = MaskFromVec(v); 4823 #else 4824 const DFromV<decltype(v)> d; 4825 const RebindToSigned<decltype(d)> di; 4826 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 4827 #endif 4828 4829 return IfThenElse(mask, yes, no); 4830 } 4831 4832 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 4833 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 4834 Vec128<T, N> no) { 4835 static_assert(IsSigned<T>(), "Only works for signed/float"); 4836 4837 // 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's 4838 // MSB. 4839 #if HWY_TARGET <= HWY_AVX3 4840 const auto mask = MaskFromVec(v); 4841 #else 4842 const DFromV<decltype(v)> d; 4843 const RebindToSigned<decltype(d)> di; 4844 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 4845 #endif 4846 4847 return IfThenElse(mask, yes, no); 4848 } 4849 4850 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> 4851 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 4852 Vec128<T, N> no) { 4853 static_assert(IsSigned<T>(), "Only works for signed/float"); 4854 const DFromV<decltype(v)> d; 4855 4856 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 4857 // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB 4858 // on SSE4 or later. 4859 const RebindToFloat<decltype(d)> df; 4860 const auto mask = MaskFromVec(BitCast(df, v)); 4861 return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no))); 4862 #else // SSE2, SSSE3, or AVX3 4863 4864 #if HWY_TARGET <= HWY_AVX3 4865 // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only 4866 // looks at the MSB on AVX3 4867 (void)d; 4868 const auto mask = MaskFromVec(v); 4869 #else 4870 const RebindToSigned<decltype(d)> di; 4871 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 4872 #endif 4873 4874 return IfThenElse(mask, yes, no); 4875 #endif 4876 } 4877 4878 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 4879 4880 #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 4881 #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 4882 #else 4883 #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 4884 #endif 4885 4886 #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 4887 #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 4888 #else 4889 #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 4890 #endif 4891 4892 // SSE4/AVX2 IfNegativeThenElseZero/IfNegativeThenZeroElse is generic for all 4893 // vector lengths 4894 template <class V, HWY_IF_NOT_UNSIGNED_V(V), 4895 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))> 4896 HWY_API V IfNegativeThenElseZero(V v, V yes) { 4897 const DFromV<decltype(v)> d; 4898 return IfNegativeThenElse(v, yes, Zero(d)); 4899 } 4900 4901 template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)> 4902 HWY_API V IfNegativeThenElseZero(V v, V yes) { 4903 return IfThenElseZero(IsNegative(v), yes); 4904 } 4905 4906 template <class V, HWY_IF_NOT_UNSIGNED_V(V), 4907 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 4) | (1 << 8))> 4908 HWY_API V IfNegativeThenZeroElse(V v, V no) { 4909 const DFromV<decltype(v)> d; 4910 return IfNegativeThenElse(v, Zero(d), no); 4911 } 4912 4913 template <class V, HWY_IF_NOT_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 2)> 4914 HWY_API V IfNegativeThenZeroElse(V v, V no) { 4915 return IfThenZeroElse(IsNegative(v), no); 4916 } 4917 4918 #endif // HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 4919 4920 // ------------------------------ IfNegativeThenNegOrUndefIfZero 4921 4922 #if HWY_TARGET <= HWY_SSSE3 4923 4924 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 4925 #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 4926 #else 4927 #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 4928 #endif 4929 4930 template <size_t N> 4931 HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask, 4932 Vec128<int8_t, N> v) { 4933 return Vec128<int8_t, N>{_mm_sign_epi8(v.raw, mask.raw)}; 4934 } 4935 4936 template <size_t N> 4937 HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero( 4938 Vec128<int16_t, N> mask, Vec128<int16_t, N> v) { 4939 return Vec128<int16_t, N>{_mm_sign_epi16(v.raw, mask.raw)}; 4940 } 4941 4942 template <size_t N> 4943 HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero( 4944 Vec128<int32_t, N> mask, Vec128<int32_t, N> v) { 4945 return Vec128<int32_t, N>{_mm_sign_epi32(v.raw, mask.raw)}; 4946 } 4947 4948 // Generic for all vector lengths 4949 template <class V, HWY_IF_I64_D(DFromV<V>)> 4950 HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { 4951 #if HWY_TARGET <= HWY_AVX3 4952 // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3 4953 const DFromV<decltype(v)> d; 4954 return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v); 4955 #else 4956 // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2 4957 return IfNegativeThenElse(mask, Neg(v), v); 4958 #endif 4959 } 4960 4961 #endif // HWY_TARGET <= HWY_SSSE3 4962 4963 // ------------------------------ ShiftLeftSame 4964 4965 template <size_t N> 4966 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v, 4967 const int bits) { 4968 #if HWY_COMPILER_GCC 4969 if (__builtin_constant_p(bits)) { 4970 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, bits)}; 4971 } 4972 #endif 4973 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 4974 } 4975 template <size_t N> 4976 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v, 4977 const int bits) { 4978 #if HWY_COMPILER_GCC 4979 if (__builtin_constant_p(bits)) { 4980 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, bits)}; 4981 } 4982 #endif 4983 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 4984 } 4985 template <size_t N> 4986 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v, 4987 const int bits) { 4988 #if HWY_COMPILER_GCC 4989 if (__builtin_constant_p(bits)) { 4990 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, bits)}; 4991 } 4992 #endif 4993 return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 4994 } 4995 4996 template <size_t N> 4997 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v, 4998 const int bits) { 4999 #if HWY_COMPILER_GCC 5000 if (__builtin_constant_p(bits)) { 5001 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, bits)}; 5002 } 5003 #endif 5004 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 5005 } 5006 5007 template <size_t N> 5008 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v, 5009 const int bits) { 5010 #if HWY_COMPILER_GCC 5011 if (__builtin_constant_p(bits)) { 5012 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, bits)}; 5013 } 5014 #endif 5015 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 5016 } 5017 5018 template <size_t N> 5019 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v, 5020 const int bits) { 5021 #if HWY_COMPILER_GCC 5022 if (__builtin_constant_p(bits)) { 5023 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, bits)}; 5024 } 5025 #endif 5026 return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 5027 } 5028 5029 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 5030 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) { 5031 const DFromV<decltype(v)> d8; 5032 // Use raw instead of BitCast to support N=1. 5033 const Vec128<T, N> shifted{ 5034 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw}; 5035 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); 5036 } 5037 5038 // ------------------------------ ShiftRightSame (BroadcastSignBit) 5039 5040 template <size_t N> 5041 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v, 5042 const int bits) { 5043 #if HWY_COMPILER_GCC 5044 if (__builtin_constant_p(bits)) { 5045 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, bits)}; 5046 } 5047 #endif 5048 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 5049 } 5050 template <size_t N> 5051 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v, 5052 const int bits) { 5053 #if HWY_COMPILER_GCC 5054 if (__builtin_constant_p(bits)) { 5055 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, bits)}; 5056 } 5057 #endif 5058 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 5059 } 5060 template <size_t N> 5061 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v, 5062 const int bits) { 5063 #if HWY_COMPILER_GCC 5064 if (__builtin_constant_p(bits)) { 5065 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, bits)}; 5066 } 5067 #endif 5068 return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 5069 } 5070 5071 template <size_t N> 5072 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v, 5073 const int bits) { 5074 const DFromV<decltype(v)> d8; 5075 // Use raw instead of BitCast to support N=1. 5076 const Vec128<uint8_t, N> shifted{ 5077 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw}; 5078 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits)); 5079 } 5080 5081 template <size_t N> 5082 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v, 5083 const int bits) { 5084 #if HWY_COMPILER_GCC 5085 if (__builtin_constant_p(bits)) { 5086 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, bits)}; 5087 } 5088 #endif 5089 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; 5090 } 5091 5092 template <size_t N> 5093 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v, 5094 const int bits) { 5095 #if HWY_COMPILER_GCC 5096 if (__builtin_constant_p(bits)) { 5097 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, bits)}; 5098 } 5099 #endif 5100 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; 5101 } 5102 template <size_t N> 5103 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v, 5104 const int bits) { 5105 #if HWY_TARGET <= HWY_AVX3 5106 #if HWY_COMPILER_GCC 5107 if (__builtin_constant_p(bits)) { 5108 return Vec128<int64_t, N>{ 5109 _mm_srai_epi64(v.raw, static_cast<Shift64Count>(bits))}; 5110 } 5111 #endif 5112 return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; 5113 #else 5114 const DFromV<decltype(v)> di; 5115 const RebindToUnsigned<decltype(di)> du; 5116 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); 5117 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); 5118 return right | sign; 5119 #endif 5120 } 5121 5122 template <size_t N> 5123 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) { 5124 const DFromV<decltype(v)> di; 5125 const RebindToUnsigned<decltype(di)> du; 5126 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); 5127 const auto shifted_sign = 5128 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits))); 5129 return (shifted ^ shifted_sign) - shifted_sign; 5130 } 5131 5132 // ------------------------------ Floating-point mul / div 5133 5134 #if HWY_HAVE_FLOAT16 5135 template <size_t N> 5136 HWY_API Vec128<float16_t, N> operator*(Vec128<float16_t, N> a, 5137 Vec128<float16_t, N> b) { 5138 return Vec128<float16_t, N>{_mm_mul_ph(a.raw, b.raw)}; 5139 } 5140 #endif // HWY_HAVE_FLOAT16 5141 template <size_t N> 5142 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) { 5143 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)}; 5144 } 5145 HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a, 5146 const Vec128<float, 1> b) { 5147 return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)}; 5148 } 5149 template <size_t N> 5150 HWY_API Vec128<double, N> operator*(const Vec128<double, N> a, 5151 const Vec128<double, N> b) { 5152 return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)}; 5153 } 5154 HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) { 5155 return Vec64<double>{_mm_mul_sd(a.raw, b.raw)}; 5156 } 5157 5158 #if HWY_TARGET <= HWY_AVX3 5159 5160 #ifdef HWY_NATIVE_MUL_BY_POW2 5161 #undef HWY_NATIVE_MUL_BY_POW2 5162 #else 5163 #define HWY_NATIVE_MUL_BY_POW2 5164 #endif 5165 5166 #if HWY_HAVE_FLOAT16 5167 template <size_t N> 5168 HWY_API Vec128<float16_t, N> MulByFloorPow2(Vec128<float16_t, N> a, 5169 Vec128<float16_t, N> b) { 5170 return Vec128<float16_t, N>{_mm_scalef_ph(a.raw, b.raw)}; 5171 } 5172 #endif 5173 5174 template <size_t N> 5175 HWY_API Vec128<float, N> MulByFloorPow2(Vec128<float, N> a, 5176 Vec128<float, N> b) { 5177 return Vec128<float, N>{_mm_scalef_ps(a.raw, b.raw)}; 5178 } 5179 5180 template <size_t N> 5181 HWY_API Vec128<double, N> MulByFloorPow2(Vec128<double, N> a, 5182 Vec128<double, N> b) { 5183 return Vec128<double, N>{_mm_scalef_pd(a.raw, b.raw)}; 5184 } 5185 5186 // MulByPow2 is generic for all vector lengths on AVX3 5187 template <class V, HWY_IF_FLOAT_V(V)> 5188 HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) { 5189 const DFromV<decltype(v)> d; 5190 return MulByFloorPow2(v, ConvertTo(d, exp)); 5191 } 5192 5193 #endif // HWY_TARGET <= HWY_AVX3 5194 5195 #if HWY_HAVE_FLOAT16 5196 template <size_t N> 5197 HWY_API Vec128<float16_t, N> operator/(const Vec128<float16_t, N> a, 5198 const Vec128<float16_t, N> b) { 5199 return Vec128<float16_t, N>{_mm_div_ph(a.raw, b.raw)}; 5200 } 5201 #endif // HWY_HAVE_FLOAT16 5202 template <size_t N> 5203 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, 5204 const Vec128<float, N> b) { 5205 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)}; 5206 } 5207 HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a, 5208 const Vec128<float, 1> b) { 5209 return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)}; 5210 } 5211 template <size_t N> 5212 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a, 5213 const Vec128<double, N> b) { 5214 return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)}; 5215 } 5216 HWY_API Vec64<double> operator/(const Vec64<double> a, const Vec64<double> b) { 5217 return Vec64<double>{_mm_div_sd(a.raw, b.raw)}; 5218 } 5219 5220 // Approximate reciprocal 5221 #if HWY_HAVE_FLOAT16 5222 template <size_t N> 5223 HWY_API Vec128<float16_t, N> ApproximateReciprocal( 5224 const Vec128<float16_t, N> v) { 5225 return Vec128<float16_t, N>{_mm_rcp_ph(v.raw)}; 5226 } 5227 #endif // HWY_HAVE_FLOAT16 5228 template <size_t N> 5229 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { 5230 return Vec128<float, N>{_mm_rcp_ps(v.raw)}; 5231 } 5232 HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) { 5233 return Vec128<float, 1>{_mm_rcp_ss(v.raw)}; 5234 } 5235 5236 #if HWY_TARGET <= HWY_AVX3 5237 #ifdef HWY_NATIVE_F64_APPROX_RECIP 5238 #undef HWY_NATIVE_F64_APPROX_RECIP 5239 #else 5240 #define HWY_NATIVE_F64_APPROX_RECIP 5241 #endif 5242 5243 HWY_API Vec128<double> ApproximateReciprocal(Vec128<double> v) { 5244 return Vec128<double>{_mm_rcp14_pd(v.raw)}; 5245 } 5246 HWY_API Vec64<double> ApproximateReciprocal(Vec64<double> v) { 5247 return Vec64<double>{_mm_rcp14_sd(v.raw, v.raw)}; 5248 } 5249 #endif 5250 5251 // Generic for all vector lengths. 5252 template <class V, HWY_IF_FLOAT_V(V)> 5253 HWY_API V AbsDiff(V a, V b) { 5254 return Abs(a - b); 5255 } 5256 5257 // ------------------------------ GetExponent 5258 5259 #if HWY_TARGET <= HWY_AVX3 5260 5261 #ifdef HWY_NATIVE_GET_EXPONENT 5262 #undef HWY_NATIVE_GET_EXPONENT 5263 #else 5264 #define HWY_NATIVE_GET_EXPONENT 5265 #endif 5266 5267 #if HWY_HAVE_FLOAT16 5268 template <class V, HWY_IF_F16(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)> 5269 HWY_API V GetExponent(V v) { 5270 return V{_mm_getexp_ph(v.raw)}; 5271 } 5272 #endif 5273 template <class V, HWY_IF_F32(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)> 5274 HWY_API V GetExponent(V v) { 5275 return V{_mm_getexp_ps(v.raw)}; 5276 } 5277 template <class V, HWY_IF_F64(TFromV<V>), HWY_IF_V_SIZE_LE_V(V, 16)> 5278 HWY_API V GetExponent(V v) { 5279 return V{_mm_getexp_pd(v.raw)}; 5280 } 5281 5282 #endif 5283 5284 // ------------------------------ MaskedMinOr 5285 5286 #if HWY_TARGET <= HWY_AVX3 5287 5288 #ifdef HWY_NATIVE_MASKED_ARITH 5289 #undef HWY_NATIVE_MASKED_ARITH 5290 #else 5291 #define HWY_NATIVE_MASKED_ARITH 5292 #endif 5293 5294 template <typename T, size_t N, HWY_IF_U8(T)> 5295 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5296 Vec128<T, N> a, Vec128<T, N> b) { 5297 return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)}; 5298 } 5299 template <typename T, size_t N, HWY_IF_I8(T)> 5300 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5301 Vec128<T, N> a, Vec128<T, N> b) { 5302 return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)}; 5303 } 5304 5305 template <typename T, size_t N, HWY_IF_U16(T)> 5306 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5307 Vec128<T, N> a, Vec128<T, N> b) { 5308 return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)}; 5309 } 5310 template <typename T, size_t N, HWY_IF_I16(T)> 5311 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5312 Vec128<T, N> a, Vec128<T, N> b) { 5313 return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)}; 5314 } 5315 5316 template <typename T, size_t N, HWY_IF_U32(T)> 5317 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5318 Vec128<T, N> a, Vec128<T, N> b) { 5319 return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)}; 5320 } 5321 template <typename T, size_t N, HWY_IF_I32(T)> 5322 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5323 Vec128<T, N> a, Vec128<T, N> b) { 5324 return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)}; 5325 } 5326 5327 template <typename T, size_t N, HWY_IF_U64(T)> 5328 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5329 Vec128<T, N> a, Vec128<T, N> b) { 5330 return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)}; 5331 } 5332 template <typename T, size_t N, HWY_IF_I64(T)> 5333 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5334 Vec128<T, N> a, Vec128<T, N> b) { 5335 return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)}; 5336 } 5337 5338 template <typename T, size_t N, HWY_IF_F32(T)> 5339 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5340 Vec128<T, N> a, Vec128<T, N> b) { 5341 return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)}; 5342 } 5343 5344 template <typename T, size_t N, HWY_IF_F64(T)> 5345 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5346 Vec128<T, N> a, Vec128<T, N> b) { 5347 return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)}; 5348 } 5349 5350 #if HWY_HAVE_FLOAT16 5351 template <typename T, size_t N, HWY_IF_F16(T)> 5352 HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m, 5353 Vec128<T, N> a, Vec128<T, N> b) { 5354 return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)}; 5355 } 5356 #endif // HWY_HAVE_FLOAT16 5357 5358 // ------------------------------ MaskedMaxOr 5359 5360 template <typename T, size_t N, HWY_IF_U8(T)> 5361 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5362 Vec128<T, N> a, Vec128<T, N> b) { 5363 return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)}; 5364 } 5365 template <typename T, size_t N, HWY_IF_I8(T)> 5366 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5367 Vec128<T, N> a, Vec128<T, N> b) { 5368 return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)}; 5369 } 5370 5371 template <typename T, size_t N, HWY_IF_U16(T)> 5372 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5373 Vec128<T, N> a, Vec128<T, N> b) { 5374 return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)}; 5375 } 5376 template <typename T, size_t N, HWY_IF_I16(T)> 5377 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5378 Vec128<T, N> a, Vec128<T, N> b) { 5379 return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)}; 5380 } 5381 5382 template <typename T, size_t N, HWY_IF_U32(T)> 5383 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5384 Vec128<T, N> a, Vec128<T, N> b) { 5385 return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)}; 5386 } 5387 template <typename T, size_t N, HWY_IF_I32(T)> 5388 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5389 Vec128<T, N> a, Vec128<T, N> b) { 5390 return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)}; 5391 } 5392 5393 template <typename T, size_t N, HWY_IF_U64(T)> 5394 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5395 Vec128<T, N> a, Vec128<T, N> b) { 5396 return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)}; 5397 } 5398 template <typename T, size_t N, HWY_IF_I64(T)> 5399 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5400 Vec128<T, N> a, Vec128<T, N> b) { 5401 return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)}; 5402 } 5403 5404 template <typename T, size_t N, HWY_IF_F32(T)> 5405 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5406 Vec128<T, N> a, Vec128<T, N> b) { 5407 return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)}; 5408 } 5409 5410 template <typename T, size_t N, HWY_IF_F64(T)> 5411 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5412 Vec128<T, N> a, Vec128<T, N> b) { 5413 return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)}; 5414 } 5415 5416 #if HWY_HAVE_FLOAT16 5417 template <typename T, size_t N, HWY_IF_F16(T)> 5418 HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m, 5419 Vec128<T, N> a, Vec128<T, N> b) { 5420 return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)}; 5421 } 5422 #endif // HWY_HAVE_FLOAT16 5423 5424 // ------------------------------ MaskedAddOr 5425 5426 template <typename T, size_t N, HWY_IF_UI8(T)> 5427 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5428 Vec128<T, N> a, Vec128<T, N> b) { 5429 return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)}; 5430 } 5431 5432 template <typename T, size_t N, HWY_IF_UI16(T)> 5433 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5434 Vec128<T, N> a, Vec128<T, N> b) { 5435 return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)}; 5436 } 5437 5438 template <typename T, size_t N, HWY_IF_UI32(T)> 5439 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5440 Vec128<T, N> a, Vec128<T, N> b) { 5441 return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)}; 5442 } 5443 5444 template <typename T, size_t N, HWY_IF_UI64(T)> 5445 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5446 Vec128<T, N> a, Vec128<T, N> b) { 5447 return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)}; 5448 } 5449 5450 template <typename T, size_t N, HWY_IF_F32(T)> 5451 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5452 Vec128<T, N> a, Vec128<T, N> b) { 5453 return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)}; 5454 } 5455 5456 template <typename T, size_t N, HWY_IF_F64(T)> 5457 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5458 Vec128<T, N> a, Vec128<T, N> b) { 5459 return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)}; 5460 } 5461 5462 #if HWY_HAVE_FLOAT16 5463 template <typename T, size_t N, HWY_IF_F16(T)> 5464 HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m, 5465 Vec128<T, N> a, Vec128<T, N> b) { 5466 return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)}; 5467 } 5468 #endif // HWY_HAVE_FLOAT16 5469 5470 // ------------------------------ MaskedSubOr 5471 5472 template <typename T, size_t N, HWY_IF_UI8(T)> 5473 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5474 Vec128<T, N> a, Vec128<T, N> b) { 5475 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)}; 5476 } 5477 5478 template <typename T, size_t N, HWY_IF_UI16(T)> 5479 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5480 Vec128<T, N> a, Vec128<T, N> b) { 5481 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)}; 5482 } 5483 5484 template <typename T, size_t N, HWY_IF_UI32(T)> 5485 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5486 Vec128<T, N> a, Vec128<T, N> b) { 5487 return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)}; 5488 } 5489 5490 template <typename T, size_t N, HWY_IF_UI64(T)> 5491 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5492 Vec128<T, N> a, Vec128<T, N> b) { 5493 return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)}; 5494 } 5495 5496 template <typename T, size_t N, HWY_IF_F32(T)> 5497 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5498 Vec128<T, N> a, Vec128<T, N> b) { 5499 return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)}; 5500 } 5501 5502 template <typename T, size_t N, HWY_IF_F64(T)> 5503 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5504 Vec128<T, N> a, Vec128<T, N> b) { 5505 return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)}; 5506 } 5507 5508 #if HWY_HAVE_FLOAT16 5509 template <typename T, size_t N, HWY_IF_F16(T)> 5510 HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m, 5511 Vec128<T, N> a, Vec128<T, N> b) { 5512 return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)}; 5513 } 5514 #endif // HWY_HAVE_FLOAT16 5515 5516 // ------------------------------ MaskedMulOr 5517 5518 // There are no elementwise integer mask_mul. Generic for all vector lengths. 5519 template <class V, class M> 5520 HWY_API V MaskedMulOr(V no, M m, V a, V b) { 5521 return IfThenElse(m, a * b, no); 5522 } 5523 5524 template <size_t N> 5525 HWY_API Vec128<float, N> MaskedMulOr(Vec128<float, N> no, Mask128<float, N> m, 5526 Vec128<float, N> a, Vec128<float, N> b) { 5527 return Vec128<float, N>{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)}; 5528 } 5529 5530 template <size_t N> 5531 HWY_API Vec128<double, N> MaskedMulOr(Vec128<double, N> no, 5532 Mask128<double, N> m, Vec128<double, N> a, 5533 Vec128<double, N> b) { 5534 return Vec128<double, N>{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)}; 5535 } 5536 5537 #if HWY_HAVE_FLOAT16 5538 template <size_t N> 5539 HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no, 5540 Mask128<float16_t, N> m, 5541 Vec128<float16_t, N> a, 5542 Vec128<float16_t, N> b) { 5543 return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)}; 5544 } 5545 #endif // HWY_HAVE_FLOAT16 5546 5547 // ------------------------------ MaskedDivOr 5548 5549 template <size_t N> 5550 HWY_API Vec128<float, N> MaskedDivOr(Vec128<float, N> no, Mask128<float, N> m, 5551 Vec128<float, N> a, Vec128<float, N> b) { 5552 return Vec128<float, N>{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)}; 5553 } 5554 5555 template <size_t N> 5556 HWY_API Vec128<double, N> MaskedDivOr(Vec128<double, N> no, 5557 Mask128<double, N> m, Vec128<double, N> a, 5558 Vec128<double, N> b) { 5559 return Vec128<double, N>{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)}; 5560 } 5561 5562 #if HWY_HAVE_FLOAT16 5563 template <size_t N> 5564 HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no, 5565 Mask128<float16_t, N> m, 5566 Vec128<float16_t, N> a, 5567 Vec128<float16_t, N> b) { 5568 return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)}; 5569 } 5570 #endif // HWY_HAVE_FLOAT16 5571 5572 // Generic for all vector lengths 5573 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5574 HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) { 5575 return IfThenElse(m, Div(a, b), no); 5576 } 5577 5578 // ------------------------------ MaskedModOr 5579 // Generic for all vector lengths 5580 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5581 HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) { 5582 return IfThenElse(m, Mod(a, b), no); 5583 } 5584 5585 // ------------------------------ MaskedSatAddOr 5586 5587 template <typename T, size_t N, HWY_IF_I8(T)> 5588 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, 5589 Vec128<T, N> a, Vec128<T, N> b) { 5590 return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)}; 5591 } 5592 5593 template <typename T, size_t N, HWY_IF_U8(T)> 5594 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, 5595 Vec128<T, N> a, Vec128<T, N> b) { 5596 return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)}; 5597 } 5598 5599 template <typename T, size_t N, HWY_IF_I16(T)> 5600 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, 5601 Vec128<T, N> a, Vec128<T, N> b) { 5602 return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)}; 5603 } 5604 5605 template <typename T, size_t N, HWY_IF_U16(T)> 5606 HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m, 5607 Vec128<T, N> a, Vec128<T, N> b) { 5608 return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)}; 5609 } 5610 5611 // ------------------------------ MaskedSatSubOr 5612 5613 template <typename T, size_t N, HWY_IF_I8(T)> 5614 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, 5615 Vec128<T, N> a, Vec128<T, N> b) { 5616 return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)}; 5617 } 5618 5619 template <typename T, size_t N, HWY_IF_U8(T)> 5620 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, 5621 Vec128<T, N> a, Vec128<T, N> b) { 5622 return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)}; 5623 } 5624 5625 template <typename T, size_t N, HWY_IF_I16(T)> 5626 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, 5627 Vec128<T, N> a, Vec128<T, N> b) { 5628 return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)}; 5629 } 5630 5631 template <typename T, size_t N, HWY_IF_U16(T)> 5632 HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m, 5633 Vec128<T, N> a, Vec128<T, N> b) { 5634 return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)}; 5635 } 5636 5637 #endif // HWY_TARGET <= HWY_AVX3 5638 5639 // ------------------------------ Floating-point multiply-add variants 5640 5641 #if HWY_HAVE_FLOAT16 5642 template <size_t N> 5643 HWY_API Vec128<float16_t, N> MulAdd(Vec128<float16_t, N> mul, 5644 Vec128<float16_t, N> x, 5645 Vec128<float16_t, N> add) { 5646 return Vec128<float16_t, N>{_mm_fmadd_ph(mul.raw, x.raw, add.raw)}; 5647 } 5648 5649 template <size_t N> 5650 HWY_API Vec128<float16_t, N> NegMulAdd(Vec128<float16_t, N> mul, 5651 Vec128<float16_t, N> x, 5652 Vec128<float16_t, N> add) { 5653 return Vec128<float16_t, N>{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)}; 5654 } 5655 5656 template <size_t N> 5657 HWY_API Vec128<float16_t, N> MulSub(Vec128<float16_t, N> mul, 5658 Vec128<float16_t, N> x, 5659 Vec128<float16_t, N> sub) { 5660 return Vec128<float16_t, N>{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)}; 5661 } 5662 5663 template <size_t N> 5664 HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul, 5665 Vec128<float16_t, N> x, 5666 Vec128<float16_t, N> sub) { 5667 return Vec128<float16_t, N>{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)}; 5668 } 5669 5670 #endif // HWY_HAVE_FLOAT16 5671 template <size_t N> 5672 HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x, 5673 Vec128<float, N> add) { 5674 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5675 return mul * x + add; 5676 #else 5677 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; 5678 #endif 5679 } 5680 template <size_t N> 5681 HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x, 5682 Vec128<double, N> add) { 5683 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5684 return mul * x + add; 5685 #else 5686 return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; 5687 #endif 5688 } 5689 5690 // Returns add - mul * x 5691 template <size_t N> 5692 HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x, 5693 Vec128<float, N> add) { 5694 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5695 return add - mul * x; 5696 #else 5697 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; 5698 #endif 5699 } 5700 template <size_t N> 5701 HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x, 5702 Vec128<double, N> add) { 5703 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5704 return add - mul * x; 5705 #else 5706 return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; 5707 #endif 5708 } 5709 5710 // Returns mul * x - sub 5711 template <size_t N> 5712 HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x, 5713 Vec128<float, N> sub) { 5714 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5715 return mul * x - sub; 5716 #else 5717 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; 5718 #endif 5719 } 5720 template <size_t N> 5721 HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x, 5722 Vec128<double, N> sub) { 5723 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5724 return mul * x - sub; 5725 #else 5726 return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; 5727 #endif 5728 } 5729 5730 // Returns -mul * x - sub 5731 template <size_t N> 5732 HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x, 5733 Vec128<float, N> sub) { 5734 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5735 return Neg(mul) * x - sub; 5736 #else 5737 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; 5738 #endif 5739 } 5740 template <size_t N> 5741 HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x, 5742 Vec128<double, N> sub) { 5743 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5744 return Neg(mul) * x - sub; 5745 #else 5746 return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; 5747 #endif 5748 } 5749 5750 #if HWY_TARGET <= HWY_SSSE3 5751 5752 #undef HWY_IF_MULADDSUB_V 5753 #define HWY_IF_MULADDSUB_V(V) \ 5754 HWY_IF_LANES_GT_D(DFromV<V>, 1), \ 5755 HWY_IF_T_SIZE_ONE_OF_V( \ 5756 V, (1 << 1) | ((hwy::IsFloat<TFromV<V>>()) \ 5757 ? 0 \ 5758 : ((1 << 2) | (1 << 4) | (1 << 8)))) 5759 5760 #if HWY_HAVE_FLOAT16 5761 template <size_t N, HWY_IF_LANES_GT(N, 1)> 5762 HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul, 5763 Vec128<float16_t, N> x, 5764 Vec128<float16_t, N> sub_or_add) { 5765 return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)}; 5766 } 5767 #endif // HWY_HAVE_FLOAT16 5768 5769 template <size_t N, HWY_IF_LANES_GT(N, 1)> 5770 HWY_API Vec128<float, N> MulAddSub(Vec128<float, N> mul, Vec128<float, N> x, 5771 Vec128<float, N> sub_or_add) { 5772 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5773 return AddSub(mul * x, sub_or_add); 5774 #else 5775 return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)}; 5776 #endif 5777 } 5778 5779 HWY_API Vec128<double> MulAddSub(Vec128<double> mul, Vec128<double> x, 5780 Vec128<double> sub_or_add) { 5781 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) 5782 return AddSub(mul * x, sub_or_add); 5783 #else 5784 return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)}; 5785 #endif 5786 } 5787 5788 #endif // HWY_TARGET <= HWY_SSSE3 5789 5790 // ------------------------------ Floating-point square root 5791 5792 // Full precision square root 5793 #if HWY_HAVE_FLOAT16 5794 template <size_t N> 5795 HWY_API Vec128<float16_t, N> Sqrt(Vec128<float16_t, N> v) { 5796 return Vec128<float16_t, N>{_mm_sqrt_ph(v.raw)}; 5797 } 5798 #endif // HWY_HAVE_FLOAT16 5799 template <size_t N> 5800 HWY_API Vec128<float, N> Sqrt(Vec128<float, N> v) { 5801 return Vec128<float, N>{_mm_sqrt_ps(v.raw)}; 5802 } 5803 HWY_API Vec128<float, 1> Sqrt(Vec128<float, 1> v) { 5804 return Vec128<float, 1>{_mm_sqrt_ss(v.raw)}; 5805 } 5806 template <size_t N> 5807 HWY_API Vec128<double, N> Sqrt(Vec128<double, N> v) { 5808 return Vec128<double, N>{_mm_sqrt_pd(v.raw)}; 5809 } 5810 HWY_API Vec64<double> Sqrt(Vec64<double> v) { 5811 return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; 5812 } 5813 5814 // Approximate reciprocal square root 5815 #if HWY_HAVE_FLOAT16 5816 template <size_t N> 5817 HWY_API Vec128<float16_t, N> ApproximateReciprocalSqrt(Vec128<float16_t, N> v) { 5818 return Vec128<float16_t, N>{_mm_rsqrt_ph(v.raw)}; 5819 } 5820 #endif // HWY_HAVE_FLOAT16 5821 template <size_t N> 5822 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { 5823 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)}; 5824 } 5825 HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(Vec128<float, 1> v) { 5826 return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)}; 5827 } 5828 5829 #if HWY_TARGET <= HWY_AVX3 5830 #ifdef HWY_NATIVE_F64_APPROX_RSQRT 5831 #undef HWY_NATIVE_F64_APPROX_RSQRT 5832 #else 5833 #define HWY_NATIVE_F64_APPROX_RSQRT 5834 #endif 5835 5836 HWY_API Vec64<double> ApproximateReciprocalSqrt(Vec64<double> v) { 5837 return Vec64<double>{_mm_rsqrt14_sd(v.raw, v.raw)}; 5838 } 5839 HWY_API Vec128<double> ApproximateReciprocalSqrt(Vec128<double> v) { 5840 #if HWY_COMPILER_MSVC 5841 const DFromV<decltype(v)> d; 5842 return Vec128<double>{_mm_mask_rsqrt14_pd( 5843 Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)}; 5844 #else 5845 return Vec128<double>{_mm_rsqrt14_pd(v.raw)}; 5846 #endif 5847 } 5848 #endif 5849 5850 // ------------------------------ Min (Gt, IfThenElse) 5851 5852 namespace detail { 5853 5854 template <typename T, size_t N> 5855 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a, 5856 const Vec128<T, N> b) { 5857 const DFromV<decltype(a)> d; 5858 const RebindToUnsigned<decltype(d)> du; 5859 const RebindToSigned<decltype(d)> di; 5860 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); 5861 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); 5862 return IfThenElse(gt, b, a); 5863 } 5864 5865 } // namespace detail 5866 5867 // Unsigned 5868 template <size_t N> 5869 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 5870 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)}; 5871 } 5872 template <size_t N> 5873 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { 5874 #if HWY_TARGET >= HWY_SSSE3 5875 return Vec128<uint16_t, N>{ 5876 _mm_sub_epi16(a.raw, _mm_subs_epu16(a.raw, b.raw))}; 5877 #else 5878 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)}; 5879 #endif 5880 } 5881 template <size_t N> 5882 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { 5883 #if HWY_TARGET >= HWY_SSSE3 5884 return detail::MinU(a, b); 5885 #else 5886 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)}; 5887 #endif 5888 } 5889 template <size_t N> 5890 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 5891 #if HWY_TARGET <= HWY_AVX3 5892 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)}; 5893 #else 5894 return detail::MinU(a, b); 5895 #endif 5896 } 5897 5898 // Signed 5899 template <size_t N> 5900 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 5901 #if HWY_TARGET >= HWY_SSSE3 5902 return IfThenElse(a < b, a, b); 5903 #else 5904 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)}; 5905 #endif 5906 } 5907 template <size_t N> 5908 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 5909 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)}; 5910 } 5911 template <size_t N> 5912 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 5913 #if HWY_TARGET >= HWY_SSSE3 5914 return IfThenElse(a < b, a, b); 5915 #else 5916 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)}; 5917 #endif 5918 } 5919 template <size_t N> 5920 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 5921 #if HWY_TARGET <= HWY_AVX3 5922 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)}; 5923 #else 5924 return IfThenElse(a < b, a, b); 5925 #endif 5926 } 5927 5928 // Float 5929 #if HWY_HAVE_FLOAT16 5930 template <size_t N> 5931 HWY_API Vec128<float16_t, N> Min(Vec128<float16_t, N> a, 5932 Vec128<float16_t, N> b) { 5933 return Vec128<float16_t, N>{_mm_min_ph(a.raw, b.raw)}; 5934 } 5935 #endif // HWY_HAVE_FLOAT16 5936 template <size_t N> 5937 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) { 5938 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)}; 5939 } 5940 template <size_t N> 5941 HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) { 5942 return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)}; 5943 } 5944 5945 // ------------------------------ Max (Gt, IfThenElse) 5946 5947 namespace detail { 5948 template <typename T, size_t N> 5949 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a, 5950 const Vec128<T, N> b) { 5951 const DFromV<decltype(a)> d; 5952 const RebindToUnsigned<decltype(d)> du; 5953 const RebindToSigned<decltype(d)> di; 5954 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1))); 5955 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); 5956 return IfThenElse(gt, a, b); 5957 } 5958 5959 } // namespace detail 5960 5961 // Unsigned 5962 template <size_t N> 5963 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 5964 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)}; 5965 } 5966 template <size_t N> 5967 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { 5968 #if HWY_TARGET >= HWY_SSSE3 5969 return Vec128<uint16_t, N>{ 5970 _mm_add_epi16(a.raw, _mm_subs_epu16(b.raw, a.raw))}; 5971 #else 5972 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)}; 5973 #endif 5974 } 5975 template <size_t N> 5976 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { 5977 #if HWY_TARGET >= HWY_SSSE3 5978 return detail::MaxU(a, b); 5979 #else 5980 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)}; 5981 #endif 5982 } 5983 template <size_t N> 5984 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 5985 #if HWY_TARGET <= HWY_AVX3 5986 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)}; 5987 #else 5988 return detail::MaxU(a, b); 5989 #endif 5990 } 5991 5992 // Signed 5993 template <size_t N> 5994 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 5995 #if HWY_TARGET >= HWY_SSSE3 5996 return IfThenElse(a < b, b, a); 5997 #else 5998 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)}; 5999 #endif 6000 } 6001 template <size_t N> 6002 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 6003 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)}; 6004 } 6005 template <size_t N> 6006 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 6007 #if HWY_TARGET >= HWY_SSSE3 6008 return IfThenElse(a < b, b, a); 6009 #else 6010 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)}; 6011 #endif 6012 } 6013 template <size_t N> 6014 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 6015 #if HWY_TARGET <= HWY_AVX3 6016 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)}; 6017 #else 6018 return IfThenElse(a < b, b, a); 6019 #endif 6020 } 6021 6022 // Float 6023 #if HWY_HAVE_FLOAT16 6024 template <size_t N> 6025 HWY_API Vec128<float16_t, N> Max(Vec128<float16_t, N> a, 6026 Vec128<float16_t, N> b) { 6027 return Vec128<float16_t, N>{_mm_max_ph(a.raw, b.raw)}; 6028 } 6029 #endif // HWY_HAVE_FLOAT16 6030 template <size_t N> 6031 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) { 6032 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)}; 6033 } 6034 template <size_t N> 6035 HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) { 6036 return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)}; 6037 } 6038 6039 // ------------------------------ MinNumber and MaxNumber 6040 6041 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 6042 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 6043 #else 6044 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 6045 #endif 6046 6047 #if HWY_X86_HAVE_AVX10_2_OPS 6048 6049 #if HWY_HAVE_FLOAT16 6050 template <size_t N> 6051 HWY_API Vec128<float16_t, N> MinNumber(Vec128<float16_t, N> a, 6052 Vec128<float16_t, N> b) { 6053 return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x14)}; 6054 } 6055 #endif 6056 template <size_t N> 6057 HWY_API Vec128<float, N> MinNumber(Vec128<float, N> a, Vec128<float, N> b) { 6058 return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x14)}; 6059 } 6060 template <size_t N> 6061 HWY_API Vec128<double, N> MinNumber(Vec128<double, N> a, Vec128<double, N> b) { 6062 return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x14)}; 6063 } 6064 6065 #if HWY_HAVE_FLOAT16 6066 template <size_t N> 6067 HWY_API Vec128<float16_t, N> MaxNumber(Vec128<float16_t, N> a, 6068 Vec128<float16_t, N> b) { 6069 return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x15)}; 6070 } 6071 #endif 6072 template <size_t N> 6073 HWY_API Vec128<float, N> MaxNumber(Vec128<float, N> a, Vec128<float, N> b) { 6074 return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x15)}; 6075 } 6076 template <size_t N> 6077 HWY_API Vec128<double, N> MaxNumber(Vec128<double, N> a, Vec128<double, N> b) { 6078 return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x15)}; 6079 } 6080 6081 #else 6082 6083 // MinNumber/MaxNumber are generic for all vector lengths on targets other 6084 // than AVX10.2 6085 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 6086 HWY_API V MinNumber(V a, V b) { 6087 return Min(a, IfThenElse(IsNaN(b), a, b)); 6088 } 6089 6090 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 6091 HWY_API V MaxNumber(V a, V b) { 6092 return Max(a, IfThenElse(IsNaN(b), a, b)); 6093 } 6094 6095 #endif 6096 6097 // ------------------------------ MinMagnitude and MaxMagnitude 6098 6099 #if HWY_X86_HAVE_AVX10_2_OPS 6100 6101 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 6102 #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 6103 #else 6104 #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 6105 #endif 6106 6107 #if HWY_HAVE_FLOAT16 6108 template <size_t N> 6109 HWY_API Vec128<float16_t, N> MinMagnitude(Vec128<float16_t, N> a, 6110 Vec128<float16_t, N> b) { 6111 return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x16)}; 6112 } 6113 #endif 6114 template <size_t N> 6115 HWY_API Vec128<float, N> MinMagnitude(Vec128<float, N> a, Vec128<float, N> b) { 6116 return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x16)}; 6117 } 6118 template <size_t N> 6119 HWY_API Vec128<double, N> MinMagnitude(Vec128<double, N> a, 6120 Vec128<double, N> b) { 6121 return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x16)}; 6122 } 6123 6124 #if HWY_HAVE_FLOAT16 6125 template <size_t N> 6126 HWY_API Vec128<float16_t, N> MaxMagnitude(Vec128<float16_t, N> a, 6127 Vec128<float16_t, N> b) { 6128 return Vec128<float16_t, N>{_mm_minmax_ph(a.raw, b.raw, 0x17)}; 6129 } 6130 #endif 6131 template <size_t N> 6132 HWY_API Vec128<float, N> MaxMagnitude(Vec128<float, N> a, Vec128<float, N> b) { 6133 return Vec128<float, N>{_mm_minmax_ps(a.raw, b.raw, 0x17)}; 6134 } 6135 template <size_t N> 6136 HWY_API Vec128<double, N> MaxMagnitude(Vec128<double, N> a, 6137 Vec128<double, N> b) { 6138 return Vec128<double, N>{_mm_minmax_pd(a.raw, b.raw, 0x17)}; 6139 } 6140 6141 #endif 6142 6143 // ================================================== MEMORY (3) 6144 6145 // ------------------------------ Non-temporal stores 6146 6147 // On clang6, we see incorrect code generated for _mm_stream_pi, so 6148 // round even partial vectors up to 16 bytes. 6149 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)> 6150 HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 6151 const RebindToUnsigned<decltype(d)> du; // for float16_t 6152 _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw); 6153 } 6154 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 6155 HWY_API void Stream(VFromD<D> v, D /* tag */, float* HWY_RESTRICT aligned) { 6156 _mm_stream_ps(aligned, v.raw); 6157 } 6158 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 6159 HWY_API void Stream(VFromD<D> v, D /* tag */, double* HWY_RESTRICT aligned) { 6160 _mm_stream_pd(aligned, v.raw); 6161 } 6162 6163 // ------------------------------ Scatter 6164 6165 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 6166 HWY_DIAGNOSTICS(push) 6167 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 6168 6169 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*. 6170 using GatherIndex64 = long long int; // NOLINT(runtime/int) 6171 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); 6172 6173 #if HWY_TARGET <= HWY_AVX3 6174 6175 #ifdef HWY_NATIVE_SCATTER 6176 #undef HWY_NATIVE_SCATTER 6177 #else 6178 #define HWY_NATIVE_SCATTER 6179 #endif 6180 6181 namespace detail { 6182 6183 template <int kScale, class D, class VI, HWY_IF_UI32_D(D)> 6184 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, 6185 VI index) { 6186 if (d.MaxBytes() == 16) { 6187 _mm_i32scatter_epi32(base, index.raw, v.raw, kScale); 6188 } else { 6189 const __mmask8 mask = (1u << MaxLanes(d)) - 1; 6190 _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale); 6191 } 6192 } 6193 6194 template <int kScale, class D, class VI, HWY_IF_UI64_D(D)> 6195 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, 6196 VI index) { 6197 if (d.MaxBytes() == 16) { 6198 _mm_i64scatter_epi64(base, index.raw, v.raw, kScale); 6199 } else { 6200 const __mmask8 mask = (1u << MaxLanes(d)) - 1; 6201 _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale); 6202 } 6203 } 6204 6205 template <int kScale, class D, class VI, HWY_IF_F32_D(D)> 6206 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, float* HWY_RESTRICT base, 6207 VI index) { 6208 if (d.MaxBytes() == 16) { 6209 _mm_i32scatter_ps(base, index.raw, v.raw, kScale); 6210 } else { 6211 const __mmask8 mask = (1u << MaxLanes(d)) - 1; 6212 _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale); 6213 } 6214 } 6215 6216 template <int kScale, class D, class VI, HWY_IF_F64_D(D)> 6217 HWY_INLINE void NativeScatter128(VFromD<D> v, D d, double* HWY_RESTRICT base, 6218 VI index) { 6219 if (d.MaxBytes() == 16) { 6220 _mm_i64scatter_pd(base, index.raw, v.raw, kScale); 6221 } else { 6222 const __mmask8 mask = (1u << MaxLanes(d)) - 1; 6223 _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale); 6224 } 6225 } 6226 6227 template <int kScale, class D, class VI, HWY_IF_UI32_D(D)> 6228 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, 6229 TFromD<D>* HWY_RESTRICT base, VI index) { 6230 // For partial vectors, ensure upper mask lanes are zero to prevent faults. 6231 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); 6232 _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale); 6233 } 6234 6235 template <int kScale, class D, class VI, HWY_IF_UI64_D(D)> 6236 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, 6237 TFromD<D>* HWY_RESTRICT base, VI index) { 6238 // For partial vectors, ensure upper mask lanes are zero to prevent faults. 6239 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); 6240 _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale); 6241 } 6242 6243 template <int kScale, class D, class VI, HWY_IF_F32_D(D)> 6244 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, 6245 float* HWY_RESTRICT base, VI index) { 6246 // For partial vectors, ensure upper mask lanes are zero to prevent faults. 6247 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); 6248 _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale); 6249 } 6250 6251 template <int kScale, class D, class VI, HWY_IF_F64_D(D)> 6252 HWY_INLINE void NativeMaskedScatter128(VFromD<D> v, MFromD<D> m, D d, 6253 double* HWY_RESTRICT base, VI index) { 6254 // For partial vectors, ensure upper mask lanes are zero to prevent faults. 6255 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); 6256 _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale); 6257 } 6258 6259 } // namespace detail 6260 6261 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6262 HWY_API void ScatterOffset(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, 6263 VFromD<RebindToSigned<D>> offset) { 6264 return detail::NativeScatter128<1>(v, d, base, offset); 6265 } 6266 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6267 HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base, 6268 VFromD<RebindToSigned<D>> index) { 6269 return detail::NativeScatter128<sizeof(TFromD<D>)>(v, d, base, index); 6270 } 6271 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6272 HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d, 6273 TFromD<D>* HWY_RESTRICT base, 6274 VFromD<RebindToSigned<D>> index) { 6275 return detail::NativeMaskedScatter128<sizeof(TFromD<D>)>(v, m, d, base, 6276 index); 6277 } 6278 6279 #endif // HWY_TARGET <= HWY_AVX3 6280 6281 // ------------------------------ Gather (Load/Store) 6282 6283 #if HWY_TARGET <= HWY_AVX2 6284 6285 #ifdef HWY_NATIVE_GATHER 6286 #undef HWY_NATIVE_GATHER 6287 #else 6288 #define HWY_NATIVE_GATHER 6289 #endif 6290 6291 namespace detail { 6292 6293 template <int kScale, typename T, size_t N, HWY_IF_UI32(T)> 6294 HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base, 6295 Vec128<int32_t, N> indices) { 6296 return Vec128<T, N>{_mm_i32gather_epi32( 6297 reinterpret_cast<const int32_t*>(base), indices.raw, kScale)}; 6298 } 6299 6300 template <int kScale, typename T, size_t N, HWY_IF_UI64(T)> 6301 HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base, 6302 Vec128<int64_t, N> indices) { 6303 return Vec128<T, N>{_mm_i64gather_epi64( 6304 reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)}; 6305 } 6306 6307 template <int kScale, size_t N> 6308 HWY_INLINE Vec128<float, N> NativeGather128(const float* HWY_RESTRICT base, 6309 Vec128<int32_t, N> indices) { 6310 return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)}; 6311 } 6312 6313 template <int kScale, size_t N> 6314 HWY_INLINE Vec128<double, N> NativeGather128(const double* HWY_RESTRICT base, 6315 Vec128<int64_t, N> indices) { 6316 return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)}; 6317 } 6318 6319 template <int kScale, typename T, size_t N, HWY_IF_UI32(T)> 6320 HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no, 6321 Mask128<T, N> m, 6322 const T* HWY_RESTRICT base, 6323 Vec128<int32_t, N> indices) { 6324 #if HWY_TARGET <= HWY_AVX3 6325 return Vec128<T, N>{_mm_mmask_i32gather_epi32( 6326 no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base), 6327 kScale)}; 6328 #else 6329 return Vec128<T, N>{ 6330 _mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base), 6331 indices.raw, m.raw, kScale)}; 6332 #endif 6333 } 6334 6335 template <int kScale, typename T, size_t N, HWY_IF_UI64(T)> 6336 HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no, 6337 Mask128<T, N> m, 6338 const T* HWY_RESTRICT base, 6339 Vec128<int64_t, N> indices) { 6340 #if HWY_TARGET <= HWY_AVX3 6341 return Vec128<T, N>{_mm_mmask_i64gather_epi64( 6342 no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base), 6343 kScale)}; 6344 #else 6345 return Vec128<T, N>{_mm_mask_i64gather_epi64( 6346 no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw, 6347 kScale)}; 6348 #endif 6349 } 6350 6351 template <int kScale, size_t N> 6352 HWY_INLINE Vec128<float, N> NativeMaskedGatherOr128( 6353 Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base, 6354 Vec128<int32_t, N> indices) { 6355 #if HWY_TARGET <= HWY_AVX3 6356 return Vec128<float, N>{ 6357 _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)}; 6358 #else 6359 return Vec128<float, N>{ 6360 _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)}; 6361 #endif 6362 } 6363 6364 template <int kScale, size_t N> 6365 HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128( 6366 Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base, 6367 Vec128<int64_t, N> indices) { 6368 #if HWY_TARGET <= HWY_AVX3 6369 return Vec128<double, N>{ 6370 _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)}; 6371 #else 6372 return Vec128<double, N>{ 6373 _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)}; 6374 #endif 6375 } 6376 6377 } // namespace detail 6378 6379 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6380 HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base, 6381 VFromD<RebindToSigned<D>> offsets) { 6382 return detail::NativeGather128<1>(base, offsets); 6383 } 6384 6385 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>> 6386 HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base, 6387 VFromD<RebindToSigned<D>> indices) { 6388 return detail::NativeGather128<sizeof(T)>(base, indices); 6389 } 6390 6391 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>> 6392 HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d, 6393 const T* HWY_RESTRICT base, 6394 VFromD<RebindToSigned<D>> indices) { 6395 // For partial vectors, ensure upper mask lanes are zero to prevent faults. 6396 if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); 6397 6398 return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices); 6399 } 6400 6401 // Generic for all vector lengths. 6402 template <class D> 6403 HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, 6404 const TFromD<D>* HWY_RESTRICT base, 6405 VFromD<RebindToSigned<D>> indices) { 6406 return MaskedGatherIndexOr(Zero(d), m, d, base, indices); 6407 } 6408 6409 #endif // HWY_TARGET <= HWY_AVX2 6410 6411 HWY_DIAGNOSTICS(pop) 6412 6413 // ================================================== SWIZZLE (2) 6414 6415 // ------------------------------ LowerHalf 6416 6417 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 6418 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { 6419 return VFromD<D>{v.raw}; 6420 } 6421 template <typename T, size_t N> 6422 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 6423 return Vec128<T, N / 2>{v.raw}; 6424 } 6425 6426 // ------------------------------ ShiftLeftBytes 6427 6428 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6429 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { 6430 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 6431 const RebindToUnsigned<decltype(d)> du; 6432 return BitCast( 6433 d, VFromD<decltype(du)>{_mm_slli_si128(BitCast(du, v).raw, kBytes)}); 6434 } 6435 6436 // Generic for all vector lengths. 6437 template <int kBytes, class V> 6438 HWY_API V ShiftLeftBytes(const V v) { 6439 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 6440 } 6441 6442 // ------------------------------ ShiftLeftLanes 6443 6444 // Generic for all vector lengths. 6445 template <int kLanes, class D> 6446 HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) { 6447 const Repartition<uint8_t, decltype(d)> d8; 6448 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v))); 6449 } 6450 6451 // Generic for all vector lengths. 6452 template <int kLanes, class V> 6453 HWY_API V ShiftLeftLanes(const V v) { 6454 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 6455 } 6456 6457 // ------------------------------ ShiftRightBytes 6458 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6459 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 6460 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 6461 const RebindToUnsigned<decltype(d)> du; 6462 // For partial vectors, clear upper lanes so we shift in zeros. 6463 if (d.MaxBytes() != 16) { 6464 const Full128<TFromD<D>> dfull; 6465 const VFromD<decltype(dfull)> vfull{v.raw}; 6466 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; 6467 } 6468 return BitCast( 6469 d, VFromD<decltype(du)>{_mm_srli_si128(BitCast(du, v).raw, kBytes)}); 6470 } 6471 6472 // ------------------------------ ShiftRightLanes 6473 // Generic for all vector lengths. 6474 template <int kLanes, class D> 6475 HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) { 6476 const Repartition<uint8_t, decltype(d)> d8; 6477 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 6478 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); 6479 } 6480 6481 // ------------------------------ UpperHalf (ShiftRightBytes) 6482 6483 // Full input: copy hi into lo (smaller instruction encoding than shifts). 6484 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_FLOAT3264_D(D)> 6485 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 6486 const Twice<RebindToUnsigned<decltype(d)>> dut; 6487 using VUT = VFromD<decltype(dut)>; // for float16_t 6488 const VUT vut = BitCast(dut, v); 6489 return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)})); 6490 } 6491 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 6492 HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) { 6493 return Vec64<float>{_mm_movehl_ps(v.raw, v.raw)}; 6494 } 6495 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> 6496 HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) { 6497 return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)}; 6498 } 6499 6500 // Partial 6501 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 6502 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 6503 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); 6504 } 6505 6506 // ------------------------------ ExtractLane (UpperHalf) 6507 6508 namespace detail { 6509 6510 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 6511 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 6512 static_assert(kLane < N, "Lane index out of bounds"); 6513 #if HWY_TARGET >= HWY_SSSE3 6514 const int pair = _mm_extract_epi16(v.raw, kLane / 2); 6515 constexpr int kShift = kLane & 1 ? 8 : 0; 6516 return static_cast<T>((pair >> kShift) & 0xFF); 6517 #else 6518 return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF); 6519 #endif 6520 } 6521 6522 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 6523 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 6524 static_assert(kLane < N, "Lane index out of bounds"); 6525 const DFromV<decltype(v)> d; 6526 const RebindToUnsigned<decltype(d)> du; 6527 const uint16_t lane = static_cast<uint16_t>( 6528 _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF); 6529 return BitCastScalar<T>(lane); 6530 } 6531 6532 template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)> 6533 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 6534 static_assert(kLane < N, "Lane index out of bounds"); 6535 #if HWY_TARGET >= HWY_SSSE3 6536 return static_cast<T>(_mm_cvtsi128_si32( 6537 (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane))); 6538 #else 6539 return static_cast<T>(_mm_extract_epi32(v.raw, kLane)); 6540 #endif 6541 } 6542 6543 template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)> 6544 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 6545 static_assert(kLane < N, "Lane index out of bounds"); 6546 #if HWY_ARCH_X86_32 6547 alignas(16) T lanes[2]; 6548 Store(v, DFromV<decltype(v)>(), lanes); 6549 return lanes[kLane]; 6550 #elif HWY_TARGET >= HWY_SSSE3 6551 return static_cast<T>( 6552 _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE))); 6553 #else 6554 return static_cast<T>(_mm_extract_epi64(v.raw, kLane)); 6555 #endif 6556 } 6557 6558 template <size_t kLane, size_t N> 6559 HWY_INLINE float ExtractLane(const Vec128<float, N> v) { 6560 static_assert(kLane < N, "Lane index out of bounds"); 6561 #if HWY_TARGET >= HWY_SSSE3 6562 return _mm_cvtss_f32((kLane == 0) ? v.raw 6563 : _mm_shuffle_ps(v.raw, v.raw, kLane)); 6564 #else 6565 // Bug in the intrinsic, returns int but should be float. 6566 const int32_t bits = _mm_extract_ps(v.raw, kLane); 6567 return BitCastScalar<float>(bits); 6568 #endif 6569 } 6570 6571 // There is no extract_pd; two overloads because there is no UpperHalf for N=1. 6572 template <size_t kLane> 6573 HWY_INLINE double ExtractLane(const Vec64<double> v) { 6574 static_assert(kLane == 0, "Lane index out of bounds"); 6575 return GetLane(v); 6576 } 6577 6578 template <size_t kLane> 6579 HWY_INLINE double ExtractLane(const Vec128<double> v) { 6580 static_assert(kLane < 2, "Lane index out of bounds"); 6581 const Half<DFromV<decltype(v)>> dh; 6582 return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v)); 6583 } 6584 6585 } // namespace detail 6586 6587 // Requires one overload per vector length because ExtractLane<3> may be a 6588 // compile error if it calls _mm_extract_epi64. 6589 template <typename T> 6590 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { 6591 HWY_DASSERT(i == 0); 6592 (void)i; 6593 return GetLane(v); 6594 } 6595 6596 template <typename T> 6597 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { 6598 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6599 if (__builtin_constant_p(i)) { 6600 switch (i) { 6601 case 0: 6602 return detail::ExtractLane<0>(v); 6603 case 1: 6604 return detail::ExtractLane<1>(v); 6605 } 6606 } 6607 #endif 6608 alignas(16) T lanes[2]; 6609 Store(v, DFromV<decltype(v)>(), lanes); 6610 return lanes[i]; 6611 } 6612 6613 template <typename T> 6614 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { 6615 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6616 if (__builtin_constant_p(i)) { 6617 switch (i) { 6618 case 0: 6619 return detail::ExtractLane<0>(v); 6620 case 1: 6621 return detail::ExtractLane<1>(v); 6622 case 2: 6623 return detail::ExtractLane<2>(v); 6624 case 3: 6625 return detail::ExtractLane<3>(v); 6626 } 6627 } 6628 #endif 6629 alignas(16) T lanes[4]; 6630 Store(v, DFromV<decltype(v)>(), lanes); 6631 return lanes[i]; 6632 } 6633 6634 template <typename T> 6635 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { 6636 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6637 if (__builtin_constant_p(i)) { 6638 switch (i) { 6639 case 0: 6640 return detail::ExtractLane<0>(v); 6641 case 1: 6642 return detail::ExtractLane<1>(v); 6643 case 2: 6644 return detail::ExtractLane<2>(v); 6645 case 3: 6646 return detail::ExtractLane<3>(v); 6647 case 4: 6648 return detail::ExtractLane<4>(v); 6649 case 5: 6650 return detail::ExtractLane<5>(v); 6651 case 6: 6652 return detail::ExtractLane<6>(v); 6653 case 7: 6654 return detail::ExtractLane<7>(v); 6655 } 6656 } 6657 #endif 6658 alignas(16) T lanes[8]; 6659 Store(v, DFromV<decltype(v)>(), lanes); 6660 return lanes[i]; 6661 } 6662 6663 template <typename T> 6664 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { 6665 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6666 if (__builtin_constant_p(i)) { 6667 switch (i) { 6668 case 0: 6669 return detail::ExtractLane<0>(v); 6670 case 1: 6671 return detail::ExtractLane<1>(v); 6672 case 2: 6673 return detail::ExtractLane<2>(v); 6674 case 3: 6675 return detail::ExtractLane<3>(v); 6676 case 4: 6677 return detail::ExtractLane<4>(v); 6678 case 5: 6679 return detail::ExtractLane<5>(v); 6680 case 6: 6681 return detail::ExtractLane<6>(v); 6682 case 7: 6683 return detail::ExtractLane<7>(v); 6684 case 8: 6685 return detail::ExtractLane<8>(v); 6686 case 9: 6687 return detail::ExtractLane<9>(v); 6688 case 10: 6689 return detail::ExtractLane<10>(v); 6690 case 11: 6691 return detail::ExtractLane<11>(v); 6692 case 12: 6693 return detail::ExtractLane<12>(v); 6694 case 13: 6695 return detail::ExtractLane<13>(v); 6696 case 14: 6697 return detail::ExtractLane<14>(v); 6698 case 15: 6699 return detail::ExtractLane<15>(v); 6700 } 6701 } 6702 #endif 6703 alignas(16) T lanes[16]; 6704 Store(v, DFromV<decltype(v)>(), lanes); 6705 return lanes[i]; 6706 } 6707 6708 // ------------------------------ InsertLane (UpperHalf) 6709 6710 namespace detail { 6711 6712 template <class V> 6713 HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV<V> t) { 6714 const DFromV<decltype(v)> d; 6715 6716 #if HWY_TARGET <= HWY_AVX3 6717 using RawMask = decltype(MaskFromVec(VFromD<decltype(d)>()).raw); 6718 const auto mask = MFromD<decltype(d)>{static_cast<RawMask>(uint64_t{1} << i)}; 6719 #else 6720 const RebindToUnsigned<decltype(d)> du; 6721 using TU = TFromD<decltype(du)>; 6722 const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast<TU>(i))); 6723 #endif 6724 6725 return IfThenElse(mask, Set(d, t), v); 6726 } 6727 6728 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 6729 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 6730 static_assert(kLane < N, "Lane index out of bounds"); 6731 #if HWY_TARGET >= HWY_SSSE3 6732 return InsertLaneUsingBroadcastAndBlend(v, kLane, t); 6733 #else 6734 return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)}; 6735 #endif 6736 } 6737 6738 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 6739 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 6740 static_assert(kLane < N, "Lane index out of bounds"); 6741 const DFromV<decltype(v)> d; 6742 const RebindToUnsigned<decltype(d)> du; 6743 const uint16_t bits = BitCastScalar<uint16_t>(t); 6744 return BitCast(d, VFromD<decltype(du)>{ 6745 _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)}); 6746 } 6747 6748 template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)> 6749 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 6750 static_assert(kLane < N, "Lane index out of bounds"); 6751 #if HWY_TARGET >= HWY_SSSE3 6752 return InsertLaneUsingBroadcastAndBlend(v, kLane, t); 6753 #else 6754 const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t); 6755 return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)}; 6756 #endif 6757 } 6758 6759 template <size_t kLane, typename T, size_t N, HWY_IF_UI64(T)> 6760 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 6761 static_assert(kLane < N, "Lane index out of bounds"); 6762 #if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32 6763 const DFromV<decltype(v)> d; 6764 const RebindToFloat<decltype(d)> df; 6765 const auto vt = BitCast(df, Set(d, t)); 6766 if (kLane == 0) { 6767 return BitCast( 6768 d, Vec128<double, N>{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)}); 6769 } 6770 return BitCast( 6771 d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)}); 6772 #else 6773 const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t); 6774 return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)}; 6775 #endif 6776 } 6777 6778 template <size_t kLane, size_t N> 6779 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) { 6780 static_assert(kLane < N, "Lane index out of bounds"); 6781 #if HWY_TARGET >= HWY_SSSE3 6782 return InsertLaneUsingBroadcastAndBlend(v, kLane, t); 6783 #else 6784 return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)}; 6785 #endif 6786 } 6787 6788 // There is no insert_pd; two overloads because there is no UpperHalf for N=1. 6789 template <size_t kLane> 6790 HWY_INLINE Vec128<double, 1> InsertLane(const Vec128<double, 1> v, double t) { 6791 static_assert(kLane == 0, "Lane index out of bounds"); 6792 return Set(DFromV<decltype(v)>(), t); 6793 } 6794 6795 template <size_t kLane> 6796 HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) { 6797 static_assert(kLane < 2, "Lane index out of bounds"); 6798 const DFromV<decltype(v)> d; 6799 const Vec128<double> vt = Set(d, t); 6800 if (kLane == 0) { 6801 return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)}; 6802 } 6803 return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)}; 6804 } 6805 6806 } // namespace detail 6807 6808 // Requires one overload per vector length because InsertLane<3> may be a 6809 // compile error if it calls _mm_insert_epi64. 6810 6811 template <typename T> 6812 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { 6813 HWY_DASSERT(i == 0); 6814 (void)i; 6815 return Set(DFromV<decltype(v)>(), t); 6816 } 6817 6818 template <typename T> 6819 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { 6820 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6821 if (__builtin_constant_p(i)) { 6822 switch (i) { 6823 case 0: 6824 return detail::InsertLane<0>(v, t); 6825 case 1: 6826 return detail::InsertLane<1>(v, t); 6827 } 6828 } 6829 #endif 6830 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 6831 } 6832 6833 template <typename T> 6834 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { 6835 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6836 if (__builtin_constant_p(i)) { 6837 switch (i) { 6838 case 0: 6839 return detail::InsertLane<0>(v, t); 6840 case 1: 6841 return detail::InsertLane<1>(v, t); 6842 case 2: 6843 return detail::InsertLane<2>(v, t); 6844 case 3: 6845 return detail::InsertLane<3>(v, t); 6846 } 6847 } 6848 #endif 6849 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 6850 } 6851 6852 template <typename T> 6853 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { 6854 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6855 if (__builtin_constant_p(i)) { 6856 switch (i) { 6857 case 0: 6858 return detail::InsertLane<0>(v, t); 6859 case 1: 6860 return detail::InsertLane<1>(v, t); 6861 case 2: 6862 return detail::InsertLane<2>(v, t); 6863 case 3: 6864 return detail::InsertLane<3>(v, t); 6865 case 4: 6866 return detail::InsertLane<4>(v, t); 6867 case 5: 6868 return detail::InsertLane<5>(v, t); 6869 case 6: 6870 return detail::InsertLane<6>(v, t); 6871 case 7: 6872 return detail::InsertLane<7>(v, t); 6873 } 6874 } 6875 #endif 6876 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 6877 } 6878 6879 template <typename T> 6880 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { 6881 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6882 if (__builtin_constant_p(i)) { 6883 switch (i) { 6884 case 0: 6885 return detail::InsertLane<0>(v, t); 6886 case 1: 6887 return detail::InsertLane<1>(v, t); 6888 case 2: 6889 return detail::InsertLane<2>(v, t); 6890 case 3: 6891 return detail::InsertLane<3>(v, t); 6892 case 4: 6893 return detail::InsertLane<4>(v, t); 6894 case 5: 6895 return detail::InsertLane<5>(v, t); 6896 case 6: 6897 return detail::InsertLane<6>(v, t); 6898 case 7: 6899 return detail::InsertLane<7>(v, t); 6900 case 8: 6901 return detail::InsertLane<8>(v, t); 6902 case 9: 6903 return detail::InsertLane<9>(v, t); 6904 case 10: 6905 return detail::InsertLane<10>(v, t); 6906 case 11: 6907 return detail::InsertLane<11>(v, t); 6908 case 12: 6909 return detail::InsertLane<12>(v, t); 6910 case 13: 6911 return detail::InsertLane<13>(v, t); 6912 case 14: 6913 return detail::InsertLane<14>(v, t); 6914 case 15: 6915 return detail::InsertLane<15>(v, t); 6916 } 6917 } 6918 #endif 6919 return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); 6920 } 6921 6922 // ------------------------------ CombineShiftRightBytes 6923 6924 #if HWY_TARGET == HWY_SSE2 6925 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)> 6926 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 6927 static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); 6928 return Or(ShiftRightBytes<kBytes>(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); 6929 } 6930 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> 6931 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 6932 constexpr size_t kSize = d.MaxBytes(); 6933 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 6934 6935 const Twice<decltype(d)> dt; 6936 return VFromD<D>{ShiftRightBytes<kBytes>(dt, Combine(dt, hi, lo)).raw}; 6937 } 6938 #else 6939 template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16)> 6940 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 6941 const Repartition<uint8_t, decltype(d)> d8; 6942 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8( 6943 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); 6944 } 6945 6946 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> 6947 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 6948 constexpr size_t kSize = d.MaxBytes(); 6949 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 6950 const Repartition<uint8_t, decltype(d)> d8; 6951 using V8 = Vec128<uint8_t>; 6952 const DFromV<V8> dfull8; 6953 const Repartition<TFromD<D>, decltype(dfull8)> dfull; 6954 const V8 hi8{BitCast(d8, hi).raw}; 6955 // Move into most-significant bytes 6956 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); 6957 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); 6958 return VFromD<D>{BitCast(dfull, r).raw}; 6959 } 6960 #endif 6961 6962 // ------------------------------ Broadcast/splat any lane 6963 6964 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 6965 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 6966 const DFromV<decltype(v)> d; 6967 const RebindToUnsigned<decltype(d)> du; 6968 using VU = VFromD<decltype(du)>; 6969 const VU vu = BitCast(du, v); // for float16_t 6970 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6971 if (kLane < 4) { 6972 const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); 6973 return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)}); 6974 } else { 6975 const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); 6976 return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)}); 6977 } 6978 } 6979 6980 template <int kLane, typename T, size_t N, HWY_IF_UI32(T)> 6981 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 6982 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6983 HWY_IF_CONSTEXPR(N == 1){ 6984 return Vec128<T, N>{v}; // Workaround for MSVC compiler bug on single lane integer broadcast 6985 }else{ 6986 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; 6987 } 6988 } 6989 6990 template <int kLane, typename T, size_t N, HWY_IF_UI64(T)> 6991 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 6992 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6993 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; 6994 } 6995 6996 template <int kLane, size_t N> 6997 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) { 6998 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6999 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; 7000 } 7001 7002 template <int kLane, size_t N> 7003 HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) { 7004 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 7005 return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; 7006 } 7007 7008 // ------------------------------ TableLookupLanes (Shuffle01) 7009 7010 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. 7011 template <typename T, size_t N = 16 / sizeof(T)> 7012 struct Indices128 { 7013 __m128i raw; 7014 }; 7015 7016 template <class D, typename T = TFromD<D>, typename TI, size_t kN, 7017 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)> 7018 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { 7019 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 7020 #if HWY_IS_DEBUG_BUILD 7021 const Rebind<TI, decltype(d)> di; 7022 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 7023 AllTrue(di, Lt(vec, Set(di, kN * 2)))); 7024 #endif 7025 7026 // No change as byte indices are always used for 8-bit lane types 7027 (void)d; 7028 return Indices128<T, kN>{vec.raw}; 7029 } 7030 7031 template <class D, typename T = TFromD<D>, typename TI, size_t kN, 7032 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)> 7033 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { 7034 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 7035 #if HWY_IS_DEBUG_BUILD 7036 const Rebind<TI, decltype(d)> di; 7037 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 7038 AllTrue(di, Lt(vec, Set(di, kN * 2)))); 7039 #endif 7040 7041 #if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 7042 (void)d; 7043 return Indices128<T, kN>{vec.raw}; 7044 #else // SSSE3, SSE4, or AVX2 7045 const Repartition<uint8_t, decltype(d)> d8; 7046 using V8 = VFromD<decltype(d8)>; 7047 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 7048 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; 7049 7050 // Broadcast each lane index to all 4 bytes of T 7051 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 7052 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 7053 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); 7054 7055 // Shift to bytes 7056 const Repartition<uint16_t, decltype(d)> d16; 7057 const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); 7058 7059 return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; 7060 #endif // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 7061 } 7062 7063 template <class D, typename T = TFromD<D>, typename TI, size_t kN, 7064 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)> 7065 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { 7066 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 7067 #if HWY_IS_DEBUG_BUILD 7068 const Rebind<TI, decltype(d)> di; 7069 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 7070 AllTrue(di, Lt(vec, Set(di, kN * 2)))); 7071 #endif 7072 7073 #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 7074 (void)d; 7075 return Indices128<T, kN>{vec.raw}; 7076 #else 7077 const Repartition<uint8_t, decltype(d)> d8; 7078 using V8 = VFromD<decltype(d8)>; 7079 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 7080 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; 7081 7082 // Broadcast each lane index to all 4 bytes of T 7083 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 7084 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 7085 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); 7086 7087 // Shift to bytes 7088 const Repartition<uint16_t, decltype(d)> d16; 7089 const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); 7090 7091 return Indices128<T, kN>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; 7092 #endif 7093 } 7094 7095 template <class D, typename T = TFromD<D>, typename TI, size_t kN, 7096 HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)> 7097 HWY_API Indices128<T, kN> IndicesFromVec(D d, Vec128<TI, kN> vec) { 7098 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 7099 #if HWY_IS_DEBUG_BUILD 7100 const Rebind<TI, decltype(d)> di; 7101 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && 7102 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(kN * 2))))); 7103 #else 7104 (void)d; 7105 #endif 7106 7107 // No change - even without AVX3, we can shuffle+blend. 7108 return Indices128<T, kN>{vec.raw}; 7109 } 7110 7111 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI> 7112 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( 7113 D d, const TI* idx) { 7114 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane"); 7115 const Rebind<TI, decltype(d)> di; 7116 return IndicesFromVec(d, LoadU(di, idx)); 7117 } 7118 7119 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 7120 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 7121 return TableLookupBytes(v, Vec128<T, N>{idx.raw}); 7122 } 7123 7124 template <typename T, size_t N, HWY_IF_UI16(T)> 7125 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 7126 #if HWY_TARGET <= HWY_AVX3 7127 return {_mm_permutexvar_epi16(idx.raw, v.raw)}; 7128 #elif HWY_TARGET == HWY_SSE2 7129 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) 7130 typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16))); 7131 return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>( 7132 __builtin_shuffle(reinterpret_cast<GccU16RawVectType>(v.raw), 7133 reinterpret_cast<GccU16RawVectType>(idx.raw)))}; 7134 #else 7135 const Full128<T> d_full; 7136 alignas(16) T src_lanes[8]; 7137 alignas(16) uint16_t indices[8]; 7138 alignas(16) T result_lanes[8]; 7139 7140 Store(Vec128<T>{v.raw}, d_full, src_lanes); 7141 _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); 7142 7143 for (int i = 0; i < 8; i++) { 7144 result_lanes[i] = src_lanes[indices[i] & 7u]; 7145 } 7146 7147 return Vec128<T, N>{Load(d_full, result_lanes).raw}; 7148 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) 7149 #else 7150 return TableLookupBytes(v, Vec128<T, N>{idx.raw}); 7151 #endif 7152 } 7153 7154 #if HWY_HAVE_FLOAT16 7155 template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 2)> 7156 HWY_API Vec128<float16_t, N> TableLookupLanes(Vec128<float16_t, N> v, 7157 Indices128<float16_t, N> idx) { 7158 return {_mm_permutexvar_ph(idx.raw, v.raw)}; 7159 } 7160 #endif // HWY_HAVE_FLOAT16 7161 7162 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 7163 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 7164 const DFromV<decltype(v)> d; 7165 const Full128<T> d_full; 7166 const Vec128<T> v_full = ZeroExtendResizeBitCast(d_full, d, v); 7167 7168 const RebindToSigned<decltype(d)> di; 7169 const Full128<MakeSigned<T>> di_full; 7170 const VFromD<decltype(di_full)> vidx = 7171 ZeroExtendResizeBitCast(di_full, di, VFromD<decltype(di)>{idx.raw}); 7172 7173 #if HWY_TARGET <= HWY_AVX2 7174 // There is no permutevar for non-float; _mm256_permutevar8x32_epi32 is for 7175 // 256-bit vectors, hence cast to float. 7176 const Full128<float> df_full; 7177 // Workaround for MSAN false positive. 7178 HWY_IF_CONSTEXPR(HWY_IS_MSAN) PreventElision(GetLane(vidx)); 7179 const Vec128<float> perm{ 7180 _mm_permutevar_ps(BitCast(df_full, v_full).raw, vidx.raw)}; 7181 return ResizeBitCast(d, perm); 7182 #elif HWY_TARGET == HWY_SSE2 7183 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) 7184 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); 7185 return Vec128<T, N>{reinterpret_cast<typename detail::Raw128<T>::type>( 7186 __builtin_shuffle(reinterpret_cast<GccU32RawVectType>(v_full.raw), 7187 reinterpret_cast<GccU32RawVectType>(vidx.raw)))}; 7188 #else 7189 alignas(16) T src_lanes[4]; 7190 alignas(16) int32_t indices[4]; 7191 alignas(16) T result_lanes[4]; 7192 7193 Store(v_full, d_full, src_lanes); 7194 Store(vidx, di_full, indices); 7195 7196 for (size_t i = 0; i < N; i++) { 7197 result_lanes[i] = src_lanes[static_cast<size_t>(indices[i] & 3)]; 7198 } 7199 return Load(d, result_lanes); 7200 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) 7201 #else // SSSE3 or SSE4 7202 return ResizeBitCast(d, TableLookupBytes(BitCast(di_full, v_full), vidx)); 7203 #endif 7204 } 7205 7206 // Single lane: no change 7207 template <typename T> 7208 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v, 7209 Indices128<T, 1> /* idx */) { 7210 return v; 7211 } 7212 7213 template <typename T, HWY_IF_T_SIZE(T, 8)> 7214 HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) { 7215 const DFromV<decltype(v)> d; 7216 // No need for ZeroExtendResizeBitCast, we have full vectors. 7217 Vec128<int64_t> vidx{idx.raw}; 7218 7219 // Disable in MSAN builds due to false positive. Note that this affects 7220 // CompressNot, which assumes upper index bits will be ignored. 7221 #if HWY_TARGET <= HWY_AVX2 && !HWY_IS_MSAN 7222 // There is no _mm_permute[x]var_epi64. 7223 vidx += vidx; // bit1 is the decider (unusual) 7224 const RebindToFloat<decltype(d)> df; 7225 return BitCast( 7226 d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); 7227 #else 7228 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit 7229 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 7230 // to obtain an all-zero or all-one mask. 7231 const RebindToSigned<decltype(d)> di; 7232 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1); 7233 return BitCast( 7234 d, IfVecThenElse(same, BitCast(di, v), Shuffle01(BitCast(di, v)))); 7235 #endif 7236 } 7237 7238 // ------------------------------ ReverseBlocks 7239 7240 // Single block: no change 7241 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 7242 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { 7243 return v; 7244 } 7245 7246 // ------------------------------ Reverse (Shuffle0123, Shuffle2301) 7247 7248 // Single lane: no change 7249 template <class D, HWY_IF_LANES_D(D, 1)> 7250 HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) { 7251 return v; 7252 } 7253 7254 // 32-bit x2: shuffle 7255 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)> 7256 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { 7257 return VFromD<D>{Shuffle2301(Vec128<TFromD<D>>{v.raw}).raw}; 7258 } 7259 7260 // 64-bit x2: shuffle 7261 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 7262 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { 7263 return Shuffle01(v); 7264 } 7265 7266 // 32-bit x4: shuffle 7267 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 7268 HWY_API VFromD<D> Reverse(D /* tag */, const VFromD<D> v) { 7269 return Shuffle0123(v); 7270 } 7271 7272 // 16-bit 7273 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2), 7274 HWY_IF_LANES_GT_D(D, 1)> 7275 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 7276 const RebindToUnsigned<decltype(d)> du; 7277 using VU = VFromD<decltype(du)>; 7278 const VU vu = BitCast(du, v); // for float16_t 7279 constexpr size_t kN = MaxLanes(d); 7280 if (kN == 1) return v; 7281 if (kN == 2) { 7282 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))}); 7283 } 7284 if (kN == 4) { 7285 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); 7286 } 7287 7288 #if HWY_TARGET == HWY_SSE2 7289 const VU rev4{ 7290 _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), 7291 _MM_SHUFFLE(0, 1, 2, 3))}; 7292 return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))}); 7293 #else 7294 const RebindToSigned<decltype(d)> di; 7295 const VFromD<decltype(di)> shuffle = Dup128VecFromValues( 7296 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); 7297 return BitCast(d, TableLookupBytes(v, shuffle)); 7298 #endif 7299 } 7300 7301 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1), 7302 HWY_IF_LANES_GT_D(D, 1)> 7303 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 7304 constexpr int kN = static_cast<int>(MaxLanes(d)); 7305 if (kN == 1) return v; 7306 #if HWY_TARGET <= HWY_SSSE3 7307 // NOTE: Lanes with negative shuffle control mask values are set to zero. 7308 alignas(16) static constexpr int8_t kReverse[16] = { 7309 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, 7310 kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; 7311 const RebindToSigned<decltype(d)> di; 7312 const VFromD<decltype(di)> idx = Load(di, kReverse); 7313 return VFromD<D>{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)}; 7314 #else 7315 const RepartitionToWide<decltype(d)> d16; 7316 return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); 7317 #endif 7318 } 7319 7320 // ------------------------------ Reverse2 7321 7322 // Single lane: no change 7323 template <class D, HWY_IF_LANES_D(D, 1)> 7324 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 7325 return v; 7326 } 7327 7328 // Generic for all vector lengths (128-bit sufficient if SSE2). 7329 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_GT_D(D, 1)> 7330 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 7331 #if HWY_TARGET <= HWY_AVX3 7332 const Repartition<uint32_t, decltype(d)> du32; 7333 return BitCast(d, RotateRight<16>(BitCast(du32, v))); 7334 #elif HWY_TARGET == HWY_SSE2 7335 const RebindToUnsigned<decltype(d)> du; 7336 using VU = VFromD<decltype(du)>; 7337 const VU vu = BitCast(du, v); // for float16_t 7338 constexpr size_t kN = MaxLanes(d); 7339 __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1)); 7340 if (kN > 4) { 7341 shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1)); 7342 } 7343 return BitCast(d, VU{shuf_result}); 7344 #else 7345 const RebindToSigned<decltype(d)> di; 7346 const VFromD<decltype(di)> shuffle = Dup128VecFromValues( 7347 di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C); 7348 return BitCast(d, TableLookupBytes(v, shuffle)); 7349 #endif 7350 } 7351 7352 // Generic for all vector lengths. 7353 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_GT_D(D, 1)> 7354 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 7355 return Shuffle2301(v); 7356 } 7357 7358 // Generic for all vector lengths. 7359 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)> 7360 HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) { 7361 return Shuffle01(v); 7362 } 7363 7364 // ------------------------------ Reverse4 7365 7366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 7367 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 7368 const RebindToUnsigned<decltype(d)> du; 7369 using VU = VFromD<decltype(du)>; 7370 const VU vu = BitCast(du, v); // for float16_t 7371 // 4x 16-bit: a single shufflelo suffices. 7372 constexpr size_t kN = MaxLanes(d); 7373 if (kN <= 4) { 7374 return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); 7375 } 7376 7377 #if HWY_TARGET == HWY_SSE2 7378 return BitCast(d, VU{_mm_shufflehi_epi16( 7379 _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), 7380 _MM_SHUFFLE(0, 1, 2, 3))}); 7381 #else 7382 const RebindToSigned<decltype(d)> di; 7383 const VFromD<decltype(di)> shuffle = Dup128VecFromValues( 7384 di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908); 7385 return BitCast(d, TableLookupBytes(v, shuffle)); 7386 #endif 7387 } 7388 7389 // Generic for all vector lengths. 7390 template <class D, HWY_IF_T_SIZE_D(D, 4)> 7391 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { 7392 return Shuffle0123(v); 7393 } 7394 7395 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 7396 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) { 7397 HWY_ASSERT(0); // don't have 4 u64 lanes 7398 } 7399 7400 // ------------------------------ Reverse8 7401 7402 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 7403 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 7404 #if HWY_TARGET == HWY_SSE2 7405 const RepartitionToWide<decltype(d)> dw; 7406 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); 7407 #else 7408 const RebindToSigned<decltype(d)> di; 7409 const VFromD<decltype(di)> shuffle = Dup128VecFromValues( 7410 di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); 7411 return BitCast(d, TableLookupBytes(v, shuffle)); 7412 #endif 7413 } 7414 7415 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 7416 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 7417 HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) { 7418 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit 7419 } 7420 7421 // ------------------------------ ReverseBits in x86_512 7422 7423 // ------------------------------ InterleaveUpper (UpperHalf) 7424 7425 // Full 7426 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 7427 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 7428 return VFromD<D>{_mm_unpackhi_epi8(a.raw, b.raw)}; 7429 } 7430 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 7431 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 7432 const DFromV<decltype(a)> d; 7433 const RebindToUnsigned<decltype(d)> du; 7434 using VU = VFromD<decltype(du)>; // for float16_t 7435 return BitCast( 7436 d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); 7437 } 7438 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> 7439 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 7440 return VFromD<D>{_mm_unpackhi_epi32(a.raw, b.raw)}; 7441 } 7442 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)> 7443 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 7444 return VFromD<D>{_mm_unpackhi_epi64(a.raw, b.raw)}; 7445 } 7446 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 7447 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 7448 return VFromD<D>{_mm_unpackhi_ps(a.raw, b.raw)}; 7449 } 7450 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 7451 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 7452 return VFromD<D>{_mm_unpackhi_pd(a.raw, b.raw)}; 7453 } 7454 7455 // Partial 7456 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 7457 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 7458 const Half<decltype(d)> d2; 7459 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, 7460 VFromD<D>{UpperHalf(d2, b).raw}); 7461 } 7462 7463 // -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper) 7464 7465 template <int kLane, class T, size_t N, HWY_IF_T_SIZE(T, 1)> 7466 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 7467 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 7468 const DFromV<decltype(v)> d; 7469 7470 #if HWY_TARGET == HWY_SSE2 7471 const Full128<T> d_full; 7472 const Vec128<T> v_full{v.raw}; 7473 const auto v_interleaved = (kLane < 8) 7474 ? InterleaveLower(d_full, v_full, v_full) 7475 : InterleaveUpper(d_full, v_full, v_full); 7476 return ResizeBitCast( 7477 d, Broadcast<kLane & 7>(BitCast(Full128<uint16_t>(), v_interleaved))); 7478 #else 7479 return TableLookupBytes(v, Set(d, static_cast<T>(kLane))); 7480 #endif 7481 } 7482 7483 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 7484 7485 // Same as Interleave*, except that the return lanes are double-width integers; 7486 // this is necessary because the single-lane scalar cannot return two values. 7487 // Generic for all vector lengths. 7488 template <class V, class DW = RepartitionToWide<DFromV<V>>> 7489 HWY_API VFromD<DW> ZipLower(V a, V b) { 7490 return BitCast(DW(), InterleaveLower(a, b)); 7491 } 7492 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 7493 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 7494 return BitCast(dw, InterleaveLower(D(), a, b)); 7495 } 7496 7497 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 7498 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 7499 return BitCast(dw, InterleaveUpper(D(), a, b)); 7500 } 7501 7502 // ================================================== CONVERT (1) 7503 7504 // ------------------------------ PromoteTo unsigned (TableLookupBytesOr0) 7505 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> 7506 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 7507 #if HWY_TARGET >= HWY_SSSE3 7508 const __m128i zero = _mm_setzero_si128(); 7509 return VFromD<D>{_mm_unpacklo_epi8(v.raw, zero)}; 7510 #else 7511 return VFromD<D>{_mm_cvtepu8_epi16(v.raw)}; 7512 #endif 7513 } 7514 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 7515 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 7516 #if HWY_TARGET >= HWY_SSSE3 7517 return VFromD<D>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; 7518 #else 7519 return VFromD<D>{_mm_cvtepu16_epi32(v.raw)}; 7520 #endif 7521 } 7522 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 7523 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 7524 #if HWY_TARGET >= HWY_SSSE3 7525 return VFromD<D>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; 7526 #else 7527 return VFromD<D>{_mm_cvtepu32_epi64(v.raw)}; 7528 #endif 7529 } 7530 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 7531 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 7532 #if HWY_TARGET >= HWY_SSSE3 7533 const __m128i zero = _mm_setzero_si128(); 7534 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); 7535 return VFromD<D>{_mm_unpacklo_epi16(u16, zero)}; 7536 #else 7537 return VFromD<D>{_mm_cvtepu8_epi32(v.raw)}; 7538 #endif 7539 } 7540 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 7541 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { 7542 #if HWY_TARGET > HWY_SSSE3 7543 const Rebind<uint32_t, decltype(d)> du32; 7544 return PromoteTo(d, PromoteTo(du32, v)); 7545 #elif HWY_TARGET == HWY_SSSE3 7546 alignas(16) static constexpr int8_t kShuffle[16] = { 7547 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1}; 7548 const Repartition<int8_t, decltype(d)> di8; 7549 return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); 7550 #else 7551 (void)d; 7552 return VFromD<D>{_mm_cvtepu8_epi64(v.raw)}; 7553 #endif 7554 } 7555 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 7556 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint16_t, D>> v) { 7557 #if HWY_TARGET > HWY_SSSE3 7558 const Rebind<uint32_t, decltype(d)> du32; 7559 return PromoteTo(d, PromoteTo(du32, v)); 7560 #elif HWY_TARGET == HWY_SSSE3 7561 alignas(16) static constexpr int8_t kShuffle[16] = { 7562 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1}; 7563 const Repartition<int8_t, decltype(d)> di8; 7564 return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); 7565 #else 7566 (void)d; 7567 return VFromD<D>{_mm_cvtepu16_epi64(v.raw)}; 7568 #endif 7569 } 7570 7571 // Unsigned to signed: same plus cast. 7572 template <class D, class V, HWY_IF_SIGNED_D(D), HWY_IF_UNSIGNED_V(V), 7573 HWY_IF_LANES_GT(sizeof(TFromD<D>), sizeof(TFromV<V>)), 7574 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> 7575 HWY_API VFromD<D> PromoteTo(D di, V v) { 7576 const RebindToUnsigned<decltype(di)> du; 7577 return BitCast(di, PromoteTo(du, v)); 7578 } 7579 7580 // ------------------------------ PromoteTo signed (ShiftRight, ZipLower) 7581 7582 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> 7583 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 7584 #if HWY_TARGET >= HWY_SSSE3 7585 return ShiftRight<8>(VFromD<D>{_mm_unpacklo_epi8(v.raw, v.raw)}); 7586 #else 7587 return VFromD<D>{_mm_cvtepi8_epi16(v.raw)}; 7588 #endif 7589 } 7590 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 7591 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 7592 #if HWY_TARGET >= HWY_SSSE3 7593 return ShiftRight<16>(VFromD<D>{_mm_unpacklo_epi16(v.raw, v.raw)}); 7594 #else 7595 return VFromD<D>{_mm_cvtepi16_epi32(v.raw)}; 7596 #endif 7597 } 7598 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 7599 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 7600 #if HWY_TARGET >= HWY_SSSE3 7601 return ShiftRight<32>(VFromD<D>{_mm_unpacklo_epi32(v.raw, v.raw)}); 7602 #else 7603 return VFromD<D>{_mm_cvtepi32_epi64(v.raw)}; 7604 #endif 7605 } 7606 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 7607 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 7608 #if HWY_TARGET >= HWY_SSSE3 7609 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); 7610 const __m128i x4 = _mm_unpacklo_epi16(x2, x2); 7611 return ShiftRight<24>(VFromD<D>{x4}); 7612 #else 7613 return VFromD<D>{_mm_cvtepi8_epi32(v.raw)}; 7614 #endif 7615 } 7616 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 7617 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int8_t, D>> v) { 7618 #if HWY_TARGET >= HWY_SSSE3 7619 const Repartition<int32_t, decltype(d)> di32; 7620 const Half<decltype(di32)> dh_i32; 7621 const VFromD<decltype(di32)> x4{PromoteTo(dh_i32, v).raw}; 7622 const VFromD<decltype(di32)> s4{ 7623 _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 7624 return ZipLower(d, x4, s4); 7625 #else 7626 (void)d; 7627 return VFromD<D>{_mm_cvtepi8_epi64(v.raw)}; 7628 #endif 7629 } 7630 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 7631 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) { 7632 #if HWY_TARGET >= HWY_SSSE3 7633 const Repartition<int32_t, decltype(d)> di32; 7634 const Half<decltype(di32)> dh_i32; 7635 const VFromD<decltype(di32)> x2{PromoteTo(dh_i32, v).raw}; 7636 const VFromD<decltype(di32)> s2{ 7637 _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 7638 return ZipLower(d, x2, s2); 7639 #else 7640 (void)d; 7641 return VFromD<D>{_mm_cvtepi16_epi64(v.raw)}; 7642 #endif 7643 } 7644 7645 // -------------------- PromoteTo float (ShiftLeft, IfNegativeThenElse) 7646 #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) 7647 7648 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. 7649 #ifdef HWY_NATIVE_F16C 7650 #undef HWY_NATIVE_F16C 7651 #else 7652 #define HWY_NATIVE_F16C 7653 #endif 7654 7655 // Workaround for origin tracking bug in Clang msan prior to 11.0 7656 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") 7657 #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) 7658 #define HWY_INLINE_F16 HWY_NOINLINE 7659 #else 7660 #define HWY_INLINE_F16 HWY_INLINE 7661 #endif 7662 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 7663 HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) { 7664 #if HWY_HAVE_FLOAT16 7665 const RebindToUnsigned<DFromV<decltype(v)>> du16; 7666 return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)}; 7667 #else 7668 return VFromD<D>{_mm_cvtph_ps(v.raw)}; 7669 #endif 7670 } 7671 7672 #endif // HWY_NATIVE_F16C 7673 7674 #if HWY_HAVE_FLOAT16 7675 7676 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 7677 #undef HWY_NATIVE_PROMOTE_F16_TO_F64 7678 #else 7679 #define HWY_NATIVE_PROMOTE_F16_TO_F64 7680 #endif 7681 7682 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 7683 HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) { 7684 return VFromD<D>{_mm_cvtph_pd(v.raw)}; 7685 } 7686 7687 #endif // HWY_HAVE_FLOAT16 7688 7689 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 7690 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { 7691 const Rebind<uint16_t, decltype(df32)> du16; 7692 const RebindToSigned<decltype(df32)> di32; 7693 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 7694 } 7695 7696 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 7697 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { 7698 return VFromD<D>{_mm_cvtps_pd(v.raw)}; 7699 } 7700 7701 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 7702 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 7703 return VFromD<D>{_mm_cvtepi32_pd(v.raw)}; 7704 } 7705 7706 #if HWY_TARGET <= HWY_AVX3 7707 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 7708 HWY_API VFromD<D> PromoteTo(D /*df64*/, VFromD<Rebind<uint32_t, D>> v) { 7709 return VFromD<D>{_mm_cvtepu32_pd(v.raw)}; 7710 } 7711 #else 7712 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 7713 template <class D, HWY_IF_F64_D(D)> 7714 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) { 7715 const Rebind<int32_t, decltype(df64)> di32; 7716 const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v)); 7717 return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result, 7718 Set(df64, 4294967296.0), 7719 Zero(df64)); 7720 } 7721 #endif // HWY_TARGET <= HWY_AVX3 7722 7723 // ------------------------------ Per4LaneBlockShuffle 7724 namespace detail { 7725 7726 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 7727 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 7728 #else 7729 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 7730 #endif 7731 7732 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 7733 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, 7734 const uint32_t x2, 7735 const uint32_t x1, 7736 const uint32_t x0) { 7737 return ResizeBitCast( 7738 d, Vec128<uint32_t>{_mm_set_epi32( 7739 static_cast<int32_t>(x3), static_cast<int32_t>(x2), 7740 static_cast<int32_t>(x1), static_cast<int32_t>(x0))}); 7741 } 7742 7743 template <size_t kIdx3210, class V> 7744 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 7745 hwy::SizeTag<2> /*lane_size_tag*/, 7746 hwy::SizeTag<8> /*vect_size_tag*/, V v) { 7747 const DFromV<decltype(v)> d; 7748 const RebindToUnsigned<decltype(d)> du; // for float16_t 7749 return BitCast(d, 7750 VFromD<decltype(du)>{_mm_shufflelo_epi16( 7751 BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))}); 7752 } 7753 7754 #if HWY_TARGET == HWY_SSE2 7755 template <size_t kIdx3210, class V> 7756 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 7757 hwy::SizeTag<2> /*lane_size_tag*/, 7758 hwy::SizeTag<16> /*vect_size_tag*/, V v) { 7759 const DFromV<decltype(v)> d; 7760 const RebindToUnsigned<decltype(d)> du; // for float16_t 7761 constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF); 7762 return BitCast( 7763 d, VFromD<decltype(du)>{_mm_shufflehi_epi16( 7764 _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)}); 7765 } 7766 7767 template <size_t kIdx3210, size_t kVectSize, class V, 7768 hwy::EnableIf<(kVectSize == 4 || kVectSize == 8)>* = nullptr> 7769 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, 7770 hwy::SizeTag<1> /*lane_size_tag*/, 7771 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 7772 V v) { 7773 const DFromV<decltype(v)> d; 7774 const RebindToUnsigned<decltype(d)> du; 7775 const Rebind<uint16_t, decltype(d)> du16; 7776 const RebindToSigned<decltype(du16)> di16; 7777 7778 const auto vu16 = PromoteTo(du16, BitCast(du, v)); 7779 const auto shuf16_result = Per4LaneBlockShuffle( 7780 idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<kVectSize * 2>(), vu16); 7781 return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result))); 7782 } 7783 7784 template <size_t kIdx3210, size_t kVectSize, class V> 7785 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, 7786 hwy::SizeTag<1> /*lane_size_tag*/, 7787 hwy::SizeTag<16> /*vect_size_tag*/, V v) { 7788 const DFromV<decltype(v)> d; 7789 const RebindToUnsigned<decltype(d)> du; 7790 const Repartition<uint16_t, decltype(d)> du16; 7791 const RebindToSigned<decltype(du16)> di16; 7792 7793 const auto zero = Zero(d); 7794 const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero)); 7795 const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero)); 7796 7797 const auto lo_shuf_result = Per4LaneBlockShuffle( 7798 idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16); 7799 const auto hi_shuf_result = Per4LaneBlockShuffle( 7800 idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16); 7801 7802 return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result), 7803 BitCast(di16, hi_shuf_result))); 7804 } 7805 #endif 7806 7807 template <size_t kIdx3210, class V, HWY_IF_NOT_FLOAT(TFromV<V>)> 7808 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 7809 hwy::SizeTag<4> /*lane_size_tag*/, 7810 hwy::SizeTag<16> /*vect_size_tag*/, V v) { 7811 return V{_mm_shuffle_epi32(v.raw, static_cast<int>(kIdx3210 & 0xFF))}; 7812 } 7813 7814 template <size_t kIdx3210, class V, HWY_IF_FLOAT(TFromV<V>)> 7815 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 7816 hwy::SizeTag<4> /*lane_size_tag*/, 7817 hwy::SizeTag<16> /*vect_size_tag*/, V v) { 7818 return V{_mm_shuffle_ps(v.raw, v.raw, static_cast<int>(kIdx3210 & 0xFF))}; 7819 } 7820 7821 } // namespace detail 7822 7823 // ------------------------------ SlideUpLanes 7824 7825 namespace detail { 7826 7827 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 7828 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 7829 const DFromV<decltype(v)> d; 7830 const Full64<uint64_t> du64; 7831 const auto vu64 = ResizeBitCast(du64, v); 7832 return ResizeBitCast( 7833 d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8))); 7834 } 7835 7836 #if HWY_TARGET <= HWY_SSSE3 7837 template <class V, HWY_IF_V_SIZE_V(V, 16)> 7838 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 7839 const DFromV<decltype(v)> d; 7840 const Repartition<uint8_t, decltype(d)> du8; 7841 const auto idx = 7842 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>))); 7843 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); 7844 } 7845 #else 7846 template <class V, HWY_IF_V_SIZE_V(V, 16)> 7847 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 7848 const DFromV<decltype(v)> d; 7849 const Repartition<int32_t, decltype(d)> di32; 7850 const Repartition<uint64_t, decltype(d)> du64; 7851 constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>); 7852 7853 const auto vu64 = BitCast(du64, v); 7854 const auto v_hi = IfVecThenElse( 7855 BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))), 7856 BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64); 7857 const auto v_lo = ShiftLeftBytes<8>(du64, v_hi); 7858 7859 const int shl_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63); 7860 return BitCast( 7861 d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt))); 7862 } 7863 #endif 7864 7865 } // namespace detail 7866 7867 template <class D, HWY_IF_LANES_D(D, 1)> 7868 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 7869 return v; 7870 } 7871 7872 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 7873 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 7874 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 7875 if (__builtin_constant_p(amt)) { 7876 switch (amt) { 7877 case 0: 7878 return v; 7879 case 1: 7880 return ShiftLeftLanes<1>(d, v); 7881 } 7882 } 7883 #else 7884 (void)d; 7885 #endif 7886 7887 return detail::SlideUpLanes(v, amt); 7888 } 7889 7890 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 7891 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 7892 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 7893 if (__builtin_constant_p(amt)) { 7894 switch (amt) { 7895 case 0: 7896 return v; 7897 case 1: 7898 return ShiftLeftLanes<1>(d, v); 7899 case 2: 7900 return ShiftLeftLanes<2>(d, v); 7901 case 3: 7902 return ShiftLeftLanes<3>(d, v); 7903 } 7904 } 7905 #else 7906 (void)d; 7907 #endif 7908 7909 return detail::SlideUpLanes(v, amt); 7910 } 7911 7912 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 7913 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 7914 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 7915 if (__builtin_constant_p(amt)) { 7916 switch (amt) { 7917 case 0: 7918 return v; 7919 case 1: 7920 return ShiftLeftLanes<1>(d, v); 7921 case 2: 7922 return ShiftLeftLanes<2>(d, v); 7923 case 3: 7924 return ShiftLeftLanes<3>(d, v); 7925 case 4: 7926 return ShiftLeftLanes<4>(d, v); 7927 case 5: 7928 return ShiftLeftLanes<5>(d, v); 7929 case 6: 7930 return ShiftLeftLanes<6>(d, v); 7931 case 7: 7932 return ShiftLeftLanes<7>(d, v); 7933 } 7934 } 7935 #else 7936 (void)d; 7937 #endif 7938 7939 return detail::SlideUpLanes(v, amt); 7940 } 7941 7942 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 7943 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 7944 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 7945 if (__builtin_constant_p(amt)) { 7946 switch (amt) { 7947 case 0: 7948 return v; 7949 case 1: 7950 return ShiftLeftLanes<1>(d, v); 7951 case 2: 7952 return ShiftLeftLanes<2>(d, v); 7953 case 3: 7954 return ShiftLeftLanes<3>(d, v); 7955 case 4: 7956 return ShiftLeftLanes<4>(d, v); 7957 case 5: 7958 return ShiftLeftLanes<5>(d, v); 7959 case 6: 7960 return ShiftLeftLanes<6>(d, v); 7961 case 7: 7962 return ShiftLeftLanes<7>(d, v); 7963 case 8: 7964 return ShiftLeftLanes<8>(d, v); 7965 case 9: 7966 return ShiftLeftLanes<9>(d, v); 7967 case 10: 7968 return ShiftLeftLanes<10>(d, v); 7969 case 11: 7970 return ShiftLeftLanes<11>(d, v); 7971 case 12: 7972 return ShiftLeftLanes<12>(d, v); 7973 case 13: 7974 return ShiftLeftLanes<13>(d, v); 7975 case 14: 7976 return ShiftLeftLanes<14>(d, v); 7977 case 15: 7978 return ShiftLeftLanes<15>(d, v); 7979 } 7980 } 7981 #else 7982 (void)d; 7983 #endif 7984 7985 return detail::SlideUpLanes(v, amt); 7986 } 7987 7988 // ------------------------------ SlideDownLanes 7989 7990 namespace detail { 7991 7992 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 7993 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 7994 const DFromV<decltype(v)> d; 7995 const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv; 7996 return BitCast(d, 7997 ShiftRightSame(BitCast(dv, v), 7998 static_cast<int>(amt * sizeof(TFromV<V>) * 8))); 7999 } 8000 8001 #if HWY_TARGET <= HWY_SSSE3 8002 template <class V, HWY_IF_V_SIZE_V(V, 16)> 8003 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 8004 const DFromV<decltype(v)> d; 8005 const Repartition<int8_t, decltype(d)> di8; 8006 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>))); 8007 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); 8008 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); 8009 } 8010 #else 8011 template <class V, HWY_IF_V_SIZE_V(V, 16)> 8012 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 8013 const DFromV<decltype(v)> d; 8014 const Repartition<int32_t, decltype(d)> di32; 8015 const Repartition<uint64_t, decltype(d)> du64; 8016 constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV<V>); 8017 8018 const auto vu64 = BitCast(du64, v); 8019 const auto v_lo = IfVecThenElse( 8020 BitCast(du64, Set(di32, -static_cast<int32_t>(amt >= kNumOfLanesPerU64))), 8021 BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64); 8022 const auto v_hi = ShiftRightBytes<8>(du64, v_lo); 8023 8024 const int shr_amt = static_cast<int>((amt * sizeof(TFromV<V>) * 8) & 63); 8025 return BitCast( 8026 d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt))); 8027 } 8028 #endif 8029 8030 } // namespace detail 8031 8032 template <class D, HWY_IF_LANES_D(D, 1)> 8033 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 8034 return v; 8035 } 8036 8037 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 8038 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 8039 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 8040 if (__builtin_constant_p(amt)) { 8041 switch (amt) { 8042 case 0: 8043 return v; 8044 case 1: 8045 return ShiftRightLanes<1>(d, v); 8046 } 8047 } 8048 #else 8049 (void)d; 8050 #endif 8051 8052 return detail::SlideDownLanes(v, amt); 8053 } 8054 8055 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 8056 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 8057 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 8058 if (__builtin_constant_p(amt)) { 8059 switch (amt) { 8060 case 0: 8061 return v; 8062 case 1: 8063 return ShiftRightLanes<1>(d, v); 8064 case 2: 8065 return ShiftRightLanes<2>(d, v); 8066 case 3: 8067 return ShiftRightLanes<3>(d, v); 8068 } 8069 } 8070 #else 8071 (void)d; 8072 #endif 8073 8074 return detail::SlideDownLanes(v, amt); 8075 } 8076 8077 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 8078 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 8079 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 8080 if (__builtin_constant_p(amt)) { 8081 switch (amt) { 8082 case 0: 8083 return v; 8084 case 1: 8085 return ShiftRightLanes<1>(d, v); 8086 case 2: 8087 return ShiftRightLanes<2>(d, v); 8088 case 3: 8089 return ShiftRightLanes<3>(d, v); 8090 case 4: 8091 return ShiftRightLanes<4>(d, v); 8092 case 5: 8093 return ShiftRightLanes<5>(d, v); 8094 case 6: 8095 return ShiftRightLanes<6>(d, v); 8096 case 7: 8097 return ShiftRightLanes<7>(d, v); 8098 } 8099 } 8100 #else 8101 (void)d; 8102 #endif 8103 8104 return detail::SlideDownLanes(v, amt); 8105 } 8106 8107 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 8108 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 8109 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 8110 if (__builtin_constant_p(amt)) { 8111 switch (amt) { 8112 case 0: 8113 return v; 8114 case 1: 8115 return ShiftRightLanes<1>(d, v); 8116 case 2: 8117 return ShiftRightLanes<2>(d, v); 8118 case 3: 8119 return ShiftRightLanes<3>(d, v); 8120 case 4: 8121 return ShiftRightLanes<4>(d, v); 8122 case 5: 8123 return ShiftRightLanes<5>(d, v); 8124 case 6: 8125 return ShiftRightLanes<6>(d, v); 8126 case 7: 8127 return ShiftRightLanes<7>(d, v); 8128 case 8: 8129 return ShiftRightLanes<8>(d, v); 8130 case 9: 8131 return ShiftRightLanes<9>(d, v); 8132 case 10: 8133 return ShiftRightLanes<10>(d, v); 8134 case 11: 8135 return ShiftRightLanes<11>(d, v); 8136 case 12: 8137 return ShiftRightLanes<12>(d, v); 8138 case 13: 8139 return ShiftRightLanes<13>(d, v); 8140 case 14: 8141 return ShiftRightLanes<14>(d, v); 8142 case 15: 8143 return ShiftRightLanes<15>(d, v); 8144 } 8145 } 8146 #else 8147 (void)d; 8148 #endif 8149 8150 return detail::SlideDownLanes(v, amt); 8151 } 8152 8153 // ================================================== MEMORY (4) 8154 8155 // ------------------------------ StoreN (ExtractLane) 8156 8157 #if HWY_TARGET <= HWY_AVX2 8158 8159 #ifdef HWY_NATIVE_STORE_N 8160 #undef HWY_NATIVE_STORE_N 8161 #else 8162 #define HWY_NATIVE_STORE_N 8163 #endif 8164 8165 template <class D, HWY_IF_T_SIZE_ONE_OF_D( 8166 D, (HWY_TARGET <= HWY_AVX3 ? ((1 << 1) | (1 << 2)) : 0) | 8167 (1 << 4) | (1 << 8))> 8168 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p, 8169 size_t max_lanes_to_store) { 8170 const size_t num_lanes_to_store = 8171 HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); 8172 8173 #if HWY_COMPILER_MSVC 8174 // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore 8175 HWY_FENCE; 8176 #endif 8177 8178 BlendedStore(v, FirstN(d, num_lanes_to_store), d, p); 8179 8180 #if HWY_COMPILER_MSVC 8181 // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore 8182 HWY_FENCE; 8183 #endif 8184 8185 detail::MaybeUnpoison(p, num_lanes_to_store); 8186 } 8187 8188 #if HWY_TARGET > HWY_AVX3 8189 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 8190 HWY_IF_LANES_D(D, 1)> 8191 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p, 8192 size_t max_lanes_to_store) { 8193 if (max_lanes_to_store > 0) { 8194 StoreU(v, d, p); 8195 } 8196 } 8197 8198 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 8199 HWY_IF_LANES_D(D, 2)> 8200 HWY_API void StoreN(VFromD<D> v, D /*d*/, TFromD<D>* HWY_RESTRICT p, 8201 size_t max_lanes_to_store) { 8202 if (max_lanes_to_store >= 1) { 8203 p[static_cast<size_t>(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v); 8204 p[0] = GetLane(v); 8205 } 8206 } 8207 8208 namespace detail { 8209 8210 template <class D, HWY_IF_T_SIZE_D(D, 1)> 8211 HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/, 8212 TFromD<D>* HWY_RESTRICT p, 8213 size_t num_lanes_to_store) { 8214 // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if 8215 // (num_lanes_to_store & 3) != 0 is true 8216 const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing); 8217 if ((num_lanes_to_store & 2) != 0) { 8218 const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128)); 8219 p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128); 8220 CopyBytes<sizeof(uint16_t)>(&u16_bits, 8221 p + (num_lanes_to_store & ~size_t{3})); 8222 } else { 8223 p[num_lanes_to_store - 1] = GetLane(v_full128); 8224 } 8225 } 8226 8227 template <class D, HWY_IF_T_SIZE_D(D, 2)> 8228 HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/, 8229 TFromD<D>* p, 8230 size_t num_lanes_to_store) { 8231 // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16 8232 // vector if (num_lanes_to_store & 1) == 1 is true 8233 p[num_lanes_to_store - 1] = GetLane(v_trailing); 8234 } 8235 8236 } // namespace detail 8237 8238 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 8239 HWY_IF_LANES_GT_D(D, 2)> 8240 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) { 8241 const size_t num_lanes_to_store = 8242 HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); 8243 8244 const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))> 8245 d_full; 8246 const RebindToUnsigned<decltype(d_full)> du_full; 8247 const Repartition<int32_t, decltype(d_full)> di32_full; 8248 8249 const auto i32_store_mask = BitCast( 8250 di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store))); 8251 const auto vi32 = ResizeBitCast(di32_full, v); 8252 8253 #if HWY_COMPILER_MSVC 8254 // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore 8255 HWY_FENCE; 8256 #endif 8257 8258 BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full, 8259 reinterpret_cast<int32_t*>(p)); 8260 8261 constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>); 8262 constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1; 8263 const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask); 8264 8265 if (trailing_n != 0) { 8266 const VFromD<D> v_trailing = ResizeBitCast( 8267 d, SlideDownLanes(di32_full, vi32, 8268 num_lanes_to_store / kNumOfLanesPerI32)); 8269 detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store); 8270 } 8271 8272 #if HWY_COMPILER_MSVC 8273 // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore 8274 HWY_FENCE; 8275 #endif 8276 8277 detail::MaybeUnpoison(p, num_lanes_to_store); 8278 } 8279 #endif // HWY_TARGET > HWY_AVX3 8280 #endif // HWY_TARGET <= HWY_AVX2 8281 8282 // ================================================== COMBINE 8283 8284 // ------------------------------ Combine (InterleaveLower) 8285 8286 // N = N/2 + N/2 (upper half undefined) 8287 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> 8288 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { 8289 const Half<decltype(d)> dh; 8290 const RebindToUnsigned<decltype(dh)> duh; 8291 // Treat half-width input as one lane, and expand to two lanes. 8292 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; 8293 const VU lo{BitCast(duh, lo_half).raw}; 8294 const VU hi{BitCast(duh, hi_half).raw}; 8295 return BitCast(d, InterleaveLower(lo, hi)); 8296 } 8297 8298 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) 8299 8300 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)> 8301 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 8302 const RebindToUnsigned<decltype(d)> du; 8303 const Half<decltype(du)> duh; 8304 return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)}); 8305 } 8306 8307 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)> 8308 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 8309 const Half<D> dh; 8310 return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); 8311 } 8312 8313 #if HWY_HAVE_FLOAT16 8314 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 8315 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 8316 const RebindToUnsigned<decltype(d)> du; 8317 const Half<decltype(du)> duh; 8318 return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo))); 8319 } 8320 #endif 8321 8322 // Generic for all vector lengths. 8323 template <class D, HWY_X86_IF_EMULATED_D(D)> 8324 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 8325 const RebindToUnsigned<decltype(d)> du; 8326 const Half<decltype(du)> duh; 8327 return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo))); 8328 } 8329 8330 // ------------------------------ Concat full (InterleaveLower) 8331 8332 // hiH,hiL loH,loL |-> hiL,loL (= lower halves) 8333 template <class D, HWY_IF_V_SIZE_D(D, 16)> 8334 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 8335 const Repartition<uint64_t, decltype(d)> d64; 8336 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); 8337 } 8338 8339 // hiH,hiL loH,loL |-> hiH,loH (= upper halves) 8340 template <class D, HWY_IF_V_SIZE_D(D, 16)> 8341 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 8342 const Repartition<uint64_t, decltype(d)> d64; 8343 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); 8344 } 8345 8346 // hiH,hiL loH,loL |-> hiL,loH (= inner halves) 8347 template <class D, HWY_IF_V_SIZE_D(D, 16)> 8348 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 8349 return CombineShiftRightBytes<8>(d, hi, lo); 8350 } 8351 8352 // hiH,hiL loH,loL |-> hiH,loL (= outer halves) 8353 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)> 8354 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 8355 const Repartition<double, decltype(d)> dd; 8356 #if HWY_TARGET >= HWY_SSSE3 8357 return BitCast( 8358 d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw, 8359 _MM_SHUFFLE2(1, 0))}); 8360 #else 8361 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle. 8362 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw, 8363 BitCast(dd, lo).raw, 1)}); 8364 #endif 8365 } 8366 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 8367 HWY_API Vec128<float> ConcatUpperLower(D d, Vec128<float> hi, 8368 Vec128<float> lo) { 8369 #if HWY_TARGET >= HWY_SSSE3 8370 (void)d; 8371 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; 8372 #else 8373 // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle. 8374 const RepartitionToWide<decltype(d)> dd; 8375 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw, 8376 BitCast(dd, lo).raw, 1)}); 8377 #endif 8378 } 8379 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 8380 HWY_API Vec128<double> ConcatUpperLower(D /* tag */, Vec128<double> hi, 8381 Vec128<double> lo) { 8382 #if HWY_TARGET >= HWY_SSSE3 8383 return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; 8384 #else 8385 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. 8386 return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)}; 8387 #endif 8388 } 8389 8390 // ------------------------------ Concat partial (Combine, LowerHalf) 8391 8392 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 8393 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 8394 const Half<decltype(d)> d2; 8395 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); 8396 } 8397 8398 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 8399 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 8400 const Half<decltype(d)> d2; 8401 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); 8402 } 8403 8404 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 8405 HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi, 8406 const VFromD<D> lo) { 8407 const Half<decltype(d)> d2; 8408 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); 8409 } 8410 8411 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 8412 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 8413 const Half<decltype(d)> d2; 8414 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); 8415 } 8416 8417 // ------------------------------ ConcatOdd 8418 8419 // 8-bit full 8420 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 8421 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8422 const Repartition<uint16_t, decltype(d)> dw; 8423 // Right-shift 8 bits per u16 so we can pack. 8424 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); 8425 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); 8426 return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)}; 8427 } 8428 8429 // 8-bit x8 8430 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 8431 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8432 #if HWY_TARGET == HWY_SSE2 8433 const Repartition<uint16_t, decltype(d)> dw; 8434 // Right-shift 8 bits per u16 so we can pack. 8435 const Vec64<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); 8436 const Vec64<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); 8437 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), 8438 _MM_SHUFFLE(2, 0, 2, 0))}; 8439 #else 8440 const Repartition<uint32_t, decltype(d)> du32; 8441 // Don't care about upper half, no need to zero. 8442 alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7}; 8443 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8)); 8444 const VFromD<D> L = TableLookupBytes(lo, shuf); 8445 const VFromD<D> H = TableLookupBytes(hi, shuf); 8446 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); 8447 #endif 8448 } 8449 8450 // 8-bit x4 8451 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)> 8452 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8453 #if HWY_TARGET == HWY_SSE2 8454 const Repartition<uint16_t, decltype(d)> dw; 8455 const Twice<decltype(dw)> dw_2; 8456 // Right-shift 8 bits per u16 so we can pack. 8457 const Vec32<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi)); 8458 const Vec32<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo)); 8459 const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL); 8460 return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)}; 8461 #else 8462 const Repartition<uint16_t, decltype(d)> du16; 8463 // Don't care about upper half, no need to zero. 8464 alignas(16) const uint8_t kCompactOddU8[4] = {1, 3}; 8465 const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8)); 8466 const VFromD<D> L = TableLookupBytes(lo, shuf); 8467 const VFromD<D> H = TableLookupBytes(hi, shuf); 8468 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); 8469 #endif 8470 } 8471 8472 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 8473 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8474 // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns 8475 // 0xFFFF8000, which correctly saturates to 0x8000. 8476 const RebindToUnsigned<decltype(d)> du; 8477 const Repartition<int32_t, decltype(d)> dw; 8478 const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi)); 8479 const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo)); 8480 return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)}); 8481 } 8482 8483 // 16-bit x4 8484 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 8485 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8486 #if HWY_TARGET == HWY_SSE2 8487 // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns 8488 // 0xFFFF8000, which correctly saturates to 0x8000. 8489 const Repartition<int32_t, decltype(d)> dw; 8490 const Vec64<int32_t> uH = ShiftRight<16>(BitCast(dw, hi)); 8491 const Vec64<int32_t> uL = ShiftRight<16>(BitCast(dw, lo)); 8492 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw), 8493 _MM_SHUFFLE(2, 0, 2, 0))}; 8494 #else 8495 const Repartition<uint32_t, decltype(d)> du32; 8496 // Don't care about upper half, no need to zero. 8497 alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7}; 8498 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16)); 8499 const VFromD<D> L = TableLookupBytes(lo, shuf); 8500 const VFromD<D> H = TableLookupBytes(hi, shuf); 8501 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); 8502 #endif 8503 } 8504 8505 // 32-bit full 8506 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 8507 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8508 const RebindToFloat<decltype(d)> df; 8509 return BitCast( 8510 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, 8511 _MM_SHUFFLE(3, 1, 3, 1))}); 8512 } 8513 8514 // Any type x2 8515 template <class D, HWY_IF_LANES_D(D, 2)> 8516 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 8517 return InterleaveUpper(d, lo, hi); 8518 } 8519 8520 // ------------------------------ ConcatEven (InterleaveLower) 8521 8522 // 8-bit full 8523 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 8524 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8525 const Repartition<uint16_t, decltype(d)> dw; 8526 // Isolate lower 8 bits per u16 so we can pack. 8527 const Vec128<uint16_t> mask = Set(dw, 0x00FF); 8528 const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask); 8529 const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask); 8530 return VFromD<D>{_mm_packus_epi16(uL.raw, uH.raw)}; 8531 } 8532 8533 // 8-bit x8 8534 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 8535 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8536 #if HWY_TARGET == HWY_SSE2 8537 const Repartition<uint16_t, decltype(d)> dw; 8538 // Isolate lower 8 bits per u16 so we can pack. 8539 const Vec64<uint16_t> mask = Set(dw, 0x00FF); 8540 const Vec64<uint16_t> uH = And(BitCast(dw, hi), mask); 8541 const Vec64<uint16_t> uL = And(BitCast(dw, lo), mask); 8542 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), 8543 _MM_SHUFFLE(2, 0, 2, 0))}; 8544 #else 8545 const Repartition<uint32_t, decltype(d)> du32; 8546 // Don't care about upper half, no need to zero. 8547 alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6}; 8548 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8)); 8549 const VFromD<D> L = TableLookupBytes(lo, shuf); 8550 const VFromD<D> H = TableLookupBytes(hi, shuf); 8551 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); 8552 #endif 8553 } 8554 8555 // 8-bit x4 8556 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_D(D, 1)> 8557 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8558 #if HWY_TARGET == HWY_SSE2 8559 const Repartition<uint16_t, decltype(d)> dw; 8560 const Twice<decltype(dw)> dw_2; 8561 // Isolate lower 8 bits per u16 so we can pack. 8562 const Vec32<uint16_t> mask = Set(dw, 0x00FF); 8563 const Vec32<uint16_t> uH = And(BitCast(dw, hi), mask); 8564 const Vec32<uint16_t> uL = And(BitCast(dw, lo), mask); 8565 const Vec64<uint16_t> uHL = Combine(dw_2, uH, uL); 8566 return VFromD<D>{_mm_packus_epi16(uHL.raw, uHL.raw)}; 8567 #else 8568 const Repartition<uint16_t, decltype(d)> du16; 8569 // Don't care about upper half, no need to zero. 8570 alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2}; 8571 const VFromD<D> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8)); 8572 const VFromD<D> L = TableLookupBytes(lo, shuf); 8573 const VFromD<D> H = TableLookupBytes(hi, shuf); 8574 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); 8575 #endif 8576 } 8577 8578 // 16-bit full 8579 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 8580 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8581 #if HWY_TARGET <= HWY_SSE4 8582 // Isolate lower 16 bits per u32 so we can pack. 8583 const RebindToUnsigned<decltype(d)> du; // for float16_t 8584 const Repartition<uint32_t, decltype(d)> dw; 8585 const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF); 8586 const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask); 8587 const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask); 8588 return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)}); 8589 #elif HWY_TARGET == HWY_SSE2 8590 const Repartition<uint32_t, decltype(d)> dw; 8591 return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), 8592 BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); 8593 #else 8594 const RebindToUnsigned<decltype(d)> du; 8595 // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two 8596 // inputs, then concatenate them. 8597 alignas(16) 8598 const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; 8599 const VFromD<D> shuf = BitCast(d, Load(du, kCompactEvenU16)); 8600 const VFromD<D> L = TableLookupBytes(lo, shuf); 8601 const VFromD<D> H = TableLookupBytes(hi, shuf); 8602 return ConcatLowerLower(d, H, L); 8603 #endif 8604 } 8605 8606 // 16-bit x4 8607 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 8608 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8609 #if HWY_TARGET == HWY_SSE2 8610 const Repartition<uint32_t, decltype(d)> dw; 8611 return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), 8612 BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); 8613 #else 8614 const Repartition<uint32_t, decltype(d)> du32; 8615 // Don't care about upper half, no need to zero. 8616 alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5}; 8617 const VFromD<D> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16)); 8618 const VFromD<D> L = TableLookupBytes(lo, shuf); 8619 const VFromD<D> H = TableLookupBytes(hi, shuf); 8620 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); 8621 #endif 8622 } 8623 8624 // 32-bit full 8625 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> 8626 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8627 const RebindToFloat<decltype(d)> df; 8628 return BitCast( 8629 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, 8630 _MM_SHUFFLE(2, 0, 2, 0))}); 8631 } 8632 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 8633 HWY_API VFromD<D> ConcatEven(D /* d */, VFromD<D> hi, VFromD<D> lo) { 8634 return VFromD<D>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; 8635 } 8636 8637 // Any T x2 8638 template <class D, HWY_IF_LANES_D(D, 2)> 8639 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 8640 return InterleaveLower(d, lo, hi); 8641 } 8642 8643 // ------------------------------ DupEven (InterleaveLower) 8644 8645 template <typename T> 8646 HWY_API Vec128<T, 1> DupEven(const Vec128<T, 1> v) { 8647 return v; 8648 } 8649 8650 template <typename T> 8651 HWY_API Vec128<T, 2> DupEven(const Vec128<T, 2> v) { 8652 return InterleaveLower(DFromV<decltype(v)>(), v, v); 8653 } 8654 8655 template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 2)> 8656 HWY_API V DupEven(V v) { 8657 const DFromV<decltype(v)> d; 8658 8659 #if HWY_TARGET <= HWY_SSSE3 8660 const RebindToUnsigned<decltype(d)> du; 8661 const VFromD<decltype(du)> shuffle = Dup128VecFromValues( 8662 du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); 8663 return TableLookupBytes(v, BitCast(d, shuffle)); 8664 #else 8665 const Repartition<uint16_t, decltype(d)> du16; 8666 return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})), 8667 BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v); 8668 #endif 8669 } 8670 8671 template <typename T, HWY_IF_T_SIZE(T, 2)> 8672 HWY_API Vec64<T> DupEven(const Vec64<T> v) { 8673 const DFromV<decltype(v)> d; 8674 const RebindToUnsigned<decltype(d)> du; // for float16_t 8675 return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16( 8676 BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))}); 8677 } 8678 8679 // Generic for all vector lengths. 8680 template <class V, HWY_IF_T_SIZE_V(V, 2)> 8681 HWY_API V DupEven(const V v) { 8682 const DFromV<decltype(v)> d; 8683 const RebindToUnsigned<decltype(d)> du; // for float16_t 8684 #if HWY_TARGET <= HWY_SSSE3 8685 const VFromD<decltype(du)> shuffle = Dup128VecFromValues( 8686 du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c); 8687 return TableLookupBytes(v, BitCast(d, shuffle)); 8688 #else 8689 return BitCast( 8690 d, VFromD<decltype(du)>{_mm_shufflehi_epi16( 8691 _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)), 8692 _MM_SHUFFLE(2, 2, 0, 0))}); 8693 #endif 8694 } 8695 8696 template <typename T, HWY_IF_UI32(T)> 8697 HWY_API Vec128<T> DupEven(Vec128<T> v) { 8698 return Vec128<T>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; 8699 } 8700 8701 HWY_API Vec128<float> DupEven(Vec128<float> v) { 8702 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; 8703 } 8704 8705 // ------------------------------ DupOdd (InterleaveUpper) 8706 8707 template <typename T, HWY_IF_T_SIZE(T, 1)> 8708 HWY_API Vec128<T, 1> DupOdd(Vec128<T, 1> v) { 8709 return v; 8710 } 8711 8712 template <typename V, HWY_IF_T_SIZE_V(V, 1), HWY_IF_V_SIZE_GT_V(V, 1)> 8713 HWY_API V DupOdd(V v) { 8714 const DFromV<decltype(v)> d; 8715 8716 #if HWY_TARGET <= HWY_SSSE3 8717 const RebindToUnsigned<decltype(d)> du; 8718 const VFromD<decltype(du)> shuffle = Dup128VecFromValues( 8719 du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); 8720 return TableLookupBytes(v, BitCast(d, shuffle)); 8721 #else 8722 const Repartition<uint16_t, decltype(d)> du16; 8723 return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})), 8724 BitCast(d, ShiftRight<8>(BitCast(du16, v))), v); 8725 #endif 8726 } 8727 8728 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)> 8729 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 8730 const DFromV<decltype(v)> d; 8731 const RebindToUnsigned<decltype(d)> du; // for float16_t 8732 return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16( 8733 BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))}); 8734 } 8735 8736 // Generic for all vector lengths. 8737 template <typename V, HWY_IF_T_SIZE_V(V, 2), HWY_IF_V_SIZE_GT_V(V, 8)> 8738 HWY_API V DupOdd(V v) { 8739 const DFromV<decltype(v)> d; 8740 const RebindToUnsigned<decltype(d)> du; // for float16_t 8741 #if HWY_TARGET <= HWY_SSSE3 8742 const VFromD<decltype(du)> shuffle = Dup128VecFromValues( 8743 du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e); 8744 return TableLookupBytes(v, BitCast(d, shuffle)); 8745 #else 8746 return BitCast( 8747 d, VFromD<decltype(du)>{_mm_shufflehi_epi16( 8748 _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)), 8749 _MM_SHUFFLE(3, 3, 1, 1))}); 8750 #endif 8751 } 8752 8753 template <typename T, size_t N, HWY_IF_UI32(T)> 8754 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 8755 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 8756 } 8757 template <size_t N> 8758 HWY_API Vec128<float, N> DupOdd(Vec128<float, N> v) { 8759 return Vec128<float, N>{ 8760 _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 8761 } 8762 8763 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 8764 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { 8765 return InterleaveUpper(DFromV<decltype(v)>(), v, v); 8766 } 8767 8768 // ------------------------------ TwoTablesLookupLanes (DupEven) 8769 8770 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 8771 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, 8772 Indices128<T, N> idx) { 8773 const DFromV<decltype(a)> d; 8774 const Twice<decltype(d)> dt; 8775 // TableLookupLanes currently requires table and index vectors to be the same 8776 // size, though a half-length index vector would be sufficient here. 8777 #if HWY_IS_MSAN 8778 const Vec128<T, N> idx_vec{idx.raw}; 8779 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; 8780 #else 8781 // We only keep LowerHalf of the result, which is valid in idx. 8782 const Indices128<T, N * 2> idx2{idx.raw}; 8783 #endif 8784 return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); 8785 } 8786 8787 template <typename T, HWY_IF_T_SIZE(T, 1)> 8788 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 8789 Indices128<T> idx) { 8790 #if HWY_TARGET <= HWY_AVX3_DL 8791 return Vec128<T>{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)}; 8792 #else // AVX3 or below 8793 const DFromV<decltype(a)> d; 8794 const Vec128<T> idx_vec{idx.raw}; 8795 8796 #if HWY_TARGET <= HWY_SSE4 8797 const Repartition<uint16_t, decltype(d)> du16; 8798 const auto sel_hi_mask = 8799 MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); 8800 #else 8801 const RebindToSigned<decltype(d)> di; 8802 const auto sel_hi_mask = 8803 RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15})); 8804 #endif 8805 8806 const auto lo_lookup_result = TableLookupBytes(a, idx_vec); 8807 #if HWY_TARGET <= HWY_AVX3 8808 const Vec128<T> lookup_result{_mm_mask_shuffle_epi8( 8809 lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; 8810 return lookup_result; 8811 #else 8812 const auto hi_lookup_result = TableLookupBytes(b, idx_vec); 8813 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); 8814 #endif // HWY_TARGET <= HWY_AVX3 8815 #endif // HWY_TARGET <= HWY_AVX3_DL 8816 } 8817 8818 template <typename T, HWY_IF_T_SIZE(T, 2)> 8819 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 8820 Indices128<T> idx) { 8821 #if HWY_TARGET <= HWY_AVX3 8822 return Vec128<T>{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)}; 8823 #elif HWY_TARGET == HWY_SSE2 8824 const DFromV<decltype(a)> d; 8825 const RebindToSigned<decltype(d)> di; 8826 const Vec128<T> idx_vec{idx.raw}; 8827 const auto sel_hi_mask = 8828 RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7})); 8829 const auto lo_lookup_result = TableLookupLanes(a, idx); 8830 const auto hi_lookup_result = TableLookupLanes(b, idx); 8831 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); 8832 #else 8833 const DFromV<decltype(a)> d; 8834 const Repartition<uint8_t, decltype(d)> du8; 8835 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), 8836 Indices128<uint8_t>{idx.raw})); 8837 #endif 8838 } 8839 8840 template <typename T, HWY_IF_UI32(T)> 8841 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 8842 Indices128<T> idx) { 8843 #if HWY_TARGET <= HWY_AVX3 8844 return Vec128<T>{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)}; 8845 #else // AVX2 or below 8846 const DFromV<decltype(a)> d; 8847 8848 #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 8849 const Vec128<T> idx_vec{idx.raw}; 8850 8851 #if HWY_TARGET <= HWY_AVX2 8852 const RebindToFloat<decltype(d)> d_sel; 8853 const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec))); 8854 #else 8855 const RebindToSigned<decltype(d)> d_sel; 8856 const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3}); 8857 #endif 8858 8859 const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx)); 8860 const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx)); 8861 return BitCast(d, 8862 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); 8863 #else // SSSE3 or SSE4 8864 const Repartition<uint8_t, decltype(d)> du8; 8865 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), 8866 Indices128<uint8_t>{idx.raw})); 8867 #endif // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 8868 #endif // HWY_TARGET <= HWY_AVX3 8869 } 8870 8871 #if HWY_HAVE_FLOAT16 8872 HWY_API Vec128<float16_t> TwoTablesLookupLanes(Vec128<float16_t> a, 8873 Vec128<float16_t> b, 8874 Indices128<float16_t> idx) { 8875 return Vec128<float16_t>{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)}; 8876 } 8877 #endif // HWY_HAVE_FLOAT16 8878 HWY_API Vec128<float> TwoTablesLookupLanes(Vec128<float> a, Vec128<float> b, 8879 Indices128<float> idx) { 8880 #if HWY_TARGET <= HWY_AVX3 8881 return Vec128<float>{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)}; 8882 #elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 8883 const DFromV<decltype(a)> d; 8884 8885 #if HWY_TARGET <= HWY_AVX2 8886 const auto sel_hi_mask = 8887 MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128<int32_t>{idx.raw}))); 8888 #else 8889 const RebindToSigned<decltype(d)> di; 8890 const auto sel_hi_mask = 8891 RebindMask(d, Vec128<int32_t>{idx.raw} > Set(di, int32_t{3})); 8892 #endif 8893 8894 const auto lo_lookup_result = TableLookupLanes(a, idx); 8895 const auto hi_lookup_result = TableLookupLanes(b, idx); 8896 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); 8897 #else // SSSE3 or SSE4 8898 const DFromV<decltype(a)> d; 8899 const Repartition<uint8_t, decltype(d)> du8; 8900 return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), 8901 Indices128<uint8_t>{idx.raw})); 8902 #endif 8903 } 8904 8905 template <typename T, HWY_IF_UI64(T)> 8906 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 8907 Indices128<T> idx) { 8908 #if HWY_TARGET <= HWY_AVX3 8909 return Vec128<T>{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)}; 8910 #else 8911 const DFromV<decltype(a)> d; 8912 const Vec128<T> idx_vec{idx.raw}; 8913 const Indices128<T> idx_mod{And(idx_vec, Set(d, T{1})).raw}; 8914 8915 #if HWY_TARGET <= HWY_SSE4 8916 const RebindToFloat<decltype(d)> d_sel; 8917 const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec))); 8918 #else // SSE2 or SSSE3 8919 const Repartition<int32_t, decltype(d)> di32; 8920 const RebindToSigned<decltype(d)> d_sel; 8921 const auto sel_hi_mask = MaskFromVec( 8922 BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > 8923 Set(di32, int32_t{1})))); 8924 #endif // HWY_TARGET <= HWY_SSE4 8925 8926 const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod)); 8927 const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod)); 8928 return BitCast(d, 8929 IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); 8930 #endif // HWY_TARGET <= HWY_AVX3 8931 } 8932 8933 HWY_API Vec128<double> TwoTablesLookupLanes(Vec128<double> a, Vec128<double> b, 8934 Indices128<double> idx) { 8935 #if HWY_TARGET <= HWY_AVX3 8936 return Vec128<double>{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)}; 8937 #else 8938 const DFromV<decltype(a)> d; 8939 const RebindToSigned<decltype(d)> di; 8940 const Vec128<int64_t> idx_vec{idx.raw}; 8941 const Indices128<double> idx_mod{And(idx_vec, Set(di, int64_t{1})).raw}; 8942 8943 #if HWY_TARGET <= HWY_SSE4 8944 const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec))); 8945 #else // SSE2 or SSSE3 8946 const Repartition<int32_t, decltype(d)> di32; 8947 const auto sel_hi_mask = 8948 MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > 8949 Set(di32, int32_t{1})))); 8950 #endif // HWY_TARGET <= HWY_SSE4 8951 8952 const auto lo_lookup_result = TableLookupLanes(a, idx_mod); 8953 const auto hi_lookup_result = TableLookupLanes(b, idx_mod); 8954 return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); 8955 #endif // HWY_TARGET <= HWY_AVX3 8956 } 8957 8958 // ------------------------------ OddEven (IfThenElse) 8959 8960 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 8961 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 8962 const DFromV<decltype(a)> d; 8963 const Repartition<uint8_t, decltype(d)> d8; 8964 alignas(16) static constexpr uint8_t mask[16] = { 8965 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; 8966 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); 8967 } 8968 8969 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 8970 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 8971 const DFromV<decltype(a)> d; 8972 #if HWY_TARGET >= HWY_SSSE3 8973 const Repartition<uint8_t, decltype(d)> d8; 8974 alignas(16) static constexpr uint8_t mask[16] = { 8975 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; 8976 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); 8977 #else 8978 const RebindToUnsigned<decltype(d)> du; // for float16_t 8979 return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16( 8980 BitCast(du, a).raw, BitCast(du, b).raw, 0x55)}); 8981 #endif 8982 } 8983 8984 template <typename T, size_t N, HWY_IF_UI32(T)> 8985 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 8986 #if HWY_TARGET >= HWY_SSSE3 8987 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); 8988 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); 8989 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)}; 8990 #else 8991 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle. 8992 const DFromV<decltype(a)> d; 8993 const RebindToFloat<decltype(d)> df; 8994 return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw, 8995 BitCast(df, b).raw, 5)}); 8996 #endif 8997 } 8998 8999 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 9000 HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 9001 // Same as ConcatUpperLower for full vectors; do not call that because this 9002 // is more efficient for 64x1 vectors. 9003 const DFromV<decltype(a)> d; 9004 const RebindToFloat<decltype(d)> dd; 9005 #if HWY_TARGET >= HWY_SSSE3 9006 return BitCast( 9007 d, Vec128<double, N>{_mm_shuffle_pd( 9008 BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))}); 9009 #else 9010 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. 9011 return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw, 9012 BitCast(dd, b).raw, 1)}); 9013 #endif 9014 } 9015 9016 template <size_t N> 9017 HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) { 9018 #if HWY_TARGET >= HWY_SSSE3 9019 // SHUFPS must fill the lower half of the output from one input, so we 9020 // need another shuffle. Unpack avoids another immediate byte. 9021 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); 9022 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); 9023 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)}; 9024 #else 9025 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)}; 9026 #endif 9027 } 9028 9029 // -------------------------- InterleaveEven 9030 9031 template <class D, HWY_IF_LANES_LE_D(D, 2)> 9032 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 9033 return ConcatEven(d, b, a); 9034 } 9035 9036 // I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes 9037 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> 9038 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 9039 const Repartition<uint16_t, decltype(d)> du16; 9040 return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a); 9041 } 9042 9043 // I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes 9044 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)> 9045 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 9046 const Repartition<uint32_t, decltype(d)> du32; 9047 return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a); 9048 } 9049 9050 #if HWY_TARGET <= HWY_AVX3 9051 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)> 9052 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 9053 return VFromD<D>{_mm_mask_shuffle_epi32( 9054 a.raw, static_cast<__mmask8>(0x0A), b.raw, 9055 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))}; 9056 } 9057 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)> 9058 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 9059 return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A), 9060 b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))}; 9061 } 9062 #else 9063 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)> 9064 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 9065 const RebindToFloat<decltype(d)> df; 9066 const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a)); 9067 return BitCast( 9068 d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, 9069 _MM_SHUFFLE(3, 1, 2, 0))}); 9070 } 9071 #endif 9072 9073 // -------------------------- InterleaveOdd 9074 9075 template <class D, HWY_IF_LANES_LE_D(D, 2)> 9076 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 9077 return ConcatOdd(d, b, a); 9078 } 9079 9080 // I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes 9081 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> 9082 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 9083 const Repartition<uint16_t, decltype(d)> du16; 9084 return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a)))); 9085 } 9086 9087 // I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes 9088 template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)> 9089 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 9090 const Repartition<uint32_t, decltype(d)> du32; 9091 return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a)))); 9092 } 9093 9094 #if HWY_TARGET <= HWY_AVX3 9095 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)> 9096 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 9097 return VFromD<D>{_mm_mask_shuffle_epi32( 9098 b.raw, static_cast<__mmask8>(0x05), a.raw, 9099 static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))}; 9100 } 9101 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)> 9102 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 9103 return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05), 9104 a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))}; 9105 } 9106 #else 9107 template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)> 9108 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 9109 const RebindToFloat<decltype(d)> df; 9110 const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a)); 9111 return BitCast( 9112 d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw, 9113 _MM_SHUFFLE(3, 1, 2, 0))}); 9114 } 9115 #endif 9116 9117 // ------------------------------ OddEvenBlocks 9118 template <typename T, size_t N> 9119 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 9120 return even; 9121 } 9122 9123 // ------------------------------ SwapAdjacentBlocks 9124 template <typename T, size_t N> 9125 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 9126 return v; 9127 } 9128 9129 // ------------------------------ InterleaveEvenBlocks 9130 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 9131 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 9132 return a; 9133 } 9134 // ------------------------------ InterleaveOddBlocks 9135 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 9136 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 9137 return a; 9138 } 9139 9140 // ------------------------------ Shl (ZipLower, Mul) 9141 9142 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of 9143 // two from loading float exponents, which is considerably faster (according 9144 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. 9145 9146 namespace detail { 9147 9148 #if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly 9149 template <class V> 9150 HWY_API V AVX2ShlU16Vec128(V v, V bits) { 9151 const DFromV<decltype(v)> d; 9152 const Rebind<uint32_t, decltype(d)> du32; 9153 return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); 9154 } 9155 #elif HWY_TARGET > HWY_AVX2 9156 9157 template <class D32> 9158 static HWY_INLINE VFromD<D32> Pow2ConvF32ToI32( 9159 D32 d32, VFromD<RebindToFloat<D32>> vf32) { 9160 const RebindToSigned<decltype(d32)> di32; 9161 #if HWY_COMPILER_GCC_ACTUAL 9162 // ConvertInRangeTo is safe with GCC due the inline assembly workaround used 9163 // for F32->I32 ConvertInRangeTo with GCC 9164 return BitCast(d32, ConvertInRangeTo(di32, vf32)); 9165 #else 9166 // Otherwise, use NearestIntInRange because we rely on the native 0x80..00 9167 // overflow behavior 9168 return BitCast(d32, NearestIntInRange(di32, vf32)); 9169 #endif 9170 } 9171 9172 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. 9173 template <typename T, HWY_IF_T_SIZE(T, 2)> 9174 HWY_INLINE Vec128<MakeUnsigned<T>> Pow2(const Vec128<T> v) { 9175 const DFromV<decltype(v)> d; 9176 const RebindToUnsigned<decltype(d)> du; 9177 const RepartitionToWide<decltype(d)> dw; 9178 const Rebind<float, decltype(dw)> df; 9179 const auto zero = Zero(d); 9180 // Move into exponent (this u16 will become the upper half of an f32) 9181 const auto exp = ShiftLeft<23 - 16>(v); 9182 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f 9183 // Insert 0 into lower halves for reinterpreting as binary32. 9184 const auto f0 = ZipLower(dw, zero, upper); 9185 const auto f1 = ZipUpper(dw, zero, upper); 9186 // See cvtps comment below. 9187 const VFromD<decltype(dw)> bits0 = Pow2ConvF32ToI32(dw, BitCast(df, f0)); 9188 const VFromD<decltype(dw)> bits1 = Pow2ConvF32ToI32(dw, BitCast(df, f1)); 9189 #if HWY_TARGET <= HWY_SSE4 9190 return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits1.raw)}; 9191 #else 9192 return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0)); 9193 #endif 9194 } 9195 9196 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_LANES_LE(N, 4)> 9197 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { 9198 const DFromV<decltype(v)> d; 9199 const RebindToUnsigned<decltype(d)> du; 9200 const Twice<decltype(du)> dt_u; 9201 const RepartitionToWide<decltype(dt_u)> dt_w; 9202 const RebindToFloat<decltype(dt_w)> dt_f; 9203 // Move into exponent (this u16 will become the upper half of an f32) 9204 const auto exp = ShiftLeft<23 - 16>(v); 9205 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f 9206 // Insert 0 into lower halves for reinterpreting as binary32. 9207 const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper)); 9208 // See cvtps comment below. 9209 const VFromD<decltype(dt_w)> bits0 = 9210 Pow2ConvF32ToI32(dt_w, BitCast(dt_f, f0)); 9211 #if HWY_TARGET <= HWY_SSE4 9212 return VFromD<decltype(du)>{_mm_packus_epi32(bits0.raw, bits0.raw)}; 9213 #elif HWY_TARGET == HWY_SSSE3 9214 alignas(16) 9215 const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; 9216 return TableLookupBytes(bits0, Load(du, kCompactEvenU16)); 9217 #else 9218 const RebindToSigned<decltype(dt_w)> dt_i32; 9219 const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0))); 9220 return VFromD<decltype(du)>{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)}; 9221 #endif 9222 } 9223 9224 // Same, for 32-bit shifts. 9225 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 9226 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) { 9227 const DFromV<decltype(v)> d; 9228 const RebindToFloat<decltype(d)> df; 9229 const auto exp = ShiftLeft<23>(v); 9230 const auto f = exp + Set(d, 0x3F800000); // 1.0f 9231 // Do not use ConvertTo because we rely on the native 0x80..00 overflow 9232 // behavior. 9233 return Pow2ConvF32ToI32(d, BitCast(df, f)); 9234 } 9235 9236 #endif // HWY_TARGET > HWY_AVX2 9237 9238 template <size_t N> 9239 HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v, 9240 Vec128<uint16_t, N> bits) { 9241 #if HWY_TARGET <= HWY_AVX3 9242 return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)}; 9243 #elif HWY_TARGET == HWY_AVX2 9244 return AVX2ShlU16Vec128(v, bits); 9245 #else 9246 return v * Pow2(bits); 9247 #endif 9248 } 9249 9250 #if HWY_TARGET > HWY_AVX3 9251 HWY_API Vec16<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec16<uint16_t> v, 9252 Vec16<uint16_t> bits) { 9253 #if HWY_TARGET <= HWY_SSE4 9254 const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)}; 9255 #else 9256 const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)}); 9257 #endif 9258 return Vec16<uint16_t>{_mm_sll_epi16(v.raw, bits16.raw)}; 9259 } 9260 #endif 9261 9262 #if HWY_TARGET <= HWY_AVX3 9263 template <class V> 9264 HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { 9265 const DFromV<decltype(v)> d; 9266 const Rebind<uint16_t, decltype(d)> du16; 9267 return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits)); 9268 } 9269 #elif HWY_TARGET <= HWY_AVX2 9270 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 9271 HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { 9272 const DFromV<decltype(v)> d; 9273 const Rebind<uint32_t, decltype(d)> du32; 9274 return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); 9275 } 9276 template <class V, HWY_IF_V_SIZE_V(V, 16)> 9277 HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { 9278 const DFromV<decltype(v)> d; 9279 const Half<decltype(d)> dh; 9280 const Rebind<uint16_t, decltype(d)> du16; 9281 const Rebind<uint32_t, decltype(dh)> dh_u32; 9282 9283 const VFromD<decltype(dh_u32)> lo_shl_result = 9284 PromoteTo(dh_u32, LowerHalf(dh, v)) 9285 << PromoteTo(dh_u32, LowerHalf(dh, bits)); 9286 const VFromD<decltype(dh_u32)> hi_shl_result = 9287 PromoteTo(dh_u32, UpperHalf(dh, v)) 9288 << PromoteTo(dh_u32, UpperHalf(dh, bits)); 9289 const VFromD<decltype(du16)> u16_shl_result = ConcatEven( 9290 du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result)); 9291 return TruncateTo(d, u16_shl_result); 9292 } 9293 #endif // HWY_TARGET <= HWY_AVX3 9294 9295 // 8-bit: may use the Shl overload for uint16_t. 9296 template <size_t N> 9297 HWY_API Vec128<uint8_t, N> Shl(hwy::UnsignedTag tag, Vec128<uint8_t, N> v, 9298 Vec128<uint8_t, N> bits) { 9299 const DFromV<decltype(v)> d; 9300 #if HWY_TARGET <= HWY_AVX3_DL 9301 (void)tag; 9302 // kMask[i] = 0xFF >> i 9303 alignas(16) static constexpr uint8_t kMasks[16] = { 9304 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; 9305 // kShl[i] = 1 << i 9306 alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, 9307 0x20, 0x40, 0x80, 0x00}; 9308 v = And(v, TableLookupBytes(Load(Full64<uint8_t>(), kMasks), bits)); 9309 const VFromD<decltype(d)> mul = 9310 TableLookupBytes(Load(Full64<uint8_t>(), kShl), bits); 9311 return VFromD<decltype(d)>{_mm_gf2p8mul_epi8(v.raw, mul.raw)}; 9312 #elif HWY_TARGET <= HWY_AVX2 9313 (void)tag; 9314 (void)d; 9315 return AVX2ShlU8Vec128(v, bits); 9316 #else 9317 const Repartition<uint16_t, decltype(d)> dw; 9318 using VW = VFromD<decltype(dw)>; 9319 const VW even_mask = Set(dw, 0x00FF); 9320 const VW odd_mask = Set(dw, 0xFF00); 9321 const VW vw = BitCast(dw, v); 9322 const VW bits16 = BitCast(dw, bits); 9323 // Shift even lanes in-place 9324 const VW evens = Shl(tag, vw, And(bits16, even_mask)); 9325 const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16)); 9326 return OddEven(BitCast(d, odds), BitCast(d, evens)); 9327 #endif 9328 } 9329 HWY_API Vec128<uint8_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint8_t, 1> v, 9330 Vec128<uint8_t, 1> bits) { 9331 #if HWY_TARGET <= HWY_SSE4 9332 const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)}; 9333 #else 9334 const Vec16<uint16_t> bits8 = 9335 And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)}); 9336 #endif 9337 return Vec128<uint8_t, 1>{_mm_sll_epi16(v.raw, bits8.raw)}; 9338 } 9339 9340 template <size_t N> 9341 HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v, 9342 Vec128<uint32_t, N> bits) { 9343 #if HWY_TARGET >= HWY_SSE4 9344 return v * Pow2(bits); 9345 #else 9346 return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)}; 9347 #endif 9348 } 9349 9350 #if HWY_TARGET >= HWY_SSE4 9351 HWY_API Vec32<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec32<uint32_t> v, 9352 const Vec32<uint32_t> bits) { 9353 #if HWY_TARGET == HWY_SSE4 9354 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)}; 9355 #else 9356 const auto bits32 = 9357 Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits); 9358 #endif 9359 return Vec32<uint32_t>{_mm_sll_epi32(v.raw, bits32.raw)}; 9360 } 9361 #endif 9362 9363 HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v, 9364 Vec128<uint64_t> bits) { 9365 #if HWY_TARGET >= HWY_SSE4 9366 const DFromV<decltype(v)> d; 9367 // Individual shifts and combine 9368 const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)}; 9369 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); 9370 const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)}; 9371 return ConcatUpperLower(d, out1, out0); 9372 #else 9373 return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)}; 9374 #endif 9375 } 9376 HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v, 9377 Vec64<uint64_t> bits) { 9378 return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)}; 9379 } 9380 9381 // Signed left shift is the same as unsigned. 9382 template <typename T, size_t N> 9383 HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v, 9384 Vec128<T, N> bits) { 9385 const DFromV<decltype(v)> di; 9386 const RebindToUnsigned<decltype(di)> du; 9387 return BitCast(di, 9388 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); 9389 } 9390 9391 } // namespace detail 9392 9393 template <typename T, size_t N> 9394 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 9395 return detail::Shl(hwy::TypeTag<T>(), v, bits); 9396 } 9397 9398 // ------------------------------ Shr (mul, mask, BroadcastSignBit) 9399 9400 // Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use 9401 // widening multiplication by powers of two obtained by loading float exponents, 9402 // followed by a constant right-shift. This is still faster than a scalar or 9403 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. 9404 9405 #if HWY_TARGET <= HWY_AVX2 9406 namespace detail { 9407 9408 #if HWY_TARGET <= HWY_AVX3 9409 template <class V> 9410 HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { 9411 const DFromV<decltype(v)> d; 9412 const Rebind<uint16_t, decltype(d)> du16; 9413 const RebindToSigned<decltype(du16)> di16; 9414 return DemoteTo(d, 9415 BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits))); 9416 } 9417 #else // AVX2 9418 template <class V> 9419 HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) { 9420 const DFromV<decltype(v)> d; 9421 const Rebind<uint32_t, decltype(d)> du32; 9422 const RebindToSigned<decltype(du32)> di32; 9423 return DemoteTo(d, 9424 BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); 9425 } 9426 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 9427 HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { 9428 const DFromV<decltype(v)> d; 9429 const Rebind<uint32_t, decltype(d)> du32; 9430 const RebindToSigned<decltype(du32)> di32; 9431 return DemoteTo(d, 9432 BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); 9433 } 9434 template <class V, HWY_IF_V_SIZE_V(V, 16)> 9435 HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { 9436 const DFromV<decltype(v)> d; 9437 const Half<decltype(d)> dh; 9438 const Rebind<int16_t, decltype(d)> di16; 9439 const Rebind<uint16_t, decltype(d)> du16; 9440 const Rebind<int32_t, decltype(dh)> dh_i32; 9441 const Rebind<uint32_t, decltype(dh)> dh_u32; 9442 9443 const auto lo_shr_result = 9444 BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >> 9445 PromoteTo(dh_u32, LowerHalf(dh, bits))); 9446 const auto hi_shr_result = 9447 BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >> 9448 PromoteTo(dh_u32, UpperHalf(dh, bits))); 9449 const auto i16_shr_result = 9450 BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result)); 9451 return DemoteTo(d, i16_shr_result); 9452 } 9453 #endif // HWY_TARGET <= HWY_AVX3 9454 9455 } // namespace detail 9456 #endif // HWY_TARGET <= HWY_AVX2 9457 9458 template <size_t N> 9459 HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> in, 9460 const Vec128<uint16_t, N> bits) { 9461 #if HWY_TARGET <= HWY_AVX3 9462 return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)}; 9463 #elif HWY_TARGET <= HWY_AVX2 9464 return detail::AVX2ShrU16Vec128(in, bits); 9465 #else 9466 const DFromV<decltype(in)> d; 9467 // For bits=0, we cannot mul by 2^16, so fix the result later. 9468 const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); 9469 // Replace output with input where bits == 0. 9470 return IfThenElse(bits == Zero(d), in, out); 9471 #endif 9472 } 9473 9474 #if HWY_TARGET > HWY_AVX3 9475 HWY_API Vec16<uint16_t> operator>>(const Vec16<uint16_t> in, 9476 const Vec16<uint16_t> bits) { 9477 #if HWY_TARGET <= HWY_SSE4 9478 const Vec16<uint16_t> bits16{_mm_cvtepu16_epi64(bits.raw)}; 9479 #else 9480 const auto bits16 = And(bits, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFFFF)}); 9481 #endif 9482 return Vec16<uint16_t>{_mm_srl_epi16(in.raw, bits16.raw)}; 9483 } 9484 #endif 9485 9486 // 8-bit uses 16-bit shifts. 9487 template <size_t N> 9488 HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> in, 9489 const Vec128<uint8_t, N> bits) { 9490 #if HWY_TARGET <= HWY_AVX2 9491 return detail::AVX2ShrU8Vec128(in, bits); 9492 #else 9493 const DFromV<decltype(in)> d; 9494 const Repartition<uint16_t, decltype(d)> dw; 9495 using VW = VFromD<decltype(dw)>; 9496 const VW mask = Set(dw, 0x00FF); 9497 const VW vw = BitCast(dw, in); 9498 const VW bits16 = BitCast(dw, bits); 9499 const VW evens = And(vw, mask) >> And(bits16, mask); 9500 // Shift odd lanes in-place 9501 const VW odds = vw >> ShiftRight<8>(bits16); 9502 return OddEven(BitCast(d, odds), BitCast(d, evens)); 9503 #endif 9504 } 9505 HWY_API Vec128<uint8_t, 1> operator>>(const Vec128<uint8_t, 1> in, 9506 const Vec128<uint8_t, 1> bits) { 9507 #if HWY_TARGET <= HWY_SSE4 9508 const Vec16<uint16_t> in8{_mm_cvtepu8_epi16(in.raw)}; 9509 const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)}; 9510 #else 9511 const Vec16<uint16_t> mask{_mm_set_epi64x(0, 0xFF)}; 9512 const Vec16<uint16_t> in8 = And(Vec16<uint16_t>{in.raw}, mask); 9513 const Vec16<uint16_t> bits8 = And(Vec16<uint16_t>{bits.raw}, mask); 9514 #endif 9515 return Vec128<uint8_t, 1>{_mm_srl_epi16(in8.raw, bits8.raw)}; 9516 } 9517 9518 template <size_t N> 9519 HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in, 9520 const Vec128<uint32_t, N> bits) { 9521 #if HWY_TARGET >= HWY_SSE4 9522 // 32x32 -> 64 bit mul, then shift right by 32. 9523 const DFromV<decltype(in)> d32; 9524 // Move odd lanes into position for the second mul. Shuffle more gracefully 9525 // handles N=1 than repartitioning to u64 and shifting 32 bits right. 9526 const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)}; 9527 // For bits=0, we cannot mul by 2^32, so fix the result later. 9528 const auto mul = detail::Pow2(Set(d32, 32) - bits); 9529 const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 9530 const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; 9531 // No need to shift right, already in the correct position. 9532 const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? 9533 const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20)); 9534 // Replace output with input where bits == 0. 9535 return IfThenElse(bits == Zero(d32), in, out); 9536 #else 9537 return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)}; 9538 #endif 9539 } 9540 9541 #if HWY_TARGET >= HWY_SSE4 9542 HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in, 9543 const Vec128<uint32_t, 1> bits) { 9544 #if HWY_TARGET == HWY_SSE4 9545 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)}; 9546 #else 9547 const auto bits32 = 9548 Combine(Full64<uint32_t>(), Zero(Full32<uint32_t>()), bits); 9549 #endif 9550 return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits32.raw)}; 9551 } 9552 #endif 9553 9554 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v, 9555 const Vec128<uint64_t> bits) { 9556 #if HWY_TARGET >= HWY_SSE4 9557 const DFromV<decltype(v)> d; 9558 // Individual shifts and combine 9559 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)}; 9560 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); 9561 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)}; 9562 return ConcatUpperLower(d, out1, out0); 9563 #else 9564 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)}; 9565 #endif 9566 } 9567 HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v, 9568 const Vec64<uint64_t> bits) { 9569 return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)}; 9570 } 9571 9572 namespace detail { 9573 9574 #if HWY_TARGET <= HWY_AVX3 9575 template <class V> 9576 HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { 9577 const DFromV<decltype(v)> d; 9578 const Rebind<int16_t, decltype(d)> di16; 9579 return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits)); 9580 } 9581 #elif HWY_TARGET <= HWY_AVX2 // AVX2 9582 template <class V> 9583 HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) { 9584 const DFromV<decltype(v)> d; 9585 const Rebind<int32_t, decltype(d)> di32; 9586 return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); 9587 } 9588 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 9589 HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { 9590 const DFromV<decltype(v)> d; 9591 const Rebind<int32_t, decltype(d)> di32; 9592 return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); 9593 } 9594 template <class V, HWY_IF_V_SIZE_V(V, 16)> 9595 HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { 9596 const DFromV<decltype(v)> d; 9597 const Half<decltype(d)> dh; 9598 const Rebind<int16_t, decltype(d)> di16; 9599 const Rebind<int32_t, decltype(dh)> dh_i32; 9600 9601 const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >> 9602 PromoteTo(dh_i32, LowerHalf(dh, bits)); 9603 const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >> 9604 PromoteTo(dh_i32, UpperHalf(dh, bits)); 9605 const auto i16_shr_result = 9606 OrderedDemote2To(di16, lo_shr_result, hi_shr_result); 9607 return DemoteTo(d, i16_shr_result); 9608 } 9609 #endif 9610 9611 #if HWY_TARGET > HWY_AVX3 9612 // Also used in x86_256-inl.h. 9613 template <class DI, class V> 9614 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { 9615 const RebindToUnsigned<DI> du; 9616 const auto count = BitCast(du, count_i); // same type as value to shift 9617 // Clear sign and restore afterwards. This is preferable to shifting the MSB 9618 // downwards because Shr is somewhat more expensive than Shl. 9619 const auto sign = BroadcastSignBit(v); 9620 const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below 9621 return BitCast(di, abs >> count) ^ sign; 9622 } 9623 #endif 9624 9625 } // namespace detail 9626 9627 template <size_t N> 9628 HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v, 9629 Vec128<int16_t, N> bits) { 9630 #if HWY_TARGET <= HWY_AVX3 9631 return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)}; 9632 #elif HWY_TARGET <= HWY_AVX2 9633 return detail::AVX2ShrI16Vec128(v, bits); 9634 #else 9635 const DFromV<decltype(v)> d; 9636 return detail::SignedShr(d, v, bits); 9637 #endif 9638 } 9639 9640 #if HWY_TARGET > HWY_AVX3 9641 HWY_API Vec16<int16_t> operator>>(Vec16<int16_t> v, Vec16<int16_t> bits) { 9642 #if HWY_TARGET <= HWY_SSE4 9643 const Vec16<int16_t> bits16{_mm_cvtepu16_epi64(bits.raw)}; 9644 #else 9645 const auto bits16 = And(bits, Vec16<int16_t>{_mm_set_epi64x(0, 0xFFFF)}); 9646 #endif 9647 return Vec16<int16_t>{_mm_sra_epi16(v.raw, bits16.raw)}; 9648 } 9649 #endif 9650 9651 template <size_t N> 9652 HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v, 9653 Vec128<int8_t, N> bits) { 9654 #if HWY_TARGET <= HWY_AVX2 9655 return detail::AVX2ShrI8Vec128(v, bits); 9656 #else 9657 const DFromV<decltype(v)> d; 9658 return detail::SignedShr(d, v, bits); 9659 #endif 9660 } 9661 HWY_API Vec128<int8_t, 1> operator>>(Vec128<int8_t, 1> v, 9662 Vec128<int8_t, 1> bits) { 9663 #if HWY_TARGET <= HWY_SSE4 9664 const Vec16<int16_t> vi16{_mm_cvtepi8_epi16(v.raw)}; 9665 const Vec16<uint16_t> bits8{_mm_cvtepu8_epi64(bits.raw)}; 9666 #else 9667 const DFromV<decltype(v)> d; 9668 const Rebind<int16_t, decltype(d)> di16; 9669 const Twice<decltype(d)> dt; 9670 9671 const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v))); 9672 const Vec16<uint16_t> bits8 = 9673 And(Vec16<uint16_t>{bits.raw}, Vec16<uint16_t>{_mm_set_epi64x(0, 0xFF)}); 9674 #endif 9675 return Vec128<int8_t, 1>{_mm_sra_epi16(vi16.raw, bits8.raw)}; 9676 } 9677 9678 template <size_t N> 9679 HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v, 9680 Vec128<int32_t, N> bits) { 9681 #if HWY_TARGET <= HWY_AVX2 9682 return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)}; 9683 #else 9684 const DFromV<decltype(v)> d; 9685 return detail::SignedShr(d, v, bits); 9686 #endif 9687 } 9688 9689 #if HWY_TARGET > HWY_AVX2 9690 HWY_API Vec32<int32_t> operator>>(Vec32<int32_t> v, Vec32<int32_t> bits) { 9691 #if HWY_TARGET == HWY_SSE4 9692 const Vec32<uint32_t> bits32{_mm_cvtepu32_epi64(bits.raw)}; 9693 #else 9694 const auto bits32 = Combine(Full64<int32_t>(), Zero(Full32<int32_t>()), bits); 9695 #endif 9696 return Vec32<int32_t>{_mm_sra_epi32(v.raw, bits32.raw)}; 9697 } 9698 #endif 9699 9700 template <size_t N> 9701 HWY_API Vec128<int64_t, N> operator>>(Vec128<int64_t, N> v, 9702 Vec128<int64_t, N> bits) { 9703 #if HWY_TARGET <= HWY_AVX3 9704 return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)}; 9705 #else 9706 const DFromV<decltype(v)> d; 9707 return detail::SignedShr(d, v, bits); 9708 #endif 9709 } 9710 9711 // ------------------------------ MulEven/Odd 64x64 (UpperHalf) 9712 9713 namespace detail { 9714 9715 template <class V, HWY_IF_U64(TFromV<V>)> 9716 static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) { 9717 const DFromV<decltype(a)> du64; 9718 const RepartitionToNarrow<decltype(du64)> du32; 9719 const auto maskL = Set(du64, 0xFFFFFFFFULL); 9720 const auto a32 = BitCast(du32, a); 9721 const auto b32 = BitCast(du32, b); 9722 // Inputs for MulEven: we only need the lower 32 bits 9723 const auto aH = Shuffle2301(a32); 9724 const auto bH = Shuffle2301(b32); 9725 9726 // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need 9727 // the even (lower 64 bits of every 128-bit block) results. See 9728 // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.txt 9729 const auto aLbL = MulEven(a32, b32); 9730 const auto w3 = aLbL & maskL; 9731 9732 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); 9733 const auto w2 = t2 & maskL; 9734 const auto w1 = ShiftRight<32>(t2); 9735 9736 const auto t = MulEven(a32, bH) + w2; 9737 const auto k = ShiftRight<32>(t); 9738 9739 mulH = MulEven(aH, bH) + w1 + k; 9740 return ShiftLeft<32>(t) + w3; 9741 } 9742 9743 template <class V, HWY_IF_I64(TFromV<V>)> 9744 static HWY_INLINE V SSE2Mul128(V a, V b, V& mulH) { 9745 const DFromV<decltype(a)> di64; 9746 const RebindToUnsigned<decltype(di64)> du64; 9747 using VU64 = VFromD<decltype(du64)>; 9748 9749 VU64 unsigned_mulH; 9750 const auto mulL = BitCast( 9751 di64, SSE2Mul128(BitCast(du64, a), BitCast(du64, b), unsigned_mulH)); 9752 mulH = BitCast(di64, unsigned_mulH) - And(BroadcastSignBit(a), b) - 9753 And(a, BroadcastSignBit(b)); 9754 return mulL; 9755 } 9756 9757 } // namespace detail 9758 9759 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 9760 9761 template <class V, HWY_IF_UI64(TFromV<V>), 9762 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))> 9763 HWY_API V MulEven(V a, V b) { 9764 V mulH; 9765 const V mulL = detail::SSE2Mul128(a, b, mulH); 9766 return InterleaveLower(mulL, mulH); 9767 } 9768 9769 template <class V, HWY_IF_UI64(TFromV<V>), 9770 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 16 : 8))> 9771 HWY_API V MulOdd(V a, V b) { 9772 const DFromV<decltype(a)> du64; 9773 V mulH; 9774 const V mulL = detail::SSE2Mul128(a, b, mulH); 9775 return InterleaveUpper(du64, mulL, mulH); 9776 } 9777 9778 #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 9779 9780 template <class V, HWY_IF_UI64(TFromV<V>), 9781 HWY_IF_V_SIZE_GT_V(V, (HWY_ARCH_X86_64 ? 8 : 0))> 9782 HWY_API V MulHigh(V a, V b) { 9783 V mulH; 9784 detail::SSE2Mul128(a, b, mulH); 9785 return mulH; 9786 } 9787 9788 #if HWY_ARCH_X86_64 9789 9790 template <class T, HWY_IF_UI64(T)> 9791 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 9792 const DFromV<decltype(a)> d; 9793 alignas(16) T mul[2]; 9794 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); 9795 return Load(d, mul); 9796 } 9797 9798 template <class T, HWY_IF_UI64(T)> 9799 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 9800 const DFromV<decltype(a)> d; 9801 const Half<decltype(d)> d2; 9802 alignas(16) T mul[2]; 9803 const T a1 = GetLane(UpperHalf(d2, a)); 9804 const T b1 = GetLane(UpperHalf(d2, b)); 9805 mul[0] = Mul128(a1, b1, &mul[1]); 9806 return Load(d, mul); 9807 } 9808 9809 template <class T, HWY_IF_UI64(T)> 9810 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) { 9811 T hi; 9812 Mul128(GetLane(a), GetLane(b), &hi); 9813 return Vec64<T>{_mm_cvtsi64_si128(static_cast<int64_t>(hi))}; 9814 } 9815 9816 #endif // HWY_ARCH_X86_64 9817 9818 // ================================================== CONVERT (2) 9819 9820 // ------------------------------ PromoteEvenTo/PromoteOddTo 9821 9822 #if HWY_TARGET > HWY_AVX3 9823 namespace detail { 9824 9825 // I32->I64 PromoteEvenTo/PromoteOddTo 9826 9827 template <class D, HWY_IF_LANES_D(D, 1)> 9828 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, 9829 hwy::SizeTag<8> /*to_lane_size_tag*/, 9830 hwy::SignedTag /*from_type_tag*/, D d_to, 9831 Vec64<int32_t> v) { 9832 return PromoteLowerTo(d_to, v); 9833 } 9834 9835 template <class D, HWY_IF_LANES_D(D, 2)> 9836 HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, 9837 hwy::SizeTag<8> /*to_lane_size_tag*/, 9838 hwy::SignedTag /*from_type_tag*/, D d_to, 9839 Vec128<int32_t> v) { 9840 const Repartition<int32_t, D> d_from; 9841 return PromoteLowerTo(d_to, ConcatEven(d_from, v, v)); 9842 } 9843 9844 template <class D, class V, HWY_IF_LANES_LE_D(D, 2)> 9845 HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/, 9846 hwy::SizeTag<8> /*to_lane_size_tag*/, 9847 hwy::SignedTag /*from_type_tag*/, D d_to, 9848 V v) { 9849 const Repartition<int32_t, D> d_from; 9850 return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v)); 9851 } 9852 9853 } // namespace detail 9854 #endif 9855 9856 // ------------------------------ PromoteEvenTo/PromoteOddTo 9857 #include "hwy/ops/inside-inl.h" 9858 9859 // ------------------------------ WidenMulPairwiseAdd (PromoteEvenTo) 9860 9861 #if HWY_NATIVE_DOT_BF16 9862 9863 template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16), 9864 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 9865 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 9866 return VFromD<DF>{_mm_dpbf16_ps(Zero(df).raw, 9867 reinterpret_cast<__m128bh>(a.raw), 9868 reinterpret_cast<__m128bh>(b.raw))}; 9869 } 9870 9871 #else 9872 9873 // Generic for all vector lengths. 9874 template <class DF, HWY_IF_F32_D(DF), 9875 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 9876 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 9877 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), 9878 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); 9879 } 9880 9881 #endif // HWY_NATIVE_DOT_BF16 9882 9883 // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. 9884 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), 9885 class V16 = VFromD<RepartitionToNarrow<D32>>> 9886 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { 9887 return VFromD<D32>{_mm_madd_epi16(a.raw, b.raw)}; 9888 } 9889 9890 // Generic for all vector lengths. 9891 template <class DU32, HWY_IF_U32_D(DU32), 9892 class VU16 = VFromD<RepartitionToNarrow<DU32>>> 9893 HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { 9894 const auto p_lo = a * b; 9895 const auto p_hi = MulHigh(a, b); 9896 9897 const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo)); 9898 const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)), 9899 ShiftRight<16>(BitCast(du32, p_lo))); 9900 return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1)); 9901 } 9902 9903 // ------------------------------ SatWidenMulPairwiseAdd 9904 9905 #if HWY_TARGET <= HWY_SSSE3 9906 9907 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 9908 #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 9909 #else 9910 #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 9911 #endif 9912 9913 // Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16 9914 // is safe. 9915 template <class DI16, HWY_IF_I16_D(DI16), HWY_IF_V_SIZE_LE_D(DI16, 16)> 9916 HWY_API VFromD<DI16> SatWidenMulPairwiseAdd( 9917 DI16 /* tag */, VFromD<Repartition<uint8_t, DI16>> a, 9918 VFromD<Repartition<int8_t, DI16>> b) { 9919 return VFromD<DI16>{_mm_maddubs_epi16(a.raw, b.raw)}; 9920 } 9921 9922 #endif 9923 9924 // ------------------------------ SatWidenMulPairwiseAccumulate 9925 9926 #if HWY_TARGET <= HWY_AVX3_DL 9927 9928 #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 9929 #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 9930 #else 9931 #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 9932 #endif 9933 9934 // Even if N=1, the I16 vectors have at least 2 lanes, hence _mm_dpwssds_epi32 9935 // is safe. 9936 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> 9937 HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate( 9938 DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a, 9939 VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) { 9940 return VFromD<DI32>{_mm_dpwssds_epi32(sum.raw, a.raw, b.raw)}; 9941 } 9942 9943 #endif // HWY_TARGET <= HWY_AVX3_DL 9944 9945 // ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo) 9946 9947 #if HWY_NATIVE_DOT_BF16 9948 9949 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 9950 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 9951 #else 9952 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 9953 #endif 9954 9955 template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16), 9956 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 9957 HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b, 9958 const VFromD<DF> sum0, 9959 VFromD<DF>& /*sum1*/) { 9960 return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw), 9961 reinterpret_cast<__m128bh>(b.raw))}; 9962 } 9963 9964 #endif // HWY_NATIVE_DOT_BF16 9965 9966 // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. 9967 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), 9968 class V16 = VFromD<RepartitionToNarrow<D32>>> 9969 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, 9970 const VFromD<D32> sum0, 9971 VFromD<D32>& /*sum1*/) { 9972 (void)d; 9973 #if HWY_TARGET <= HWY_AVX3_DL 9974 return VFromD<D32>{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; 9975 #else 9976 return sum0 + WidenMulPairwiseAdd(d, a, b); 9977 #endif 9978 } 9979 9980 template <class DU32, HWY_IF_U32_D(DU32), 9981 class VU16 = VFromD<RepartitionToNarrow<DU32>>> 9982 HWY_API VFromD<DU32> ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b, 9983 const VFromD<DU32> sum0, 9984 VFromD<DU32>& /*sum1*/) { 9985 (void)d; 9986 return sum0 + WidenMulPairwiseAdd(d, a, b); 9987 } 9988 9989 // ------------------------------ RearrangeToOddPlusEven 9990 template <size_t N> 9991 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0, 9992 Vec128<int32_t, N> /*sum1*/) { 9993 return sum0; // invariant already holds 9994 } 9995 9996 template <size_t N> 9997 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven( 9998 const Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) { 9999 return sum0; // invariant already holds 10000 } 10001 10002 template <class VW> 10003 HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { 10004 return Add(sum0, sum1); 10005 } 10006 10007 // ------------------------------ SumOfMulQuadAccumulate 10008 #if HWY_TARGET <= HWY_AVX3_DL 10009 10010 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 10011 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 10012 #else 10013 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 10014 #endif 10015 10016 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> 10017 HWY_API VFromD<DI32> SumOfMulQuadAccumulate( 10018 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u, 10019 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { 10020 return VFromD<DI32>{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; 10021 } 10022 10023 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 10024 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 10025 #else 10026 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 10027 #endif 10028 10029 #if HWY_X86_HAVE_AVX10_2_OPS 10030 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)> 10031 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/, 10032 VFromD<Repartition<int8_t, DI32>> a, 10033 VFromD<Repartition<int8_t, DI32>> b, 10034 VFromD<DI32> sum) { 10035 return VFromD<DI32>{_mm_dpbssd_epi32(sum.raw, a.raw, b.raw)}; 10036 } 10037 #else // !HWY_X86_HAVE_AVX10_2_OPS 10038 template <class DI32, HWY_IF_I32_D(DI32)> 10039 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32, 10040 VFromD<Repartition<int8_t, DI32>> a, 10041 VFromD<Repartition<int8_t, DI32>> b, 10042 VFromD<DI32> sum) { 10043 const Repartition<uint8_t, decltype(di32)> du8; 10044 10045 const auto a_u = BitCast(du8, a); 10046 const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum); 10047 const auto result_sum_1 = ShiftLeft<8>( 10048 SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32))); 10049 return result_sum_0 - result_sum_1; 10050 } 10051 #endif // HWY_X86_HAVE_AVX10_2_OPS 10052 10053 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 10054 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 10055 #else 10056 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 10057 #endif 10058 10059 #if HWY_X86_HAVE_AVX10_2_OPS 10060 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16)> 10061 HWY_API VFromD<DU32> SumOfMulQuadAccumulate( 10062 DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a, 10063 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { 10064 return VFromD<DU32>{_mm_dpbuud_epi32(sum.raw, a.raw, b.raw)}; 10065 } 10066 #else // !HWY_X86_HAVE_AVX10_2_OPS 10067 template <class DU32, HWY_IF_U32_D(DU32)> 10068 HWY_API VFromD<DU32> SumOfMulQuadAccumulate( 10069 DU32 du32, VFromD<Repartition<uint8_t, DU32>> a, 10070 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { 10071 const Repartition<uint8_t, decltype(du32)> du8; 10072 const RebindToSigned<decltype(du8)> di8; 10073 const RebindToSigned<decltype(du32)> di32; 10074 10075 const auto b_i = BitCast(di8, b); 10076 const auto result_sum_0 = 10077 SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum)); 10078 const auto result_sum_1 = ShiftLeft<8>( 10079 SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32))); 10080 10081 return BitCast(du32, result_sum_0 - result_sum_1); 10082 } 10083 #endif // HWY_X86_HAVE_AVX10_2_OPS 10084 10085 #endif // HWY_TARGET <= HWY_AVX3_DL 10086 10087 // ------------------------------ Demotions (full -> part w/ narrow lanes) 10088 10089 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 10090 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 10091 return VFromD<D>{_mm_packs_epi32(v.raw, v.raw)}; 10092 } 10093 10094 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 10095 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 10096 #if HWY_TARGET >= HWY_SSSE3 10097 const Rebind<int32_t, D> di32; 10098 const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); 10099 const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); 10100 const auto clamped = Or(zero_if_neg, too_big); 10101 #if HWY_TARGET == HWY_SSE2 10102 const Rebind<uint16_t, decltype(di32)> du16; 10103 const RebindToSigned<decltype(du16)> di16; 10104 return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); 10105 #else 10106 const Repartition<uint16_t, decltype(di32)> du16; 10107 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. 10108 alignas(16) static constexpr uint16_t kLower2Bytes[16] = { 10109 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; 10110 const auto lo2 = Load(du16, kLower2Bytes); 10111 return VFromD<D>{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; 10112 #endif 10113 #else 10114 return VFromD<D>{_mm_packus_epi32(v.raw, v.raw)}; 10115 #endif 10116 } 10117 10118 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 10119 HWY_API VFromD<D> DemoteTo(D du16, VFromD<Rebind<uint32_t, D>> v) { 10120 const DFromV<decltype(v)> du32; 10121 const RebindToSigned<decltype(du32)> di32; 10122 #if HWY_TARGET >= HWY_SSSE3 10123 const auto too_big = 10124 VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32))); 10125 const auto clamped = Or(BitCast(di32, v), too_big); 10126 #if HWY_TARGET == HWY_SSE2 10127 const RebindToSigned<decltype(du16)> di16; 10128 return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); 10129 #else 10130 (void)du16; 10131 const Repartition<uint16_t, decltype(di32)> du16_full; 10132 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. 10133 alignas(16) static constexpr uint16_t kLower2Bytes[16] = { 10134 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; 10135 const auto lo2 = Load(du16_full, kLower2Bytes); 10136 return VFromD<D>{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw}; 10137 #endif 10138 #else 10139 return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); 10140 #endif 10141 } 10142 10143 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 10144 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 10145 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); 10146 return VFromD<D>{_mm_packus_epi16(i16, i16)}; 10147 } 10148 10149 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 10150 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 10151 return VFromD<D>{_mm_packus_epi16(v.raw, v.raw)}; 10152 } 10153 10154 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> 10155 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 10156 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); 10157 return VFromD<D>{_mm_packs_epi16(i16, i16)}; 10158 } 10159 10160 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> 10161 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 10162 return VFromD<D>{_mm_packs_epi16(v.raw, v.raw)}; 10163 } 10164 10165 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 10166 HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint32_t, D>> v) { 10167 #if HWY_TARGET <= HWY_AVX3 10168 // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned 10169 // integers to 8-bit unsigned integers 10170 (void)du8; 10171 return VFromD<D>{_mm_cvtusepi32_epi8(v.raw)}; 10172 #else 10173 const DFromV<decltype(v)> du32; 10174 const RebindToSigned<decltype(du32)> di32; 10175 const auto max_i32 = Set(du32, 0x7FFFFFFFu); 10176 10177 #if HWY_TARGET >= HWY_SSSE3 10178 // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation 10179 // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. 10180 10181 // The u8 Min operation below leaves the lower 24 bits of each 32-bit 10182 // lane unchanged. 10183 10184 // The u8 Min operation below will leave any values that are less than or 10185 // equal to 0x7FFFFFFF unchanged. 10186 10187 // For values that are greater than or equal to 0x80000000, the u8 Min 10188 // operation below will force the upper 8 bits to 0x7F and leave the lower 10189 // 24 bits unchanged. 10190 10191 // An u8 Min operation is okay here as any clamped value that is greater than 10192 // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and 10193 // 0x7FFFFFFF through the u8 Min operation below, which will then be converted 10194 // to 0xFF through the i32->u8 demotion. 10195 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 10196 const auto clamped = BitCast( 10197 di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32))); 10198 #else 10199 const auto clamped = BitCast(di32, Min(v, max_i32)); 10200 #endif 10201 10202 return DemoteTo(du8, clamped); 10203 #endif 10204 } 10205 10206 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 10207 HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) { 10208 const DFromV<decltype(v)> du16; 10209 const RebindToSigned<decltype(du16)> di16; 10210 const auto max_i16 = Set(du16, 0x7FFF); 10211 10212 #if HWY_TARGET >= HWY_SSSE3 10213 // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation 10214 // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. 10215 10216 // The u8 Min operation below leaves the lower 8 bits of each 16-bit 10217 // lane unchanged. 10218 10219 // The u8 Min operation below will leave any values that are less than or 10220 // equal to 0x7FFF unchanged. 10221 10222 // For values that are greater than or equal to 0x8000, the u8 Min 10223 // operation below will force the upper 8 bits to 0x7F and leave the lower 10224 // 8 bits unchanged. 10225 10226 // An u8 Min operation is okay here as any clamped value that is greater than 10227 // or equal to 0x8000 will be clamped to a value between 0x7F00 and 10228 // 0x7FFF through the u8 Min operation below, which will then be converted 10229 // to 0xFF through the i16->u8 demotion. 10230 const Repartition<uint8_t, decltype(du16)> du16_as_du8; 10231 const auto clamped = BitCast( 10232 di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16))); 10233 #else 10234 const auto clamped = BitCast(di16, Min(v, max_i16)); 10235 #endif 10236 10237 return DemoteTo(du8, clamped); 10238 } 10239 10240 #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) 10241 10242 // HWY_NATIVE_F16C was already toggled above. 10243 10244 // Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate). 10245 // clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain. 10246 HWY_DIAGNOSTICS(push) 10247 HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain") 10248 10249 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> 10250 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { 10251 const RebindToUnsigned<decltype(df16)> du16; 10252 return BitCast( 10253 df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}); 10254 } 10255 10256 HWY_DIAGNOSTICS(pop) 10257 10258 #endif // F16C 10259 10260 #if HWY_HAVE_FLOAT16 10261 10262 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 10263 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 10264 #else 10265 #define HWY_NATIVE_DEMOTE_F64_TO_F16 10266 #endif 10267 10268 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)> 10269 HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) { 10270 return VFromD<D>{_mm_cvtpd_ph(v.raw)}; 10271 } 10272 10273 #endif // HWY_HAVE_FLOAT16 10274 10275 // The _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics require GCC 9 or later 10276 // or Clang 10 or later 10277 10278 // Also need GCC or Clang to bit cast the __m128bh, __m256bh, or __m512bh vector 10279 // returned by the _mm*_cvtneps_pbh and _mm*_cvtne2ps_pbh intrinsics to a 10280 // __m128i, __m256i, or __m512i as there are currently no intrinsics available 10281 // (as of GCC 13 and Clang 17) to bit cast a __m128bh, __m256bh, or __m512bh 10282 // vector to a __m128i, __m256i, or __m512i vector 10283 10284 #if HWY_AVX3_HAVE_F32_TO_BF16C 10285 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 10286 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 10287 #else 10288 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 10289 #endif 10290 10291 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)> 10292 HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) { 10293 #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000 10294 // Inline assembly workaround for LLVM codegen bug 10295 __m128i raw_result; 10296 __asm__("vcvtneps2bf16 %1, %0" : "=v"(raw_result) : "v"(v.raw)); 10297 return VFromD<D>{raw_result}; 10298 #else 10299 // The _mm_cvtneps_pbh intrinsic returns a __m128bh vector that needs to be 10300 // bit casted to a __m128i vector 10301 return VFromD<D>{detail::BitCastToInteger(_mm_cvtneps_pbh(v.raw))}; 10302 #endif 10303 } 10304 10305 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)> 10306 HWY_API VFromD<D> ReorderDemote2To(D /*dbf16*/, Vec128<float> a, 10307 Vec128<float> b) { 10308 #if HWY_COMPILER_CLANG >= 1600 && HWY_COMPILER_CLANG < 2000 10309 // Inline assembly workaround for LLVM codegen bug 10310 __m128i raw_result; 10311 __asm__("vcvtne2ps2bf16 %2, %1, %0" 10312 : "=v"(raw_result) 10313 : "v"(b.raw), "v"(a.raw)); 10314 return VFromD<D>{raw_result}; 10315 #else 10316 // The _mm_cvtne2ps_pbh intrinsic returns a __m128bh vector that needs to be 10317 // bit casted to a __m128i vector 10318 return VFromD<D>{detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw))}; 10319 #endif 10320 } 10321 10322 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)> 10323 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<float> a, 10324 Vec64<float> b) { 10325 return VFromD<D>{_mm_shuffle_epi32( 10326 detail::BitCastToInteger(_mm_cvtne2ps_pbh(b.raw, a.raw)), 10327 _MM_SHUFFLE(2, 0, 2, 0))}; 10328 } 10329 10330 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)> 10331 HWY_API VFromD<D> ReorderDemote2To(D dbf16, Vec32<float> a, Vec32<float> b) { 10332 const DFromV<decltype(a)> d; 10333 const Twice<decltype(d)> dt; 10334 return DemoteTo(dbf16, Combine(dt, b, a)); 10335 } 10336 #endif // HWY_AVX3_HAVE_F32_TO_BF16C 10337 10338 // Specializations for partial vectors because packs_epi32 sets lanes above 2*N. 10339 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)> 10340 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) { 10341 const DFromV<decltype(a)> d; 10342 const Twice<decltype(d)> dt; 10343 return DemoteTo(dn, Combine(dt, b, a)); 10344 } 10345 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> 10346 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int32_t> a, 10347 Vec64<int32_t> b) { 10348 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw), 10349 _MM_SHUFFLE(2, 0, 2, 0))}; 10350 } 10351 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 10352 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, 10353 Vec128<int32_t> b) { 10354 return VFromD<D>{_mm_packs_epi32(a.raw, b.raw)}; 10355 } 10356 10357 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)> 10358 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec32<int32_t> a, Vec32<int32_t> b) { 10359 const DFromV<decltype(a)> d; 10360 const Twice<decltype(d)> dt; 10361 return DemoteTo(dn, Combine(dt, b, a)); 10362 } 10363 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> 10364 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec64<int32_t> a, Vec64<int32_t> b) { 10365 #if HWY_TARGET >= HWY_SSSE3 10366 const DFromV<decltype(a)> d; 10367 const Twice<decltype(d)> dt; 10368 return DemoteTo(dn, Combine(dt, b, a)); 10369 #else 10370 (void)dn; 10371 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw), 10372 _MM_SHUFFLE(2, 0, 2, 0))}; 10373 #endif 10374 } 10375 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 10376 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<int32_t> a, Vec128<int32_t> b) { 10377 #if HWY_TARGET >= HWY_SSSE3 10378 const Half<decltype(dn)> dnh; 10379 const auto u16_a = DemoteTo(dnh, a); 10380 const auto u16_b = DemoteTo(dnh, b); 10381 return Combine(dn, u16_b, u16_a); 10382 #else 10383 (void)dn; 10384 return VFromD<D>{_mm_packus_epi32(a.raw, b.raw)}; 10385 #endif 10386 } 10387 10388 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 10389 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a, 10390 Vec128<uint32_t> b) { 10391 const DFromV<decltype(a)> du32; 10392 const RebindToSigned<decltype(du32)> di32; 10393 const auto max_i32 = Set(du32, 0x7FFFFFFFu); 10394 10395 #if HWY_TARGET >= HWY_SSSE3 10396 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 10397 // On SSE2/SSSE3, clamp a and b using u8 Min operation 10398 const auto clamped_a = BitCast( 10399 di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32))); 10400 const auto clamped_b = BitCast( 10401 di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32))); 10402 #else 10403 const auto clamped_a = BitCast(di32, Min(a, max_i32)); 10404 const auto clamped_b = BitCast(di32, Min(b, max_i32)); 10405 #endif 10406 10407 return ReorderDemote2To(dn, clamped_a, clamped_b); 10408 } 10409 10410 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 10411 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a, 10412 VFromD<Repartition<uint32_t, D>> b) { 10413 const DFromV<decltype(a)> d; 10414 const Twice<decltype(d)> dt; 10415 return DemoteTo(dn, Combine(dt, b, a)); 10416 } 10417 10418 // Specializations for partial vectors because packs_epi32 sets lanes above 2*N. 10419 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> 10420 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, 10421 VFromD<Repartition<int16_t, D>> b) { 10422 const DFromV<decltype(a)> d; 10423 const Twice<decltype(d)> dt; 10424 return DemoteTo(dn, Combine(dt, b, a)); 10425 } 10426 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> 10427 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a, 10428 Vec64<int16_t> b) { 10429 return VFromD<D>{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw), 10430 _MM_SHUFFLE(2, 0, 2, 0))}; 10431 } 10432 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> 10433 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, 10434 Vec128<int16_t> b) { 10435 return VFromD<D>{_mm_packs_epi16(a.raw, b.raw)}; 10436 } 10437 10438 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 10439 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, 10440 VFromD<Repartition<int16_t, D>> b) { 10441 const DFromV<decltype(a)> d; 10442 const Twice<decltype(d)> dt; 10443 return DemoteTo(dn, Combine(dt, b, a)); 10444 } 10445 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 10446 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec64<int16_t> a, 10447 Vec64<int16_t> b) { 10448 return VFromD<D>{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw), 10449 _MM_SHUFFLE(2, 0, 2, 0))}; 10450 } 10451 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 10452 HWY_API VFromD<D> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, 10453 Vec128<int16_t> b) { 10454 return VFromD<D>{_mm_packus_epi16(a.raw, b.raw)}; 10455 } 10456 10457 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 10458 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a, 10459 Vec128<uint16_t> b) { 10460 const DFromV<decltype(a)> du16; 10461 const RebindToSigned<decltype(du16)> di16; 10462 const auto max_i16 = Set(du16, 0x7FFFu); 10463 10464 #if HWY_TARGET >= HWY_SSSE3 10465 const Repartition<uint8_t, decltype(du16)> du16_as_du8; 10466 // On SSE2/SSSE3, clamp a and b using u8 Min operation 10467 const auto clamped_a = BitCast( 10468 di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16))); 10469 const auto clamped_b = BitCast( 10470 di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16))); 10471 #else 10472 const auto clamped_a = BitCast(di16, Min(a, max_i16)); 10473 const auto clamped_b = BitCast(di16, Min(b, max_i16)); 10474 #endif 10475 10476 return ReorderDemote2To(dn, clamped_a, clamped_b); 10477 } 10478 10479 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 10480 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a, 10481 VFromD<Repartition<uint16_t, D>> b) { 10482 const DFromV<decltype(a)> d; 10483 const Twice<decltype(d)> dt; 10484 return DemoteTo(dn, Combine(dt, b, a)); 10485 } 10486 10487 template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), 10488 HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 10489 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 10490 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 10491 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 10492 return ReorderDemote2To(d, a, b); 10493 } 10494 10495 #if HWY_AVX3_HAVE_F32_TO_BF16C 10496 // F32 to BF16 OrderedDemote2To is generic for all vector lengths on targets 10497 // that support AVX512BF16 10498 template <class D, HWY_IF_BF16_D(D)> 10499 HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a, 10500 VFromD<Repartition<float, D>> b) { 10501 return ReorderDemote2To(dbf16, a, b); 10502 } 10503 #endif // HWY_AVX3_HAVE_F32_TO_BF16C 10504 10505 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 10506 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { 10507 return VFromD<D>{_mm_cvtpd_ps(v.raw)}; 10508 } 10509 10510 namespace detail { 10511 10512 // Generic for all vector lengths. 10513 template <class D> 10514 HWY_INLINE VFromD<D> ClampF64ToI32Max(D d, VFromD<D> v) { 10515 // The max can be exactly represented in binary64, so clamping beforehand 10516 // prevents x86 conversion from raising an exception and returning 80..00. 10517 return Min(v, Set(d, 2147483647.0)); 10518 } 10519 10520 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 10521 template <class TTo, class TF> 10522 static constexpr HWY_INLINE TTo 10523 X86ConvertScalarFromFloat(hwy::FloatTag /* to_type_tag */, TF from_val) { 10524 return ConvertScalarTo<TTo>(from_val); 10525 } 10526 10527 template <class TTo, class TF> 10528 static HWY_BITCASTSCALAR_CONSTEXPR HWY_INLINE TTo 10529 X86ConvertScalarFromFloat(hwy::SpecialTag /* to_type_tag */, TF from_val) { 10530 return ConvertScalarTo<TTo>(from_val); 10531 } 10532 10533 template <class TTo, class TF> 10534 static HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_INLINE TTo 10535 X86ConvertScalarFromFloat(hwy::SignedTag /* to_type_tag */, TF from_val) { 10536 #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS 10537 using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float, 10538 RemoveCvRef<TF>>; 10539 #else 10540 using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; 10541 #endif 10542 10543 const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val); 10544 constexpr TTo kMinResultVal = LimitsMin<TTo>(); 10545 HWY_BITCASTSCALAR_CONSTEXPR const TFArith kMinOutOfRangePosVal = 10546 ScalarAbs(ConvertScalarTo<TFArith>(kMinResultVal)); 10547 10548 return (ScalarAbs(from_val_in_arith_type) < kMinOutOfRangePosVal) 10549 ? ConvertScalarTo<TTo>(from_val_in_arith_type) 10550 : kMinResultVal; 10551 } 10552 10553 template <class TTo, class TF> 10554 static HWY_CXX14_CONSTEXPR HWY_INLINE TTo 10555 X86ConvertScalarFromFloat(hwy::UnsignedTag /* to_type_tag */, TF from_val) { 10556 #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS 10557 using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float, 10558 RemoveCvRef<TF>>; 10559 #else 10560 using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; 10561 #endif 10562 10563 const TFArith from_val_in_arith_type = ConvertScalarTo<TFArith>(from_val); 10564 constexpr TTo kTToMsb = static_cast<TTo>(TTo{1} << (sizeof(TTo) * 8 - 1)); 10565 constexpr const TFArith kNegOne = ConvertScalarTo<TFArith>(-1.0); 10566 constexpr const TFArith kMinOutOfRangePosVal = 10567 ConvertScalarTo<TFArith>(static_cast<double>(kTToMsb) * 2.0); 10568 10569 return (from_val_in_arith_type > kNegOne && 10570 from_val_in_arith_type < kMinOutOfRangePosVal) 10571 ? ConvertScalarTo<TTo>(from_val_in_arith_type) 10572 : LimitsMax<TTo>(); 10573 } 10574 10575 template <class TTo, class TF> 10576 static constexpr HWY_INLINE HWY_MAYBE_UNUSED TTo 10577 X86ConvertScalarFromFloat(TF from_val) { 10578 return X86ConvertScalarFromFloat<TTo>(hwy::TypeTag<RemoveCvRef<TTo>>(), 10579 from_val); 10580 } 10581 10582 #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 10583 10584 } // namespace detail 10585 10586 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 10587 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 10588 #else 10589 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 10590 #endif 10591 10592 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 10593 HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) { 10594 #if HWY_X86_HAVE_AVX10_2_OPS 10595 return VFromD<D>{_mm_cvtts_pd_epi32(v.raw)}; 10596 #elif HWY_COMPILER_GCC_ACTUAL 10597 // Workaround for undefined behavior in _mm_cvttpd_epi32 with GCC if any 10598 // values of v[i] are not within the range of an int32_t 10599 10600 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 10601 if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { 10602 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 10603 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 10604 return Dup128VecFromValues( 10605 D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]), 10606 detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), int32_t{0}, 10607 int32_t{0}); 10608 } 10609 #endif 10610 10611 __m128i raw_result; 10612 __asm__("%vcvttpd2dq {%1, %0|%0, %1}" 10613 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 10614 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 10615 :); 10616 return VFromD<D>{raw_result}; 10617 #else // !HWY_COMPILER_GCC_ACTUAL 10618 return VFromD<D>{_mm_cvttpd_epi32(v.raw)}; 10619 #endif 10620 } 10621 10622 // F64 to I32 DemoteTo is generic for all vector lengths 10623 template <class D, HWY_IF_I32_D(D)> 10624 HWY_API VFromD<D> DemoteTo(D di32, VFromD<Rebind<double, D>> v) { 10625 const Rebind<double, decltype(di32)> df64; 10626 const VFromD<decltype(df64)> clamped = detail::ClampF64ToI32Max(df64, v); 10627 return DemoteInRangeTo(di32, clamped); 10628 } 10629 10630 #if HWY_TARGET <= HWY_AVX3 10631 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 10632 HWY_API VFromD<D> DemoteInRangeTo(D /* tag */, VFromD<Rebind<double, D>> v) { 10633 #if HWY_X86_HAVE_AVX10_2_OPS 10634 return VFromD<D>{_mm_cvtts_pd_epu32(v.raw)}; 10635 #elif HWY_COMPILER_GCC_ACTUAL 10636 // Workaround for undefined behavior in _mm_cvttpd_epu32 with GCC if any 10637 // values of v[i] are not within the range of an uint32_t 10638 10639 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 10640 if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) { 10641 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 10642 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 10643 return Dup128VecFromValues( 10644 D(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]), 10645 detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), uint32_t{0}, 10646 uint32_t{0}); 10647 } 10648 #endif 10649 10650 __m128i raw_result; 10651 __asm__("vcvttpd2udq {%1, %0|%0, %1}" 10652 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 10653 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 10654 :); 10655 return VFromD<D>{raw_result}; 10656 #else 10657 return VFromD<D>{_mm_cvttpd_epu32(v.raw)}; 10658 #endif 10659 } 10660 10661 // F64->U32 DemoteTo is generic for all vector lengths 10662 template <class D, HWY_IF_U32_D(D)> 10663 HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) { 10664 #if HWY_X86_HAVE_AVX10_2_OPS 10665 return DemoteInRangeTo(du32, v); 10666 #else 10667 return DemoteInRangeTo(du32, ZeroIfNegative(v)); 10668 #endif 10669 } 10670 #else // HWY_TARGET > HWY_AVX3 10671 10672 // F64 to U32 DemoteInRangeTo is generic for all vector lengths on 10673 // SSE2/SSSE3/SSE4/AVX2 10674 template <class D, HWY_IF_U32_D(D)> 10675 HWY_API VFromD<D> DemoteInRangeTo(D du32, VFromD<Rebind<double, D>> v) { 10676 const RebindToSigned<decltype(du32)> di32; 10677 const Rebind<double, decltype(du32)> df64; 10678 const RebindToUnsigned<decltype(df64)> du64; 10679 10680 const auto k2_31 = Set(df64, 2147483648.0); 10681 const auto v_is_ge_k2_31 = (v >= k2_31); 10682 const auto clamped_lo31_f64 = v - IfThenElseZero(v_is_ge_k2_31, k2_31); 10683 const auto clamped_lo31_u32 = 10684 BitCast(du32, DemoteInRangeTo(di32, clamped_lo31_f64)); 10685 const auto clamped_u32_msb = ShiftLeft<31>( 10686 TruncateTo(du32, BitCast(du64, VecFromMask(df64, v_is_ge_k2_31)))); 10687 return Or(clamped_lo31_u32, clamped_u32_msb); 10688 } 10689 10690 // F64 to U32 DemoteTo is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 10691 template <class D, HWY_IF_U32_D(D)> 10692 HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) { 10693 const Rebind<double, decltype(du32)> df64; 10694 const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0)); 10695 return DemoteInRangeTo(du32, clamped); 10696 } 10697 #endif // HWY_TARGET <= HWY_AVX3 10698 10699 #if HWY_TARGET <= HWY_AVX3 10700 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 10701 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 10702 return VFromD<D>{_mm_cvtepi64_ps(v.raw)}; 10703 } 10704 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 10705 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 10706 return VFromD<D>{_mm_cvtepu64_ps(v.raw)}; 10707 } 10708 #else 10709 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 10710 template <class D, HWY_IF_F32_D(D)> 10711 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) { 10712 const Rebind<double, decltype(df32)> df64; 10713 const RebindToUnsigned<decltype(df64)> du64; 10714 const RebindToSigned<decltype(df32)> di32; 10715 const RebindToUnsigned<decltype(df32)> du32; 10716 10717 const auto k2p64_63 = Set(df64, 27670116110564327424.0); 10718 const auto f64_hi52 = 10719 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; 10720 const auto f64_lo12 = 10721 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), 10722 Set(du32, uint32_t{0x00000FFF})))); 10723 10724 const auto f64_sum = f64_hi52 + f64_lo12; 10725 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 10726 10727 const auto f64_sum_is_inexact = 10728 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 10729 const auto f64_bits_decrement = 10730 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), 10731 f64_sum_is_inexact); 10732 10733 const auto adj_f64_val = BitCast( 10734 df64, 10735 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); 10736 10737 return DemoteTo(df32, adj_f64_val); 10738 } 10739 10740 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 10741 template <class D, HWY_IF_F32_D(D)> 10742 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) { 10743 const Rebind<double, decltype(df32)> df64; 10744 const RebindToUnsigned<decltype(df64)> du64; 10745 const RebindToSigned<decltype(df32)> di32; 10746 const RebindToUnsigned<decltype(df32)> du32; 10747 10748 const auto k2p64 = Set(df64, 18446744073709551616.0); 10749 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; 10750 const auto f64_lo12 = 10751 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), 10752 Set(du32, uint32_t{0x00000FFF})))); 10753 10754 const auto f64_sum = f64_hi52 + f64_lo12; 10755 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 10756 const auto f64_sum_is_inexact = 10757 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 10758 10759 const auto adj_f64_val = BitCast( 10760 df64, 10761 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), 10762 f64_sum_is_inexact)); 10763 10764 return DemoteTo(df32, adj_f64_val); 10765 } 10766 #endif 10767 10768 // For already range-limited input [0, 255]. 10769 template <size_t N> 10770 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { 10771 #if HWY_TARGET == HWY_SSE2 10772 const RebindToSigned<DFromV<decltype(v)>> di32; 10773 const Rebind<uint8_t, decltype(di32)> du8; 10774 return DemoteTo(du8, BitCast(di32, v)); 10775 #else 10776 const DFromV<decltype(v)> d32; 10777 const Repartition<uint8_t, decltype(d32)> d8; 10778 alignas(16) static constexpr uint32_t k8From32[4] = { 10779 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; 10780 // Also replicate bytes into all 32 bit lanes for safety. 10781 const auto quad = TableLookupBytes(v, Load(d32, k8From32)); 10782 return LowerHalf(LowerHalf(BitCast(d8, quad))); 10783 #endif 10784 } 10785 10786 // ------------------------------ F32->UI64 PromoteTo 10787 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 10788 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 10789 #else 10790 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 10791 #endif 10792 10793 #if HWY_TARGET <= HWY_AVX3 10794 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 10795 HWY_API VFromD<D> PromoteInRangeTo(D /*di64*/, VFromD<Rebind<float, D>> v) { 10796 #if HWY_X86_HAVE_AVX10_2_OPS 10797 return VFromD<D>{_mm_cvtts_ps_epi64(v.raw)}; 10798 #elif HWY_COMPILER_GCC_ACTUAL 10799 // Workaround for undefined behavior with GCC if any values of v[i] are not 10800 // within the range of an int64_t 10801 10802 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 10803 if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) { 10804 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 10805 const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); 10806 return Dup128VecFromValues( 10807 D(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]), 10808 detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1])); 10809 } 10810 #endif 10811 10812 __m128i raw_result; 10813 __asm__("vcvttps2qq {%1, %0|%0, %1}" 10814 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 10815 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 10816 :); 10817 return VFromD<D>{raw_result}; 10818 #else 10819 return VFromD<D>{_mm_cvttps_epi64(v.raw)}; 10820 #endif 10821 } 10822 10823 // Generic for all vector lengths. 10824 template <class D, HWY_IF_I64_D(D)> 10825 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { 10826 #if HWY_X86_HAVE_AVX10_2_OPS 10827 return PromoteInRangeTo(di64, v); 10828 #else 10829 const Rebind<float, decltype(di64)> df32; 10830 const RebindToFloat<decltype(di64)> df64; 10831 // We now avoid GCC UB in PromoteInRangeTo via assembly, see #2189 and 10832 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115115. Previously we fixed up 10833 // the result afterwards using three instructions. Now we instead check if 10834 // v >= 2^63, and if so replace the output with 2^63-1, which is likely more 10835 // efficient. Note that the previous representable f32 is less than 2^63 and 10836 // thus fits in i64. 10837 const MFromD<D> overflow = RebindMask( 10838 di64, PromoteMaskTo(df64, df32, Ge(v, Set(df32, 9.223372e18f)))); 10839 return IfThenElse(overflow, Set(di64, LimitsMax<int64_t>()), 10840 PromoteInRangeTo(di64, v)); 10841 #endif 10842 } 10843 template <class D, HWY_IF_U64_D(D)> 10844 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { 10845 #if HWY_X86_HAVE_AVX10_2_OPS 10846 return PromoteInRangeTo(du64, v); 10847 #else 10848 return PromoteInRangeTo(du64, ZeroIfNegative(v)); 10849 #endif 10850 } 10851 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 10852 HWY_API VFromD<D> PromoteInRangeTo(D /* tag */, VFromD<Rebind<float, D>> v) { 10853 #if HWY_X86_HAVE_AVX10_2_OPS 10854 return VFromD<D>{_mm_cvtts_ps_epu64(v.raw)}; 10855 #elif HWY_COMPILER_GCC_ACTUAL 10856 // Workaround for undefined behavior with GCC if any values of v[i] are not 10857 // within the range of an uint64_t 10858 10859 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 10860 if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) { 10861 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 10862 const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); 10863 return Dup128VecFromValues( 10864 D(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]), 10865 detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])); 10866 } 10867 #endif 10868 10869 __m128i raw_result; 10870 __asm__("vcvttps2uqq {%1, %0|%0, %1}" 10871 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 10872 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 10873 :); 10874 return VFromD<D>{raw_result}; 10875 #else 10876 return VFromD<D>{_mm_cvttps_epu64(v.raw)}; 10877 #endif 10878 } 10879 #else // AVX2 or below 10880 10881 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 10882 template <class D, HWY_IF_I64_D(D)> 10883 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { 10884 const Rebind<int32_t, decltype(di64)> di32; 10885 const RebindToFloat<decltype(di32)> df32; 10886 const RebindToUnsigned<decltype(di32)> du32; 10887 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 10888 10889 const auto exponent_adj = BitCast( 10890 du32, 10891 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 10892 BitCast(du32_as_du8, Set(du32, uint32_t{157}))), 10893 BitCast(du32_as_du8, Set(du32, uint32_t{32})))); 10894 const auto adj_v = 10895 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 10896 10897 const auto f32_to_i32_result = ConvertTo(di32, adj_v); 10898 const auto lo64_or_mask = PromoteTo( 10899 di64, 10900 BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, 10901 Set(di32, LimitsMax<int32_t>()))))); 10902 10903 return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) 10904 << PromoteTo(di64, exponent_adj), 10905 lo64_or_mask); 10906 } 10907 10908 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 10909 template <class D, HWY_IF_UI64_D(D)> 10910 HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) { 10911 const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32; 10912 const RebindToSigned<decltype(d32)> di32; 10913 const RebindToFloat<decltype(d32)> df32; 10914 const RebindToUnsigned<decltype(d32)> du32; 10915 const Repartition<uint8_t, decltype(d32)> du32_as_du8; 10916 10917 const auto exponent_adj = BitCast( 10918 du32, 10919 SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 10920 BitCast(du32_as_du8, Set(du32, uint32_t{0xFFFFFF9Du})))); 10921 const auto adj_v = 10922 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 10923 10924 const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v); 10925 return PromoteTo(d64, BitCast(d32, f32_to_i32_result)) 10926 << PromoteTo(d64, exponent_adj); 10927 } 10928 10929 namespace detail { 10930 10931 template <class DU64, HWY_IF_V_SIZE_LE_D(DU64, 16)> 10932 HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64( 10933 DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) { 10934 const Rebind<int32_t, decltype(du64)> di32; 10935 const Twice<decltype(di32)> dt_i32; 10936 10937 const auto vt_i32_overflow_mask = ResizeBitCast(dt_i32, i32_overflow_mask); 10938 return BitCast(du64, 10939 InterleaveLower(vt_i32_overflow_mask, vt_i32_overflow_mask)); 10940 } 10941 10942 template <class DU64, HWY_IF_V_SIZE_GT_D(DU64, 16)> 10943 HWY_INLINE VFromD<DU64> PromoteF32ToU64OverflowMaskToU64( 10944 DU64 du64, VFromD<Rebind<int32_t, DU64>> i32_overflow_mask) { 10945 const RebindToSigned<decltype(du64)> di64; 10946 return BitCast(du64, PromoteTo(di64, i32_overflow_mask)); 10947 } 10948 10949 } // namespace detail 10950 10951 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 10952 template <class D, HWY_IF_U64_D(D)> 10953 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { 10954 const Rebind<int32_t, decltype(du64)> di32; 10955 const RebindToFloat<decltype(di32)> df32; 10956 const RebindToUnsigned<decltype(di32)> du32; 10957 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 10958 10959 const auto non_neg_v = ZeroIfNegative(v); 10960 10961 const auto exponent_adj = BitCast( 10962 du32, Min(SaturatedSub(BitCast(du32_as_du8, 10963 ShiftRight<23>(BitCast(du32, non_neg_v))), 10964 BitCast(du32_as_du8, Set(du32, uint32_t{157}))), 10965 BitCast(du32_as_du8, Set(du32, uint32_t{33})))); 10966 10967 const auto adj_v = 10968 BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj)); 10969 const auto f32_to_i32_result = ConvertInRangeTo(di32, adj_v); 10970 10971 const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result); 10972 const auto overflow_result = 10973 detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask); 10974 10975 return Or(PromoteTo(du64, BitCast(du32, f32_to_i32_result)) 10976 << PromoteTo(du64, exponent_adj), 10977 overflow_result); 10978 } 10979 #endif // HWY_TARGET <= HWY_AVX3 10980 10981 // ------------------------------ MulFixedPoint15 10982 10983 #if HWY_TARGET == HWY_SSE2 10984 HWY_API Vec128<int16_t> MulFixedPoint15(const Vec128<int16_t> a, 10985 const Vec128<int16_t> b) { 10986 const DFromV<decltype(a)> d; 10987 const Repartition<int32_t, decltype(d)> di32; 10988 10989 auto lo_product = a * b; 10990 auto hi_product = MulHigh(a, b); 10991 10992 const VFromD<decltype(di32)> i32_product_lo{ 10993 _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; 10994 const VFromD<decltype(di32)> i32_product_hi{ 10995 _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)}; 10996 10997 const auto round_up_incr = Set(di32, 0x4000); 10998 return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr), 10999 ShiftRight<15>(i32_product_hi + round_up_incr)); 11000 } 11001 11002 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> 11003 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, 11004 const Vec128<int16_t, N> b) { 11005 const DFromV<decltype(a)> d; 11006 const Rebind<int32_t, decltype(d)> di32; 11007 11008 const auto lo_product = a * b; 11009 const auto hi_product = MulHigh(a, b); 11010 const VFromD<decltype(di32)> i32_product{ 11011 _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; 11012 11013 return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000))); 11014 } 11015 #else 11016 template <size_t N> 11017 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a, 11018 const Vec128<int16_t, N> b) { 11019 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)}; 11020 } 11021 #endif 11022 11023 // ------------------------------ Truncations 11024 11025 template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)> 11026 HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) { 11027 // BitCast requires the same size; DTo might be u8x1 and v u16x1. 11028 const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto; 11029 return VFromD<DTo>{BitCast(dto, v).raw}; 11030 } 11031 11032 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)> 11033 HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) { 11034 #if HWY_TARGET == HWY_SSE2 11035 const Vec128<uint8_t, 1> lo{v.raw}; 11036 const Vec128<uint8_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)}; 11037 return Combine(d, hi, lo); 11038 #else 11039 const Repartition<uint8_t, DFromV<decltype(v)>> d8; 11040 (void)d; 11041 alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8, 11042 0, 8, 0, 8, 0, 8, 0, 8}; 11043 const Vec128<uint8_t> v8 = TableLookupBytes(v, Load(d8, kIdx)); 11044 return LowerHalf(LowerHalf(LowerHalf(v8))); 11045 #endif 11046 } 11047 11048 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)> 11049 HWY_API VFromD<D> TruncateTo(D d, Vec128<uint64_t> v) { 11050 #if HWY_TARGET == HWY_SSE2 11051 const Vec128<uint16_t, 1> lo{v.raw}; 11052 const Vec128<uint16_t, 1> hi{_mm_unpackhi_epi64(v.raw, v.raw)}; 11053 return Combine(d, hi, lo); 11054 #else 11055 (void)d; 11056 const Repartition<uint16_t, DFromV<decltype(v)>> d16; 11057 alignas(16) static constexpr uint16_t kIdx[8] = { 11058 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u}; 11059 const Vec128<uint16_t> v16 = TableLookupBytes(v, Load(d16, kIdx)); 11060 return LowerHalf(LowerHalf(v16)); 11061 #endif 11062 } 11063 11064 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> 11065 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 11066 return VFromD<D>{_mm_shuffle_epi32(v.raw, 0x88)}; 11067 } 11068 11069 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 11070 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 11071 const DFromV<decltype(v)> du32; 11072 #if HWY_TARGET == HWY_SSE2 11073 const RebindToSigned<decltype(du32)> di32; 11074 const Rebind<uint8_t, decltype(di32)> du8; 11075 return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v)))); 11076 #else 11077 const Repartition<uint8_t, decltype(du32)> d; 11078 alignas(16) static constexpr uint8_t kIdx[16] = { 11079 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, 11080 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu}; 11081 return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx)))); 11082 #endif 11083 } 11084 11085 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 11086 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 11087 const DFromV<decltype(v)> du32; 11088 #if HWY_TARGET == HWY_SSE2 11089 const RebindToSigned<decltype(du32)> di32; 11090 const Rebind<uint16_t, decltype(di32)> du16; 11091 const RebindToSigned<decltype(du16)> di16; 11092 return BitCast( 11093 du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v))))); 11094 #else 11095 const Repartition<uint16_t, decltype(du32)> d; 11096 return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); 11097 #endif 11098 } 11099 11100 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 11101 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 11102 const DFromV<decltype(v)> du16; 11103 #if HWY_TARGET == HWY_SSE2 11104 const RebindToSigned<decltype(du16)> di16; 11105 const Rebind<uint8_t, decltype(di16)> du8; 11106 const RebindToSigned<decltype(du8)> di8; 11107 return BitCast(du8, 11108 DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v))))); 11109 #else 11110 const Repartition<uint8_t, decltype(du16)> d; 11111 return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); 11112 #endif 11113 } 11114 11115 // ------------------------------ Demotions to/from i64 11116 11117 #if HWY_TARGET <= HWY_AVX3 11118 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 11119 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 11120 return VFromD<D>{_mm_cvtsepi64_epi32(v.raw)}; 11121 } 11122 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)> 11123 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 11124 return VFromD<D>{_mm_cvtsepi64_epi16(v.raw)}; 11125 } 11126 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)> 11127 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 11128 return VFromD<D>{_mm_cvtsepi64_epi8(v.raw)}; 11129 } 11130 11131 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 11132 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 11133 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; 11134 return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; 11135 } 11136 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> 11137 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 11138 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; 11139 return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; 11140 } 11141 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> 11142 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) { 11143 const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; 11144 return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; 11145 } 11146 11147 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 11148 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 11149 return VFromD<D>{_mm_cvtusepi64_epi32(v.raw)}; 11150 } 11151 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> 11152 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 11153 return VFromD<D>{_mm_cvtusepi64_epi16(v.raw)}; 11154 } 11155 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> 11156 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint64_t, D>> v) { 11157 return VFromD<D>{_mm_cvtusepi64_epi8(v.raw)}; 11158 } 11159 #else // AVX2 or below 11160 11161 // Disable the default unsigned to signed DemoteTo/ReorderDemote2To 11162 // implementations in generic_ops-inl.h for U64->I8/I16/I32 demotions on 11163 // SSE2/SSSE3/SSE4/AVX2 as U64->I8/I16/I32 DemoteTo/ReorderDemote2To for 11164 // SSE2/SSSE3/SSE4/AVX2 is implemented in x86_128-inl.h 11165 11166 // The default unsigned to signed DemoteTo/ReorderDemote2To 11167 // implementations in generic_ops-inl.h are still used for U32->I8/I16 and 11168 // U16->I8 demotions on SSE2/SSSE3/SSE4/AVX2 11169 11170 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V 11171 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) HWY_IF_NOT_T_SIZE_V(V, 8) 11172 11173 namespace detail { 11174 template <class D, HWY_IF_UNSIGNED_D(D)> 11175 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( 11176 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { 11177 return v; 11178 } 11179 11180 template <class D, HWY_IF_SIGNED_D(D)> 11181 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( 11182 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { 11183 const DFromV<decltype(v)> du64; 11184 return And(v, 11185 Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>()))); 11186 } 11187 11188 template <class D> 11189 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate( 11190 D dn, VFromD<Rebind<uint64_t, D>> v) { 11191 const Rebind<uint64_t, D> du64; 11192 const RebindToSigned<decltype(du64)> di64; 11193 constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) - 11194 static_cast<int>(hwy::IsSigned<TFromD<D>>()); 11195 11196 const auto too_big = BitCast( 11197 du64, VecFromMask( 11198 di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64)))); 11199 return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); 11200 } 11201 11202 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class V> 11203 HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) { 11204 return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); 11205 } 11206 11207 } // namespace detail 11208 11209 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 11210 HWY_IF_SIGNED_D(D)> 11211 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { 11212 const DFromV<decltype(v)> di64; 11213 const RebindToUnsigned<decltype(di64)> du64; 11214 const RebindToUnsigned<decltype(dn)> dn_u; 11215 11216 // Negative values are saturated by first saturating their bitwise inverse 11217 // and then inverting the saturation result 11218 const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); 11219 const auto saturated_vals = Xor( 11220 invert_mask, 11221 detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); 11222 return BitCast(dn, TruncateTo(dn_u, saturated_vals)); 11223 } 11224 11225 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 11226 HWY_IF_UNSIGNED_D(D)> 11227 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { 11228 const DFromV<decltype(v)> di64; 11229 const RebindToUnsigned<decltype(di64)> du64; 11230 11231 const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); 11232 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); 11233 } 11234 11235 template <class D, 11236 HWY_IF_T_SIZE_ONE_OF_D( 11237 D, ((HWY_TARGET != HWY_SSE2) ? ((1 << 1) | (1 << 2)) : 0) | 11238 (1 << 4)), 11239 HWY_IF_SIGNED_D(D)> 11240 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { 11241 const RebindToUnsigned<decltype(dn)> dn_u; 11242 return BitCast(dn, TruncateTo(dn_u, detail::DemoteFromU64Saturate(dn, v))); 11243 } 11244 11245 #if HWY_TARGET == HWY_SSE2 11246 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 11247 HWY_IF_SIGNED_D(D)> 11248 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { 11249 const Rebind<int32_t, decltype(dn)> di32; 11250 return DemoteTo(dn, DemoteTo(di32, v)); 11251 } 11252 #endif // HWY_TARGET == HWY_SSE2 11253 11254 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 11255 HWY_IF_UNSIGNED_D(D)> 11256 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { 11257 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); 11258 } 11259 #endif // HWY_TARGET <= HWY_AVX3 11260 11261 template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), 11262 HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)> 11263 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a, 11264 VFromD<Repartition<int64_t, D>> b) { 11265 const DFromV<decltype(a)> d; 11266 const Twice<decltype(d)> dt; 11267 return DemoteTo(dn, Combine(dt, b, a)); 11268 } 11269 11270 template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_U32_D(D)> 11271 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, 11272 VFromD<Repartition<uint64_t, D>> b) { 11273 const DFromV<decltype(a)> d; 11274 const Twice<decltype(d)> dt; 11275 return DemoteTo(dn, Combine(dt, b, a)); 11276 } 11277 11278 #if HWY_TARGET > HWY_AVX3 11279 template <class D, HWY_IF_V_SIZE_LE_D(D, HWY_MAX_BYTES / 2), HWY_IF_I32_D(D)> 11280 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, 11281 VFromD<Repartition<uint64_t, D>> b) { 11282 const DFromV<decltype(a)> d; 11283 const Twice<decltype(d)> dt; 11284 return DemoteTo(dn, Combine(dt, b, a)); 11285 } 11286 #endif 11287 11288 #if HWY_TARGET > HWY_AVX2 11289 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 11290 HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, 11291 Vec128<int64_t> b) { 11292 const DFromV<decltype(a)> di64; 11293 const RebindToUnsigned<decltype(di64)> du64; 11294 const Half<decltype(dn)> dnh; 11295 11296 // Negative values are saturated by first saturating their bitwise inverse 11297 // and then inverting the saturation result 11298 const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); 11299 const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); 11300 const auto saturated_a = Xor( 11301 invert_mask_a, 11302 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); 11303 const auto saturated_b = Xor( 11304 invert_mask_b, 11305 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); 11306 11307 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); 11308 } 11309 11310 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 11311 HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, 11312 Vec128<int64_t> b) { 11313 const DFromV<decltype(a)> di64; 11314 const RebindToUnsigned<decltype(di64)> du64; 11315 const Half<decltype(dn)> dnh; 11316 11317 const auto saturated_a = detail::DemoteFromU64Saturate( 11318 dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); 11319 const auto saturated_b = detail::DemoteFromU64Saturate( 11320 dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); 11321 11322 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); 11323 } 11324 11325 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)> 11326 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint64_t> a, 11327 Vec128<uint64_t> b) { 11328 const Half<decltype(dn)> dnh; 11329 11330 const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); 11331 const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); 11332 11333 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); 11334 } 11335 #endif // HWY_TARGET > HWY_AVX2 11336 11337 // ------------------------------ Integer <=> fp (ShiftRight, OddEven) 11338 11339 #if HWY_HAVE_FLOAT16 11340 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 11341 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 11342 return VFromD<D>{_mm_cvtepu16_ph(v.raw)}; 11343 } 11344 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)> 11345 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 11346 return VFromD<D>{_mm_cvtepi16_ph(v.raw)}; 11347 } 11348 #endif // HWY_HAVE_FLOAT16 11349 11350 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 11351 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 11352 return VFromD<D>{_mm_cvtepi32_ps(v.raw)}; 11353 } 11354 11355 #if HWY_TARGET <= HWY_AVX3 11356 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 11357 HWY_API VFromD<D> ConvertTo(D /*df*/, VFromD<Rebind<uint32_t, D>> v) { 11358 return VFromD<D>{_mm_cvtepu32_ps(v.raw)}; 11359 } 11360 11361 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 11362 HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<int64_t, D>> v) { 11363 return VFromD<D>{_mm_cvtepi64_pd(v.raw)}; 11364 } 11365 11366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 11367 HWY_API VFromD<D> ConvertTo(D /*dd*/, VFromD<Rebind<uint64_t, D>> v) { 11368 return VFromD<D>{_mm_cvtepu64_pd(v.raw)}; 11369 } 11370 #else // AVX2 or below 11371 // Generic for all vector lengths. 11372 template <class D, HWY_IF_F32_D(D)> 11373 HWY_API VFromD<D> ConvertTo(D df, VFromD<Rebind<uint32_t, D>> v) { 11374 // Based on wim's approach (https://stackoverflow.com/questions/34066228/) 11375 const RebindToUnsigned<decltype(df)> du32; 11376 const RebindToSigned<decltype(df)> d32; 11377 11378 const auto msk_lo = Set(du32, 0xFFFF); 11379 const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 11380 11381 // Extract the 16 lowest/highest significant bits of v and cast to signed int 11382 const auto v_lo = BitCast(d32, And(v, msk_lo)); 11383 const auto v_hi = BitCast(d32, ShiftRight<16>(v)); 11384 return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); 11385 } 11386 11387 // Generic for all vector lengths. 11388 template <class D, HWY_IF_F64_D(D)> 11389 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) { 11390 // Based on wim's approach (https://stackoverflow.com/questions/41144668/) 11391 const Repartition<uint32_t, decltype(dd)> d32; 11392 const Repartition<uint64_t, decltype(dd)> d64; 11393 11394 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 11395 const auto k84_63 = Set(d64, 0x4530000080000000ULL); 11396 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); 11397 11398 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) 11399 const auto k52 = Set(d32, 0x43300000); 11400 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); 11401 11402 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); 11403 return (v_upper - k84_63_52) + v_lower; // order matters! 11404 } 11405 11406 namespace detail { 11407 template <class VW> 11408 HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) { 11409 const DFromV<decltype(w)> d64; 11410 const RebindToFloat<decltype(d64)> dd; 11411 const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 11412 return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; 11413 } 11414 } // namespace detail 11415 11416 // Generic for all vector lengths. 11417 template <class D, HWY_IF_F64_D(D)> 11418 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) { 11419 // Based on wim's approach (https://stackoverflow.com/questions/41144668/) 11420 const RebindToUnsigned<decltype(dd)> d64; 11421 using VU = VFromD<decltype(d64)>; 11422 11423 const VU msk_lo = Set(d64, 0xFFFFFFFF); 11424 const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 11425 11426 // Extract the 32 lowest/highest significant bits of v 11427 const VU v_lo = And(v, msk_lo); 11428 const VU v_hi = ShiftRight<32>(v); 11429 11430 const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); 11431 return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); 11432 } 11433 #endif // HWY_TARGET <= HWY_AVX3 11434 11435 // Truncates (rounds toward zero). 11436 11437 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 11438 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 11439 #else 11440 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 11441 #endif 11442 11443 #if HWY_HAVE_FLOAT16 11444 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> 11445 HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) { 11446 #if HWY_COMPILER_GCC_ACTUAL 11447 // Workaround for undefined behavior in _mm_cvttph_epi16 if any values of v[i] 11448 // are not within the range of an int16_t 11449 11450 #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \ 11451 HWY_HAVE_SCALAR_F16_TYPE 11452 if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) { 11453 typedef hwy::float16_t::Native GccF16RawVectType 11454 __attribute__((__vector_size__(16))); 11455 const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw); 11456 return Dup128VecFromValues( 11457 D(), detail::X86ConvertScalarFromFloat<int16_t>(raw_v[0]), 11458 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[1]), 11459 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[2]), 11460 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[3]), 11461 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[4]), 11462 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[5]), 11463 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[6]), 11464 detail::X86ConvertScalarFromFloat<int16_t>(raw_v[7])); 11465 } 11466 #endif 11467 11468 __m128i raw_result; 11469 __asm__("vcvttph2w {%1, %0|%0, %1}" 11470 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11471 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11472 :); 11473 return VFromD<D>{raw_result}; 11474 #else // !HWY_COMPILER_GCC_ACTUAL 11475 return VFromD<D>{_mm_cvttph_epi16(v.raw)}; 11476 #endif 11477 } 11478 11479 // F16 to I16 ConvertTo is generic for all vector lengths 11480 template <class D, HWY_IF_I16_D(D)> 11481 HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) { 11482 const RebindToFloat<decltype(di)> df; 11483 // See comment at the first occurrence of "IfThenElse(overflow,". 11484 const MFromD<D> overflow = 11485 RebindMask(di, Ge(v, Set(df, ConvertScalarTo<hwy::float16_t>(32768.0f)))); 11486 return IfThenElse(overflow, Set(di, LimitsMax<int16_t>()), 11487 ConvertInRangeTo(di, v)); 11488 } 11489 11490 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> 11491 HWY_API VFromD<D> ConvertInRangeTo(D /* tag */, VFromD<RebindToFloat<D>> v) { 11492 #if HWY_COMPILER_GCC_ACTUAL 11493 // Workaround for undefined behavior in _mm_cvttph_epu16 if any values of v[i] 11494 // are not within the range of an uint16_t 11495 11496 #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \ 11497 HWY_HAVE_SCALAR_F16_TYPE 11498 if (detail::IsConstantX86VecForF2IConv<uint16_t>(v)) { 11499 typedef hwy::float16_t::Native GccF16RawVectType 11500 __attribute__((__vector_size__(16))); 11501 const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw); 11502 return Dup128VecFromValues( 11503 D(), detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[0]), 11504 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[1]), 11505 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[2]), 11506 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[3]), 11507 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[4]), 11508 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[5]), 11509 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[6]), 11510 detail::X86ConvertScalarFromFloat<uint16_t>(raw_v[7])); 11511 } 11512 #endif 11513 11514 __m128i raw_result; 11515 __asm__("vcvttph2uw {%1, %0|%0, %1}" 11516 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11517 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11518 :); 11519 return VFromD<D>{raw_result}; 11520 #else // !HWY_COMPILER_GCC_ACTUAL 11521 return VFromD<D>{_mm_cvttph_epu16(v.raw)}; 11522 #endif 11523 } 11524 11525 // F16->U16 ConvertTo is generic for all vector lengths 11526 template <class D, HWY_IF_U16_D(D)> 11527 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) { 11528 return ConvertInRangeTo(D(), ZeroIfNegative(v)); 11529 } 11530 #endif // HWY_HAVE_FLOAT16 11531 11532 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 11533 HWY_API VFromD<D> ConvertInRangeTo(D /*di*/, VFromD<RebindToFloat<D>> v) { 11534 #if HWY_X86_HAVE_AVX10_2_OPS 11535 return VFromD<D>{_mm_cvtts_ps_epi32(v.raw)}; 11536 #elif HWY_COMPILER_GCC_ACTUAL 11537 // Workaround for undefined behavior in _mm_cvttps_epi32 with GCC if any 11538 // values of v[i] are not within the range of an int32_t 11539 11540 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11541 if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { 11542 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 11543 const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); 11544 return Dup128VecFromValues( 11545 D(), detail::X86ConvertScalarFromFloat<int32_t>(raw_v[0]), 11546 detail::X86ConvertScalarFromFloat<int32_t>(raw_v[1]), 11547 detail::X86ConvertScalarFromFloat<int32_t>(raw_v[2]), 11548 detail::X86ConvertScalarFromFloat<int32_t>(raw_v[3])); 11549 } 11550 #endif 11551 11552 __m128i raw_result; 11553 __asm__("%vcvttps2dq {%1, %0|%0, %1}" 11554 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11555 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11556 :); 11557 return VFromD<D>{raw_result}; 11558 #else // !HWY_COMPILER_GCC_ACTUAL 11559 return VFromD<D>{_mm_cvttps_epi32(v.raw)}; 11560 #endif 11561 } 11562 11563 // F32 to I32 ConvertTo is generic for all vector lengths 11564 template <class D, HWY_IF_I32_D(D)> 11565 HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) { 11566 #if HWY_X86_HAVE_AVX10_2_OPS 11567 return ConvertInRangeTo(di, v); 11568 #else 11569 const RebindToFloat<decltype(di)> df; 11570 // See comment at the first occurrence of "IfThenElse(overflow,". 11571 const MFromD<D> overflow = RebindMask(di, Ge(v, Set(df, 2147483648.0f))); 11572 return IfThenElse(overflow, Set(di, LimitsMax<int32_t>()), 11573 ConvertInRangeTo(di, v)); 11574 #endif 11575 } 11576 11577 #if HWY_TARGET <= HWY_AVX3 11578 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> 11579 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) { 11580 #if HWY_X86_HAVE_AVX10_2_OPS 11581 return VFromD<DI>{_mm_cvtts_pd_epi64(v.raw)}; 11582 #elif HWY_COMPILER_GCC_ACTUAL 11583 // Workaround for undefined behavior in _mm_cvttpd_epi64 with GCC if any 11584 // values of v[i] are not within the range of an int64_t 11585 11586 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11587 if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) { 11588 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 11589 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 11590 return Dup128VecFromValues( 11591 DI(), detail::X86ConvertScalarFromFloat<int64_t>(raw_v[0]), 11592 detail::X86ConvertScalarFromFloat<int64_t>(raw_v[1])); 11593 } 11594 #endif 11595 11596 __m128i raw_result; 11597 __asm__("vcvttpd2qq {%1, %0|%0, %1}" 11598 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11599 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11600 :); 11601 return VFromD<DI>{raw_result}; 11602 #else // !HWY_COMPILER_GCC_ACTUAL 11603 return VFromD<DI>{_mm_cvttpd_epi64(v.raw)}; 11604 #endif 11605 } 11606 11607 // F64 to I64 ConvertTo is generic for all vector lengths on AVX3 11608 template <class DI, HWY_IF_I64_D(DI)> 11609 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) { 11610 #if HWY_X86_HAVE_AVX10_2_OPS 11611 return ConvertInRangeTo(di, v); 11612 #else 11613 const RebindToFloat<decltype(di)> df; 11614 // See comment at the first occurrence of "IfThenElse(overflow,". 11615 const MFromD<DI> overflow = 11616 RebindMask(di, Ge(v, Set(df, 9.223372036854776e18))); 11617 return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()), 11618 ConvertInRangeTo(di, v)); 11619 #endif 11620 } 11621 11622 template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)> 11623 HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { 11624 #if HWY_X86_HAVE_AVX10_2_OPS 11625 return VFromD<DU>{_mm_cvtts_ps_epu32(v.raw)}; 11626 #elif HWY_COMPILER_GCC_ACTUAL 11627 // Workaround for undefined behavior in _mm_cvttps_epu32 with GCC if any 11628 // values of v[i] are not within the range of an uint32_t 11629 11630 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11631 if (detail::IsConstantX86VecForF2IConv<uint32_t>(v)) { 11632 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 11633 const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); 11634 return Dup128VecFromValues( 11635 DU(), detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[0]), 11636 detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[1]), 11637 detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[2]), 11638 detail::X86ConvertScalarFromFloat<uint32_t>(raw_v[3])); 11639 } 11640 #endif 11641 11642 __m128i raw_result; 11643 __asm__("vcvttps2udq {%1, %0|%0, %1}" 11644 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11645 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11646 :); 11647 return VFromD<DU>{raw_result}; 11648 #else // !HWY_COMPILER_GCC_ACTUAL 11649 return VFromD<DU>{_mm_cvttps_epu32(v.raw)}; 11650 #endif 11651 } 11652 11653 // F32->U32 ConvertTo is generic for all vector lengths 11654 template <class DU, HWY_IF_U32_D(DU)> 11655 HWY_API VFromD<DU> ConvertTo(DU du32, VFromD<RebindToFloat<DU>> v) { 11656 #if HWY_X86_HAVE_AVX10_2_OPS 11657 return ConvertInRangeTo(du32, v); 11658 #else 11659 return ConvertInRangeTo(du32, ZeroIfNegative(v)); 11660 #endif 11661 } 11662 11663 template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)> 11664 HWY_API VFromD<DU> ConvertInRangeTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) { 11665 #if HWY_X86_HAVE_AVX10_2_OPS 11666 return VFromD<DU>{_mm_cvtts_pd_epu64(v.raw)}; 11667 #elif HWY_COMPILER_GCC_ACTUAL 11668 // Workaround for undefined behavior in _mm_cvttpd_epu64 with GCC if any 11669 // values of v[i] are not within the range of an uint64_t 11670 11671 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11672 if (detail::IsConstantX86VecForF2IConv<uint64_t>(v)) { 11673 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 11674 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 11675 return Dup128VecFromValues( 11676 DU(), detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[0]), 11677 detail::X86ConvertScalarFromFloat<uint64_t>(raw_v[1])); 11678 } 11679 #endif 11680 11681 __m128i raw_result; 11682 __asm__("vcvttpd2uqq {%1, %0|%0, %1}" 11683 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11684 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11685 :); 11686 return VFromD<DU>{raw_result}; 11687 #else // !HWY_COMPILER_GCC_ACTUAL 11688 return VFromD<DU>{_mm_cvttpd_epu64(v.raw)}; 11689 #endif 11690 } 11691 11692 // F64->U64 ConvertTo is generic for all vector lengths 11693 template <class DU, HWY_IF_U64_D(DU)> 11694 HWY_API VFromD<DU> ConvertTo(DU du64, VFromD<RebindToFloat<DU>> v) { 11695 #if HWY_X86_HAVE_AVX10_2_OPS 11696 return ConvertInRangeTo(du64, v); 11697 #else 11698 return ConvertInRangeTo(du64, ZeroIfNegative(v)); 11699 #endif 11700 } 11701 11702 #else // AVX2 or below 11703 11704 namespace detail { 11705 11706 template <class DU32, HWY_IF_U32_D(DU32)> 11707 static HWY_INLINE VFromD<DU32> ConvInRangeF32ToU32( 11708 DU32 du32, VFromD<RebindToFloat<DU32>> v, VFromD<DU32>& exp_diff) { 11709 const RebindToSigned<decltype(du32)> di32; 11710 const RebindToFloat<decltype(du32)> df32; 11711 11712 exp_diff = Set(du32, uint32_t{158}) - ShiftRight<23>(BitCast(du32, v)); 11713 const auto scale_down_f32_val_mask = 11714 VecFromMask(du32, Eq(exp_diff, Zero(du32))); 11715 11716 const auto v_scaled = 11717 BitCast(df32, BitCast(du32, v) + ShiftLeft<23>(scale_down_f32_val_mask)); 11718 const auto f32_to_u32_result = 11719 BitCast(du32, ConvertInRangeTo(di32, v_scaled)); 11720 11721 return f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask); 11722 } 11723 11724 } // namespace detail 11725 11726 // F32 to U32 ConvertInRangeTo is generic for all vector lengths on 11727 // SSE2/SSSE3/SSE4/AVX2 11728 template <class DU32, HWY_IF_U32_D(DU32)> 11729 HWY_API VFromD<DU32> ConvertInRangeTo(DU32 du32, 11730 VFromD<RebindToFloat<DU32>> v) { 11731 VFromD<DU32> exp_diff; 11732 const auto f32_to_u32_result = detail::ConvInRangeF32ToU32(du32, v, exp_diff); 11733 return f32_to_u32_result; 11734 } 11735 11736 // F32 to U32 ConvertTo is generic for all vector lengths on 11737 // SSE2/SSSE3/SSE4/AVX2 11738 template <class DU32, HWY_IF_U32_D(DU32)> 11739 HWY_API VFromD<DU32> ConvertTo(DU32 du32, VFromD<RebindToFloat<DU32>> v) { 11740 const RebindToSigned<decltype(du32)> di32; 11741 11742 const auto non_neg_v = ZeroIfNegative(v); 11743 VFromD<DU32> exp_diff; 11744 const auto f32_to_u32_result = 11745 detail::ConvInRangeF32ToU32(du32, non_neg_v, exp_diff); 11746 11747 return Or(f32_to_u32_result, 11748 BitCast(du32, BroadcastSignBit(BitCast(di32, exp_diff)))); 11749 } 11750 11751 namespace detail { 11752 11753 template <class D64, HWY_IF_UI64_D(D64)> 11754 HWY_API VFromD<D64> ConvAbsInRangeF64ToUI64(D64 d64, 11755 VFromD<Rebind<double, D64>> v, 11756 VFromD<D64>& biased_exp) { 11757 const RebindToSigned<decltype(d64)> di64; 11758 const RebindToUnsigned<decltype(d64)> du64; 11759 using VU64 = VFromD<decltype(du64)>; 11760 const Repartition<uint16_t, decltype(di64)> du16; 11761 const VU64 k1075 = Set(du64, 1075); /* biased exponent of 2^52 */ 11762 11763 // Exponent indicates whether the number can be represented as int64_t. 11764 biased_exp = BitCast(d64, ShiftRight<52>(BitCast(du64, v))); 11765 HWY_IF_CONSTEXPR(IsSigned<TFromD<D64>>()) { 11766 biased_exp = And(biased_exp, Set(d64, TFromD<D64>{0x7FF})); 11767 } 11768 11769 // If we were to cap the exponent at 51 and add 2^52, the number would be in 11770 // [2^52, 2^53) and mantissa bits could be read out directly. We need to 11771 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a 11772 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead 11773 // manually shift the mantissa into place (we already have many of the 11774 // inputs anyway). 11775 11776 // Use 16-bit saturated unsigned subtraction to compute shift_mnt and 11777 // shift_int since biased_exp[i] is a non-negative integer that is less than 11778 // or equal to 2047. 11779 11780 // 16-bit saturated unsigned subtraction is also more efficient than a 11781 // 64-bit subtraction followed by a 64-bit signed Max operation on 11782 // SSE2/SSSE3/SSE4/AVX2. 11783 11784 // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be 11785 // zero as the upper 48 bits of both k1075 and biased_exp are zero. 11786 11787 const VU64 shift_mnt = BitCast( 11788 du64, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); 11789 const VU64 shift_int = BitCast( 11790 du64, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); 11791 const VU64 mantissa = BitCast(du64, v) & Set(du64, (1ULL << 52) - 1); 11792 // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86 11793 // returning zero in that case. 11794 const VU64 int53 = (mantissa | Set(du64, 1ULL << 52)) >> shift_mnt; 11795 11796 // For inputs larger than 2^53 - 1, insert zeros at the bottom. 11797 11798 // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be 11799 // shifted out of the left shift result below as shift_int[i] <= 11 is true 11800 // for any inputs that are less than 2^64. 11801 11802 return BitCast(d64, int53 << shift_int); 11803 } 11804 11805 } // namespace detail 11806 11807 #if HWY_ARCH_X86_64 11808 11809 namespace detail { 11810 11811 template <size_t N> 11812 static HWY_INLINE int64_t SSE2ConvFirstF64LaneToI64(Vec128<double, N> v) { 11813 #if HWY_COMPILER_GCC_ACTUAL 11814 // Workaround for undefined behavior in _mm_cvttsd_si64 with GCC if v[0] is 11815 // not within the range of an int64_t 11816 11817 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11818 if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) { 11819 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 11820 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 11821 return X86ConvertScalarFromFloat<int64_t>(raw_v[0]); 11822 } 11823 #endif 11824 11825 int64_t result; 11826 __asm__("%vcvttsd2si {%1, %0|%0, %1}" 11827 : "=r"(result) 11828 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11829 :); 11830 return result; 11831 #else 11832 return _mm_cvttsd_si64(v.raw); 11833 #endif 11834 } 11835 11836 } // namespace detail 11837 11838 template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)> 11839 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec64<double> v) { 11840 return VFromD<DI>{_mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v))}; 11841 } 11842 template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)> 11843 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, Vec128<double> v) { 11844 const __m128i i0 = _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(v)); 11845 const Full64<double> dd2; 11846 const __m128i i1 = 11847 _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToI64(UpperHalf(dd2, v))); 11848 return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)}; 11849 } 11850 11851 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> 11852 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { 11853 const RebindToFloat<decltype(di)> df; 11854 // See comment at the first occurrence of "IfThenElse(overflow,". 11855 const MFromD<DI> overflow = 11856 RebindMask(di, Ge(v, Set(df, 9.223372036854776e18))); 11857 return IfThenElse(overflow, Set(di, LimitsMax<int64_t>()), 11858 ConvertInRangeTo(di, v)); 11859 } 11860 #endif // HWY_ARCH_X86_64 11861 11862 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 11863 template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), 11864 HWY_IF_I64_D(DI)> 11865 HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<Rebind<double, DI>> v) { 11866 using VI = VFromD<DI>; 11867 11868 VI biased_exp; 11869 const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp); 11870 const VI sign_mask = BroadcastSignBit(BitCast(di, v)); 11871 11872 // If the input was negative, negate the integer (two's complement). 11873 return (shifted ^ sign_mask) - sign_mask; 11874 } 11875 11876 template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), 11877 HWY_IF_I64_D(DI)> 11878 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { 11879 using VI = VFromD<DI>; 11880 11881 VI biased_exp; 11882 const VI shifted = detail::ConvAbsInRangeF64ToUI64(di, v, biased_exp); 11883 11884 #if HWY_TARGET <= HWY_SSE4 11885 const auto in_range = biased_exp < Set(di, 1086); 11886 #else 11887 const Repartition<int32_t, decltype(di)> di32; 11888 const auto in_range = MaskFromVec(BitCast( 11889 di, 11890 VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086)))); 11891 #endif 11892 11893 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. 11894 const VI sign_mask = BroadcastSignBit(BitCast(di, v)); 11895 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask; 11896 const VI magnitude = IfThenElse(in_range, shifted, limit); 11897 11898 // If the input was negative, negate the integer (two's complement). 11899 return (magnitude ^ sign_mask) - sign_mask; 11900 } 11901 #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 11902 11903 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 11904 template <class DU, HWY_IF_U64_D(DU)> 11905 HWY_API VFromD<DU> ConvertInRangeTo(DU du, VFromD<Rebind<double, DU>> v) { 11906 VFromD<DU> biased_exp; 11907 const auto shifted = detail::ConvAbsInRangeF64ToUI64(du, v, biased_exp); 11908 return shifted; 11909 } 11910 11911 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 11912 template <class DU, HWY_IF_U64_D(DU)> 11913 HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) { 11914 const RebindToSigned<DU> di; 11915 using VU = VFromD<DU>; 11916 11917 VU biased_exp; 11918 const VU shifted = 11919 detail::ConvAbsInRangeF64ToUI64(du, ZeroIfNegative(v), biased_exp); 11920 11921 // Exponent indicates whether the number can be represented as uint64_t. 11922 #if HWY_TARGET <= HWY_SSE4 11923 const VU out_of_range = 11924 BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086))); 11925 #else 11926 const Repartition<int32_t, decltype(di)> di32; 11927 const VU out_of_range = BitCast( 11928 du, 11929 VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086))); 11930 #endif 11931 11932 return (shifted | out_of_range); 11933 } 11934 #endif // HWY_TARGET <= HWY_AVX3 11935 11936 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11937 namespace detail { 11938 11939 template <class TTo, class TF, HWY_IF_SIGNED(TTo)> 11940 static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CXX14_CONSTEXPR TTo 11941 X86ScalarNearestInt(TF flt_val) { 11942 #if HWY_HAVE_SCALAR_F16_TYPE && HWY_HAVE_SCALAR_F16_OPERATORS 11943 using TFArith = If<hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>(), float, 11944 RemoveCvRef<TF>>; 11945 #else 11946 using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; 11947 #endif 11948 11949 const TTo trunc_int_val = X86ConvertScalarFromFloat<TTo>(flt_val); 11950 const TFArith abs_val_diff = ScalarAbs( 11951 ConvertScalarTo<TFArith>(ConvertScalarTo<TFArith>(flt_val) - 11952 ConvertScalarTo<TFArith>(trunc_int_val))); 11953 constexpr TFArith kHalf = ConvertScalarTo<TFArith>(0.5); 11954 11955 const bool round_result_up = 11956 ((trunc_int_val ^ ScalarShr(trunc_int_val, sizeof(TTo) * 8 - 1)) != 11957 LimitsMax<TTo>()) && 11958 (abs_val_diff > kHalf || 11959 (abs_val_diff == kHalf && (trunc_int_val & 1) != 0)); 11960 return static_cast<TTo>( 11961 trunc_int_val + 11962 (round_result_up ? (ScalarSignBit(flt_val) ? (-1) : 1) : 0)); 11963 } 11964 11965 } // namespace detail 11966 #endif // HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11967 11968 // If these are in namespace detail, the x86_256/512 templates are not found. 11969 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I32_D(DI)> 11970 static HWY_INLINE VFromD<DI> NearestIntInRange(DI, 11971 VFromD<RebindToFloat<DI>> v) { 11972 #if HWY_COMPILER_GCC_ACTUAL 11973 // Workaround for undefined behavior in _mm_cvtps_epi32 with GCC if any values 11974 // of v[i] are not within the range of an int32_t 11975 11976 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 11977 if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { 11978 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 11979 const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); 11980 return Dup128VecFromValues(DI(), 11981 detail::X86ScalarNearestInt<int32_t>(raw_v[0]), 11982 detail::X86ScalarNearestInt<int32_t>(raw_v[1]), 11983 detail::X86ScalarNearestInt<int32_t>(raw_v[2]), 11984 detail::X86ScalarNearestInt<int32_t>(raw_v[3])); 11985 } 11986 #endif 11987 11988 __m128i raw_result; 11989 __asm__("%vcvtps2dq {%1, %0|%0, %1}" 11990 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 11991 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 11992 :); 11993 return VFromD<DI>{raw_result}; 11994 #else // !HWY_COMPILER_GCC_ACTUAL 11995 return VFromD<DI>{_mm_cvtps_epi32(v.raw)}; 11996 #endif 11997 } 11998 11999 #if HWY_HAVE_FLOAT16 12000 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I16_D(DI)> 12001 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, 12002 VFromD<RebindToFloat<DI>> v) { 12003 #if HWY_COMPILER_GCC_ACTUAL 12004 // Workaround for undefined behavior in _mm_cvtph_epi16 if any values of v[i] 12005 // are not within the range of an int16_t 12006 12007 #if HWY_COMPILER_GCC_ACTUAL >= 1200 && !HWY_IS_DEBUG_BUILD && \ 12008 HWY_HAVE_SCALAR_F16_TYPE 12009 if (detail::IsConstantX86VecForF2IConv<int16_t>(v)) { 12010 typedef hwy::float16_t::Native GccF16RawVectType 12011 __attribute__((__vector_size__(16))); 12012 const auto raw_v = reinterpret_cast<GccF16RawVectType>(v.raw); 12013 return Dup128VecFromValues(DI(), 12014 detail::X86ScalarNearestInt<int16_t>(raw_v[0]), 12015 detail::X86ScalarNearestInt<int16_t>(raw_v[1]), 12016 detail::X86ScalarNearestInt<int16_t>(raw_v[2]), 12017 detail::X86ScalarNearestInt<int16_t>(raw_v[3]), 12018 detail::X86ScalarNearestInt<int16_t>(raw_v[4]), 12019 detail::X86ScalarNearestInt<int16_t>(raw_v[5]), 12020 detail::X86ScalarNearestInt<int16_t>(raw_v[6]), 12021 detail::X86ScalarNearestInt<int16_t>(raw_v[7])); 12022 } 12023 #endif 12024 12025 __m128i raw_result; 12026 __asm__("vcvtph2w {%1, %0|%0, %1}" 12027 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 12028 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 12029 :); 12030 return VFromD<DI>{raw_result}; 12031 #else // !HWY_COMPILER_GCC_ACTUAL 12032 return VFromD<DI>{_mm_cvtph_epi16(v.raw)}; 12033 #endif 12034 } 12035 #endif // HWY_HAVE_FLOAT16 12036 12037 #if HWY_TARGET <= HWY_AVX3 12038 12039 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 16), HWY_IF_I64_D(DI)> 12040 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, 12041 VFromD<RebindToFloat<DI>> v) { 12042 #if HWY_COMPILER_GCC_ACTUAL 12043 // Workaround for undefined behavior in _mm_cvtpd_epi64 with GCC if any 12044 // values of v[i] are not within the range of an int64_t 12045 12046 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 12047 if (detail::IsConstantX86VecForF2IConv<int64_t>(v)) { 12048 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 12049 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 12050 return Dup128VecFromValues(DI(), 12051 detail::X86ScalarNearestInt<int64_t>(raw_v[0]), 12052 detail::X86ScalarNearestInt<int64_t>(raw_v[1])); 12053 } 12054 #endif 12055 12056 __m128i raw_result; 12057 __asm__("vcvtpd2qq {%1, %0|%0, %1}" 12058 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 12059 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 12060 :); 12061 return VFromD<DI>{raw_result}; 12062 #else // !HWY_COMPILER_GCC_ACTUAL 12063 return VFromD<DI>{_mm_cvtpd_epi64(v.raw)}; 12064 #endif 12065 } 12066 12067 #else // HWY_TARGET > HWY_AVX3 12068 12069 namespace detail { 12070 12071 #if HWY_ARCH_X86_64 12072 template <size_t N> 12073 static HWY_INLINE int64_t 12074 SSE2ConvFirstF64LaneToNearestI64(Vec128<double, N> v) { 12075 #if HWY_COMPILER_GCC_ACTUAL 12076 // Workaround for undefined behavior in _mm_cvtsd_si64 with GCC if v[0] is 12077 // not within the range of an int64_t 12078 12079 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 12080 if (IsConstantX86Vec(hwy::SizeTag<1>(), v)) { 12081 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 12082 const auto raw_v = reinterpret_cast<GccF64RawVectType>(v.raw); 12083 return X86ScalarNearestInt<int64_t>(raw_v[0]); 12084 } 12085 #endif 12086 12087 int64_t result; 12088 __asm__("%vcvtsd2si {%1, %0|%0, %1}" 12089 : "=r"(result) 12090 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 12091 :); 12092 return result; 12093 #else 12094 return _mm_cvtsd_si64(v.raw); 12095 #endif 12096 } 12097 #endif // HWY_ARCH_X86_64 12098 12099 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 12100 template <class DI64, HWY_IF_I64_D(DI64)> 12101 static HWY_INLINE VFromD<DI64> SSE2NearestI64InRange( 12102 DI64 di64, VFromD<RebindToFloat<DI64>> v) { 12103 const RebindToFloat<DI64> df64; 12104 const RebindToUnsigned<DI64> du64; 12105 using VI64 = VFromD<decltype(di64)>; 12106 12107 const auto mant_end = Set(df64, MantissaEnd<double>()); 12108 const auto is_small = Lt(Abs(v), mant_end); 12109 12110 const auto adj_v = Max(v, Set(df64, -9223372036854775808.0)) + 12111 IfThenElseZero(is_small, CopySignToAbs(mant_end, v)); 12112 const auto adj_v_biased_exp = 12113 And(BitCast(di64, ShiftRight<52>(BitCast(du64, adj_v))), 12114 Set(di64, int64_t{0x7FF})); 12115 12116 // We can simply subtract 1075 from adj_v_biased_exp[i] to get shift_int since 12117 // adj_v_biased_exp[i] is at least 1075 12118 const VI64 shift_int = adj_v_biased_exp + Set(di64, int64_t{-1075}); 12119 12120 const VI64 mantissa = BitCast(di64, adj_v) & Set(di64, (1LL << 52) - 1); 12121 // Include implicit 1-bit if is_small[i] is 0. NOTE: the shift count may 12122 // exceed 63; we rely on x86 returning zero in that case. 12123 const VI64 int53 = mantissa | IfThenZeroElse(RebindMask(di64, is_small), 12124 Set(di64, 1LL << 52)); 12125 12126 const VI64 sign_mask = BroadcastSignBit(BitCast(di64, v)); 12127 // If the input was negative, negate the integer (two's complement). 12128 return ((int53 << shift_int) ^ sign_mask) - sign_mask; 12129 } 12130 #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 12131 12132 } // namespace detail 12133 12134 #if HWY_ARCH_X86_64 12135 template <class DI, HWY_IF_V_SIZE_D(DI, 8), HWY_IF_I64_D(DI)> 12136 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec64<double> v) { 12137 return VFromD<DI>{ 12138 _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v))}; 12139 } 12140 template <class DI, HWY_IF_V_SIZE_D(DI, 16), HWY_IF_I64_D(DI)> 12141 static HWY_INLINE VFromD<DI> NearestIntInRange(DI /*di*/, Vec128<double> v) { 12142 const __m128i i0 = 12143 _mm_cvtsi64_si128(detail::SSE2ConvFirstF64LaneToNearestI64(v)); 12144 const Full64<double> dd2; 12145 const __m128i i1 = _mm_cvtsi64_si128( 12146 detail::SSE2ConvFirstF64LaneToNearestI64(UpperHalf(dd2, v))); 12147 return VFromD<DI>{_mm_unpacklo_epi64(i0, i1)}; 12148 } 12149 #endif // HWY_ARCH_X86_64 12150 12151 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 12152 template <class DI, HWY_IF_V_SIZE_GT_D(DI, (HWY_ARCH_X86_64 ? 16 : 0)), 12153 HWY_IF_I64_D(DI)> 12154 static HWY_INLINE VFromD<DI> NearestIntInRange(DI di, 12155 VFromD<RebindToFloat<DI>> v) { 12156 return detail::SSE2NearestI64InRange(di, v); 12157 } 12158 #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 12159 12160 #endif // HWY_TARGET <= HWY_AVX3 12161 12162 template <class DI, HWY_IF_V_SIZE_LE_D(DI, 8), HWY_IF_I32_D(DI)> 12163 static HWY_INLINE VFromD<DI> DemoteToNearestIntInRange( 12164 DI, VFromD<Rebind<double, DI>> v) { 12165 #if HWY_COMPILER_GCC_ACTUAL 12166 // Workaround for undefined behavior in _mm_cvtpd_epi32 with GCC if any values 12167 // of v[i] are not within the range of an int32_t 12168 12169 #if HWY_COMPILER_GCC_ACTUAL >= 700 && !HWY_IS_DEBUG_BUILD 12170 if (detail::IsConstantX86VecForF2IConv<int32_t>(v)) { 12171 typedef double GccF32RawVectType __attribute__((__vector_size__(16))); 12172 const auto raw_v = reinterpret_cast<GccF32RawVectType>(v.raw); 12173 return Dup128VecFromValues( 12174 DI(), detail::X86ScalarNearestInt<int32_t>(raw_v[0]), 12175 detail::X86ScalarNearestInt<int32_t>(raw_v[1]), int32_t{0}, int32_t{0}); 12176 } 12177 #endif 12178 12179 __m128i raw_result; 12180 __asm__("%vcvtpd2dq {%1, %0|%0, %1}" 12181 : "=" HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(raw_result) 12182 : HWY_X86_GCC_INLINE_ASM_VEC_CONSTRAINT(v.raw) 12183 :); 12184 return VFromD<DI>{raw_result}; 12185 #else // !HWY_COMPILER_GCC_ACTUAL 12186 return VFromD<DI>{_mm_cvtpd_epi32(v.raw)}; 12187 #endif 12188 } 12189 12190 // F16/F32/F64 NearestInt is generic for all vector lengths 12191 template <class VF, class DF = DFromV<VF>, class DI = RebindToSigned<DF>, 12192 HWY_IF_FLOAT_D(DF), 12193 HWY_IF_T_SIZE_ONE_OF_D(DF, (1 << 4) | (1 << 8) | 12194 (HWY_HAVE_FLOAT16 ? (1 << 2) : 0))> 12195 HWY_API VFromD<DI> NearestInt(const VF v) { 12196 const DI di; 12197 using TI = TFromD<DI>; 12198 using TF = TFromD<DF>; 12199 using TFArith = If<sizeof(TF) <= sizeof(float), float, RemoveCvRef<TF>>; 12200 12201 constexpr TFArith kMinOutOfRangePosVal = 12202 static_cast<TFArith>(-static_cast<TFArith>(LimitsMin<TI>())); 12203 static_assert(kMinOutOfRangePosVal > static_cast<TFArith>(0.0), 12204 "kMinOutOfRangePosVal > 0.0 must be true"); 12205 12206 // See comment at the first occurrence of "IfThenElse(overflow,". 12207 // Here we are rounding, whereas previous occurrences truncate, but there is 12208 // no difference because the previous float value is well below the max i32. 12209 const auto overflow = RebindMask( 12210 di, Ge(v, Set(DF(), ConvertScalarTo<TF>(kMinOutOfRangePosVal)))); 12211 auto result = 12212 IfThenElse(overflow, Set(di, LimitsMax<TI>()), NearestIntInRange(di, v)); 12213 12214 return result; 12215 } 12216 12217 template <class DI, HWY_IF_I32_D(DI)> 12218 HWY_API VFromD<DI> DemoteToNearestInt(DI, VFromD<Rebind<double, DI>> v) { 12219 const DI di; 12220 const Rebind<double, DI> df64; 12221 return DemoteToNearestIntInRange(di, Min(v, Set(df64, 2147483647.0))); 12222 } 12223 12224 // ------------------------------ Floating-point rounding (ConvertTo) 12225 12226 #if HWY_TARGET >= HWY_SSSE3 12227 12228 // Toward nearest integer, ties to even 12229 template <typename T, size_t N> 12230 HWY_API Vec128<T, N> Round(const Vec128<T, N> v) { 12231 static_assert(IsFloat<T>(), "Only for float"); 12232 // Rely on rounding after addition with a large value such that no mantissa 12233 // bits remain (assuming the current mode is nearest-even). We may need a 12234 // compiler flag for precise floating-point to prevent "optimizing" this out. 12235 const DFromV<decltype(v)> df; 12236 const auto max = Set(df, MantissaEnd<T>()); 12237 const auto large = CopySignToAbs(max, v); 12238 const auto added = large + v; 12239 const auto rounded = added - large; 12240 // Keep original if NaN or the magnitude is large (already an int). 12241 return IfThenElse(Abs(v) < max, rounded, v); 12242 } 12243 12244 namespace detail { 12245 12246 // Truncating to integer and converting back to float is correct except when the 12247 // input magnitude is large, in which case the input was already an integer 12248 // (because mantissa >> exponent is zero). 12249 template <typename T, size_t N> 12250 HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) { 12251 static_assert(IsFloat<T>(), "Only for float"); 12252 const DFromV<decltype(v)> d; 12253 return Abs(v) < Set(d, MantissaEnd<T>()); 12254 } 12255 12256 } // namespace detail 12257 12258 // Toward zero, aka truncate 12259 template <typename T, size_t N> 12260 HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) { 12261 static_assert(IsFloat<T>(), "Only for float"); 12262 const DFromV<decltype(v)> df; 12263 const RebindToSigned<decltype(df)> di; 12264 12265 const auto integer = ConvertInRangeTo(di, v); // round toward 0 12266 const auto int_f = ConvertTo(df, integer); 12267 12268 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); 12269 } 12270 12271 // Toward +infinity, aka ceiling 12272 template <typename T, size_t N> 12273 HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) { 12274 static_assert(IsFloat<T>(), "Only for float"); 12275 const DFromV<decltype(v)> df; 12276 const RebindToSigned<decltype(df)> di; 12277 12278 const auto integer = ConvertInRangeTo(di, v); // round toward 0 12279 const auto int_f = ConvertTo(df, integer); 12280 12281 // Truncating a positive non-integer ends up smaller; if so, add 1. 12282 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); 12283 12284 return IfThenElse(detail::UseInt(v), int_f - neg1, v); 12285 } 12286 12287 #ifdef HWY_NATIVE_CEIL_FLOOR_INT 12288 #undef HWY_NATIVE_CEIL_FLOOR_INT 12289 #else 12290 #define HWY_NATIVE_CEIL_FLOOR_INT 12291 #endif 12292 12293 template <class V, HWY_IF_FLOAT_V(V)> 12294 HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) { 12295 const DFromV<decltype(v)> df; 12296 const RebindToSigned<decltype(df)> di; 12297 12298 const auto integer = ConvertTo(di, v); // round toward 0 12299 const auto int_f = ConvertTo(df, integer); 12300 12301 // Truncating a positive non-integer ends up smaller; if so, add 1. 12302 return integer - 12303 VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v))); 12304 } 12305 12306 // Toward -infinity, aka floor 12307 template <typename T, size_t N> 12308 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) { 12309 static_assert(IsFloat<T>(), "Only for float"); 12310 const DFromV<decltype(v)> df; 12311 const RebindToSigned<decltype(df)> di; 12312 12313 const auto integer = ConvertInRangeTo(di, v); // round toward 0 12314 const auto int_f = ConvertTo(df, integer); 12315 12316 // Truncating a negative non-integer ends up larger; if so, subtract 1. 12317 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); 12318 12319 return IfThenElse(detail::UseInt(v), int_f + neg1, v); 12320 } 12321 12322 template <class V, HWY_IF_FLOAT_V(V)> 12323 HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) { 12324 const DFromV<decltype(v)> df; 12325 const RebindToSigned<decltype(df)> di; 12326 12327 const auto integer = ConvertTo(di, v); // round toward 0 12328 const auto int_f = ConvertTo(df, integer); 12329 12330 // Truncating a negative non-integer ends up larger; if so, subtract 1. 12331 return integer + 12332 VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v))); 12333 } 12334 12335 #else 12336 12337 // Toward nearest integer, ties to even 12338 #if HWY_HAVE_FLOAT16 12339 template <size_t N> 12340 HWY_API Vec128<float16_t, N> Round(const Vec128<float16_t, N> v) { 12341 return Vec128<float16_t, N>{ 12342 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; 12343 } 12344 #endif // HWY_HAVE_FLOAT16 12345 template <size_t N> 12346 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { 12347 return Vec128<float, N>{ 12348 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; 12349 } 12350 template <size_t N> 12351 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) { 12352 return Vec128<double, N>{ 12353 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; 12354 } 12355 12356 // Toward zero, aka truncate 12357 #if HWY_HAVE_FLOAT16 12358 template <size_t N> 12359 HWY_API Vec128<float16_t, N> Trunc(const Vec128<float16_t, N> v) { 12360 return Vec128<float16_t, N>{ 12361 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; 12362 } 12363 #endif // HWY_HAVE_FLOAT16 12364 template <size_t N> 12365 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { 12366 return Vec128<float, N>{ 12367 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; 12368 } 12369 template <size_t N> 12370 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) { 12371 return Vec128<double, N>{ 12372 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; 12373 } 12374 12375 // Toward +infinity, aka ceiling 12376 #if HWY_HAVE_FLOAT16 12377 template <size_t N> 12378 HWY_API Vec128<float16_t, N> Ceil(const Vec128<float16_t, N> v) { 12379 return Vec128<float16_t, N>{ 12380 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; 12381 } 12382 #endif // HWY_HAVE_FLOAT16 12383 template <size_t N> 12384 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { 12385 return Vec128<float, N>{ 12386 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; 12387 } 12388 template <size_t N> 12389 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) { 12390 return Vec128<double, N>{ 12391 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; 12392 } 12393 12394 // Toward -infinity, aka floor 12395 #if HWY_HAVE_FLOAT16 12396 template <size_t N> 12397 HWY_API Vec128<float16_t, N> Floor(const Vec128<float16_t, N> v) { 12398 return Vec128<float16_t, N>{ 12399 _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; 12400 } 12401 #endif // HWY_HAVE_FLOAT16 12402 template <size_t N> 12403 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { 12404 return Vec128<float, N>{ 12405 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; 12406 } 12407 template <size_t N> 12408 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) { 12409 return Vec128<double, N>{ 12410 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; 12411 } 12412 12413 #endif // !HWY_SSSE3 12414 12415 // ------------------------------ Floating-point classification 12416 12417 #define HWY_X86_FPCLASS_QNAN 0x01 12418 #define HWY_X86_FPCLASS_POS0 0x02 12419 #define HWY_X86_FPCLASS_NEG0 0x04 12420 #define HWY_X86_FPCLASS_POS_INF 0x08 12421 #define HWY_X86_FPCLASS_NEG_INF 0x10 12422 #define HWY_X86_FPCLASS_SUBNORMAL 0x20 12423 #define HWY_X86_FPCLASS_NEG 0x40 12424 #define HWY_X86_FPCLASS_SNAN 0x80 12425 12426 #if HWY_HAVE_FLOAT16 || HWY_IDE 12427 12428 template <size_t N> 12429 HWY_API Mask128<float16_t, N> IsNaN(const Vec128<float16_t, N> v) { 12430 return Mask128<float16_t, N>{ 12431 _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; 12432 } 12433 12434 template <size_t N> 12435 HWY_API Mask128<float16_t, N> IsEitherNaN(Vec128<float16_t, N> a, 12436 Vec128<float16_t, N> b) { 12437 // Work around warnings in the intrinsic definitions (passing -1 as a mask). 12438 HWY_DIAGNOSTICS(push) 12439 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 12440 return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_UNORD_Q)}; 12441 HWY_DIAGNOSTICS(pop) 12442 } 12443 12444 template <size_t N> 12445 HWY_API Mask128<float16_t, N> IsInf(const Vec128<float16_t, N> v) { 12446 return Mask128<float16_t, N>{_mm_fpclass_ph_mask( 12447 v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; 12448 } 12449 12450 template <size_t N> 12451 HWY_API Mask128<float16_t, N> IsFinite(const Vec128<float16_t, N> v) { 12452 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN 12453 // and negate the mask. 12454 return Not(Mask128<float16_t, N>{_mm_fpclass_ph_mask( 12455 v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | 12456 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); 12457 } 12458 12459 #endif // HWY_HAVE_FLOAT16 12460 12461 template <size_t N> 12462 HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) { 12463 #if HWY_TARGET <= HWY_AVX3 12464 return Mask128<float, N>{ 12465 _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; 12466 #else 12467 return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)}; 12468 #endif 12469 } 12470 template <size_t N> 12471 HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) { 12472 #if HWY_TARGET <= HWY_AVX3 12473 return Mask128<double, N>{ 12474 _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; 12475 #else 12476 return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)}; 12477 #endif 12478 } 12479 12480 #ifdef HWY_NATIVE_IS_EITHER_NAN 12481 #undef HWY_NATIVE_IS_EITHER_NAN 12482 #else 12483 #define HWY_NATIVE_IS_EITHER_NAN 12484 #endif 12485 12486 template <size_t N> 12487 HWY_API Mask128<float, N> IsEitherNaN(Vec128<float, N> a, Vec128<float, N> b) { 12488 #if HWY_TARGET <= HWY_AVX3 12489 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_UNORD_Q)}; 12490 #else 12491 return Mask128<float, N>{_mm_cmpunord_ps(a.raw, b.raw)}; 12492 #endif 12493 } 12494 12495 template <size_t N> 12496 HWY_API Mask128<double, N> IsEitherNaN(Vec128<double, N> a, 12497 Vec128<double, N> b) { 12498 #if HWY_TARGET <= HWY_AVX3 12499 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_UNORD_Q)}; 12500 #else 12501 return Mask128<double, N>{_mm_cmpunord_pd(a.raw, b.raw)}; 12502 #endif 12503 } 12504 12505 #if HWY_TARGET <= HWY_AVX3 12506 12507 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. 12508 #ifdef HWY_NATIVE_ISINF 12509 #undef HWY_NATIVE_ISINF 12510 #else 12511 #define HWY_NATIVE_ISINF 12512 #endif 12513 12514 template <size_t N> 12515 HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) { 12516 return Mask128<float, N>{_mm_fpclass_ps_mask( 12517 v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; 12518 } 12519 template <size_t N> 12520 HWY_API Mask128<double, N> IsInf(const Vec128<double, N> v) { 12521 return Mask128<double, N>{_mm_fpclass_pd_mask( 12522 v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; 12523 } 12524 12525 // Returns whether normal/subnormal/zero. 12526 template <size_t N> 12527 HWY_API Mask128<float, N> IsFinite(const Vec128<float, N> v) { 12528 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN 12529 // and negate the mask. 12530 return Not(Mask128<float, N>{_mm_fpclass_ps_mask( 12531 v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | 12532 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); 12533 } 12534 template <size_t N> 12535 HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) { 12536 return Not(Mask128<double, N>{_mm_fpclass_pd_mask( 12537 v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | 12538 HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); 12539 } 12540 12541 #endif // HWY_TARGET <= HWY_AVX3 12542 12543 // ================================================== CRYPTO 12544 12545 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 12546 12547 // Per-target flag to prevent generic_ops-inl.h from defining AESRound. 12548 #ifdef HWY_NATIVE_AES 12549 #undef HWY_NATIVE_AES 12550 #else 12551 #define HWY_NATIVE_AES 12552 #endif 12553 12554 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, 12555 Vec128<uint8_t> round_key) { 12556 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)}; 12557 } 12558 12559 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, 12560 Vec128<uint8_t> round_key) { 12561 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)}; 12562 } 12563 12564 HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { 12565 return Vec128<uint8_t>{_mm_aesimc_si128(state.raw)}; 12566 } 12567 12568 HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, 12569 Vec128<uint8_t> round_key) { 12570 return Vec128<uint8_t>{_mm_aesdec_si128(state.raw, round_key.raw)}; 12571 } 12572 12573 HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, 12574 Vec128<uint8_t> round_key) { 12575 return Vec128<uint8_t>{_mm_aesdeclast_si128(state.raw, round_key.raw)}; 12576 } 12577 12578 template <uint8_t kRcon> 12579 HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { 12580 return Vec128<uint8_t>{_mm_aeskeygenassist_si128(v.raw, kRcon)}; 12581 } 12582 12583 template <size_t N> 12584 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a, 12585 Vec128<uint64_t, N> b) { 12586 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; 12587 } 12588 12589 template <size_t N> 12590 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a, 12591 Vec128<uint64_t, N> b) { 12592 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; 12593 } 12594 12595 #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 12596 12597 // ================================================== MISC 12598 12599 // ------------------------------ LoadMaskBits (TestBit) 12600 12601 #if HWY_TARGET > HWY_AVX3 12602 namespace detail { 12603 12604 template <class D, HWY_IF_T_SIZE_D(D, 1)> 12605 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { 12606 const RebindToUnsigned<decltype(d)> du; 12607 // Easier than Set(), which would require an >8-bit type, which would not 12608 // compile for T=uint8_t, kN=1. 12609 const VFromD<D> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))}; 12610 12611 #if HWY_TARGET == HWY_SSE2 12612 // {b0, b1, ...} ===> {b0, b0, b1, b1, ...} 12613 __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw); 12614 // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...} 12615 unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits); 12616 // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==> 12617 // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1} 12618 const VFromD<decltype(du)> rep8{ 12619 _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)}; 12620 #else 12621 // Replicate bytes 8x such that each byte contains the bit that governs it. 12622 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 12623 1, 1, 1, 1, 1, 1, 1, 1}; 12624 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); 12625 #endif 12626 const VFromD<decltype(du)> bit = Dup128VecFromValues( 12627 du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); 12628 return RebindMask(d, TestBit(rep8, bit)); 12629 } 12630 12631 template <class D, HWY_IF_T_SIZE_D(D, 2)> 12632 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { 12633 const RebindToUnsigned<decltype(d)> du; 12634 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; 12635 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); 12636 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 12637 } 12638 12639 template <class D, HWY_IF_T_SIZE_D(D, 4)> 12640 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { 12641 const RebindToUnsigned<decltype(d)> du; 12642 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; 12643 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); 12644 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 12645 } 12646 12647 template <class D, HWY_IF_T_SIZE_D(D, 8)> 12648 HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) { 12649 const RebindToUnsigned<decltype(d)> du; 12650 alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; 12651 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); 12652 } 12653 12654 } // namespace detail 12655 #endif // HWY_TARGET > HWY_AVX3 12656 12657 // `p` points to at least 8 readable bytes, not all of which need be valid. 12658 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12659 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 12660 constexpr size_t kN = MaxLanes(d); 12661 #if HWY_TARGET <= HWY_AVX3 12662 (void)d; 12663 uint64_t mask_bits = 0; 12664 constexpr size_t kNumBytes = (kN + 7) / 8; 12665 CopyBytes<kNumBytes>(bits, &mask_bits); 12666 if (kN < 8) { 12667 mask_bits &= (1ull << kN) - 1; 12668 } 12669 12670 return MFromD<D>::FromBits(mask_bits); 12671 #else 12672 uint64_t mask_bits = 0; 12673 constexpr size_t kNumBytes = (kN + 7) / 8; 12674 CopyBytes<kNumBytes>(bits, &mask_bits); 12675 if (kN < 8) { 12676 mask_bits &= (1ull << kN) - 1; 12677 } 12678 12679 return detail::LoadMaskBits128(d, mask_bits); 12680 #endif 12681 } 12682 12683 // ------------------------------ Dup128MaskFromMaskBits 12684 12685 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12686 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 12687 constexpr size_t kN = MaxLanes(d); 12688 if (kN < 8) mask_bits &= (1u << kN) - 1; 12689 12690 #if HWY_TARGET <= HWY_AVX3 12691 return MFromD<D>::FromBits(mask_bits); 12692 #else 12693 return detail::LoadMaskBits128(d, mask_bits); 12694 #endif 12695 } 12696 12697 template <typename T> 12698 struct CompressIsPartition { 12699 #if HWY_TARGET <= HWY_AVX3 12700 // AVX3 supports native compress, but a table-based approach allows 12701 // 'partitioning' (also moving mask=false lanes to the top), which helps 12702 // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8 12703 // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3 12704 // u32x8 etc.). 12705 enum { value = (sizeof(T) == 8) }; 12706 #else 12707 // generic_ops-inl does not guarantee IsPartition for 8-bit. 12708 enum { value = (sizeof(T) != 1) }; 12709 #endif 12710 }; 12711 12712 namespace detail { 12713 12714 // Returns `mask_bits` (from movemask) with the upper bits cleared, if there 12715 // are 8 or fewer valid bits. 12716 template <class D> 12717 constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { 12718 return (d.MaxBytes() >= 16) ? mask_bits 12719 : mask_bits & ((1ull << d.MaxLanes()) - 1); 12720 } 12721 12722 } // namespace detail 12723 12724 #if HWY_TARGET <= HWY_AVX3 12725 12726 // ------------------------------ BitsFromMask (MFromD, OnlyActive) 12727 // Generic for all vector lengths. 12728 template <class D> 12729 HWY_INLINE uint64_t BitsFromMask(D d, MFromD<D> mask) { 12730 return detail::OnlyActive(d, mask.raw); 12731 } 12732 12733 // ------------------------------ StoreMaskBits 12734 12735 // `p` points to at least 8 writable bytes. 12736 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12737 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 12738 constexpr size_t kN = MaxLanes(d); 12739 constexpr size_t kNumBytes = (kN + 7) / 8; 12740 CopyBytes<kNumBytes>(&mask.raw, bits); 12741 12742 // Non-full byte, need to clear the undefined upper bits. 12743 if (kN < 8) { 12744 const int mask_bits = (1 << kN) - 1; 12745 bits[0] = static_cast<uint8_t>(bits[0] & mask_bits); 12746 } 12747 12748 return kNumBytes; 12749 } 12750 12751 // ------------------------------ Mask testing 12752 12753 // Beware: the suffix indicates the number of mask bits, not lane size! 12754 12755 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12756 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 12757 constexpr size_t kN = MaxLanes(d); 12758 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); 12759 return PopCount(mask_bits); 12760 } 12761 12762 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12763 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 12764 constexpr size_t kN = MaxLanes(d); 12765 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); 12766 return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); 12767 } 12768 12769 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12770 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 12771 constexpr size_t kN = MaxLanes(d); 12772 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); 12773 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; 12774 } 12775 12776 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12777 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 12778 constexpr size_t kN = MaxLanes(d); 12779 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); 12780 return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); 12781 } 12782 12783 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12784 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 12785 constexpr size_t kN = MaxLanes(d); 12786 const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); 12787 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) 12788 : -1; 12789 } 12790 12791 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12792 HWY_API bool AllFalse(D d, MFromD<D> mask) { 12793 constexpr size_t kN = MaxLanes(d); 12794 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); 12795 return mask_bits == 0; 12796 } 12797 12798 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12799 HWY_API bool AllTrue(D d, MFromD<D> mask) { 12800 constexpr size_t kN = MaxLanes(d); 12801 const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); 12802 // Cannot use _kortestc because we may have less than 8 mask bits. 12803 return mask_bits == (1ull << kN) - 1; 12804 } 12805 12806 // ------------------------------ Compress 12807 12808 // 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512. 12809 12810 // Single lane: no-op 12811 template <typename T> 12812 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 12813 return v; 12814 } 12815 12816 template <size_t N, HWY_IF_V_SIZE_GT(float, N, 4)> 12817 HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) { 12818 return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)}; 12819 } 12820 12821 template <typename T, HWY_IF_T_SIZE(T, 8)> 12822 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { 12823 HWY_DASSERT(mask.raw < 4); 12824 12825 // There are only 2 lanes, so we can afford to load the index vector directly. 12826 alignas(16) static constexpr uint8_t u8_indices[64] = { 12827 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12828 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12829 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 12830 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 12831 12832 const DFromV<decltype(v)> d; 12833 const Repartition<uint8_t, decltype(d)> d8; 12834 const auto index = Load(d8, u8_indices + 16 * mask.raw); 12835 return BitCast(d, TableLookupBytes(BitCast(d8, v), index)); 12836 } 12837 12838 // ------------------------------ CompressNot (Compress) 12839 12840 // Single lane: no-op 12841 template <typename T> 12842 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 12843 return v; 12844 } 12845 12846 template <typename T, HWY_IF_T_SIZE(T, 8)> 12847 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { 12848 // See CompressIsPartition, PrintCompressNot64x2NibbleTables 12849 alignas(16) static constexpr uint64_t packed_array[16] = { 12850 0x00000010, 0x00000001, 0x00000010, 0x00000010}; 12851 12852 // For lane i, shift the i-th 4-bit index down to bits [0, 2). 12853 const DFromV<decltype(v)> d; 12854 const RebindToUnsigned<decltype(d)> du64; 12855 const auto packed = Set(du64, packed_array[mask.raw]); 12856 alignas(16) static constexpr uint64_t kShifts[2] = {0, 4}; 12857 Vec128<uint64_t> indices = packed >> Load(du64, kShifts); 12858 // _mm_permutevar_pd will ignore the upper bits, but TableLookupLanes uses 12859 // a fallback in MSAN builds, so mask there. 12860 HWY_IF_CONSTEXPR(HWY_IS_MSAN) indices &= Set(du64, 1); 12861 return TableLookupLanes(v, Indices128<T>{indices.raw}); 12862 } 12863 12864 // ------------------------------ CompressBlocksNot 12865 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 12866 Mask128<uint64_t> /* m */) { 12867 return v; 12868 } 12869 12870 // ------------------------------ CompressStore (defined in x86_512) 12871 12872 // ------------------------------ CompressBlendedStore (defined in x86_avx3) 12873 12874 // ------------------------------ CompressBitsStore (defined in x86_512) 12875 12876 #else // AVX2 or below 12877 12878 // ------------------------------ BitsFromMask 12879 12880 namespace detail { 12881 12882 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { 12883 return static_cast<uint64_t>(static_cast<unsigned>(mask_bits)); 12884 } 12885 12886 } // namespace detail 12887 12888 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)> 12889 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 12890 const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; 12891 return detail::OnlyActive(d, 12892 detail::U64FromInt(_mm_movemask_epi8(sign_bits))); 12893 } 12894 12895 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)> 12896 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 12897 // Remove useless lower half of each u16 while preserving the sign bit. 12898 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); 12899 return detail::OnlyActive(d, 12900 detail::U64FromInt(_mm_movemask_epi8(sign_bits))); 12901 } 12902 12903 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)> 12904 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 12905 const RebindToFloat<decltype(d)> df; 12906 const auto sign_bits = BitCast(df, VecFromMask(d, mask)); 12907 return detail::OnlyActive(d, 12908 detail::U64FromInt(_mm_movemask_ps(sign_bits.raw))); 12909 } 12910 12911 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)> 12912 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 12913 const RebindToFloat<D> df; 12914 const auto sign_bits = BitCast(df, VecFromMask(d, mask)); 12915 return detail::OnlyActive(d, 12916 detail::U64FromInt(_mm_movemask_pd(sign_bits.raw))); 12917 } 12918 12919 // ------------------------------ StoreMaskBits 12920 // `p` points to at least 8 writable bytes. 12921 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12922 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 12923 constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; 12924 const uint64_t mask_bits = BitsFromMask(d, mask); 12925 CopyBytes<kNumBytes>(&mask_bits, bits); 12926 return kNumBytes; 12927 } 12928 12929 // ------------------------------ Mask testing 12930 12931 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12932 HWY_API bool AllFalse(D d, MFromD<D> mask) { 12933 // Cheaper than PTEST, which is 2 uop / 3L. 12934 return BitsFromMask(d, mask) == 0; 12935 } 12936 12937 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12938 HWY_API bool AllTrue(D d, MFromD<D> mask) { 12939 constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; 12940 return BitsFromMask(d, mask) == kAllBits; 12941 } 12942 12943 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12944 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 12945 return PopCount(BitsFromMask(d, mask)); 12946 } 12947 12948 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12949 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 12950 return Num0BitsBelowLS1Bit_Nonzero32( 12951 static_cast<uint32_t>(BitsFromMask(d, mask))); 12952 } 12953 12954 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12955 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 12956 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 12957 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; 12958 } 12959 12960 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12961 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 12962 return 31 - Num0BitsAboveMS1Bit_Nonzero32( 12963 static_cast<uint32_t>(BitsFromMask(d, mask))); 12964 } 12965 12966 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 12967 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 12968 const uint32_t mask_bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 12969 return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) 12970 : -1; 12971 } 12972 12973 // ------------------------------ Compress, CompressBits 12974 12975 namespace detail { 12976 12977 // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. 12978 template <class D, HWY_IF_T_SIZE_D(D, 2)> 12979 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 12980 HWY_DASSERT(mask_bits < 256); 12981 const Rebind<uint8_t, decltype(d)> d8; 12982 const Twice<decltype(d8)> d8t; 12983 const RebindToUnsigned<decltype(d)> du; 12984 12985 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need 12986 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of 12987 // 8 mask bits). Loading them directly would require 4 KiB. We can instead 12988 // store lane indices and convert to byte indices (2*lane + 0..1), with the 12989 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane 12990 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. 12991 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles 12992 // is likely more costly than the higher cache footprint from storing bytes. 12993 alignas(16) static constexpr uint8_t table[2048] = { 12994 // PrintCompress16x8Tables 12995 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12996 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12997 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 12998 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12999 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 13000 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 13001 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 13002 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 13003 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 13004 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 13005 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 13006 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 13007 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 13008 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 13009 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 13010 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 13011 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 13012 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 13013 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 13014 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 13015 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 13016 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 13017 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 13018 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 13019 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 13020 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 13021 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 13022 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 13023 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 13024 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 13025 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 13026 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 13027 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 13028 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 13029 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 13030 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 13031 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 13032 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 13033 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 13034 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 13035 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 13036 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 13037 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 13038 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 13039 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 13040 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 13041 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 13042 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 13043 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 13044 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 13045 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 13046 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 13047 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 13048 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 13049 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 13050 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 13051 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 13052 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 13053 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 13054 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 13055 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 13056 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 13057 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 13058 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 13059 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 13060 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 13061 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 13062 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 13063 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 13064 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 13065 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 13066 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 13067 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 13068 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 13069 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 13070 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 13071 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 13072 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 13073 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 13074 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 13075 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 13076 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 13077 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 13078 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 13079 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 13080 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 13081 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 13082 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 13083 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 13084 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 13085 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 13086 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 13087 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 13088 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 13089 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 13090 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 13091 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 13092 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 13093 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 13094 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 13095 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 13096 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 13097 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 13098 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 13099 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 13100 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 13101 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 13102 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 13103 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 13104 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 13105 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 13106 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 13107 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 13108 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 13109 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 13110 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 13111 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 13112 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 13113 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 13114 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 13115 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 13116 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 13117 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 13118 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 13119 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 13120 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 13121 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 13122 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 13123 13124 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; 13125 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); 13126 return BitCast(d, pairs + Set(du, 0x0100)); 13127 } 13128 13129 template <class D, HWY_IF_T_SIZE_D(D, 2)> 13130 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 13131 HWY_DASSERT(mask_bits < 256); 13132 const Rebind<uint8_t, decltype(d)> d8; 13133 const Twice<decltype(d8)> d8t; 13134 const RebindToUnsigned<decltype(d)> du; 13135 13136 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need 13137 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of 13138 // 8 mask bits). Loading them directly would require 4 KiB. We can instead 13139 // store lane indices and convert to byte indices (2*lane + 0..1), with the 13140 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane 13141 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. 13142 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles 13143 // is likely more costly than the higher cache footprint from storing bytes. 13144 alignas(16) static constexpr uint8_t table[2048] = { 13145 // PrintCompressNot16x8Tables 13146 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 13147 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 13148 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 13149 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 13150 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 13151 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 13152 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 13153 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 13154 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 13155 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 13156 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 13157 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 13158 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 13159 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 13160 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 13161 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 13162 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 13163 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 13164 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 13165 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 13166 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 13167 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 13168 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 13169 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 13170 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 13171 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 13172 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 13173 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 13174 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 13175 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 13176 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 13177 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 13178 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 13179 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 13180 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 13181 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 13182 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 13183 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 13184 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 13185 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 13186 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 13187 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 13188 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 13189 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 13190 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 13191 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 13192 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 13193 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 13194 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 13195 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 13196 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 13197 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 13198 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 13199 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 13200 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 13201 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 13202 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 13203 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 13204 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 13205 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 13206 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 13207 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 13208 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 13209 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 13210 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 13211 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 13212 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 13213 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 13214 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 13215 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 13216 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 13217 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 13218 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 13219 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 13220 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 13221 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 13222 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 13223 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 13224 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 13225 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 13226 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 13227 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 13228 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 13229 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 13230 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 13231 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 13232 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 13233 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 13234 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 13235 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 13236 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 13237 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 13238 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 13239 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 13240 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 13241 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 13242 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 13243 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 13244 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 13245 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 13246 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 13247 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 13248 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 13249 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 13250 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 13251 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 13252 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 13253 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 13254 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 13255 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 13256 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 13257 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 13258 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 13259 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 13260 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 13261 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 13262 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 13263 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 13264 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 13265 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 13266 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 13267 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 13268 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 13269 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 13270 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 13271 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 13272 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 13273 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 13274 13275 const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw}; 13276 const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx); 13277 return BitCast(d, pairs + Set(du, 0x0100)); 13278 } 13279 13280 template <class D, HWY_IF_T_SIZE_D(D, 4)> 13281 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 13282 HWY_DASSERT(mask_bits < 16); 13283 13284 // There are only 4 lanes, so we can afford to load the index vector directly. 13285 alignas(16) static constexpr uint8_t u8_indices[256] = { 13286 // PrintCompress32x4Tables 13287 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 13288 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 13289 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 13290 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 13291 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 13292 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 13293 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 13294 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 13295 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 13296 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 13297 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 13298 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 13299 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 13300 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 13301 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 13302 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 13303 13304 const Repartition<uint8_t, decltype(d)> d8; 13305 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 13306 } 13307 13308 template <class D, HWY_IF_T_SIZE_D(D, 4)> 13309 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 13310 HWY_DASSERT(mask_bits < 16); 13311 13312 // There are only 4 lanes, so we can afford to load the index vector directly. 13313 alignas(16) static constexpr uint8_t u8_indices[256] = { 13314 // PrintCompressNot32x4Tables 13315 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 13316 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 13317 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13318 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 13319 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 13320 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 13321 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13322 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13323 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 13324 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 13325 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 13326 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 13327 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13328 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13329 12, 13, 14, 15}; 13330 13331 const Repartition<uint8_t, decltype(d)> d8; 13332 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 13333 } 13334 13335 template <class D, HWY_IF_T_SIZE_D(D, 8)> 13336 HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) { 13337 HWY_DASSERT(mask_bits < 4); 13338 13339 // There are only 2 lanes, so we can afford to load the index vector directly. 13340 alignas(16) static constexpr uint8_t u8_indices[64] = { 13341 // PrintCompress64x2Tables 13342 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13343 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13344 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 13345 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 13346 13347 const Repartition<uint8_t, decltype(d)> d8; 13348 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 13349 } 13350 13351 template <class D, HWY_IF_T_SIZE_D(D, 8)> 13352 HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) { 13353 HWY_DASSERT(mask_bits < 4); 13354 13355 // There are only 2 lanes, so we can afford to load the index vector directly. 13356 alignas(16) static constexpr uint8_t u8_indices[64] = { 13357 // PrintCompressNot64x2Tables 13358 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13359 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 13360 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13361 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 13362 13363 const Repartition<uint8_t, decltype(d)> d8; 13364 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 13365 } 13366 13367 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 13368 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) { 13369 const DFromV<decltype(v)> d; 13370 const RebindToUnsigned<decltype(d)> du; 13371 13372 HWY_DASSERT(mask_bits < (1ull << N)); 13373 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 13374 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 13375 } 13376 13377 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 13378 HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) { 13379 const DFromV<decltype(v)> d; 13380 const RebindToUnsigned<decltype(d)> du; 13381 13382 HWY_DASSERT(mask_bits < (1ull << N)); 13383 const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); 13384 return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 13385 } 13386 13387 } // namespace detail 13388 13389 // Single lane: no-op 13390 template <typename T> 13391 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 13392 return v; 13393 } 13394 13395 // Two lanes: conditional swap 13396 template <typename T, HWY_IF_T_SIZE(T, 8)> 13397 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { 13398 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. 13399 const DFromV<decltype(v)> d; 13400 const Vec128<T> m = VecFromMask(d, mask); 13401 const Vec128<T> maskL = DupEven(m); 13402 const Vec128<T> maskH = DupOdd(m); 13403 const Vec128<T> swap = AndNot(maskL, maskH); 13404 return IfVecThenElse(swap, Shuffle01(v), v); 13405 } 13406 13407 // General case, 2 or 4 bytes 13408 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 13409 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 13410 const DFromV<decltype(v)> d; 13411 return detail::CompressBits(v, BitsFromMask(d, mask)); 13412 } 13413 13414 // ------------------------------ CompressNot 13415 13416 // Single lane: no-op 13417 template <typename T> 13418 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 13419 return v; 13420 } 13421 13422 // Two lanes: conditional swap 13423 template <typename T, HWY_IF_T_SIZE(T, 8)> 13424 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { 13425 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. 13426 const DFromV<decltype(v)> d; 13427 const Vec128<T> m = VecFromMask(d, mask); 13428 const Vec128<T> maskL = DupEven(m); 13429 const Vec128<T> maskH = DupOdd(m); 13430 const Vec128<T> swap = AndNot(maskH, maskL); 13431 return IfVecThenElse(swap, Shuffle01(v), v); 13432 } 13433 13434 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 13435 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 13436 const DFromV<decltype(v)> d; 13437 // For partial vectors, we cannot pull the Not() into the table because 13438 // BitsFromMask clears the upper bits. 13439 if (N < 16 / sizeof(T)) { 13440 return detail::CompressBits(v, BitsFromMask(d, Not(mask))); 13441 } 13442 return detail::CompressNotBits(v, BitsFromMask(d, mask)); 13443 } 13444 13445 // ------------------------------ CompressBlocksNot 13446 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 13447 Mask128<uint64_t> /* m */) { 13448 return v; 13449 } 13450 13451 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 13452 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 13453 const uint8_t* HWY_RESTRICT bits) { 13454 uint64_t mask_bits = 0; 13455 constexpr size_t kNumBytes = (N + 7) / 8; 13456 CopyBytes<kNumBytes>(bits, &mask_bits); 13457 if (N < 8) { 13458 mask_bits &= (1ull << N) - 1; 13459 } 13460 13461 return detail::CompressBits(v, mask_bits); 13462 } 13463 13464 // ------------------------------ CompressStore, CompressBitsStore 13465 13466 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> 13467 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d, 13468 TFromD<D>* HWY_RESTRICT unaligned) { 13469 const RebindToUnsigned<decltype(d)> du; 13470 13471 const uint64_t mask_bits = BitsFromMask(d, m); 13472 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); 13473 const size_t count = PopCount(mask_bits); 13474 13475 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). 13476 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 13477 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 13478 StoreU(compressed, d, unaligned); 13479 detail::MaybeUnpoison(unaligned, count); 13480 return count; 13481 } 13482 13483 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> 13484 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 13485 TFromD<D>* HWY_RESTRICT unaligned) { 13486 const RebindToUnsigned<decltype(d)> du; 13487 13488 const uint64_t mask_bits = BitsFromMask(d, m); 13489 HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); 13490 const size_t count = PopCount(mask_bits); 13491 13492 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). 13493 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 13494 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 13495 BlendedStore(compressed, FirstN(d, count), d, unaligned); 13496 detail::MaybeUnpoison(unaligned, count); 13497 return count; 13498 } 13499 13500 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 1)> 13501 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 13502 D d, TFromD<D>* HWY_RESTRICT unaligned) { 13503 const RebindToUnsigned<decltype(d)> du; 13504 13505 uint64_t mask_bits = 0; 13506 constexpr size_t kN = MaxLanes(d); 13507 constexpr size_t kNumBytes = (kN + 7) / 8; 13508 CopyBytes<kNumBytes>(bits, &mask_bits); 13509 if (kN < 8) { 13510 mask_bits &= (1ull << kN) - 1; 13511 } 13512 const size_t count = PopCount(mask_bits); 13513 13514 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). 13515 const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); 13516 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); 13517 StoreU(compressed, d, unaligned); 13518 13519 detail::MaybeUnpoison(unaligned, count); 13520 return count; 13521 } 13522 13523 #endif // HWY_TARGET <= HWY_AVX3 13524 13525 // ------------------------------ Expand 13526 13527 // Otherwise, use the generic_ops-inl.h fallback. 13528 #if HWY_TARGET <= HWY_AVX3 || HWY_IDE 13529 13530 // The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL), 13531 // but we still want to override generic_ops-inl's table-based implementation 13532 // whenever we have the 32-bit expand provided by AVX3. 13533 #ifdef HWY_NATIVE_EXPAND 13534 #undef HWY_NATIVE_EXPAND 13535 #else 13536 #define HWY_NATIVE_EXPAND 13537 #endif 13538 13539 namespace detail { 13540 13541 #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 13542 13543 template <size_t N> 13544 HWY_INLINE Vec128<uint8_t, N> NativeExpand(Vec128<uint8_t, N> v, 13545 Mask128<uint8_t, N> mask) { 13546 return Vec128<uint8_t, N>{_mm_maskz_expand_epi8(mask.raw, v.raw)}; 13547 } 13548 13549 template <size_t N> 13550 HWY_INLINE Vec128<uint16_t, N> NativeExpand(Vec128<uint16_t, N> v, 13551 Mask128<uint16_t, N> mask) { 13552 return Vec128<uint16_t, N>{_mm_maskz_expand_epi16(mask.raw, v.raw)}; 13553 } 13554 13555 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> 13556 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, 13557 const uint8_t* HWY_RESTRICT unaligned) { 13558 return VFromD<D>{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)}; 13559 } 13560 13561 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> 13562 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, 13563 const uint16_t* HWY_RESTRICT unaligned) { 13564 return VFromD<D>{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)}; 13565 } 13566 13567 #endif // HWY_TARGET <= HWY_AVX3_DL 13568 13569 template <size_t N> 13570 HWY_INLINE Vec128<uint32_t, N> NativeExpand(Vec128<uint32_t, N> v, 13571 Mask128<uint32_t, N> mask) { 13572 return Vec128<uint32_t, N>{_mm_maskz_expand_epi32(mask.raw, v.raw)}; 13573 } 13574 13575 template <size_t N> 13576 HWY_INLINE Vec128<uint64_t, N> NativeExpand(Vec128<uint64_t, N> v, 13577 Mask128<uint64_t, N> mask) { 13578 return Vec128<uint64_t, N>{_mm_maskz_expand_epi64(mask.raw, v.raw)}; 13579 } 13580 13581 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 13582 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, 13583 const uint32_t* HWY_RESTRICT unaligned) { 13584 return VFromD<D>{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)}; 13585 } 13586 13587 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 13588 HWY_INLINE VFromD<D> NativeLoadExpand(MFromD<D> mask, D /* d */, 13589 const uint64_t* HWY_RESTRICT unaligned) { 13590 return VFromD<D>{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)}; 13591 } 13592 13593 } // namespace detail 13594 13595 // Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo. 13596 #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 13597 13598 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 13599 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { 13600 const DFromV<decltype(v)> d; 13601 const RebindToUnsigned<decltype(d)> du; 13602 const MFromD<decltype(du)> mu = RebindMask(du, mask); 13603 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); 13604 } 13605 13606 #endif // HWY_TARGET <= HWY_AVX3_DL 13607 13608 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> 13609 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { 13610 const DFromV<decltype(v)> d; 13611 const RebindToUnsigned<decltype(d)> du; 13612 const MFromD<decltype(du)> mu = RebindMask(du, mask); 13613 return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); 13614 } 13615 13616 // ------------------------------ LoadExpand 13617 13618 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 13619 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 13620 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 13621 const TFromD<D>* HWY_RESTRICT unaligned) { 13622 #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 13623 const RebindToUnsigned<decltype(d)> du; 13624 using TU = TFromD<decltype(du)>; 13625 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); 13626 const MFromD<decltype(du)> mu = RebindMask(du, mask); 13627 return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); 13628 #else 13629 return Expand(LoadU(d, unaligned), mask); 13630 #endif 13631 } 13632 13633 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), 13634 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 13635 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 13636 const TFromD<D>* HWY_RESTRICT unaligned) { 13637 #if HWY_TARGET <= HWY_AVX3 13638 const RebindToUnsigned<decltype(d)> du; 13639 using TU = TFromD<decltype(du)>; 13640 const TU* HWY_RESTRICT pu = reinterpret_cast<const TU*>(unaligned); 13641 const MFromD<decltype(du)> mu = RebindMask(du, mask); 13642 return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); 13643 #else 13644 return Expand(LoadU(d, unaligned), mask); 13645 #endif 13646 } 13647 13648 #endif // HWY_TARGET <= HWY_AVX3 13649 13650 // ------------------------------ StoreInterleaved2/3/4 13651 13652 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in 13653 // generic_ops-inl.h. 13654 13655 // ------------------------------ Additional mask logical operations 13656 13657 #if HWY_TARGET <= HWY_AVX3 13658 namespace detail { 13659 13660 template <class T, HWY_IF_LANES_LE(sizeof(T), 4)> 13661 static HWY_INLINE uint32_t AVX3Blsi(T x) { 13662 using TU = MakeUnsigned<T>; 13663 const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x)); 13664 #if HWY_COMPILER_CLANGCL 13665 return static_cast<uint32_t>(u32_val & (0u - u32_val)); 13666 #else 13667 return static_cast<uint32_t>(_blsi_u32(u32_val)); 13668 #endif 13669 } 13670 template <class T, HWY_IF_T_SIZE(T, 8)> 13671 static HWY_INLINE uint64_t AVX3Blsi(T x) { 13672 const auto u64_val = static_cast<uint64_t>(x); 13673 #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 13674 return static_cast<uint64_t>(u64_val & (0ULL - u64_val)); 13675 #else 13676 return static_cast<uint64_t>(_blsi_u64(u64_val)); 13677 #endif 13678 } 13679 13680 template <class T, HWY_IF_LANES_LE(sizeof(T), 4)> 13681 static HWY_INLINE uint32_t AVX3Blsmsk(T x) { 13682 using TU = MakeUnsigned<T>; 13683 const auto u32_val = static_cast<uint32_t>(static_cast<TU>(x)); 13684 #if HWY_COMPILER_CLANGCL 13685 return static_cast<uint32_t>(u32_val ^ (u32_val - 1u)); 13686 #else 13687 return static_cast<uint32_t>(_blsmsk_u32(u32_val)); 13688 #endif 13689 } 13690 template <class T, HWY_IF_T_SIZE(T, 8)> 13691 static HWY_INLINE uint64_t AVX3Blsmsk(T x) { 13692 const auto u64_val = static_cast<uint64_t>(x); 13693 #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 13694 return static_cast<uint64_t>(u64_val ^ (u64_val - 1ULL)); 13695 #else 13696 return static_cast<uint64_t>(_blsmsk_u64(u64_val)); 13697 #endif 13698 } 13699 13700 } // namespace detail 13701 13702 template <class T, size_t N> 13703 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 13704 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; 13705 return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>( 13706 (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)}; 13707 } 13708 template <class T, size_t N> 13709 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 13710 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; 13711 return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>( 13712 (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)}; 13713 } 13714 template <class T, size_t N> 13715 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 13716 constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; 13717 return Mask128<T, N>{static_cast<typename Mask128<T, N>::Raw>( 13718 detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)}; 13719 } 13720 template <class T, size_t N> 13721 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 13722 return Mask128<T, N>{ 13723 static_cast<typename Mask128<T, N>::Raw>(detail::AVX3Blsi(mask.raw))}; 13724 } 13725 #else // AVX2 or below 13726 template <class T> 13727 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { 13728 return mask; 13729 } 13730 template <class T> 13731 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) { 13732 const FixedTag<T, 2> d; 13733 const auto vmask = VecFromMask(d, mask); 13734 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); 13735 } 13736 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 13737 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 13738 const Simd<T, N, 0> d; 13739 const auto vmask = VecFromMask(d, mask); 13740 const auto neg_vmask = 13741 ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask))); 13742 return MaskFromVec(Or(vmask, neg_vmask)); 13743 } 13744 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 13745 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) { 13746 const Full128<T> d; 13747 const Repartition<int64_t, decltype(d)> di64; 13748 const Repartition<float, decltype(d)> df32; 13749 const Repartition<int32_t, decltype(d)> di32; 13750 using VF = VFromD<decltype(df32)>; 13751 13752 auto vmask = BitCast(di64, VecFromMask(d, mask)); 13753 vmask = Or(vmask, Neg(vmask)); 13754 13755 // Copy the sign bit of the first int64_t lane to the second int64_t lane 13756 const auto vmask2 = BroadcastSignBit( 13757 BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw, 13758 _MM_SHUFFLE(1, 1, 0, 0))})); 13759 return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2)))); 13760 } 13761 13762 template <class T, size_t N> 13763 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 13764 return Not(SetAtOrAfterFirst(mask)); 13765 } 13766 13767 template <class T> 13768 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) { 13769 return mask; 13770 } 13771 template <class T> 13772 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) { 13773 const FixedTag<T, 2> d; 13774 const RebindToSigned<decltype(d)> di; 13775 13776 const auto vmask = BitCast(di, VecFromMask(d, mask)); 13777 const auto zero = Zero(di); 13778 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); 13779 return MaskFromVec(BitCast(d, And(vmask, vmask2))); 13780 } 13781 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 13782 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 13783 const Simd<T, N, 0> d; 13784 const RebindToSigned<decltype(d)> di; 13785 13786 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask)); 13787 const auto only_first_vmask = 13788 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); 13789 return MaskFromVec(only_first_vmask); 13790 } 13791 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 13792 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) { 13793 const Full128<T> d; 13794 const RebindToSigned<decltype(d)> di; 13795 const Repartition<int64_t, decltype(d)> di64; 13796 13797 const auto zero = Zero(di64); 13798 const auto vmask = BitCast(di64, VecFromMask(d, mask)); 13799 const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); 13800 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); 13801 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); 13802 } 13803 13804 template <class T> 13805 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) { 13806 const FixedTag<T, 1> d; 13807 const RebindToSigned<decltype(d)> di; 13808 using TI = MakeSigned<T>; 13809 13810 return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); 13811 } 13812 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> 13813 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 13814 const Simd<T, N, 0> d; 13815 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); 13816 } 13817 #endif // HWY_TARGET <= HWY_AVX3 13818 13819 // ------------------------------ Reductions 13820 13821 // Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum. 13822 13823 // We provide specializations of u8x8 and u8x16, so exclude those. 13824 #undef HWY_IF_SUM_OF_LANES_D 13825 #define HWY_IF_SUM_OF_LANES_D(D) \ 13826 HWY_IF_LANES_GT_D(D, 1), \ 13827 hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \ 13828 (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \ 13829 nullptr 13830 13831 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)> 13832 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 13833 return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF)); 13834 } 13835 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)> 13836 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 13837 const Repartition<uint64_t, decltype(d)> d64; 13838 VFromD<decltype(d64)> sums = SumsOf8(v); 13839 sums = SumOfLanes(d64, sums); 13840 return Broadcast<0>(BitCast(d, sums)); 13841 } 13842 13843 #if HWY_TARGET <= HWY_SSE4 13844 // We provide specializations of u8x8, u8x16, and u16x8, so exclude those. 13845 #undef HWY_IF_MINMAX_OF_LANES_D 13846 #define HWY_IF_MINMAX_OF_LANES_D(D) \ 13847 HWY_IF_LANES_GT_D(D, 1), \ 13848 hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() || \ 13849 ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \ 13850 (!hwy::IsSame<TFromD<D>, uint16_t>() || \ 13851 (HWY_V_SIZE_D(D) != 16))>* = nullptr 13852 13853 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 13854 HWY_API Vec128<uint16_t> MinOfLanes(D /* tag */, Vec128<uint16_t> v) { 13855 return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)}); 13856 } 13857 13858 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 13859 HWY_API Vec128<uint16_t> MaxOfLanes(D d, Vec128<uint16_t> v) { 13860 const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>()); 13861 return max - MinOfLanes(d, max - v); 13862 } 13863 13864 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 13865 HWY_API Vec64<uint8_t> MinOfLanes(D d, Vec64<uint8_t> v) { 13866 const Rebind<uint16_t, decltype(d)> d16; 13867 return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v))); 13868 } 13869 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 13870 HWY_API Vec128<uint8_t> MinOfLanes(D d, Vec128<uint8_t> v) { 13871 const Half<decltype(d)> dh; 13872 Vec64<uint8_t> result = 13873 Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v))); 13874 return Combine(d, result, result); 13875 } 13876 13877 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 13878 HWY_API Vec64<uint8_t> MaxOfLanes(D d, Vec64<uint8_t> v) { 13879 const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>())); 13880 return m - MinOfLanes(d, m - v); 13881 } 13882 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 13883 HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) { 13884 const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>())); 13885 return m - MinOfLanes(d, m - v); 13886 } 13887 13888 #endif // HWY_TARGET <= HWY_SSE4 13889 13890 // ------------------------------ BitShuffle 13891 #if HWY_TARGET <= HWY_AVX3_DL 13892 13893 #ifdef HWY_NATIVE_BITSHUFFLE 13894 #undef HWY_NATIVE_BITSHUFFLE 13895 #else 13896 #define HWY_NATIVE_BITSHUFFLE 13897 #endif 13898 13899 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), 13900 HWY_IF_V_SIZE_LE_V(V, 16), 13901 HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)> 13902 HWY_API V BitShuffle(V v, VI idx) { 13903 const DFromV<decltype(v)> d64; 13904 const RebindToUnsigned<decltype(d64)> du64; 13905 const Rebind<uint8_t, decltype(d64)> du8; 13906 13907 int32_t i32_bit_shuf_result = static_cast<int32_t>( 13908 static_cast<uint16_t>(_mm_bitshuffle_epi64_mask(v.raw, idx.raw))); 13909 13910 return BitCast(d64, PromoteTo(du64, VFromD<decltype(du8)>{_mm_cvtsi32_si128( 13911 i32_bit_shuf_result)})); 13912 } 13913 #endif // HWY_TARGET <= HWY_AVX3_DL 13914 13915 // ------------------------------ MultiRotateRight 13916 13917 #if HWY_TARGET <= HWY_AVX3_DL 13918 13919 #ifdef HWY_NATIVE_MULTIROTATERIGHT 13920 #undef HWY_NATIVE_MULTIROTATERIGHT 13921 #else 13922 #define HWY_NATIVE_MULTIROTATERIGHT 13923 #endif 13924 13925 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), 13926 HWY_IF_V_SIZE_LE_V(V, 16), 13927 HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)> 13928 HWY_API V MultiRotateRight(V v, VI idx) { 13929 return V{_mm_multishift_epi64_epi8(idx.raw, v.raw)}; 13930 } 13931 13932 #endif 13933 13934 // ------------------------------ Lt128 13935 13936 namespace detail { 13937 13938 // Returns vector-mask for Lt128. Generic for all vector lengths. 13939 template <class D, HWY_IF_U64_D(D)> 13940 HWY_INLINE VFromD<D> Lt128Vec(const D d, VFromD<D> a, VFromD<D> b) { 13941 // Truth table of Eq and Lt for Hi and Lo u64. 13942 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) 13943 // =H =L cH cL | out = cH | (=H & cL) 13944 // 0 0 0 0 | 0 13945 // 0 0 0 1 | 0 13946 // 0 0 1 0 | 1 13947 // 0 0 1 1 | 1 13948 // 0 1 0 0 | 0 13949 // 0 1 0 1 | 0 13950 // 0 1 1 0 | 1 13951 // 1 0 0 0 | 0 13952 // 1 0 0 1 | 1 13953 // 1 1 0 0 | 0 13954 const auto eqHL = Eq(a, b); 13955 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 13956 const VFromD<D> ltLX = ShiftLeftLanes<1>(ltHL); 13957 const VFromD<D> vecHx = IfThenElse(eqHL, ltLX, ltHL); 13958 return InterleaveUpper(d, vecHx, vecHx); 13959 } 13960 13961 // Returns vector-mask for Eq128. Generic for all vector lengths. 13962 template <class D, HWY_IF_U64_D(D)> 13963 HWY_INLINE VFromD<D> Eq128Vec(D d, VFromD<D> a, VFromD<D> b) { 13964 const auto eqHL = VecFromMask(d, Eq(a, b)); 13965 const auto eqLH = Reverse2(d, eqHL); 13966 return And(eqHL, eqLH); 13967 } 13968 13969 template <class D, HWY_IF_U64_D(D)> 13970 HWY_INLINE VFromD<D> Ne128Vec(D d, VFromD<D> a, VFromD<D> b) { 13971 const auto neHL = VecFromMask(d, Ne(a, b)); 13972 const auto neLH = Reverse2(d, neHL); 13973 return Or(neHL, neLH); 13974 } 13975 13976 template <class D, HWY_IF_U64_D(D)> 13977 HWY_INLINE VFromD<D> Lt128UpperVec(D d, VFromD<D> a, VFromD<D> b) { 13978 // No specialization required for AVX-512: Mask <-> Vec is fast, and 13979 // copying mask bits to their neighbor seems infeasible. 13980 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 13981 return InterleaveUpper(d, ltHL, ltHL); 13982 } 13983 13984 template <class D, HWY_IF_U64_D(D)> 13985 HWY_INLINE VFromD<D> Eq128UpperVec(D d, VFromD<D> a, VFromD<D> b) { 13986 // No specialization required for AVX-512: Mask <-> Vec is fast, and 13987 // copying mask bits to their neighbor seems infeasible. 13988 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 13989 return InterleaveUpper(d, eqHL, eqHL); 13990 } 13991 13992 template <class D, HWY_IF_U64_D(D)> 13993 HWY_INLINE VFromD<D> Ne128UpperVec(D d, VFromD<D> a, VFromD<D> b) { 13994 // No specialization required for AVX-512: Mask <-> Vec is fast, and 13995 // copying mask bits to their neighbor seems infeasible. 13996 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 13997 return InterleaveUpper(d, neHL, neHL); 13998 } 13999 14000 } // namespace detail 14001 14002 template <class D, HWY_IF_U64_D(D)> 14003 HWY_API MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { 14004 return MaskFromVec(detail::Lt128Vec(d, a, b)); 14005 } 14006 14007 template <class D, HWY_IF_U64_D(D)> 14008 HWY_API MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { 14009 return MaskFromVec(detail::Eq128Vec(d, a, b)); 14010 } 14011 14012 template <class D, HWY_IF_U64_D(D)> 14013 HWY_API MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { 14014 return MaskFromVec(detail::Ne128Vec(d, a, b)); 14015 } 14016 14017 template <class D, HWY_IF_U64_D(D)> 14018 HWY_API MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { 14019 return MaskFromVec(detail::Lt128UpperVec(d, a, b)); 14020 } 14021 14022 template <class D, HWY_IF_U64_D(D)> 14023 HWY_API MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { 14024 return MaskFromVec(detail::Eq128UpperVec(d, a, b)); 14025 } 14026 14027 template <class D, HWY_IF_U64_D(D)> 14028 HWY_API MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { 14029 return MaskFromVec(detail::Ne128UpperVec(d, a, b)); 14030 } 14031 14032 // ------------------------------ Min128, Max128 (Lt128) 14033 14034 // Avoids the extra MaskFromVec in Lt128. 14035 template <class D, HWY_IF_U64_D(D)> 14036 HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { 14037 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); 14038 } 14039 14040 template <class D, HWY_IF_U64_D(D)> 14041 HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { 14042 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); 14043 } 14044 14045 template <class D, HWY_IF_U64_D(D)> 14046 HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { 14047 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); 14048 } 14049 14050 template <class D, HWY_IF_U64_D(D)> 14051 HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { 14052 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); 14053 } 14054 14055 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex 14056 14057 #if HWY_TARGET <= HWY_AVX3 14058 14059 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT 14060 #undef HWY_NATIVE_LEADING_ZERO_COUNT 14061 #else 14062 #define HWY_NATIVE_LEADING_ZERO_COUNT 14063 #endif 14064 14065 template <class V, HWY_IF_UI32(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> 14066 HWY_API V LeadingZeroCount(V v) { 14067 return V{_mm_lzcnt_epi32(v.raw)}; 14068 } 14069 14070 template <class V, HWY_IF_UI64(TFromV<V>), HWY_IF_V_SIZE_LE_D(DFromV<V>, 16)> 14071 HWY_API V LeadingZeroCount(V v) { 14072 return V{_mm_lzcnt_epi64(v.raw)}; 14073 } 14074 14075 // HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h 14076 // for AVX3 targets 14077 14078 #endif // HWY_TARGET <= HWY_AVX3 14079 14080 // NOLINTNEXTLINE(google-readability-namespace-comments) 14081 } // namespace HWY_NAMESPACE 14082 } // namespace hwy 14083 HWY_AFTER_NAMESPACE(); 14084 14085 #undef HWY_X86_IF_EMULATED_D 14086 14087 // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - 14088 // the warning seems to be issued at the call site of intrinsics, i.e. our code. 14089 HWY_DIAGNOSTICS(pop)