x86_avx3-inl.h (19232B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // External include guard in highway.h - see comment there. 17 18 // For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h 19 // and shared-inl.h. 20 #include "hwy/ops/x86_512-inl.h" 21 22 // AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if 23 // HWY_MAX_BYTES >= 64 is true are defined below 24 25 // Avoid uninitialized warnings in GCC's avx512fintrin.h - see 26 // https://github.com/google/highway/issues/710) 27 HWY_DIAGNOSTICS(push) 28 #if HWY_COMPILER_GCC_ACTUAL 29 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 30 HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, 31 ignored "-Wmaybe-uninitialized") 32 #endif 33 34 HWY_BEFORE_NAMESPACE(); 35 namespace hwy { 36 namespace HWY_NAMESPACE { 37 38 #if HWY_TARGET <= HWY_AVX3_DL 39 40 // ------------------------------ ShiftLeft 41 42 // Generic for all vector lengths. Must be defined after all GaloisAffine. 43 template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)> 44 HWY_API V ShiftLeft(const V v) { 45 const Repartition<uint64_t, DFromV<V>> du64; 46 if (kBits == 0) return v; 47 if (kBits == 1) return v + v; 48 constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) & 49 (0x0101010101010101ULL * (0xFF >> kBits)); 50 return detail::GaloisAffine(v, Set(du64, kMatrix)); 51 } 52 53 // ------------------------------ ShiftRight 54 55 // Generic for all vector lengths. Must be defined after all GaloisAffine. 56 template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)> 57 HWY_API V ShiftRight(const V v) { 58 const Repartition<uint64_t, DFromV<V>> du64; 59 if (kBits == 0) return v; 60 constexpr uint64_t kMatrix = 61 (0x0102040810204080ULL << kBits) & 62 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); 63 return detail::GaloisAffine(v, Set(du64, kMatrix)); 64 } 65 66 // Generic for all vector lengths. Must be defined after all GaloisAffine. 67 template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)> 68 HWY_API V ShiftRight(const V v) { 69 const Repartition<uint64_t, DFromV<V>> du64; 70 if (kBits == 0) return v; 71 constexpr uint64_t kShift = 72 (0x0102040810204080ULL << kBits) & 73 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); 74 constexpr uint64_t kSign = 75 kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits))); 76 return detail::GaloisAffine(v, Set(du64, kShift | kSign)); 77 } 78 79 // ------------------------------ RotateRight 80 81 // U8 RotateRight is generic for all vector lengths on AVX3_DL 82 template <int kBits, class V, HWY_IF_U8(TFromV<V>)> 83 HWY_API V RotateRight(V v) { 84 static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); 85 86 const Repartition<uint64_t, DFromV<V>> du64; 87 if (kBits == 0) return v; 88 89 constexpr uint64_t kShrMatrix = 90 (0x0102040810204080ULL << kBits) & 91 (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); 92 constexpr int kShlBits = (-kBits) & 7; 93 constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) & 94 (0x0101010101010101ULL * (0xFF >> kShlBits)); 95 constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix; 96 97 return detail::GaloisAffine(v, Set(du64, kMatrix)); 98 } 99 100 #endif // HWY_TARGET <= HWY_AVX3_DL 101 102 // ------------------------------ Compress 103 104 #pragma push_macro("HWY_X86_SLOW_COMPRESS_STORE") 105 106 #ifndef HWY_X86_SLOW_COMPRESS_STORE // allow override 107 // Slow on Zen4 and SPR, faster if we emulate via Compress(). 108 #if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR 109 #define HWY_X86_SLOW_COMPRESS_STORE 1 110 #else 111 #define HWY_X86_SLOW_COMPRESS_STORE 0 112 #endif 113 #endif // HWY_X86_SLOW_COMPRESS_STORE 114 115 // Always implement 8-bit here even if we lack VBMI2 because we can do better 116 // than generic_ops (8 at a time) via the native 32-bit compress (16 at a time). 117 #ifdef HWY_NATIVE_COMPRESS8 118 #undef HWY_NATIVE_COMPRESS8 119 #else 120 #define HWY_NATIVE_COMPRESS8 121 #endif 122 123 namespace detail { 124 125 #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 126 template <size_t N> 127 HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v, 128 const Mask128<uint8_t, N> mask) { 129 return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)}; 130 } 131 HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v, 132 const Mask256<uint8_t> mask) { 133 return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)}; 134 } 135 #if HWY_MAX_BYTES >= 64 136 HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v, 137 const Mask512<uint8_t> mask) { 138 return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)}; 139 } 140 #endif 141 142 template <size_t N> 143 HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v, 144 const Mask128<uint16_t, N> mask) { 145 return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)}; 146 } 147 HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v, 148 const Mask256<uint16_t> mask) { 149 return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)}; 150 } 151 #if HWY_MAX_BYTES >= 64 152 HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v, 153 const Mask512<uint16_t> mask) { 154 return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)}; 155 } 156 #endif 157 158 // Do not even define these to prevent accidental usage. 159 #if !HWY_X86_SLOW_COMPRESS_STORE 160 161 template <size_t N> 162 HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v, 163 Mask128<uint8_t, N> mask, 164 uint8_t* HWY_RESTRICT unaligned) { 165 _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); 166 } 167 HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask, 168 uint8_t* HWY_RESTRICT unaligned) { 169 _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); 170 } 171 #if HWY_MAX_BYTES >= 64 172 HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask, 173 uint8_t* HWY_RESTRICT unaligned) { 174 _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); 175 } 176 #endif 177 178 template <size_t N> 179 HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v, 180 Mask128<uint16_t, N> mask, 181 uint16_t* HWY_RESTRICT unaligned) { 182 _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); 183 } 184 HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask, 185 uint16_t* HWY_RESTRICT unaligned) { 186 _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); 187 } 188 #if HWY_MAX_BYTES >= 64 189 HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask, 190 uint16_t* HWY_RESTRICT unaligned) { 191 _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); 192 } 193 #endif // HWY_MAX_BYTES >= 64 194 195 #endif // HWY_X86_SLOW_COMPRESS_STORE 196 197 #endif // HWY_TARGET <= HWY_AVX3_DL 198 199 template <size_t N> 200 HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v, 201 Mask128<uint32_t, N> mask) { 202 return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)}; 203 } 204 HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v, 205 Mask256<uint32_t> mask) { 206 return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; 207 } 208 209 #if HWY_MAX_BYTES >= 64 210 HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v, 211 Mask512<uint32_t> mask) { 212 return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; 213 } 214 #endif 215 // We use table-based compress for 64-bit lanes, see CompressIsPartition. 216 217 // Do not even define these to prevent accidental usage. 218 #if !HWY_X86_SLOW_COMPRESS_STORE 219 220 template <size_t N> 221 HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v, 222 Mask128<uint32_t, N> mask, 223 uint32_t* HWY_RESTRICT unaligned) { 224 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); 225 } 226 HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask, 227 uint32_t* HWY_RESTRICT unaligned) { 228 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); 229 } 230 #if HWY_MAX_BYTES >= 64 231 HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask, 232 uint32_t* HWY_RESTRICT unaligned) { 233 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); 234 } 235 #endif 236 237 template <size_t N> 238 HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v, 239 Mask128<uint64_t, N> mask, 240 uint64_t* HWY_RESTRICT unaligned) { 241 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); 242 } 243 HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask, 244 uint64_t* HWY_RESTRICT unaligned) { 245 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); 246 } 247 #if HWY_MAX_BYTES >= 64 248 HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask, 249 uint64_t* HWY_RESTRICT unaligned) { 250 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); 251 } 252 #endif 253 254 template <size_t N> 255 HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask, 256 float* HWY_RESTRICT unaligned) { 257 _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); 258 } 259 HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask, 260 float* HWY_RESTRICT unaligned) { 261 _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); 262 } 263 #if HWY_MAX_BYTES >= 64 264 HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask, 265 float* HWY_RESTRICT unaligned) { 266 _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); 267 } 268 #endif 269 270 template <size_t N> 271 HWY_INLINE void NativeCompressStore(Vec128<double, N> v, 272 Mask128<double, N> mask, 273 double* HWY_RESTRICT unaligned) { 274 _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); 275 } 276 HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask, 277 double* HWY_RESTRICT unaligned) { 278 _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); 279 } 280 #if HWY_MAX_BYTES >= 64 281 HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask, 282 double* HWY_RESTRICT unaligned) { 283 _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); 284 } 285 #endif 286 287 #endif // HWY_X86_SLOW_COMPRESS_STORE 288 289 // For u8x16 and <= u16x16 we can avoid store+load for Compress because there is 290 // only a single compressed vector (u32x16). Other EmuCompress are implemented 291 // after the EmuCompressStore they build upon. 292 template <class V, HWY_IF_U8(TFromV<V>), 293 HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)> 294 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) { 295 const DFromV<decltype(v)> d; 296 const Rebind<uint32_t, decltype(d)> d32; 297 const VFromD<decltype(d32)> v0 = PromoteTo(d32, v); 298 299 using M32 = MFromD<decltype(d32)>; 300 const M32 m0 = PromoteMaskTo(d32, d, mask); 301 return TruncateTo(d, Compress(v0, m0)); 302 } 303 304 template <class V, HWY_IF_U16(TFromV<V>), 305 HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)> 306 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) { 307 const DFromV<decltype(v)> d; 308 const Rebind<int32_t, decltype(d)> di32; 309 const RebindToUnsigned<decltype(di32)> du32; 310 311 const MFromD<decltype(du32)> mask32 = PromoteMaskTo(du32, d, mask); 312 // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX. 313 // Only i32 -> u16 is supported, whereas NativeCompress expects u32. 314 const VFromD<decltype(du32)> v32 = PromoteTo(du32, v); 315 return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); 316 } 317 318 // See above - small-vector EmuCompressStore are implemented via EmuCompress. 319 template <class D, HWY_IF_UNSIGNED_D(D), 320 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 321 HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)> 322 static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore( 323 VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) { 324 StoreU(EmuCompress(v, mask), d, unaligned); 325 } 326 327 // Main emulation logic for wider vector, starting with EmuCompressStore because 328 // it is most convenient to merge pieces using memory (concatenating vectors at 329 // byte offsets is difficult). 330 template <class D, HWY_IF_UNSIGNED_D(D), 331 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 332 HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)> 333 static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore( 334 VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) { 335 const Half<decltype(d)> dh; 336 337 const MFromD<decltype(dh)> m0 = LowerHalfOfMask(dh, mask); 338 const MFromD<decltype(dh)> m1 = UpperHalfOfMask(dh, mask); 339 340 const VFromD<decltype(dh)> v0 = LowerHalf(dh, v); 341 const VFromD<decltype(dh)> v1 = UpperHalf(dh, v); 342 343 EmuCompressStore(v0, m0, dh, unaligned); 344 EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0)); 345 } 346 347 // Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore. 348 template <class V, HWY_IF_UNSIGNED_V(V), 349 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), 350 HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)> 351 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) { 352 using D = DFromV<decltype(v)>; 353 using T = TFromD<D>; 354 const D d; 355 356 alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)]; 357 EmuCompressStore(v, mask, d, buf); 358 return Load(d, buf); 359 } 360 361 } // namespace detail 362 363 template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 364 HWY_API V Compress(V v, const M mask) { 365 const DFromV<decltype(v)> d; 366 const RebindToUnsigned<decltype(d)> du; 367 const auto mu = RebindMask(du, mask); 368 #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 369 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); 370 #else 371 return BitCast(d, detail::EmuCompress(BitCast(du, v), mu)); 372 #endif 373 } 374 375 template <class V, class M, HWY_IF_T_SIZE_V(V, 4)> 376 HWY_API V Compress(V v, const M mask) { 377 const DFromV<decltype(v)> d; 378 const RebindToUnsigned<decltype(d)> du; 379 const auto mu = RebindMask(du, mask); 380 return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); 381 } 382 383 // ------------------------------ CompressNot 384 385 template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)> 386 HWY_API V CompressNot(V v, const M mask) { 387 return Compress(v, Not(mask)); 388 } 389 390 // uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a 391 // no-op for 128-bit. 392 template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)> 393 HWY_API V CompressBlocksNot(V v, M mask) { 394 return CompressNot(v, mask); 395 } 396 397 // ------------------------------ CompressBits 398 template <class V> 399 HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { 400 return Compress(v, LoadMaskBits(DFromV<V>(), bits)); 401 } 402 403 // ------------------------------ CompressStore 404 405 // Generic for all vector lengths. 406 407 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 408 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, 409 TFromD<D>* HWY_RESTRICT unaligned) { 410 #if HWY_X86_SLOW_COMPRESS_STORE 411 StoreU(Compress(v, mask), d, unaligned); 412 #else 413 const RebindToUnsigned<decltype(d)> du; 414 const auto mu = RebindMask(du, mask); 415 auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned); 416 417 #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 418 detail::NativeCompressStore(BitCast(du, v), mu, pu); 419 #else 420 detail::EmuCompressStore(BitCast(du, v), mu, du, pu); 421 #endif 422 #endif // HWY_X86_SLOW_COMPRESS_STORE 423 const size_t count = CountTrue(d, mask); 424 detail::MaybeUnpoison(unaligned, count); 425 return count; 426 } 427 428 template <class D, HWY_IF_NOT_FLOAT_D(D), 429 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 430 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, 431 TFromD<D>* HWY_RESTRICT unaligned) { 432 #if HWY_X86_SLOW_COMPRESS_STORE 433 StoreU(Compress(v, mask), d, unaligned); 434 #else 435 const RebindToUnsigned<decltype(d)> du; 436 const auto mu = RebindMask(du, mask); 437 using TU = TFromD<decltype(du)>; 438 TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned); 439 detail::NativeCompressStore(BitCast(du, v), mu, pu); 440 #endif // HWY_X86_SLOW_COMPRESS_STORE 441 const size_t count = CountTrue(d, mask); 442 detail::MaybeUnpoison(unaligned, count); 443 return count; 444 } 445 446 // Additional overloads to avoid casting to uint32_t (delay?). 447 template <class D, HWY_IF_FLOAT3264_D(D)> 448 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, 449 TFromD<D>* HWY_RESTRICT unaligned) { 450 #if HWY_X86_SLOW_COMPRESS_STORE 451 StoreU(Compress(v, mask), d, unaligned); 452 #else 453 (void)d; 454 detail::NativeCompressStore(v, mask, unaligned); 455 #endif // HWY_X86_SLOW_COMPRESS_STORE 456 const size_t count = PopCount(uint64_t{mask.raw}); 457 detail::MaybeUnpoison(unaligned, count); 458 return count; 459 } 460 461 // ------------------------------ CompressBlendedStore 462 template <class D> 463 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 464 TFromD<D>* HWY_RESTRICT unaligned) { 465 // Native CompressStore already does the blending at no extra cost (latency 466 // 11, rthroughput 2 - same as compress plus store). 467 468 HWY_IF_CONSTEXPR(HWY_MAX_LANES_D(D) < (16 / sizeof(TFromD<D>))) { 469 m = And(m, FirstN(d, HWY_MAX_LANES_D(D))); 470 } 471 472 HWY_IF_CONSTEXPR(!HWY_X86_SLOW_COMPRESS_STORE && 473 (HWY_TARGET <= HWY_AVX3_DL || sizeof(TFromD<D>) > 2)) { 474 return CompressStore(v, m, d, unaligned); 475 } 476 else { 477 const size_t count = CountTrue(d, m); 478 StoreN(Compress(v, m), d, unaligned, count); 479 detail::MaybeUnpoison(unaligned, count); 480 return count; 481 } 482 } 483 484 // ------------------------------ CompressBitsStore 485 // Generic for all vector lengths. 486 template <class D> 487 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 488 D d, TFromD<D>* HWY_RESTRICT unaligned) { 489 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); 490 } 491 492 #pragma pop_macro("HWY_X86_SLOW_COMPRESS_STORE") 493 494 // NOLINTNEXTLINE(google-readability-namespace-comments) 495 } // namespace HWY_NAMESPACE 496 } // namespace hwy 497 HWY_AFTER_NAMESPACE(); 498 499 // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - 500 // the warning seems to be issued at the call site of intrinsics, i.e. our code. 501 HWY_DIAGNOSTICS(pop)