tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

x86_avx3-inl.h (19232B)


      1 // Copyright 2019 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // External include guard in highway.h - see comment there.
     17 
     18 // For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h
     19 // and shared-inl.h.
     20 #include "hwy/ops/x86_512-inl.h"
     21 
     22 // AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if
     23 // HWY_MAX_BYTES >= 64 is true are defined below
     24 
     25 // Avoid uninitialized warnings in GCC's avx512fintrin.h - see
     26 // https://github.com/google/highway/issues/710)
     27 HWY_DIAGNOSTICS(push)
     28 #if HWY_COMPILER_GCC_ACTUAL
     29 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
     30 HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
     31                    ignored "-Wmaybe-uninitialized")
     32 #endif
     33 
     34 HWY_BEFORE_NAMESPACE();
     35 namespace hwy {
     36 namespace HWY_NAMESPACE {
     37 
     38 #if HWY_TARGET <= HWY_AVX3_DL
     39 
     40 // ------------------------------ ShiftLeft
     41 
     42 // Generic for all vector lengths. Must be defined after all GaloisAffine.
     43 template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
     44 HWY_API V ShiftLeft(const V v) {
     45  const Repartition<uint64_t, DFromV<V>> du64;
     46  if (kBits == 0) return v;
     47  if (kBits == 1) return v + v;
     48  constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
     49                               (0x0101010101010101ULL * (0xFF >> kBits));
     50  return detail::GaloisAffine(v, Set(du64, kMatrix));
     51 }
     52 
     53 // ------------------------------ ShiftRight
     54 
     55 // Generic for all vector lengths. Must be defined after all GaloisAffine.
     56 template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
     57 HWY_API V ShiftRight(const V v) {
     58  const Repartition<uint64_t, DFromV<V>> du64;
     59  if (kBits == 0) return v;
     60  constexpr uint64_t kMatrix =
     61      (0x0102040810204080ULL << kBits) &
     62      (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
     63  return detail::GaloisAffine(v, Set(du64, kMatrix));
     64 }
     65 
     66 // Generic for all vector lengths. Must be defined after all GaloisAffine.
     67 template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
     68 HWY_API V ShiftRight(const V v) {
     69  const Repartition<uint64_t, DFromV<V>> du64;
     70  if (kBits == 0) return v;
     71  constexpr uint64_t kShift =
     72      (0x0102040810204080ULL << kBits) &
     73      (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
     74  constexpr uint64_t kSign =
     75      kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
     76  return detail::GaloisAffine(v, Set(du64, kShift | kSign));
     77 }
     78 
     79 // ------------------------------ RotateRight
     80 
     81 // U8 RotateRight is generic for all vector lengths on AVX3_DL
     82 template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
     83 HWY_API V RotateRight(V v) {
     84  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
     85 
     86  const Repartition<uint64_t, DFromV<V>> du64;
     87  if (kBits == 0) return v;
     88 
     89  constexpr uint64_t kShrMatrix =
     90      (0x0102040810204080ULL << kBits) &
     91      (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
     92  constexpr int kShlBits = (-kBits) & 7;
     93  constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
     94                                  (0x0101010101010101ULL * (0xFF >> kShlBits));
     95  constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
     96 
     97  return detail::GaloisAffine(v, Set(du64, kMatrix));
     98 }
     99 
    100 #endif  // HWY_TARGET <= HWY_AVX3_DL
    101 
    102 // ------------------------------ Compress
    103 
    104 #pragma push_macro("HWY_X86_SLOW_COMPRESS_STORE")
    105 
    106 #ifndef HWY_X86_SLOW_COMPRESS_STORE  // allow override
    107 // Slow on Zen4 and SPR, faster if we emulate via Compress().
    108 #if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
    109 #define HWY_X86_SLOW_COMPRESS_STORE 1
    110 #else
    111 #define HWY_X86_SLOW_COMPRESS_STORE 0
    112 #endif
    113 #endif  // HWY_X86_SLOW_COMPRESS_STORE
    114 
    115 // Always implement 8-bit here even if we lack VBMI2 because we can do better
    116 // than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
    117 #ifdef HWY_NATIVE_COMPRESS8
    118 #undef HWY_NATIVE_COMPRESS8
    119 #else
    120 #define HWY_NATIVE_COMPRESS8
    121 #endif
    122 
    123 namespace detail {
    124 
    125 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
    126 template <size_t N>
    127 HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v,
    128                                             const Mask128<uint8_t, N> mask) {
    129  return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
    130 }
    131 HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v,
    132                                          const Mask256<uint8_t> mask) {
    133  return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
    134 }
    135 #if HWY_MAX_BYTES >= 64
    136 HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v,
    137                                          const Mask512<uint8_t> mask) {
    138  return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
    139 }
    140 #endif
    141 
    142 template <size_t N>
    143 HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v,
    144                                              const Mask128<uint16_t, N> mask) {
    145  return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
    146 }
    147 HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v,
    148                                           const Mask256<uint16_t> mask) {
    149  return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
    150 }
    151 #if HWY_MAX_BYTES >= 64
    152 HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v,
    153                                           const Mask512<uint16_t> mask) {
    154  return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
    155 }
    156 #endif
    157 
    158 // Do not even define these to prevent accidental usage.
    159 #if !HWY_X86_SLOW_COMPRESS_STORE
    160 
    161 template <size_t N>
    162 HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
    163                                    Mask128<uint8_t, N> mask,
    164                                    uint8_t* HWY_RESTRICT unaligned) {
    165  _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
    166 }
    167 HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
    168                                    uint8_t* HWY_RESTRICT unaligned) {
    169  _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
    170 }
    171 #if HWY_MAX_BYTES >= 64
    172 HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
    173                                    uint8_t* HWY_RESTRICT unaligned) {
    174  _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
    175 }
    176 #endif
    177 
    178 template <size_t N>
    179 HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
    180                                    Mask128<uint16_t, N> mask,
    181                                    uint16_t* HWY_RESTRICT unaligned) {
    182  _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
    183 }
    184 HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
    185                                    uint16_t* HWY_RESTRICT unaligned) {
    186  _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
    187 }
    188 #if HWY_MAX_BYTES >= 64
    189 HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
    190                                    uint16_t* HWY_RESTRICT unaligned) {
    191  _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
    192 }
    193 #endif  // HWY_MAX_BYTES >= 64
    194 
    195 #endif  // HWY_X86_SLOW_COMPRESS_STORE
    196 
    197 #endif  // HWY_TARGET <= HWY_AVX3_DL
    198 
    199 template <size_t N>
    200 HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v,
    201                                              Mask128<uint32_t, N> mask) {
    202  return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
    203 }
    204 HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v,
    205                                           Mask256<uint32_t> mask) {
    206  return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
    207 }
    208 
    209 #if HWY_MAX_BYTES >= 64
    210 HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v,
    211                                           Mask512<uint32_t> mask) {
    212  return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
    213 }
    214 #endif
    215 // We use table-based compress for 64-bit lanes, see CompressIsPartition.
    216 
    217 // Do not even define these to prevent accidental usage.
    218 #if !HWY_X86_SLOW_COMPRESS_STORE
    219 
    220 template <size_t N>
    221 HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
    222                                    Mask128<uint32_t, N> mask,
    223                                    uint32_t* HWY_RESTRICT unaligned) {
    224  _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
    225 }
    226 HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
    227                                    uint32_t* HWY_RESTRICT unaligned) {
    228  _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
    229 }
    230 #if HWY_MAX_BYTES >= 64
    231 HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
    232                                    uint32_t* HWY_RESTRICT unaligned) {
    233  _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
    234 }
    235 #endif
    236 
    237 template <size_t N>
    238 HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
    239                                    Mask128<uint64_t, N> mask,
    240                                    uint64_t* HWY_RESTRICT unaligned) {
    241  _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
    242 }
    243 HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
    244                                    uint64_t* HWY_RESTRICT unaligned) {
    245  _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
    246 }
    247 #if HWY_MAX_BYTES >= 64
    248 HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
    249                                    uint64_t* HWY_RESTRICT unaligned) {
    250  _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
    251 }
    252 #endif
    253 
    254 template <size_t N>
    255 HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
    256                                    float* HWY_RESTRICT unaligned) {
    257  _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
    258 }
    259 HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
    260                                    float* HWY_RESTRICT unaligned) {
    261  _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
    262 }
    263 #if HWY_MAX_BYTES >= 64
    264 HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
    265                                    float* HWY_RESTRICT unaligned) {
    266  _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
    267 }
    268 #endif
    269 
    270 template <size_t N>
    271 HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
    272                                    Mask128<double, N> mask,
    273                                    double* HWY_RESTRICT unaligned) {
    274  _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
    275 }
    276 HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
    277                                    double* HWY_RESTRICT unaligned) {
    278  _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
    279 }
    280 #if HWY_MAX_BYTES >= 64
    281 HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
    282                                    double* HWY_RESTRICT unaligned) {
    283  _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
    284 }
    285 #endif
    286 
    287 #endif  // HWY_X86_SLOW_COMPRESS_STORE
    288 
    289 // For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
    290 // only a single compressed vector (u32x16). Other EmuCompress are implemented
    291 // after the EmuCompressStore they build upon.
    292 template <class V, HWY_IF_U8(TFromV<V>),
    293          HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
    294 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
    295  const DFromV<decltype(v)> d;
    296  const Rebind<uint32_t, decltype(d)> d32;
    297  const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);
    298 
    299  using M32 = MFromD<decltype(d32)>;
    300  const M32 m0 = PromoteMaskTo(d32, d, mask);
    301  return TruncateTo(d, Compress(v0, m0));
    302 }
    303 
    304 template <class V, HWY_IF_U16(TFromV<V>),
    305          HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
    306 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
    307  const DFromV<decltype(v)> d;
    308  const Rebind<int32_t, decltype(d)> di32;
    309  const RebindToUnsigned<decltype(di32)> du32;
    310 
    311  const MFromD<decltype(du32)> mask32 = PromoteMaskTo(du32, d, mask);
    312  // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
    313  // Only i32 -> u16 is supported, whereas NativeCompress expects u32.
    314  const VFromD<decltype(du32)> v32 = PromoteTo(du32, v);
    315  return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
    316 }
    317 
    318 // See above - small-vector EmuCompressStore are implemented via EmuCompress.
    319 template <class D, HWY_IF_UNSIGNED_D(D),
    320          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
    321          HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
    322 static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
    323    VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
    324  StoreU(EmuCompress(v, mask), d, unaligned);
    325 }
    326 
    327 // Main emulation logic for wider vector, starting with EmuCompressStore because
    328 // it is most convenient to merge pieces using memory (concatenating vectors at
    329 // byte offsets is difficult).
    330 template <class D, HWY_IF_UNSIGNED_D(D),
    331          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
    332          HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
    333 static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
    334    VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
    335  const Half<decltype(d)> dh;
    336 
    337  const MFromD<decltype(dh)> m0 = LowerHalfOfMask(dh, mask);
    338  const MFromD<decltype(dh)> m1 = UpperHalfOfMask(dh, mask);
    339 
    340  const VFromD<decltype(dh)> v0 = LowerHalf(dh, v);
    341  const VFromD<decltype(dh)> v1 = UpperHalf(dh, v);
    342 
    343  EmuCompressStore(v0, m0, dh, unaligned);
    344  EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0));
    345 }
    346 
    347 // Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
    348 template <class V, HWY_IF_UNSIGNED_V(V),
    349          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
    350          HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)>
    351 static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
    352  using D = DFromV<decltype(v)>;
    353  using T = TFromD<D>;
    354  const D d;
    355 
    356  alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)];
    357  EmuCompressStore(v, mask, d, buf);
    358  return Load(d, buf);
    359 }
    360 
    361 }  // namespace detail
    362 
    363 template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
    364 HWY_API V Compress(V v, const M mask) {
    365  const DFromV<decltype(v)> d;
    366  const RebindToUnsigned<decltype(d)> du;
    367  const auto mu = RebindMask(du, mask);
    368 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
    369  return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
    370 #else
    371  return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
    372 #endif
    373 }
    374 
    375 template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
    376 HWY_API V Compress(V v, const M mask) {
    377  const DFromV<decltype(v)> d;
    378  const RebindToUnsigned<decltype(d)> du;
    379  const auto mu = RebindMask(du, mask);
    380  return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
    381 }
    382 
    383 // ------------------------------ CompressNot
    384 
    385 template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
    386 HWY_API V CompressNot(V v, const M mask) {
    387  return Compress(v, Not(mask));
    388 }
    389 
    390 // uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
    391 // no-op for 128-bit.
    392 template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
    393 HWY_API V CompressBlocksNot(V v, M mask) {
    394  return CompressNot(v, mask);
    395 }
    396 
    397 // ------------------------------ CompressBits
    398 template <class V>
    399 HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
    400  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
    401 }
    402 
    403 // ------------------------------ CompressStore
    404 
    405 // Generic for all vector lengths.
    406 
    407 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
    408 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
    409                             TFromD<D>* HWY_RESTRICT unaligned) {
    410 #if HWY_X86_SLOW_COMPRESS_STORE
    411  StoreU(Compress(v, mask), d, unaligned);
    412 #else
    413  const RebindToUnsigned<decltype(d)> du;
    414  const auto mu = RebindMask(du, mask);
    415  auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
    416 
    417 #if HWY_TARGET <= HWY_AVX3_DL  // VBMI2
    418  detail::NativeCompressStore(BitCast(du, v), mu, pu);
    419 #else
    420  detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
    421 #endif
    422 #endif  // HWY_X86_SLOW_COMPRESS_STORE
    423  const size_t count = CountTrue(d, mask);
    424  detail::MaybeUnpoison(unaligned, count);
    425  return count;
    426 }
    427 
    428 template <class D, HWY_IF_NOT_FLOAT_D(D),
    429          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
    430 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
    431                             TFromD<D>* HWY_RESTRICT unaligned) {
    432 #if HWY_X86_SLOW_COMPRESS_STORE
    433  StoreU(Compress(v, mask), d, unaligned);
    434 #else
    435  const RebindToUnsigned<decltype(d)> du;
    436  const auto mu = RebindMask(du, mask);
    437  using TU = TFromD<decltype(du)>;
    438  TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
    439  detail::NativeCompressStore(BitCast(du, v), mu, pu);
    440 #endif  // HWY_X86_SLOW_COMPRESS_STORE
    441  const size_t count = CountTrue(d, mask);
    442  detail::MaybeUnpoison(unaligned, count);
    443  return count;
    444 }
    445 
    446 // Additional overloads to avoid casting to uint32_t (delay?).
    447 template <class D, HWY_IF_FLOAT3264_D(D)>
    448 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
    449                             TFromD<D>* HWY_RESTRICT unaligned) {
    450 #if HWY_X86_SLOW_COMPRESS_STORE
    451  StoreU(Compress(v, mask), d, unaligned);
    452 #else
    453  (void)d;
    454  detail::NativeCompressStore(v, mask, unaligned);
    455 #endif  // HWY_X86_SLOW_COMPRESS_STORE
    456  const size_t count = PopCount(uint64_t{mask.raw});
    457  detail::MaybeUnpoison(unaligned, count);
    458  return count;
    459 }
    460 
    461 // ------------------------------ CompressBlendedStore
    462 template <class D>
    463 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
    464                                    TFromD<D>* HWY_RESTRICT unaligned) {
    465  // Native CompressStore already does the blending at no extra cost (latency
    466  // 11, rthroughput 2 - same as compress plus store).
    467 
    468  HWY_IF_CONSTEXPR(HWY_MAX_LANES_D(D) < (16 / sizeof(TFromD<D>))) {
    469    m = And(m, FirstN(d, HWY_MAX_LANES_D(D)));
    470  }
    471 
    472  HWY_IF_CONSTEXPR(!HWY_X86_SLOW_COMPRESS_STORE &&
    473                   (HWY_TARGET <= HWY_AVX3_DL || sizeof(TFromD<D>) > 2)) {
    474    return CompressStore(v, m, d, unaligned);
    475  }
    476  else {
    477    const size_t count = CountTrue(d, m);
    478    StoreN(Compress(v, m), d, unaligned, count);
    479    detail::MaybeUnpoison(unaligned, count);
    480    return count;
    481  }
    482 }
    483 
    484 // ------------------------------ CompressBitsStore
    485 // Generic for all vector lengths.
    486 template <class D>
    487 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
    488                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
    489  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
    490 }
    491 
    492 #pragma pop_macro("HWY_X86_SLOW_COMPRESS_STORE")
    493 
    494 // NOLINTNEXTLINE(google-readability-namespace-comments)
    495 }  // namespace HWY_NAMESPACE
    496 }  // namespace hwy
    497 HWY_AFTER_NAMESPACE();
    498 
    499 // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
    500 // the warning seems to be issued at the call site of intrinsics, i.e. our code.
    501 HWY_DIAGNOSTICS(pop)