tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

crc_x86_arm_combined.cc (28689B)


      1 // Copyright 2022 The Abseil Authors.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
     16 
     17 #include <cstddef>
     18 #include <cstdint>
     19 #include <memory>
     20 #include <vector>
     21 
     22 #include "absl/base/attributes.h"
     23 #include "absl/base/config.h"
     24 #include "absl/base/internal/endian.h"
     25 #include "absl/base/prefetch.h"
     26 #include "absl/crc/internal/cpu_detect.h"
     27 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
     28 #include "absl/crc/internal/crc_internal.h"
     29 #include "absl/memory/memory.h"
     30 #include "absl/numeric/bits.h"
     31 
     32 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
     33    defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
     34 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
     35 #endif
     36 
     37 namespace absl {
     38 ABSL_NAMESPACE_BEGIN
     39 namespace crc_internal {
     40 
     41 #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
     42 
     43 // Implementation details not exported outside of file
     44 namespace {
     45 
     46 // Some machines have CRC acceleration hardware.
     47 // We can do a faster version of Extend() on such machines.
     48 class CRC32AcceleratedX86ARMCombined : public CRC32 {
     49 public:
     50  CRC32AcceleratedX86ARMCombined() {}
     51  ~CRC32AcceleratedX86ARMCombined() override {}
     52  void ExtendByZeroes(uint32_t* crc, size_t length) const override;
     53  uint32_t ComputeZeroConstant(size_t length) const;
     54 
     55 private:
     56  CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
     57      delete;
     58  CRC32AcceleratedX86ARMCombined& operator=(
     59      const CRC32AcceleratedX86ARMCombined&) = delete;
     60 };
     61 
     62 // Constants for switching between algorithms.
     63 // Chosen by comparing speed at different powers of 2.
     64 constexpr size_t kSmallCutoff = 256;
     65 constexpr size_t kMediumCutoff = 2048;
     66 
     67 #define ABSL_INTERNAL_STEP1(crc, p)                   \
     68  do {                                                \
     69    crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
     70  } while (0)
     71 #define ABSL_INTERNAL_STEP2(crc, p)                                            \
     72  do {                                                                         \
     73    crc =                                                                      \
     74        CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
     75    p += 2;                                                                    \
     76  } while (0)
     77 #define ABSL_INTERNAL_STEP4(crc, p)                                            \
     78  do {                                                                         \
     79    crc =                                                                      \
     80        CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
     81    p += 4;                                                                    \
     82  } while (0)
     83 #define ABSL_INTERNAL_STEP8(crc, p)                                            \
     84  do {                                                                         \
     85    crc =                                                                      \
     86        CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); \
     87    p += 8;                                                                    \
     88  } while (0)
     89 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
     90  do {                                             \
     91    ABSL_INTERNAL_STEP8(crc0, p0);                 \
     92    ABSL_INTERNAL_STEP8(crc1, p1);                 \
     93  } while (0)
     94 #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
     95  do {                                                       \
     96    ABSL_INTERNAL_STEP8(crc0, p0);                           \
     97    ABSL_INTERNAL_STEP8(crc1, p1);                           \
     98    ABSL_INTERNAL_STEP8(crc2, p2);                           \
     99  } while (0)
    100 
    101 namespace {
    102 
    103 uint32_t multiply(uint32_t a, uint32_t b) {
    104  V128 power = V128_From64WithZeroFill(a);
    105  V128 crc = V128_From64WithZeroFill(b);
    106  V128 res = V128_PMulLow(power, crc);
    107 
    108  // Combine crc values.
    109  //
    110  // Adding res to itself is equivalent to multiplying by 2,
    111  // or shifting left by 1. Addition is used as not all compilers
    112  // are able to generate optimal code without this hint.
    113  // https://godbolt.org/z/rr3fMnf39
    114  res = V128_Add64(res, res);
    115  return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
    116         CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
    117 }
    118 
    119 // Powers of crc32c polynomial, for faster ExtendByZeros.
    120 // Verified against folly:
    121 // folly/hash/detail/Crc32CombineDetail.cpp
    122 constexpr uint32_t kCRC32CPowers[] = {
    123    0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
    124    0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
    125    0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
    126    0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
    127    0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
    128    0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
    129    0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
    130    0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
    131    0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
    132    0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
    133    0x00800000, 0x00008000,
    134 };
    135 
    136 }  // namespace
    137 
    138 // Compute a magic constant, so that multiplying by it is the same as
    139 // extending crc by length zeros.
    140 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
    141    size_t length) const {
    142  // Lowest 2 bits are handled separately in ExtendByZeroes
    143  length >>= 2;
    144 
    145  int index = absl::countr_zero(length);
    146  uint32_t prev = kCRC32CPowers[index];
    147  length &= length - 1;
    148 
    149  while (length) {
    150    // For each bit of length, extend by 2**n zeros.
    151    index = absl::countr_zero(length);
    152    prev = multiply(prev, kCRC32CPowers[index]);
    153    length &= length - 1;
    154  }
    155  return prev;
    156 }
    157 
    158 void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
    159                                                    size_t length) const {
    160  uint32_t val = *crc;
    161  // Don't bother with multiplication for small length.
    162  switch (length & 3) {
    163    case 0:
    164      break;
    165    case 1:
    166      val = CRC32_u8(val, 0);
    167      break;
    168    case 2:
    169      val = CRC32_u16(val, 0);
    170      break;
    171    case 3:
    172      val = CRC32_u8(val, 0);
    173      val = CRC32_u16(val, 0);
    174      break;
    175  }
    176  if (length > 3) {
    177    val = multiply(val, ComputeZeroConstant(length));
    178  }
    179  *crc = val;
    180 }
    181 
    182 // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
    183 // Instruction"
    184 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
    185 // We only need every 4th value, because we unroll loop by 4.
    186 constexpr uint64_t kClmulConstants[] = {
    187    0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
    188    0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
    189    0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
    190    0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
    191    0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
    192    0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
    193    0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
    194    0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
    195    0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
    196    0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
    197    0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
    198    0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
    199    0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
    200 };
    201 
    202 enum class CutoffStrategy {
    203  // Use 3 CRC streams to fold into 1.
    204  Fold3,
    205  // Unroll CRC instructions for 64 bytes.
    206  Unroll64CRC,
    207 };
    208 
    209 // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
    210 // methods and data that don't need the template arguments.
    211 class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
    212    : public CRC32AcceleratedX86ARMCombined {
    213 protected:
    214  // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
    215  // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
    216  // high latency, so we run 4 128-bit partial checksums that can be reduced to
    217  // a single value by FinalizePclmulStream later. Computing crc for arbitrary
    218  // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
    219  // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
    220  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
    221  // We are applying it to CRC32C polynomial.
    222  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
    223      const uint8_t* p, V128* partialCRC) const {
    224    V128 loopMultiplicands =
    225        V128_Load(reinterpret_cast<const V128*>(kFoldAcross512Bits));
    226 
    227    V128 partialCRC1 = partialCRC[0];
    228    V128 partialCRC2 = partialCRC[1];
    229    V128 partialCRC3 = partialCRC[2];
    230    V128 partialCRC4 = partialCRC[3];
    231 
    232    V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
    233    V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
    234    V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
    235    V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
    236    V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
    237    V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
    238    V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
    239    V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
    240    partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
    241    partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
    242    partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
    243    partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
    244    partialCRC1 = V128_Xor(tmp1, partialCRC1);
    245    partialCRC2 = V128_Xor(tmp2, partialCRC2);
    246    partialCRC3 = V128_Xor(tmp3, partialCRC3);
    247    partialCRC4 = V128_Xor(tmp4, partialCRC4);
    248    partialCRC1 = V128_Xor(partialCRC1, data1);
    249    partialCRC2 = V128_Xor(partialCRC2, data2);
    250    partialCRC3 = V128_Xor(partialCRC3, data3);
    251    partialCRC4 = V128_Xor(partialCRC4, data4);
    252    partialCRC[0] = partialCRC1;
    253    partialCRC[1] = partialCRC2;
    254    partialCRC[2] = partialCRC3;
    255    partialCRC[3] = partialCRC4;
    256  }
    257 
    258  // Reduce partialCRC produced by Process64BytesPclmul into a single value,
    259  // that represents crc checksum of all the processed bytes.
    260  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
    261  FinalizePclmulStream(V128* partialCRC) const {
    262    V128 partialCRC1 = partialCRC[0];
    263    V128 partialCRC2 = partialCRC[1];
    264    V128 partialCRC3 = partialCRC[2];
    265    V128 partialCRC4 = partialCRC[3];
    266 
    267    // Combine 4 vectors of partial crc into a single vector.
    268    V128 reductionMultiplicands =
    269        V128_Load(reinterpret_cast<const V128*>(kFoldAcross256Bits));
    270 
    271    V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
    272    V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
    273 
    274    partialCRC1 = V128_Xor(low, high);
    275    partialCRC1 = V128_Xor(partialCRC1, partialCRC3);
    276 
    277    low = V128_PMulLow(reductionMultiplicands, partialCRC2);
    278    high = V128_PMulHi(reductionMultiplicands, partialCRC2);
    279 
    280    partialCRC2 = V128_Xor(low, high);
    281    partialCRC2 = V128_Xor(partialCRC2, partialCRC4);
    282 
    283    reductionMultiplicands =
    284        V128_Load(reinterpret_cast<const V128*>(kFoldAcross128Bits));
    285 
    286    low = V128_PMulLow(reductionMultiplicands, partialCRC1);
    287    high = V128_PMulHi(reductionMultiplicands, partialCRC1);
    288    V128 fullCRC = V128_Xor(low, high);
    289    fullCRC = V128_Xor(fullCRC, partialCRC2);
    290 
    291    // Reduce fullCRC into scalar value.
    292    uint32_t crc = 0;
    293    crc = CRC32_u64(crc, V128_Extract64<0>(fullCRC));
    294    crc = CRC32_u64(crc, V128_Extract64<1>(fullCRC));
    295    return crc;
    296  }
    297 
    298  // Update crc with 64 bytes of data from p.
    299  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
    300                                                          uint64_t crc) const {
    301    for (int i = 0; i < 8; i++) {
    302      crc =
    303          CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
    304      p += 8;
    305    }
    306    return crc;
    307  }
    308 
    309  // Constants generated by './scripts/gen-crc-consts.py x86_pclmul
    310  // crc32_lsb_0x82f63b78' from the Linux kernel.
    311  alignas(16) static constexpr uint64_t kFoldAcross512Bits[2] = {
    312      // (x^543 mod G) * x^32
    313      0x00000000740eef02,
    314      // (x^479 mod G) * x^32
    315      0x000000009e4addf8};
    316  alignas(16) static constexpr uint64_t kFoldAcross256Bits[2] = {
    317      // (x^287 mod G) * x^32
    318      0x000000003da6d0cb,
    319      // (x^223 mod G) * x^32
    320      0x00000000ba4fc28e};
    321  alignas(16) static constexpr uint64_t kFoldAcross128Bits[2] = {
    322      // (x^159 mod G) * x^32
    323      0x00000000f20c0dfe,
    324      // (x^95 mod G) * x^32
    325      0x00000000493c7d27};
    326 
    327  // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
    328  // size. Each group is CRCed in parallel then combined at the end of the
    329  // block.
    330  static constexpr size_t kGroupsSmall = 3;
    331  // For large runs we use up to kMaxStreams blocks computed with CRC
    332  // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
    333  // are combined in the end.
    334  static constexpr size_t kMaxStreams = 3;
    335 };
    336 
    337 template <size_t num_crc_streams, size_t num_pclmul_streams,
    338          CutoffStrategy strategy>
    339 class CRC32AcceleratedX86ARMCombinedMultipleStreams
    340    : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
    341  ABSL_ATTRIBUTE_HOT
    342  void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
    343    static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
    344                  "Invalid number of crc streams");
    345    static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
    346                  "Invalid number of pclmul streams");
    347    const uint8_t* p = static_cast<const uint8_t*>(bytes);
    348    const uint8_t* e = p + length;
    349    uint32_t l = *crc;
    350    uint64_t l64;
    351 
    352    // We have dedicated instruction for 1,2,4 and 8 bytes.
    353    if (length & 8) {
    354      ABSL_INTERNAL_STEP8(l, p);
    355      length &= ~size_t{8};
    356    }
    357    if (length & 4) {
    358      ABSL_INTERNAL_STEP4(l, p);
    359      length &= ~size_t{4};
    360    }
    361    if (length & 2) {
    362      ABSL_INTERNAL_STEP2(l, p);
    363      length &= ~size_t{2};
    364    }
    365    if (length & 1) {
    366      ABSL_INTERNAL_STEP1(l, p);
    367      length &= ~size_t{1};
    368    }
    369    if (length == 0) {
    370      *crc = l;
    371      return;
    372    }
    373    // length is now multiple of 16.
    374 
    375    // For small blocks just run simple loop, because cost of combining multiple
    376    // streams is significant.
    377    if (strategy != CutoffStrategy::Unroll64CRC) {
    378      if (length < kSmallCutoff) {
    379        while (length >= 16) {
    380          ABSL_INTERNAL_STEP8(l, p);
    381          ABSL_INTERNAL_STEP8(l, p);
    382          length -= 16;
    383        }
    384        *crc = l;
    385        return;
    386      }
    387    }
    388 
    389    // For medium blocks we run 3 crc streams and combine them as described in
    390    // Intel paper above. Running 4th stream doesn't help, because crc
    391    // instruction has latency 3 and throughput 1.
    392    if (length < kMediumCutoff) {
    393      l64 = l;
    394      if (strategy == CutoffStrategy::Fold3) {
    395        uint64_t l641 = 0;
    396        uint64_t l642 = 0;
    397        const size_t blockSize = 32;
    398        size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
    399        const uint8_t* p1 = p + bs * blockSize;
    400        const uint8_t* p2 = p1 + bs * blockSize;
    401 
    402        for (size_t i = 0; i + 1 < bs; ++i) {
    403          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    404          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    405          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    406          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    407          PrefetchToLocalCache(
    408              reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
    409          PrefetchToLocalCache(
    410              reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
    411          PrefetchToLocalCache(
    412              reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
    413        }
    414        // Don't run crc on last 8 bytes.
    415        ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    416        ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    417        ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
    418        ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
    419 
    420        V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
    421 
    422        V128 tmp = V128_From64WithZeroFill(l64);
    423 
    424        V128 res1 = V128_PMulLow(tmp, magic);
    425 
    426        tmp = V128_From64WithZeroFill(l641);
    427 
    428        V128 res2 = V128_PMul10(tmp, magic);
    429        V128 x = V128_Xor(res1, res2);
    430        l64 = static_cast<uint64_t>(V128_Low64(x)) ^
    431              absl::little_endian::Load64(p2);
    432        l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
    433 
    434        p = p2 + 8;
    435      } else if (strategy == CutoffStrategy::Unroll64CRC) {
    436        while ((e - p) >= 64) {
    437          l64 = Process64BytesCRC(p, l64);
    438          p += 64;
    439        }
    440      }
    441    } else {
    442      // There is a lot of data, we can ignore combine costs and run all
    443      // requested streams (num_crc_streams + num_pclmul_streams),
    444      // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
    445      // so on some cpus it makes sense to execute both of them for different
    446      // streams.
    447 
    448      // Point x at first 8-byte aligned byte in string.
    449      const uint8_t* x = RoundUp<8>(p);
    450      // Process bytes until p is 8-byte aligned, if that isn't past the end.
    451      while (p != x) {
    452        ABSL_INTERNAL_STEP1(l, p);
    453      }
    454 
    455      size_t bs = static_cast<size_t>(e - p) /
    456                  (num_crc_streams + num_pclmul_streams) / 64;
    457      const uint8_t* crc_streams[kMaxStreams];
    458      const uint8_t* pclmul_streams[kMaxStreams];
    459      // We are guaranteed to have at least one crc stream.
    460      crc_streams[0] = p;
    461      for (size_t i = 1; i < num_crc_streams; i++) {
    462        crc_streams[i] = crc_streams[i - 1] + bs * 64;
    463      }
    464      pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
    465      for (size_t i = 1; i < num_pclmul_streams; i++) {
    466        pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
    467      }
    468 
    469      // Per stream crc sums.
    470      uint64_t l64_crc[kMaxStreams] = {l};
    471      uint64_t l64_pclmul[kMaxStreams] = {0};
    472 
    473      // Peel first iteration, because PCLMULQDQ stream, needs setup.
    474      for (size_t i = 0; i < num_crc_streams; i++) {
    475        l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
    476        crc_streams[i] += 16 * 4;
    477      }
    478 
    479      V128 partialCRC[kMaxStreams][4];
    480      for (size_t i = 0; i < num_pclmul_streams; i++) {
    481        partialCRC[i][0] = V128_LoadU(
    482            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
    483        partialCRC[i][1] = V128_LoadU(
    484            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
    485        partialCRC[i][2] = V128_LoadU(
    486            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
    487        partialCRC[i][3] = V128_LoadU(
    488            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
    489        pclmul_streams[i] += 16 * 4;
    490      }
    491 
    492      for (size_t i = 1; i < bs; i++) {
    493        // Prefetch data for next iterations.
    494        for (size_t j = 0; j < num_crc_streams; j++) {
    495          PrefetchToLocalCache(
    496              reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
    497        }
    498        for (size_t j = 0; j < num_pclmul_streams; j++) {
    499          PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
    500                                                             kPrefetchHorizon));
    501        }
    502 
    503        // We process each stream in 64 byte blocks. This can be written as
    504        // for (int i = 0; i < num_pclmul_streams; i++) {
    505        //   Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
    506        //   pclmul_streams[i] += 16 * 4;
    507        // }
    508        // for (int i = 0; i < num_crc_streams; i++) {
    509        //   l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
    510        //   crc_streams[i] += 16*4;
    511        // }
    512        // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
    513        // gives ~2% performance boost.
    514        l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
    515        crc_streams[0] += 16 * 4;
    516        if (num_pclmul_streams > 0) {
    517          Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
    518          pclmul_streams[0] += 16 * 4;
    519        }
    520        if (num_crc_streams > 1) {
    521          l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
    522          crc_streams[1] += 16 * 4;
    523        }
    524        if (num_pclmul_streams > 1) {
    525          Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
    526          pclmul_streams[1] += 16 * 4;
    527        }
    528        if (num_crc_streams > 2) {
    529          l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
    530          crc_streams[2] += 16 * 4;
    531        }
    532        if (num_pclmul_streams > 2) {
    533          Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
    534          pclmul_streams[2] += 16 * 4;
    535        }
    536      }
    537 
    538      // PCLMULQDQ based streams require special final step;
    539      // CRC based don't.
    540      for (size_t i = 0; i < num_pclmul_streams; i++) {
    541        l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
    542      }
    543 
    544      // Combine all streams into single result.
    545      uint32_t magic = ComputeZeroConstant(bs * 64);
    546      l64 = l64_crc[0];
    547      for (size_t i = 1; i < num_crc_streams; i++) {
    548        l64 = multiply(static_cast<uint32_t>(l64), magic);
    549        l64 ^= l64_crc[i];
    550      }
    551      for (size_t i = 0; i < num_pclmul_streams; i++) {
    552        l64 = multiply(static_cast<uint32_t>(l64), magic);
    553        l64 ^= l64_pclmul[i];
    554      }
    555 
    556      // Update p.
    557      if (num_pclmul_streams > 0) {
    558        p = pclmul_streams[num_pclmul_streams - 1];
    559      } else {
    560        p = crc_streams[num_crc_streams - 1];
    561      }
    562    }
    563    l = static_cast<uint32_t>(l64);
    564 
    565    while ((e - p) >= 16) {
    566      ABSL_INTERNAL_STEP8(l, p);
    567      ABSL_INTERNAL_STEP8(l, p);
    568    }
    569    // Process the last few bytes
    570    while (p != e) {
    571      ABSL_INTERNAL_STEP1(l, p);
    572    }
    573 
    574 #undef ABSL_INTERNAL_STEP8BY3
    575 #undef ABSL_INTERNAL_STEP8BY2
    576 #undef ABSL_INTERNAL_STEP8
    577 #undef ABSL_INTERNAL_STEP4
    578 #undef ABSL_INTERNAL_STEP2
    579 #undef ABSL_INTERNAL_STEP1
    580 
    581    *crc = l;
    582  }
    583 };
    584 
    585 }  // namespace
    586 
    587 // Intel processors with SSE4.2 have an instruction for one particular
    588 // 32-bit CRC polynomial:  crc32c
    589 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
    590  CpuType type = GetCpuType();
    591  switch (type) {
    592    case CpuType::kIntelHaswell:
    593    case CpuType::kAmdRome:
    594    case CpuType::kAmdNaples:
    595    case CpuType::kAmdMilan:
    596      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    597          3, 1, CutoffStrategy::Fold3>();
    598    // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
    599    case CpuType::kIntelCascadelakeXeon:
    600    case CpuType::kIntelSkylakeXeon:
    601    case CpuType::kIntelBroadwell:
    602    case CpuType::kIntelSkylake:
    603      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    604          3, 2, CutoffStrategy::Fold3>();
    605    // PCLMULQDQ is slow, don't use it.
    606    case CpuType::kIntelIvybridge:
    607    case CpuType::kIntelSandybridge:
    608    case CpuType::kIntelWestmere:
    609      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    610          3, 0, CutoffStrategy::Fold3>();
    611    case CpuType::kArmNeoverseN1:
    612    case CpuType::kArmNeoverseN2:
    613    case CpuType::kArmNeoverseV1:
    614      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    615          1, 1, CutoffStrategy::Unroll64CRC>();
    616    case CpuType::kAmpereSiryn:
    617      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    618          3, 2, CutoffStrategy::Fold3>();
    619    case CpuType::kArmNeoverseV2:
    620      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    621          1, 2, CutoffStrategy::Unroll64CRC>();
    622 #if defined(__aarch64__)
    623    default:
    624      // Not all ARM processors support the needed instructions, so check here
    625      // before trying to use an accelerated implementation.
    626      if (SupportsArmCRC32PMULL()) {
    627        return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    628            1, 1, CutoffStrategy::Unroll64CRC>();
    629      } else {
    630        return nullptr;
    631      }
    632 #else
    633    default:
    634      // Something else, play it safe and assume slow PCLMULQDQ.
    635      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
    636          3, 0, CutoffStrategy::Fold3>();
    637 #endif
    638  }
    639 }
    640 
    641 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
    642  auto ret = std::vector<std::unique_ptr<CRCImpl>>();
    643  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    644                    1, 0, CutoffStrategy::Fold3>>());
    645  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    646                    1, 1, CutoffStrategy::Fold3>>());
    647  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    648                    1, 2, CutoffStrategy::Fold3>>());
    649  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    650                    1, 3, CutoffStrategy::Fold3>>());
    651  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    652                    2, 0, CutoffStrategy::Fold3>>());
    653  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    654                    2, 1, CutoffStrategy::Fold3>>());
    655  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    656                    2, 2, CutoffStrategy::Fold3>>());
    657  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    658                    2, 3, CutoffStrategy::Fold3>>());
    659  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    660                    3, 0, CutoffStrategy::Fold3>>());
    661  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    662                    3, 1, CutoffStrategy::Fold3>>());
    663  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    664                    3, 2, CutoffStrategy::Fold3>>());
    665  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    666                    3, 3, CutoffStrategy::Fold3>>());
    667  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    668                    1, 0, CutoffStrategy::Unroll64CRC>>());
    669  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    670                    1, 1, CutoffStrategy::Unroll64CRC>>());
    671  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    672                    1, 2, CutoffStrategy::Unroll64CRC>>());
    673  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    674                    1, 3, CutoffStrategy::Unroll64CRC>>());
    675  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    676                    2, 0, CutoffStrategy::Unroll64CRC>>());
    677  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    678                    2, 1, CutoffStrategy::Unroll64CRC>>());
    679  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    680                    2, 2, CutoffStrategy::Unroll64CRC>>());
    681  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    682                    2, 3, CutoffStrategy::Unroll64CRC>>());
    683  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    684                    3, 0, CutoffStrategy::Unroll64CRC>>());
    685  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    686                    3, 1, CutoffStrategy::Unroll64CRC>>());
    687  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    688                    3, 2, CutoffStrategy::Unroll64CRC>>());
    689  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
    690                    3, 3, CutoffStrategy::Unroll64CRC>>());
    691 
    692  return ret;
    693 }
    694 
    695 #else  // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
    696 
    697 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
    698  return std::vector<std::unique_ptr<CRCImpl>>();
    699 }
    700 
    701 // no hardware acceleration available
    702 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
    703 
    704 #endif
    705 
    706 }  // namespace crc_internal
    707 ABSL_NAMESPACE_END
    708 }  // namespace absl