crc_x86_arm_combined.cc (28689B)
1 // Copyright 2022 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Hardware accelerated CRC32 computation on Intel and ARM architecture. 16 17 #include <cstddef> 18 #include <cstdint> 19 #include <memory> 20 #include <vector> 21 22 #include "absl/base/attributes.h" 23 #include "absl/base/config.h" 24 #include "absl/base/internal/endian.h" 25 #include "absl/base/prefetch.h" 26 #include "absl/crc/internal/cpu_detect.h" 27 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h" 28 #include "absl/crc/internal/crc_internal.h" 29 #include "absl/memory/memory.h" 30 #include "absl/numeric/bits.h" 31 32 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \ 33 defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) 34 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C 35 #endif 36 37 namespace absl { 38 ABSL_NAMESPACE_BEGIN 39 namespace crc_internal { 40 41 #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C) 42 43 // Implementation details not exported outside of file 44 namespace { 45 46 // Some machines have CRC acceleration hardware. 47 // We can do a faster version of Extend() on such machines. 48 class CRC32AcceleratedX86ARMCombined : public CRC32 { 49 public: 50 CRC32AcceleratedX86ARMCombined() {} 51 ~CRC32AcceleratedX86ARMCombined() override {} 52 void ExtendByZeroes(uint32_t* crc, size_t length) const override; 53 uint32_t ComputeZeroConstant(size_t length) const; 54 55 private: 56 CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) = 57 delete; 58 CRC32AcceleratedX86ARMCombined& operator=( 59 const CRC32AcceleratedX86ARMCombined&) = delete; 60 }; 61 62 // Constants for switching between algorithms. 63 // Chosen by comparing speed at different powers of 2. 64 constexpr size_t kSmallCutoff = 256; 65 constexpr size_t kMediumCutoff = 2048; 66 67 #define ABSL_INTERNAL_STEP1(crc, p) \ 68 do { \ 69 crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \ 70 } while (0) 71 #define ABSL_INTERNAL_STEP2(crc, p) \ 72 do { \ 73 crc = \ 74 CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \ 75 p += 2; \ 76 } while (0) 77 #define ABSL_INTERNAL_STEP4(crc, p) \ 78 do { \ 79 crc = \ 80 CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \ 81 p += 4; \ 82 } while (0) 83 #define ABSL_INTERNAL_STEP8(crc, p) \ 84 do { \ 85 crc = \ 86 CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); \ 87 p += 8; \ 88 } while (0) 89 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \ 90 do { \ 91 ABSL_INTERNAL_STEP8(crc0, p0); \ 92 ABSL_INTERNAL_STEP8(crc1, p1); \ 93 } while (0) 94 #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \ 95 do { \ 96 ABSL_INTERNAL_STEP8(crc0, p0); \ 97 ABSL_INTERNAL_STEP8(crc1, p1); \ 98 ABSL_INTERNAL_STEP8(crc2, p2); \ 99 } while (0) 100 101 namespace { 102 103 uint32_t multiply(uint32_t a, uint32_t b) { 104 V128 power = V128_From64WithZeroFill(a); 105 V128 crc = V128_From64WithZeroFill(b); 106 V128 res = V128_PMulLow(power, crc); 107 108 // Combine crc values. 109 // 110 // Adding res to itself is equivalent to multiplying by 2, 111 // or shifting left by 1. Addition is used as not all compilers 112 // are able to generate optimal code without this hint. 113 // https://godbolt.org/z/rr3fMnf39 114 res = V128_Add64(res, res); 115 return static_cast<uint32_t>(V128_Extract32<1>(res)) ^ 116 CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res))); 117 } 118 119 // Powers of crc32c polynomial, for faster ExtendByZeros. 120 // Verified against folly: 121 // folly/hash/detail/Crc32CombineDetail.cpp 122 constexpr uint32_t kCRC32CPowers[] = { 123 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7, 124 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564, 125 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3, 126 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc, 127 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000, 128 0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 129 0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 130 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 131 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 132 0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 133 0x00800000, 0x00008000, 134 }; 135 136 } // namespace 137 138 // Compute a magic constant, so that multiplying by it is the same as 139 // extending crc by length zeros. 140 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant( 141 size_t length) const { 142 // Lowest 2 bits are handled separately in ExtendByZeroes 143 length >>= 2; 144 145 int index = absl::countr_zero(length); 146 uint32_t prev = kCRC32CPowers[index]; 147 length &= length - 1; 148 149 while (length) { 150 // For each bit of length, extend by 2**n zeros. 151 index = absl::countr_zero(length); 152 prev = multiply(prev, kCRC32CPowers[index]); 153 length &= length - 1; 154 } 155 return prev; 156 } 157 158 void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc, 159 size_t length) const { 160 uint32_t val = *crc; 161 // Don't bother with multiplication for small length. 162 switch (length & 3) { 163 case 0: 164 break; 165 case 1: 166 val = CRC32_u8(val, 0); 167 break; 168 case 2: 169 val = CRC32_u16(val, 0); 170 break; 171 case 3: 172 val = CRC32_u8(val, 0); 173 val = CRC32_u16(val, 0); 174 break; 175 } 176 if (length > 3) { 177 val = multiply(val, ComputeZeroConstant(length)); 178 } 179 *crc = val; 180 } 181 182 // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32 183 // Instruction" 184 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf 185 // We only need every 4th value, because we unroll loop by 4. 186 constexpr uint64_t kClmulConstants[] = { 187 0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a, 188 0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456, 189 0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c, 190 0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a, 191 0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a, 192 0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28, 193 0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2, 194 0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2, 195 0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6, 196 0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312, 197 0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24, 198 0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e, 199 0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa, 200 }; 201 202 enum class CutoffStrategy { 203 // Use 3 CRC streams to fold into 1. 204 Fold3, 205 // Unroll CRC instructions for 64 bytes. 206 Unroll64CRC, 207 }; 208 209 // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the 210 // methods and data that don't need the template arguments. 211 class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase 212 : public CRC32AcceleratedX86ARMCombined { 213 protected: 214 // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream 215 // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a 216 // high latency, so we run 4 128-bit partial checksums that can be reduced to 217 // a single value by FinalizePclmulStream later. Computing crc for arbitrary 218 // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC 219 // Computation for Generic Polynomials Using PCLMULQDQ Instruction" 220 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 221 // We are applying it to CRC32C polynomial. 222 ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( 223 const uint8_t* p, V128* partialCRC) const { 224 V128 loopMultiplicands = 225 V128_Load(reinterpret_cast<const V128*>(kFoldAcross512Bits)); 226 227 V128 partialCRC1 = partialCRC[0]; 228 V128 partialCRC2 = partialCRC[1]; 229 V128 partialCRC3 = partialCRC[2]; 230 V128 partialCRC4 = partialCRC[3]; 231 232 V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); 233 V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); 234 V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); 235 V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); 236 V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); 237 V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); 238 V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); 239 V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); 240 partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); 241 partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); 242 partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); 243 partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); 244 partialCRC1 = V128_Xor(tmp1, partialCRC1); 245 partialCRC2 = V128_Xor(tmp2, partialCRC2); 246 partialCRC3 = V128_Xor(tmp3, partialCRC3); 247 partialCRC4 = V128_Xor(tmp4, partialCRC4); 248 partialCRC1 = V128_Xor(partialCRC1, data1); 249 partialCRC2 = V128_Xor(partialCRC2, data2); 250 partialCRC3 = V128_Xor(partialCRC3, data3); 251 partialCRC4 = V128_Xor(partialCRC4, data4); 252 partialCRC[0] = partialCRC1; 253 partialCRC[1] = partialCRC2; 254 partialCRC[2] = partialCRC3; 255 partialCRC[3] = partialCRC4; 256 } 257 258 // Reduce partialCRC produced by Process64BytesPclmul into a single value, 259 // that represents crc checksum of all the processed bytes. 260 ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t 261 FinalizePclmulStream(V128* partialCRC) const { 262 V128 partialCRC1 = partialCRC[0]; 263 V128 partialCRC2 = partialCRC[1]; 264 V128 partialCRC3 = partialCRC[2]; 265 V128 partialCRC4 = partialCRC[3]; 266 267 // Combine 4 vectors of partial crc into a single vector. 268 V128 reductionMultiplicands = 269 V128_Load(reinterpret_cast<const V128*>(kFoldAcross256Bits)); 270 271 V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); 272 V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); 273 274 partialCRC1 = V128_Xor(low, high); 275 partialCRC1 = V128_Xor(partialCRC1, partialCRC3); 276 277 low = V128_PMulLow(reductionMultiplicands, partialCRC2); 278 high = V128_PMulHi(reductionMultiplicands, partialCRC2); 279 280 partialCRC2 = V128_Xor(low, high); 281 partialCRC2 = V128_Xor(partialCRC2, partialCRC4); 282 283 reductionMultiplicands = 284 V128_Load(reinterpret_cast<const V128*>(kFoldAcross128Bits)); 285 286 low = V128_PMulLow(reductionMultiplicands, partialCRC1); 287 high = V128_PMulHi(reductionMultiplicands, partialCRC1); 288 V128 fullCRC = V128_Xor(low, high); 289 fullCRC = V128_Xor(fullCRC, partialCRC2); 290 291 // Reduce fullCRC into scalar value. 292 uint32_t crc = 0; 293 crc = CRC32_u64(crc, V128_Extract64<0>(fullCRC)); 294 crc = CRC32_u64(crc, V128_Extract64<1>(fullCRC)); 295 return crc; 296 } 297 298 // Update crc with 64 bytes of data from p. 299 ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, 300 uint64_t crc) const { 301 for (int i = 0; i < 8; i++) { 302 crc = 303 CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); 304 p += 8; 305 } 306 return crc; 307 } 308 309 // Constants generated by './scripts/gen-crc-consts.py x86_pclmul 310 // crc32_lsb_0x82f63b78' from the Linux kernel. 311 alignas(16) static constexpr uint64_t kFoldAcross512Bits[2] = { 312 // (x^543 mod G) * x^32 313 0x00000000740eef02, 314 // (x^479 mod G) * x^32 315 0x000000009e4addf8}; 316 alignas(16) static constexpr uint64_t kFoldAcross256Bits[2] = { 317 // (x^287 mod G) * x^32 318 0x000000003da6d0cb, 319 // (x^223 mod G) * x^32 320 0x00000000ba4fc28e}; 321 alignas(16) static constexpr uint64_t kFoldAcross128Bits[2] = { 322 // (x^159 mod G) * x^32 323 0x00000000f20c0dfe, 324 // (x^95 mod G) * x^32 325 0x00000000493c7d27}; 326 327 // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same 328 // size. Each group is CRCed in parallel then combined at the end of the 329 // block. 330 static constexpr size_t kGroupsSmall = 3; 331 // For large runs we use up to kMaxStreams blocks computed with CRC 332 // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which 333 // are combined in the end. 334 static constexpr size_t kMaxStreams = 3; 335 }; 336 337 template <size_t num_crc_streams, size_t num_pclmul_streams, 338 CutoffStrategy strategy> 339 class CRC32AcceleratedX86ARMCombinedMultipleStreams 340 : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase { 341 ABSL_ATTRIBUTE_HOT 342 void Extend(uint32_t* crc, const void* bytes, size_t length) const override { 343 static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams, 344 "Invalid number of crc streams"); 345 static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams, 346 "Invalid number of pclmul streams"); 347 const uint8_t* p = static_cast<const uint8_t*>(bytes); 348 const uint8_t* e = p + length; 349 uint32_t l = *crc; 350 uint64_t l64; 351 352 // We have dedicated instruction for 1,2,4 and 8 bytes. 353 if (length & 8) { 354 ABSL_INTERNAL_STEP8(l, p); 355 length &= ~size_t{8}; 356 } 357 if (length & 4) { 358 ABSL_INTERNAL_STEP4(l, p); 359 length &= ~size_t{4}; 360 } 361 if (length & 2) { 362 ABSL_INTERNAL_STEP2(l, p); 363 length &= ~size_t{2}; 364 } 365 if (length & 1) { 366 ABSL_INTERNAL_STEP1(l, p); 367 length &= ~size_t{1}; 368 } 369 if (length == 0) { 370 *crc = l; 371 return; 372 } 373 // length is now multiple of 16. 374 375 // For small blocks just run simple loop, because cost of combining multiple 376 // streams is significant. 377 if (strategy != CutoffStrategy::Unroll64CRC) { 378 if (length < kSmallCutoff) { 379 while (length >= 16) { 380 ABSL_INTERNAL_STEP8(l, p); 381 ABSL_INTERNAL_STEP8(l, p); 382 length -= 16; 383 } 384 *crc = l; 385 return; 386 } 387 } 388 389 // For medium blocks we run 3 crc streams and combine them as described in 390 // Intel paper above. Running 4th stream doesn't help, because crc 391 // instruction has latency 3 and throughput 1. 392 if (length < kMediumCutoff) { 393 l64 = l; 394 if (strategy == CutoffStrategy::Fold3) { 395 uint64_t l641 = 0; 396 uint64_t l642 = 0; 397 const size_t blockSize = 32; 398 size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize; 399 const uint8_t* p1 = p + bs * blockSize; 400 const uint8_t* p2 = p1 + bs * blockSize; 401 402 for (size_t i = 0; i + 1 < bs; ++i) { 403 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 404 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 405 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 406 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 407 PrefetchToLocalCache( 408 reinterpret_cast<const char*>(p + kPrefetchHorizonMedium)); 409 PrefetchToLocalCache( 410 reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium)); 411 PrefetchToLocalCache( 412 reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium)); 413 } 414 // Don't run crc on last 8 bytes. 415 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 416 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 417 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); 418 ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1); 419 420 V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1); 421 422 V128 tmp = V128_From64WithZeroFill(l64); 423 424 V128 res1 = V128_PMulLow(tmp, magic); 425 426 tmp = V128_From64WithZeroFill(l641); 427 428 V128 res2 = V128_PMul10(tmp, magic); 429 V128 x = V128_Xor(res1, res2); 430 l64 = static_cast<uint64_t>(V128_Low64(x)) ^ 431 absl::little_endian::Load64(p2); 432 l64 = CRC32_u64(static_cast<uint32_t>(l642), l64); 433 434 p = p2 + 8; 435 } else if (strategy == CutoffStrategy::Unroll64CRC) { 436 while ((e - p) >= 64) { 437 l64 = Process64BytesCRC(p, l64); 438 p += 64; 439 } 440 } 441 } else { 442 // There is a lot of data, we can ignore combine costs and run all 443 // requested streams (num_crc_streams + num_pclmul_streams), 444 // using prefetch. CRC and PCLMULQDQ use different cpu execution units, 445 // so on some cpus it makes sense to execute both of them for different 446 // streams. 447 448 // Point x at first 8-byte aligned byte in string. 449 const uint8_t* x = RoundUp<8>(p); 450 // Process bytes until p is 8-byte aligned, if that isn't past the end. 451 while (p != x) { 452 ABSL_INTERNAL_STEP1(l, p); 453 } 454 455 size_t bs = static_cast<size_t>(e - p) / 456 (num_crc_streams + num_pclmul_streams) / 64; 457 const uint8_t* crc_streams[kMaxStreams]; 458 const uint8_t* pclmul_streams[kMaxStreams]; 459 // We are guaranteed to have at least one crc stream. 460 crc_streams[0] = p; 461 for (size_t i = 1; i < num_crc_streams; i++) { 462 crc_streams[i] = crc_streams[i - 1] + bs * 64; 463 } 464 pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64; 465 for (size_t i = 1; i < num_pclmul_streams; i++) { 466 pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64; 467 } 468 469 // Per stream crc sums. 470 uint64_t l64_crc[kMaxStreams] = {l}; 471 uint64_t l64_pclmul[kMaxStreams] = {0}; 472 473 // Peel first iteration, because PCLMULQDQ stream, needs setup. 474 for (size_t i = 0; i < num_crc_streams; i++) { 475 l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]); 476 crc_streams[i] += 16 * 4; 477 } 478 479 V128 partialCRC[kMaxStreams][4]; 480 for (size_t i = 0; i < num_pclmul_streams; i++) { 481 partialCRC[i][0] = V128_LoadU( 482 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0)); 483 partialCRC[i][1] = V128_LoadU( 484 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1)); 485 partialCRC[i][2] = V128_LoadU( 486 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2)); 487 partialCRC[i][3] = V128_LoadU( 488 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3)); 489 pclmul_streams[i] += 16 * 4; 490 } 491 492 for (size_t i = 1; i < bs; i++) { 493 // Prefetch data for next iterations. 494 for (size_t j = 0; j < num_crc_streams; j++) { 495 PrefetchToLocalCache( 496 reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon)); 497 } 498 for (size_t j = 0; j < num_pclmul_streams; j++) { 499 PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] + 500 kPrefetchHorizon)); 501 } 502 503 // We process each stream in 64 byte blocks. This can be written as 504 // for (int i = 0; i < num_pclmul_streams; i++) { 505 // Process64BytesPclmul(pclmul_streams[i], partialCRC[i]); 506 // pclmul_streams[i] += 16 * 4; 507 // } 508 // for (int i = 0; i < num_crc_streams; i++) { 509 // l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]); 510 // crc_streams[i] += 16*4; 511 // } 512 // But unrolling and interleaving PCLMULQDQ and CRC blocks manually 513 // gives ~2% performance boost. 514 l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]); 515 crc_streams[0] += 16 * 4; 516 if (num_pclmul_streams > 0) { 517 Process64BytesPclmul(pclmul_streams[0], partialCRC[0]); 518 pclmul_streams[0] += 16 * 4; 519 } 520 if (num_crc_streams > 1) { 521 l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]); 522 crc_streams[1] += 16 * 4; 523 } 524 if (num_pclmul_streams > 1) { 525 Process64BytesPclmul(pclmul_streams[1], partialCRC[1]); 526 pclmul_streams[1] += 16 * 4; 527 } 528 if (num_crc_streams > 2) { 529 l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]); 530 crc_streams[2] += 16 * 4; 531 } 532 if (num_pclmul_streams > 2) { 533 Process64BytesPclmul(pclmul_streams[2], partialCRC[2]); 534 pclmul_streams[2] += 16 * 4; 535 } 536 } 537 538 // PCLMULQDQ based streams require special final step; 539 // CRC based don't. 540 for (size_t i = 0; i < num_pclmul_streams; i++) { 541 l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]); 542 } 543 544 // Combine all streams into single result. 545 uint32_t magic = ComputeZeroConstant(bs * 64); 546 l64 = l64_crc[0]; 547 for (size_t i = 1; i < num_crc_streams; i++) { 548 l64 = multiply(static_cast<uint32_t>(l64), magic); 549 l64 ^= l64_crc[i]; 550 } 551 for (size_t i = 0; i < num_pclmul_streams; i++) { 552 l64 = multiply(static_cast<uint32_t>(l64), magic); 553 l64 ^= l64_pclmul[i]; 554 } 555 556 // Update p. 557 if (num_pclmul_streams > 0) { 558 p = pclmul_streams[num_pclmul_streams - 1]; 559 } else { 560 p = crc_streams[num_crc_streams - 1]; 561 } 562 } 563 l = static_cast<uint32_t>(l64); 564 565 while ((e - p) >= 16) { 566 ABSL_INTERNAL_STEP8(l, p); 567 ABSL_INTERNAL_STEP8(l, p); 568 } 569 // Process the last few bytes 570 while (p != e) { 571 ABSL_INTERNAL_STEP1(l, p); 572 } 573 574 #undef ABSL_INTERNAL_STEP8BY3 575 #undef ABSL_INTERNAL_STEP8BY2 576 #undef ABSL_INTERNAL_STEP8 577 #undef ABSL_INTERNAL_STEP4 578 #undef ABSL_INTERNAL_STEP2 579 #undef ABSL_INTERNAL_STEP1 580 581 *crc = l; 582 } 583 }; 584 585 } // namespace 586 587 // Intel processors with SSE4.2 have an instruction for one particular 588 // 32-bit CRC polynomial: crc32c 589 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { 590 CpuType type = GetCpuType(); 591 switch (type) { 592 case CpuType::kIntelHaswell: 593 case CpuType::kAmdRome: 594 case CpuType::kAmdNaples: 595 case CpuType::kAmdMilan: 596 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 597 3, 1, CutoffStrategy::Fold3>(); 598 // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation. 599 case CpuType::kIntelCascadelakeXeon: 600 case CpuType::kIntelSkylakeXeon: 601 case CpuType::kIntelBroadwell: 602 case CpuType::kIntelSkylake: 603 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 604 3, 2, CutoffStrategy::Fold3>(); 605 // PCLMULQDQ is slow, don't use it. 606 case CpuType::kIntelIvybridge: 607 case CpuType::kIntelSandybridge: 608 case CpuType::kIntelWestmere: 609 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 610 3, 0, CutoffStrategy::Fold3>(); 611 case CpuType::kArmNeoverseN1: 612 case CpuType::kArmNeoverseN2: 613 case CpuType::kArmNeoverseV1: 614 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 615 1, 1, CutoffStrategy::Unroll64CRC>(); 616 case CpuType::kAmpereSiryn: 617 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 618 3, 2, CutoffStrategy::Fold3>(); 619 case CpuType::kArmNeoverseV2: 620 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 621 1, 2, CutoffStrategy::Unroll64CRC>(); 622 #if defined(__aarch64__) 623 default: 624 // Not all ARM processors support the needed instructions, so check here 625 // before trying to use an accelerated implementation. 626 if (SupportsArmCRC32PMULL()) { 627 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 628 1, 1, CutoffStrategy::Unroll64CRC>(); 629 } else { 630 return nullptr; 631 } 632 #else 633 default: 634 // Something else, play it safe and assume slow PCLMULQDQ. 635 return new CRC32AcceleratedX86ARMCombinedMultipleStreams< 636 3, 0, CutoffStrategy::Fold3>(); 637 #endif 638 } 639 } 640 641 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() { 642 auto ret = std::vector<std::unique_ptr<CRCImpl>>(); 643 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 644 1, 0, CutoffStrategy::Fold3>>()); 645 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 646 1, 1, CutoffStrategy::Fold3>>()); 647 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 648 1, 2, CutoffStrategy::Fold3>>()); 649 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 650 1, 3, CutoffStrategy::Fold3>>()); 651 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 652 2, 0, CutoffStrategy::Fold3>>()); 653 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 654 2, 1, CutoffStrategy::Fold3>>()); 655 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 656 2, 2, CutoffStrategy::Fold3>>()); 657 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 658 2, 3, CutoffStrategy::Fold3>>()); 659 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 660 3, 0, CutoffStrategy::Fold3>>()); 661 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 662 3, 1, CutoffStrategy::Fold3>>()); 663 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 664 3, 2, CutoffStrategy::Fold3>>()); 665 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 666 3, 3, CutoffStrategy::Fold3>>()); 667 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 668 1, 0, CutoffStrategy::Unroll64CRC>>()); 669 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 670 1, 1, CutoffStrategy::Unroll64CRC>>()); 671 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 672 1, 2, CutoffStrategy::Unroll64CRC>>()); 673 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 674 1, 3, CutoffStrategy::Unroll64CRC>>()); 675 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 676 2, 0, CutoffStrategy::Unroll64CRC>>()); 677 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 678 2, 1, CutoffStrategy::Unroll64CRC>>()); 679 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 680 2, 2, CutoffStrategy::Unroll64CRC>>()); 681 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 682 2, 3, CutoffStrategy::Unroll64CRC>>()); 683 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 684 3, 0, CutoffStrategy::Unroll64CRC>>()); 685 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 686 3, 1, CutoffStrategy::Unroll64CRC>>()); 687 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 688 3, 2, CutoffStrategy::Unroll64CRC>>()); 689 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams< 690 3, 3, CutoffStrategy::Unroll64CRC>>()); 691 692 return ret; 693 } 694 695 #else // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C 696 697 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() { 698 return std::vector<std::unique_ptr<CRCImpl>>(); 699 } 700 701 // no hardware acceleration available 702 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; } 703 704 #endif 705 706 } // namespace crc_internal 707 ABSL_NAMESPACE_END 708 } // namespace absl