crc.cc (15127B)
1 // Copyright 2022 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Implementation of CRCs (aka Rabin Fingerprints). 16 // Treats the input as a polynomial with coefficients in Z(2), 17 // and finds the remainder when divided by an irreducible polynomial 18 // of the appropriate length. 19 // It handles all CRC sizes from 8 to 128 bits. 20 // It's somewhat complicated by having separate implementations optimized for 21 // CRC's <=32 bits, <= 64 bits, and <= 128 bits. 22 // The input string is prefixed with a "1" bit, and has "degree" "0" bits 23 // appended to it before the remainder is found. This ensures that 24 // short strings are scrambled somewhat and that strings consisting 25 // of all nulls have a non-zero CRC. 26 // 27 // Uses the "interleaved word-by-word" method from 28 // "Everything we know about CRC but afraid to forget" by Andrew Kadatch 29 // and Bob Jenkins, 30 // http://crcutil.googlecode.com/files/crc-doc.1.0.pdf 31 // 32 // The idea is to compute kStride CRCs simultaneously, allowing the 33 // processor to more effectively use multiple execution units. Each of 34 // the CRCs is calculated on one word of data followed by kStride - 1 35 // words of zeroes; the CRC starting points are staggered by one word. 36 // Assuming a stride of 4 with data words "ABCDABCDABCD", the first 37 // CRC is over A000A000A, the second over 0B000B000B, and so on. 38 // The CRC of the whole data is then calculated by properly aligning the 39 // CRCs by appending zeroes until the data lengths agree then XORing 40 // the CRCs. 41 42 #include "absl/crc/internal/crc.h" 43 44 #include <cstdint> 45 46 #include "absl/base/internal/endian.h" 47 #include "absl/base/internal/raw_logging.h" 48 #include "absl/base/prefetch.h" 49 #include "absl/crc/internal/crc_internal.h" 50 51 namespace absl { 52 ABSL_NAMESPACE_BEGIN 53 namespace crc_internal { 54 55 namespace { 56 57 // Constants 58 #if defined(__i386__) || defined(__x86_64__) 59 constexpr bool kNeedAlignedLoads = false; 60 #else 61 constexpr bool kNeedAlignedLoads = true; 62 #endif 63 64 // We express the number of zeroes as a number in base ZEROES_BASE. By 65 // pre-computing the zero extensions for all possible components of such an 66 // expression (numbers in a form a*ZEROES_BASE**b), we can calculate the 67 // resulting extension by multiplying the extensions for individual components 68 // using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of 69 // zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries. 70 constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE) 71 constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2 72 73 constexpr uint32_t kCrc32cPoly = 0x82f63b78; 74 75 uint32_t ReverseBits(uint32_t bits) { 76 bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1; 77 bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2; 78 bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4; 79 return absl::gbswap_32(bits); 80 } 81 82 // Polynomial long multiplication mod the polynomial of degree 32. 83 void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) { 84 uint32_t l = *val; 85 uint32_t result = 0; 86 auto onebit = uint32_t{0x80000000u}; 87 for (uint32_t one = onebit; one != 0; one >>= 1) { 88 if ((l & one) != 0) { 89 result ^= m; 90 } 91 if (m & 1) { 92 m = (m >> 1) ^ poly; 93 } else { 94 m >>= 1; 95 } 96 } 97 *val = result; 98 } 99 } // namespace 100 101 void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size, 102 Uint32By256* t) { 103 for (int j = 0; j != word_size; j++) { // for each byte of extension.... 104 t[j][0] = 0; // a zero has no effect 105 for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2 106 if (j == 0 && i == 128) { 107 t[j][i] = last; // top bit in last byte is given 108 } else { 109 // each successive power of two is derived from the previous 110 // one, either in this table, or the last table 111 uint32_t pred; 112 if (i == 128) { 113 pred = t[j - 1][1]; 114 } else { 115 pred = t[j][i << 1]; 116 } 117 // Advance the CRC by one bit (multiply by X, and take remainder 118 // through one step of polynomial long division) 119 if (pred & 1) { 120 t[j][i] = (pred >> 1) ^ poly; 121 } else { 122 t[j][i] = pred >> 1; 123 } 124 } 125 } 126 // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b) 127 // so we can make all the tables for non-powers of two by 128 // xoring previously created entries. 129 for (int i = 2; i != 256; i <<= 1) { 130 for (int k = i + 1; k != (i << 1); k++) { 131 t[j][k] = t[j][i] ^ t[j][k - i]; 132 } 133 } 134 } 135 } 136 137 int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) { 138 uint32_t inc = 1; 139 inc <<= 31; 140 141 // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0. 142 inc >>= 1; 143 144 // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte. 145 for (int i = 0; i < 3; ++i) { 146 PolyMultiply(&inc, inc, poly); 147 } 148 149 int j = 0; 150 for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) { 151 // Every entry in the table adds an additional inc_len zeroes. 152 uint32_t v = inc; 153 for (int a = 1; a != ZEROES_BASE; a++) { 154 t[0][j] = v; 155 PolyMultiply(&v, inc, poly); 156 j++; 157 } 158 inc = v; 159 } 160 ABSL_RAW_CHECK(j <= 256, ""); 161 return j; 162 } 163 164 // Internal version of the "constructor". 165 CRCImpl* CRCImpl::NewInternal() { 166 // Find an accelearated implementation first. 167 CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined(); 168 169 // Fall back to generic implementions if no acceleration is available. 170 if (result == nullptr) { 171 result = new CRC32(); 172 } 173 174 result->InitTables(); 175 176 return result; 177 } 178 179 // The 32-bit implementation 180 181 void CRC32::InitTables() { 182 // Compute the table for extending a CRC by one byte. 183 Uint32By256* t = new Uint32By256[4]; 184 FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t); 185 for (int i = 0; i != 256; i++) { 186 this->table0_[i] = t[0][i]; 187 } 188 189 // Construct a table for updating the CRC by 4 bytes data followed by 190 // 12 bytes of zeroes. 191 // 192 // Note: the data word size could be larger than the CRC size; it might 193 // be slightly faster to use a 64-bit data word, but doing so doubles the 194 // table size. 195 uint32_t last = kCrc32cPoly; 196 const size_t size = 12; 197 for (size_t i = 0; i < size; ++i) { 198 last = (last >> 8) ^ this->table0_[last & 0xff]; 199 } 200 FillWordTable(kCrc32cPoly, last, 4, t); 201 for (size_t b = 0; b < 4; ++b) { 202 for (int i = 0; i < 256; ++i) { 203 this->table_[b][i] = t[b][i]; 204 } 205 } 206 207 int j = FillZeroesTable(kCrc32cPoly, t); 208 ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), ""); 209 for (int i = 0; i < j; i++) { 210 this->zeroes_[i] = t[0][i]; 211 } 212 213 delete[] t; 214 215 // Build up tables for _reversing_ the operation of doing CRC operations on 216 // zero bytes. 217 218 // In C++, extending `crc` by a single zero bit is done by the following: 219 // (A) bool low_bit_set = (crc & 1); 220 // crc >>= 1; 221 // if (low_bit_set) crc ^= kCrc32cPoly; 222 // 223 // In particular note that the high bit of `crc` after this operation will be 224 // set if and only if the low bit of `crc` was set before it. This means that 225 // no information is lost, and the operation can be reversed, as follows: 226 // (B) bool high_bit_set = (crc & 0x80000000u); 227 // if (high_bit_set) crc ^= kCrc32cPoly; 228 // crc <<= 1; 229 // if (high_bit_set) crc ^= 1; 230 // 231 // Or, equivalently: 232 // (C) bool high_bit_set = (crc & 0x80000000u); 233 // crc <<= 1; 234 // if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1); 235 // 236 // The last observation is, if we store our checksums in variable `rcrc`, 237 // with order of the bits reversed, the inverse operation becomes: 238 // (D) bool low_bit_set = (rcrc & 1); 239 // rcrc >>= 1; 240 // if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1) 241 // 242 // This is the same algorithm (A) that we started with, only with a different 243 // polynomial bit pattern. This means that by building up our tables with 244 // this alternate polynomial, we can apply the CRC algorithms to a 245 // bit-reversed CRC checksum to perform inverse zero-extension. 246 247 const uint32_t kCrc32cUnextendPoly = 248 ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1)); 249 FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_); 250 251 j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_); 252 ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)), 253 ""); 254 } 255 256 void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const { 257 const uint8_t* p = static_cast<const uint8_t*>(bytes); 258 const uint8_t* e = p + length; 259 uint32_t l = *crc; 260 261 auto step_one_byte = [this, &p, &l]() { 262 int c = (l & 0xff) ^ *p++; 263 l = this->table0_[c] ^ (l >> 8); 264 }; 265 266 if (kNeedAlignedLoads) { 267 // point x at first 4-byte aligned byte in string. this might be past the 268 // end of the string. 269 const uint8_t* x = RoundUp<4>(p); 270 if (x <= e) { 271 // Process bytes until finished or p is 4-byte aligned 272 while (p != x) { 273 step_one_byte(); 274 } 275 } 276 } 277 278 const size_t kSwathSize = 16; 279 if (static_cast<size_t>(e - p) >= kSwathSize) { 280 // Load one swath of data into the operating buffers. 281 uint32_t buf0 = absl::little_endian::Load32(p) ^ l; 282 uint32_t buf1 = absl::little_endian::Load32(p + 4); 283 uint32_t buf2 = absl::little_endian::Load32(p + 8); 284 uint32_t buf3 = absl::little_endian::Load32(p + 12); 285 p += kSwathSize; 286 287 // Increment a CRC value by a "swath"; this combines the four bytes 288 // starting at `ptr` and twelve zero bytes, so that four CRCs can be 289 // built incrementally and combined at the end. 290 const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) { 291 return absl::little_endian::Load32(ptr) ^ 292 this->table_[3][crc_in & 0xff] ^ 293 this->table_[2][(crc_in >> 8) & 0xff] ^ 294 this->table_[1][(crc_in >> 16) & 0xff] ^ 295 this->table_[0][crc_in >> 24]; 296 }; 297 298 // Run one CRC calculation step over all swaths in one 16-byte stride 299 const auto step_stride = [&]() { 300 buf0 = step_swath(buf0, p); 301 buf1 = step_swath(buf1, p + 4); 302 buf2 = step_swath(buf2, p + 8); 303 buf3 = step_swath(buf3, p + 12); 304 p += 16; 305 }; 306 307 // Process kStride interleaved swaths through the data in parallel. 308 while ((e - p) > kPrefetchHorizon) { 309 PrefetchToLocalCacheNta( 310 reinterpret_cast<const void*>(p + kPrefetchHorizon)); 311 // Process 64 bytes at a time 312 step_stride(); 313 step_stride(); 314 step_stride(); 315 step_stride(); 316 } 317 while (static_cast<size_t>(e - p) >= kSwathSize) { 318 step_stride(); 319 } 320 321 // Now advance one word at a time as far as possible. This isn't worth 322 // doing if we have word-advance tables. 323 while (static_cast<size_t>(e - p) >= 4) { 324 buf0 = step_swath(buf0, p); 325 uint32_t tmp = buf0; 326 buf0 = buf1; 327 buf1 = buf2; 328 buf2 = buf3; 329 buf3 = tmp; 330 p += 4; 331 } 332 333 // Combine the results from the different swaths. This is just a CRC 334 // on the data values in the bufX words. 335 auto combine_one_word = [this](uint32_t crc_in, uint32_t w) { 336 w ^= crc_in; 337 for (size_t i = 0; i < 4; ++i) { 338 w = (w >> 8) ^ this->table0_[w & 0xff]; 339 } 340 return w; 341 }; 342 343 l = combine_one_word(0, buf0); 344 l = combine_one_word(l, buf1); 345 l = combine_one_word(l, buf2); 346 l = combine_one_word(l, buf3); 347 } 348 349 // Process the last few bytes 350 while (p != e) { 351 step_one_byte(); 352 } 353 354 *crc = l; 355 } 356 357 void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length, 358 const uint32_t zeroes_table[256], 359 const uint32_t poly_table[256]) { 360 if (length != 0) { 361 uint32_t l = *crc; 362 // For each ZEROES_BASE_LG bits in length 363 // (after the low-order bits have been removed) 364 // we lookup the appropriate polynomial in the zeroes_ array 365 // and do a polynomial long multiplication (mod the CRC polynomial) 366 // to extend the CRC by the appropriate number of bits. 367 for (int i = 0; length != 0; 368 i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) { 369 int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits 370 if (c != 0) { // if they are not zero, 371 // multiply by entry in table 372 // Build a table to aid in multiplying 2 bits at a time. 373 // It takes too long to build tables for more bits. 374 uint64_t m = zeroes_table[c + i - 1]; 375 m <<= 1; 376 uint64_t m2 = m << 1; 377 uint64_t mtab[4] = {0, m, m2, m2 ^ m}; 378 379 // Do the multiply one byte at a time. 380 uint64_t result = 0; 381 for (int x = 0; x < 32; x += 8) { 382 // The carry-less multiply. 383 result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^ 384 (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6); 385 l >>= 8; 386 387 // Reduce modulo the polynomial 388 result = (result >> 8) ^ poly_table[result & 0xff]; 389 } 390 l = static_cast<uint32_t>(result); 391 } 392 } 393 *crc = l; 394 } 395 } 396 397 void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const { 398 return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_); 399 } 400 401 void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const { 402 // See the comment in CRC32::InitTables() for an explanation of the algorithm 403 // below. 404 *crc = ReverseBits(*crc); 405 ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_); 406 *crc = ReverseBits(*crc); 407 } 408 409 void CRC32::Scramble(uint32_t* crc) const { 410 // Rotate by near half the word size plus 1. See the scramble comment in 411 // crc_internal.h for an explanation. 412 constexpr int scramble_rotate = (32 / 2) + 1; 413 *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo), 414 32, scramble_rotate) & 415 MaskOfLength<uint32_t>(32); 416 } 417 418 void CRC32::Unscramble(uint32_t* crc) const { 419 constexpr int scramble_rotate = (32 / 2) + 1; 420 uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32, 421 32 - scramble_rotate); 422 *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32); 423 } 424 425 // Constructor and destructor for base class CRC. 426 CRC::~CRC() {} 427 CRC::CRC() {} 428 429 // The "constructor" for a CRC32C with a standard polynomial. 430 CRC* CRC::Crc32c() { 431 static CRC* singleton = CRCImpl::NewInternal(); 432 return singleton; 433 } 434 435 } // namespace crc_internal 436 ABSL_NAMESPACE_END 437 } // namespace absl