snappy.cc (100564B)
1 // Copyright 2005 Google Inc. All Rights Reserved. 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google Inc. nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 #include "snappy-internal.h" 30 #include "snappy-sinksource.h" 31 #include "snappy.h" 32 #if !defined(SNAPPY_HAVE_BMI2) 33 // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2 34 // specifically, but it does define __AVX2__ when AVX2 support is available. 35 // Fortunately, AVX2 was introduced in Haswell, just like BMI2. 36 // 37 // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So, 38 // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which 39 // case issuing BMI2 instructions results in a compiler error. 40 #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) 41 #define SNAPPY_HAVE_BMI2 1 42 #else 43 #define SNAPPY_HAVE_BMI2 0 44 #endif 45 #endif // !defined(SNAPPY_HAVE_BMI2) 46 47 #if !defined(SNAPPY_HAVE_X86_CRC32) 48 #if defined(__SSE4_2__) 49 #define SNAPPY_HAVE_X86_CRC32 1 50 #else 51 #define SNAPPY_HAVE_X86_CRC32 0 52 #endif 53 #endif // !defined(SNAPPY_HAVE_X86_CRC32) 54 55 #if !defined(SNAPPY_HAVE_NEON_CRC32) 56 #if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32) 57 #define SNAPPY_HAVE_NEON_CRC32 1 58 #else 59 #define SNAPPY_HAVE_NEON_CRC32 0 60 #endif 61 #endif // !defined(SNAPPY_HAVE_NEON_CRC32) 62 63 #if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32 64 // Please do not replace with <x86intrin.h>. or with headers that assume more 65 // advanced SSE versions without checking with all the OWNERS. 66 #include <immintrin.h> 67 #elif SNAPPY_HAVE_NEON_CRC32 68 #include <arm_acle.h> 69 #endif 70 71 #include <algorithm> 72 #include <array> 73 #include <cstddef> 74 #include <cstdint> 75 #include <cstdio> 76 #include <cstring> 77 #include <functional> 78 #include <memory> 79 #include <string> 80 #include <utility> 81 #include <vector> 82 83 namespace snappy { 84 85 namespace { 86 87 // The amount of slop bytes writers are using for unconditional copies. 88 constexpr int kSlopBytes = 64; 89 90 using internal::char_table; 91 using internal::COPY_1_BYTE_OFFSET; 92 using internal::COPY_2_BYTE_OFFSET; 93 using internal::COPY_4_BYTE_OFFSET; 94 using internal::kMaximumTagLength; 95 using internal::LITERAL; 96 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 97 using internal::V128; 98 using internal::V128_Load; 99 using internal::V128_LoadU; 100 using internal::V128_Shuffle; 101 using internal::V128_StoreU; 102 using internal::V128_DupChar; 103 #endif 104 105 // We translate the information encoded in a tag through a lookup table to a 106 // format that requires fewer instructions to decode. Effectively we store 107 // the length minus the tag part of the offset. The lowest significant byte 108 // thus stores the length. While total length - offset is given by 109 // entry - ExtractOffset(type). The nice thing is that the subtraction 110 // immediately sets the flags for the necessary check that offset >= length. 111 // This folds the cmp with sub. We engineer the long literals and copy-4 to 112 // always fail this check, so their presence doesn't affect the fast path. 113 // To prevent literals from triggering the guard against offset < length (offset 114 // does not apply to literals) the table is giving them a spurious offset of 115 // 256. 116 inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) { 117 return len - (offset << 8); 118 } 119 120 inline constexpr int16_t LengthMinusOffset(int data, int type) { 121 return type == 3 ? 0xFF // copy-4 (or type == 3) 122 : type == 2 ? MakeEntry(data + 1, 0) // copy-2 123 : type == 1 ? MakeEntry((data & 7) + 4, data >> 3) // copy-1 124 : data < 60 ? MakeEntry(data + 1, 1) // note spurious offset. 125 : 0xFF; // long literal 126 } 127 128 inline constexpr int16_t LengthMinusOffset(uint8_t tag) { 129 return LengthMinusOffset(tag >> 2, tag & 3); 130 } 131 132 template <size_t... Ints> 133 struct index_sequence {}; 134 135 template <std::size_t N, size_t... Is> 136 struct make_index_sequence : make_index_sequence<N - 1, N - 1, Is...> {}; 137 138 template <size_t... Is> 139 struct make_index_sequence<0, Is...> : index_sequence<Is...> {}; 140 141 template <size_t... seq> 142 constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) { 143 return std::array<int16_t, 256>{LengthMinusOffset(seq)...}; 144 } 145 146 alignas(64) const std::array<int16_t, 256> kLengthMinusOffset = 147 MakeTable(make_index_sequence<256>{}); 148 149 // Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the 150 // relevant entry, if any, for the given bytes. Any hash function will do, 151 // but a good hash function reduces the number of collisions and thus yields 152 // better compression for compressible input. 153 // 154 // REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two. 155 inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) { 156 // Our choice is quicker-and-dirtier than the typical hash function; 157 // empirically, that seems beneficial. The upper bits of kMagic * bytes are a 158 // higher-quality hash than the lower bits, so when using kMagic * bytes we 159 // also shift right to get a higher-quality end result. There's no similar 160 // issue with a CRC because all of the output bits of a CRC are equally good 161 // "hashes." So, a CPU instruction for CRC, if available, tends to be a good 162 // choice. 163 #if SNAPPY_HAVE_NEON_CRC32 164 // We use mask as the second arg to the CRC function, as it's about to 165 // be used anyway; it'd be equally correct to use 0 or some constant. 166 // Mathematically, _mm_crc32_u32 (or similar) is a function of the 167 // xor of its arguments. 168 const uint32_t hash = __crc32cw(bytes, mask); 169 #elif SNAPPY_HAVE_X86_CRC32 170 const uint32_t hash = _mm_crc32_u32(bytes, mask); 171 #else 172 constexpr uint32_t kMagic = 0x1e35a7bd; 173 const uint32_t hash = (kMagic * bytes) >> (31 - kMaxHashTableBits); 174 #endif 175 return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) + 176 (hash & mask)); 177 } 178 179 inline uint16_t* TableEntry4ByteMatch(uint16_t* table, uint32_t bytes, 180 uint32_t mask) { 181 constexpr uint32_t kMagic = 2654435761U; 182 const uint32_t hash = (kMagic * bytes) >> (32 - kMaxHashTableBits); 183 return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) + 184 (hash & mask)); 185 } 186 187 inline uint16_t* TableEntry8ByteMatch(uint16_t* table, uint64_t bytes, 188 uint32_t mask) { 189 constexpr uint64_t kMagic = 58295818150454627ULL; 190 const uint32_t hash = (kMagic * bytes) >> (64 - kMaxHashTableBits); 191 return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) + 192 (hash & mask)); 193 } 194 195 } // namespace 196 197 size_t MaxCompressedLength(size_t source_bytes) { 198 // Compressed data can be defined as: 199 // compressed := item* literal* 200 // item := literal* copy 201 // 202 // The trailing literal sequence has a space blowup of at most 62/60 203 // since a literal of length 60 needs one tag byte + one extra byte 204 // for length information. 205 // 206 // Item blowup is trickier to measure. Suppose the "copy" op copies 207 // 4 bytes of data. Because of a special check in the encoding code, 208 // we produce a 4-byte copy only if the offset is < 65536. Therefore 209 // the copy op takes 3 bytes to encode, and this type of item leads 210 // to at most the 62/60 blowup for representing literals. 211 // 212 // Suppose the "copy" op copies 5 bytes of data. If the offset is big 213 // enough, it will take 5 bytes to encode the copy op. Therefore the 214 // worst case here is a one-byte literal followed by a five-byte copy. 215 // I.e., 6 bytes of input turn into 7 bytes of "compressed" data. 216 // 217 // This last factor dominates the blowup, so the final estimate is: 218 return 32 + source_bytes + source_bytes / 6; 219 } 220 221 namespace { 222 223 void UnalignedCopy64(const void* src, void* dst) { 224 char tmp[8]; 225 std::memcpy(tmp, src, 8); 226 std::memcpy(dst, tmp, 8); 227 } 228 229 void UnalignedCopy128(const void* src, void* dst) { 230 // std::memcpy() gets vectorized when the appropriate compiler options are 231 // used. For example, x86 compilers targeting SSE2+ will optimize to an SSE2 232 // load and store. 233 char tmp[16]; 234 std::memcpy(tmp, src, 16); 235 std::memcpy(dst, tmp, 16); 236 } 237 238 template <bool use_16bytes_chunk> 239 inline void ConditionalUnalignedCopy128(const char* src, char* dst) { 240 if (use_16bytes_chunk) { 241 UnalignedCopy128(src, dst); 242 } else { 243 UnalignedCopy64(src, dst); 244 UnalignedCopy64(src + 8, dst + 8); 245 } 246 } 247 248 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used 249 // for handling COPY operations where the input and output regions may overlap. 250 // For example, suppose: 251 // src == "ab" 252 // op == src + 2 253 // op_limit == op + 20 254 // After IncrementalCopySlow(src, op, op_limit), the result will have eleven 255 // copies of "ab" 256 // ababababababababababab 257 // Note that this does not match the semantics of either std::memcpy() or 258 // std::memmove(). 259 inline char* IncrementalCopySlow(const char* src, char* op, 260 char* const op_limit) { 261 // TODO: Remove pragma when LLVM is aware this 262 // function is only called in cold regions and when cold regions don't get 263 // vectorized or unrolled. 264 #ifdef __clang__ 265 #pragma clang loop unroll(disable) 266 #endif 267 while (op < op_limit) { 268 *op++ = *src++; 269 } 270 return op_limit; 271 } 272 273 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 274 275 // Computes the bytes for shuffle control mask (please read comments on 276 // 'pattern_generation_masks' as well) for the given index_offset and 277 // pattern_size. For example, when the 'offset' is 6, it will generate a 278 // repeating pattern of size 6. So, the first 16 byte indexes will correspond to 279 // the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the 280 // next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3, 281 // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by 282 // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and 283 // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. 284 template <size_t... indexes> 285 inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes( 286 int index_offset, int pattern_size, index_sequence<indexes...>) { 287 return {static_cast<char>((index_offset + indexes) % pattern_size)...}; 288 } 289 290 // Computes the shuffle control mask bytes array for given pattern-sizes and 291 // returns an array. 292 template <size_t... pattern_sizes_minus_one> 293 inline constexpr std::array<std::array<char, sizeof(V128)>, 294 sizeof...(pattern_sizes_minus_one)> 295 MakePatternMaskBytesTable(int index_offset, 296 index_sequence<pattern_sizes_minus_one...>) { 297 return { 298 MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, 299 make_index_sequence</*indexes=*/sizeof(V128)>())...}; 300 } 301 302 // This is an array of shuffle control masks that can be used as the source 303 // operand for PSHUFB to permute the contents of the destination XMM register 304 // into a repeating byte pattern. 305 alignas(16) constexpr std::array<std::array<char, sizeof(V128)>, 306 16> pattern_generation_masks = 307 MakePatternMaskBytesTable( 308 /*index_offset=*/0, 309 /*pattern_sizes_minus_one=*/make_index_sequence<16>()); 310 311 // Similar to 'pattern_generation_masks', this table is used to "rotate" the 312 // pattern so that we can copy the *next 16 bytes* consistent with the pattern. 313 // Basically, pattern_reshuffle_masks is a continuation of 314 // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as 315 // pattern_generation_masks for offsets 1, 2, 4, 8 and 16. 316 alignas(16) constexpr std::array<std::array<char, sizeof(V128)>, 317 16> pattern_reshuffle_masks = 318 MakePatternMaskBytesTable( 319 /*index_offset=*/16, 320 /*pattern_sizes_minus_one=*/make_index_sequence<16>()); 321 322 SNAPPY_ATTRIBUTE_ALWAYS_INLINE 323 static inline V128 LoadPattern(const char* src, const size_t pattern_size) { 324 V128 generation_mask = V128_Load(reinterpret_cast<const V128*>( 325 pattern_generation_masks[pattern_size - 1].data())); 326 // Uninitialized bytes are masked out by the shuffle mask. 327 // TODO: remove annotation and macro defs once MSan is fixed. 328 SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size); 329 return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)), 330 generation_mask); 331 } 332 333 SNAPPY_ATTRIBUTE_ALWAYS_INLINE 334 static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */> 335 LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { 336 V128 pattern = LoadPattern(src, pattern_size); 337 338 // This mask will generate the next 16 bytes in-place. Doing so enables us to 339 // write data by at most 4 V128_StoreU. 340 // 341 // For example, suppose pattern is: abcdefabcdefabcd 342 // Shuffling with this mask will generate: efabcdefabcdefab 343 // Shuffling again will generate: cdefabcdefabcdef 344 V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>( 345 pattern_reshuffle_masks[pattern_size - 1].data())); 346 return {pattern, reshuffle_mask}; 347 } 348 349 #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 350 351 // Fallback for when we need to copy while extending the pattern, for example 352 // copying 10 bytes from 3 positions back abc -> abcabcabcabca. 353 // 354 // REQUIRES: [dst - offset, dst + 64) is a valid address range. 355 SNAPPY_ATTRIBUTE_ALWAYS_INLINE 356 static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { 357 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 358 if (SNAPPY_PREDICT_TRUE(offset <= 16)) { 359 switch (offset) { 360 case 0: 361 return false; 362 case 1: { 363 // TODO: Ideally we should memset, move back once the 364 // codegen issues are fixed. 365 V128 pattern = V128_DupChar(dst[-1]); 366 for (int i = 0; i < 4; i++) { 367 V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern); 368 } 369 return true; 370 } 371 case 2: 372 case 4: 373 case 8: 374 case 16: { 375 V128 pattern = LoadPattern(dst - offset, offset); 376 for (int i = 0; i < 4; i++) { 377 V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern); 378 } 379 return true; 380 } 381 default: { 382 auto pattern_and_reshuffle_mask = 383 LoadPatternAndReshuffleMask(dst - offset, offset); 384 V128 pattern = pattern_and_reshuffle_mask.first; 385 V128 reshuffle_mask = pattern_and_reshuffle_mask.second; 386 for (int i = 0; i < 4; i++) { 387 V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern); 388 pattern = V128_Shuffle(pattern, reshuffle_mask); 389 } 390 return true; 391 } 392 } 393 } 394 #else 395 if (SNAPPY_PREDICT_TRUE(offset < 16)) { 396 if (SNAPPY_PREDICT_FALSE(offset == 0)) return false; 397 // Extend the pattern to the first 16 bytes. 398 // The simpler formulation of `dst[i - offset]` induces undefined behavior. 399 for (int i = 0; i < 16; i++) dst[i] = (dst - offset)[i]; 400 // Find a multiple of pattern >= 16. 401 static std::array<uint8_t, 16> pattern_sizes = []() { 402 std::array<uint8_t, 16> res; 403 for (int i = 1; i < 16; i++) res[i] = (16 / i + 1) * i; 404 return res; 405 }(); 406 offset = pattern_sizes[offset]; 407 for (int i = 1; i < 4; i++) { 408 std::memcpy(dst + i * 16, dst + i * 16 - offset, 16); 409 } 410 return true; 411 } 412 #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 413 414 // Very rare. 415 for (int i = 0; i < 4; i++) { 416 std::memcpy(dst + i * 16, dst + i * 16 - offset, 16); 417 } 418 return true; 419 } 420 421 // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than 422 // IncrementalCopySlow. buf_limit is the address past the end of the writable 423 // region of the buffer. 424 inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, 425 char* const buf_limit) { 426 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 427 constexpr int big_pattern_size_lower_bound = 16; 428 #else 429 constexpr int big_pattern_size_lower_bound = 8; 430 #endif 431 432 // Terminology: 433 // 434 // slop = buf_limit - op 435 // pat = op - src 436 // len = op_limit - op 437 assert(src < op); 438 assert(op < op_limit); 439 assert(op_limit <= buf_limit); 440 // NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64. 441 assert(op_limit - op <= 64); 442 // NOTE: In practice the compressor always emits len >= 4, so it is ok to 443 // assume that to optimize this function, but this is not guaranteed by the 444 // compression format, so we have to also handle len < 4 in case the input 445 // does not satisfy these conditions. 446 447 size_t pattern_size = op - src; 448 // The cases are split into different branches to allow the branch predictor, 449 // FDO, and static prediction hints to work better. For each input we list the 450 // ratio of invocations that match each condition. 451 // 452 // input slop < 16 pat < 8 len > 16 453 // ------------------------------------------ 454 // html|html4|cp 0% 1.01% 27.73% 455 // urls 0% 0.88% 14.79% 456 // jpg 0% 64.29% 7.14% 457 // pdf 0% 2.56% 58.06% 458 // txt[1-4] 0% 0.23% 0.97% 459 // pb 0% 0.96% 13.88% 460 // bin 0.01% 22.27% 41.17% 461 // 462 // It is very rare that we don't have enough slop for doing block copies. It 463 // is also rare that we need to expand a pattern. Small patterns are common 464 // for incompressible formats and for those we are plenty fast already. 465 // Lengths are normally not greater than 16 but they vary depending on the 466 // input. In general if we always predict len <= 16 it would be an ok 467 // prediction. 468 // 469 // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE) 470 // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a 471 // time. 472 473 // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE) 474 // bytes. 475 if (pattern_size < big_pattern_size_lower_bound) { 476 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 477 // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB 478 // to permute the register's contents in-place into a repeating sequence of 479 // the first "pattern_size" bytes. 480 // For example, suppose: 481 // src == "abc" 482 // op == op + 3 483 // After V128_Shuffle(), "pattern" will have five copies of "abc" 484 // followed by one byte of slop: abcabcabcabcabca. 485 // 486 // The non-SSE fallback implementation suffers from store-forwarding stalls 487 // because its loads and stores partly overlap. By expanding the pattern 488 // in-place, we avoid the penalty. 489 490 // Typically, the op_limit is the gating factor so try to simplify the loop 491 // based on that. 492 if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { 493 auto pattern_and_reshuffle_mask = 494 LoadPatternAndReshuffleMask(src, pattern_size); 495 V128 pattern = pattern_and_reshuffle_mask.first; 496 V128 reshuffle_mask = pattern_and_reshuffle_mask.second; 497 498 // There is at least one, and at most four 16-byte blocks. Writing four 499 // conditionals instead of a loop allows FDO to layout the code with 500 // respect to the actual probabilities of each length. 501 // TODO: Replace with loop with trip count hint. 502 V128_StoreU(reinterpret_cast<V128*>(op), pattern); 503 504 if (op + 16 < op_limit) { 505 pattern = V128_Shuffle(pattern, reshuffle_mask); 506 V128_StoreU(reinterpret_cast<V128*>(op + 16), pattern); 507 } 508 if (op + 32 < op_limit) { 509 pattern = V128_Shuffle(pattern, reshuffle_mask); 510 V128_StoreU(reinterpret_cast<V128*>(op + 32), pattern); 511 } 512 if (op + 48 < op_limit) { 513 pattern = V128_Shuffle(pattern, reshuffle_mask); 514 V128_StoreU(reinterpret_cast<V128*>(op + 48), pattern); 515 } 516 return op_limit; 517 } 518 char* const op_end = buf_limit - 15; 519 if (SNAPPY_PREDICT_TRUE(op < op_end)) { 520 auto pattern_and_reshuffle_mask = 521 LoadPatternAndReshuffleMask(src, pattern_size); 522 V128 pattern = pattern_and_reshuffle_mask.first; 523 V128 reshuffle_mask = pattern_and_reshuffle_mask.second; 524 525 // This code path is relatively cold however so we save code size 526 // by avoiding unrolling and vectorizing. 527 // 528 // TODO: Remove pragma when when cold regions don't get 529 // vectorized or unrolled. 530 #ifdef __clang__ 531 #pragma clang loop unroll(disable) 532 #endif 533 do { 534 V128_StoreU(reinterpret_cast<V128*>(op), pattern); 535 pattern = V128_Shuffle(pattern, reshuffle_mask); 536 op += 16; 537 } while (SNAPPY_PREDICT_TRUE(op < op_end)); 538 } 539 return IncrementalCopySlow(op - pattern_size, op, op_limit); 540 #else // !SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 541 // If plenty of buffer space remains, expand the pattern to at least 8 542 // bytes. The way the following loop is written, we need 8 bytes of buffer 543 // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10 544 // bytes if pattern_size is 2. Precisely encoding that is probably not 545 // worthwhile; instead, invoke the slow path if we cannot write 11 bytes 546 // (because 11 are required in the worst case). 547 if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) { 548 while (pattern_size < 8) { 549 UnalignedCopy64(src, op); 550 op += pattern_size; 551 pattern_size *= 2; 552 } 553 if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit; 554 } else { 555 return IncrementalCopySlow(src, op, op_limit); 556 } 557 #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 558 } 559 assert(pattern_size >= big_pattern_size_lower_bound); 560 constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16; 561 562 // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can 563 // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op. 564 // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes 565 // guarantees that op - src >= 8. 566 // 567 // Typically, the op_limit is the gating factor so try to simplify the loop 568 // based on that. 569 if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { 570 // There is at least one, and at most four 16-byte blocks. Writing four 571 // conditionals instead of a loop allows FDO to layout the code with respect 572 // to the actual probabilities of each length. 573 // TODO: Replace with loop with trip count hint. 574 ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op); 575 if (op + 16 < op_limit) { 576 ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16); 577 } 578 if (op + 32 < op_limit) { 579 ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32); 580 } 581 if (op + 48 < op_limit) { 582 ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48); 583 } 584 return op_limit; 585 } 586 587 // Fall back to doing as much as we can with the available slop in the 588 // buffer. This code path is relatively cold however so we save code size by 589 // avoiding unrolling and vectorizing. 590 // 591 // TODO: Remove pragma when when cold regions don't get vectorized 592 // or unrolled. 593 #ifdef __clang__ 594 #pragma clang loop unroll(disable) 595 #endif 596 for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) { 597 ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op); 598 } 599 if (op >= op_limit) return op_limit; 600 601 // We only take this branch if we didn't have enough slop and we can do a 602 // single 8 byte copy. 603 if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) { 604 UnalignedCopy64(src, op); 605 src += 8; 606 op += 8; 607 } 608 return IncrementalCopySlow(src, op, op_limit); 609 } 610 611 } // namespace 612 613 template <bool allow_fast_path> 614 static inline char* EmitLiteral(char* op, const char* literal, int len) { 615 // The vast majority of copies are below 16 bytes, for which a 616 // call to std::memcpy() is overkill. This fast path can sometimes 617 // copy up to 15 bytes too much, but that is okay in the 618 // main loop, since we have a bit to go on for both sides: 619 // 620 // - The input will always have kInputMarginBytes = 15 extra 621 // available bytes, as long as we're in the main loop, and 622 // if not, allow_fast_path = false. 623 // - The output will always have 32 spare bytes (see 624 // MaxCompressedLength). 625 assert(len > 0); // Zero-length literals are disallowed 626 int n = len - 1; 627 if (allow_fast_path && len <= 16) { 628 // Fits in tag byte 629 *op++ = LITERAL | (n << 2); 630 631 UnalignedCopy128(literal, op); 632 return op + len; 633 } 634 635 if (n < 60) { 636 // Fits in tag byte 637 *op++ = LITERAL | (n << 2); 638 } else { 639 int count = (Bits::Log2Floor(n) >> 3) + 1; 640 assert(count >= 1); 641 assert(count <= 4); 642 *op++ = LITERAL | ((59 + count) << 2); 643 // Encode in upcoming bytes. 644 // Write 4 bytes, though we may care about only 1 of them. The output buffer 645 // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds 646 // here and there is a std::memcpy() of size 'len' below. 647 LittleEndian::Store32(op, n); 648 op += count; 649 } 650 // When allow_fast_path is true, we can overwrite up to 16 bytes. 651 if (allow_fast_path) { 652 char* destination = op; 653 const char* source = literal; 654 const char* end = destination + len; 655 do { 656 std::memcpy(destination, source, 16); 657 destination += 16; 658 source += 16; 659 } while (destination < end); 660 } else { 661 std::memcpy(op, literal, len); 662 } 663 return op + len; 664 } 665 666 template <bool len_less_than_12> 667 static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) { 668 assert(len <= 64); 669 assert(len >= 4); 670 assert(offset < 65536); 671 assert(len_less_than_12 == (len < 12)); 672 673 if (len_less_than_12) { 674 uint32_t u = (len << 2) + (offset << 8); 675 uint32_t copy1 = COPY_1_BYTE_OFFSET - (4 << 2) + ((offset >> 3) & 0xe0); 676 uint32_t copy2 = COPY_2_BYTE_OFFSET - (1 << 2); 677 // It turns out that offset < 2048 is a difficult to predict branch. 678 // `perf record` shows this is the highest percentage of branch misses in 679 // benchmarks. This code produces branch free code, the data dependency 680 // chain that bottlenecks the throughput is so long that a few extra 681 // instructions are completely free (IPC << 6 because of data deps). 682 u += offset < 2048 ? copy1 : copy2; 683 LittleEndian::Store32(op, u); 684 op += offset < 2048 ? 2 : 3; 685 } else { 686 // Write 4 bytes, though we only care about 3 of them. The output buffer 687 // is required to have some slack, so the extra byte won't overrun it. 688 uint32_t u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8); 689 LittleEndian::Store32(op, u); 690 op += 3; 691 } 692 return op; 693 } 694 695 template <bool len_less_than_12> 696 static inline char* EmitCopy(char* op, size_t offset, size_t len) { 697 assert(len_less_than_12 == (len < 12)); 698 if (len_less_than_12) { 699 return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len); 700 } else { 701 // A special case for len <= 64 might help, but so far measurements suggest 702 // it's in the noise. 703 704 // Emit 64 byte copies but make sure to keep at least four bytes reserved. 705 while (SNAPPY_PREDICT_FALSE(len >= 68)) { 706 op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64); 707 len -= 64; 708 } 709 710 // One or two copies will now finish the job. 711 if (len > 64) { 712 op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60); 713 len -= 60; 714 } 715 716 // Emit remainder. 717 if (len < 12) { 718 op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len); 719 } else { 720 op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len); 721 } 722 return op; 723 } 724 } 725 726 bool GetUncompressedLength(const char* start, size_t n, size_t* result) { 727 uint32_t v = 0; 728 const char* limit = start + n; 729 if (Varint::Parse32WithLimit(start, limit, &v) != NULL) { 730 *result = v; 731 return true; 732 } else { 733 return false; 734 } 735 } 736 737 namespace { 738 uint32_t CalculateTableSize(uint32_t input_size) { 739 static_assert( 740 kMaxHashTableSize >= kMinHashTableSize, 741 "kMaxHashTableSize should be greater or equal to kMinHashTableSize."); 742 if (input_size > kMaxHashTableSize) { 743 return kMaxHashTableSize; 744 } 745 if (input_size < kMinHashTableSize) { 746 return kMinHashTableSize; 747 } 748 // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1. 749 // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)). 750 return 2u << Bits::Log2Floor(input_size - 1); 751 } 752 } // namespace 753 754 namespace internal { 755 WorkingMemory::WorkingMemory(size_t input_size) { 756 const size_t max_fragment_size = std::min(input_size, kBlockSize); 757 const size_t table_size = CalculateTableSize(max_fragment_size); 758 size_ = table_size * sizeof(*table_) + max_fragment_size + 759 MaxCompressedLength(max_fragment_size); 760 mem_ = std::allocator<char>().allocate(size_); 761 table_ = reinterpret_cast<uint16_t*>(mem_); 762 input_ = mem_ + table_size * sizeof(*table_); 763 output_ = input_ + max_fragment_size; 764 } 765 766 WorkingMemory::~WorkingMemory() { 767 std::allocator<char>().deallocate(mem_, size_); 768 } 769 770 uint16_t* WorkingMemory::GetHashTable(size_t fragment_size, 771 int* table_size) const { 772 const size_t htsize = CalculateTableSize(fragment_size); 773 memset(table_, 0, htsize * sizeof(*table_)); 774 *table_size = htsize; 775 return table_; 776 } 777 } // end namespace internal 778 779 // Flat array compression that does not emit the "uncompressed length" 780 // prefix. Compresses "input" string to the "*op" buffer. 781 // 782 // REQUIRES: "input" is at most "kBlockSize" bytes long. 783 // REQUIRES: "op" points to an array of memory that is at least 784 // "MaxCompressedLength(input.size())" in size. 785 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero. 786 // REQUIRES: "table_size" is a power of two 787 // 788 // Returns an "end" pointer into "op" buffer. 789 // "end - op" is the compressed size of "input". 790 namespace internal { 791 char* CompressFragment(const char* input, size_t input_size, char* op, 792 uint16_t* table, const int table_size) { 793 // "ip" is the input pointer, and "op" is the output pointer. 794 const char* ip = input; 795 assert(input_size <= kBlockSize); 796 assert((table_size & (table_size - 1)) == 0); // table must be power of two 797 const uint32_t mask = 2 * (table_size - 1); 798 const char* ip_end = input + input_size; 799 const char* base_ip = ip; 800 801 const size_t kInputMarginBytes = 15; 802 if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) { 803 const char* ip_limit = input + input_size - kInputMarginBytes; 804 805 for (uint32_t preload = LittleEndian::Load32(ip + 1);;) { 806 // Bytes in [next_emit, ip) will be emitted as literal bytes. Or 807 // [next_emit, ip_end) after the main loop. 808 const char* next_emit = ip++; 809 uint64_t data = LittleEndian::Load64(ip); 810 // The body of this loop calls EmitLiteral once and then EmitCopy one or 811 // more times. (The exception is that when we're close to exhausting 812 // the input we goto emit_remainder.) 813 // 814 // In the first iteration of this loop we're just starting, so 815 // there's nothing to copy, so calling EmitLiteral once is 816 // necessary. And we only start a new iteration when the 817 // current iteration has determined that a call to EmitLiteral will 818 // precede the next call to EmitCopy (if any). 819 // 820 // Step 1: Scan forward in the input looking for a 4-byte-long match. 821 // If we get close to exhausting the input then goto emit_remainder. 822 // 823 // Heuristic match skipping: If 32 bytes are scanned with no matches 824 // found, start looking only at every other byte. If 32 more bytes are 825 // scanned (or skipped), look at every third byte, etc.. When a match is 826 // found, immediately go back to looking at every byte. This is a small 827 // loss (~5% performance, ~0.1% density) for compressible data due to more 828 // bookkeeping, but for non-compressible data (such as JPEG) it's a huge 829 // win since the compressor quickly "realizes" the data is incompressible 830 // and doesn't bother looking for matches everywhere. 831 // 832 // The "skip" variable keeps track of how many bytes there are since the 833 // last match; dividing it by 32 (ie. right-shifting by five) gives the 834 // number of bytes to move ahead for each iteration. 835 uint32_t skip = 32; 836 837 const char* candidate; 838 if (ip_limit - ip >= 16) { 839 auto delta = ip - base_ip; 840 for (int j = 0; j < 4; ++j) { 841 for (int k = 0; k < 4; ++k) { 842 int i = 4 * j + k; 843 // These for-loops are meant to be unrolled. So we can freely 844 // special case the first iteration to use the value already 845 // loaded in preload. 846 uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data); 847 assert(dword == LittleEndian::Load32(ip + i)); 848 uint16_t* table_entry = TableEntry(table, dword, mask); 849 candidate = base_ip + *table_entry; 850 assert(candidate >= base_ip); 851 assert(candidate < ip + i); 852 *table_entry = delta + i; 853 if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) { 854 *op = LITERAL | (i << 2); 855 UnalignedCopy128(next_emit, op + 1); 856 ip += i; 857 op = op + i + 2; 858 goto emit_match; 859 } 860 data >>= 8; 861 } 862 data = LittleEndian::Load64(ip + 4 * j + 4); 863 } 864 ip += 16; 865 skip += 16; 866 } 867 while (true) { 868 assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip)); 869 uint16_t* table_entry = TableEntry(table, data, mask); 870 uint32_t bytes_between_hash_lookups = skip >> 5; 871 skip += bytes_between_hash_lookups; 872 const char* next_ip = ip + bytes_between_hash_lookups; 873 if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) { 874 ip = next_emit; 875 goto emit_remainder; 876 } 877 candidate = base_ip + *table_entry; 878 assert(candidate >= base_ip); 879 assert(candidate < ip); 880 881 *table_entry = ip - base_ip; 882 if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) == 883 LittleEndian::Load32(candidate))) { 884 break; 885 } 886 data = LittleEndian::Load32(next_ip); 887 ip = next_ip; 888 } 889 890 // Step 2: A 4-byte match has been found. We'll later see if more 891 // than 4 bytes match. But, prior to the match, input 892 // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes." 893 assert(next_emit + 16 <= ip_end); 894 op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit); 895 896 // Step 3: Call EmitCopy, and then see if another EmitCopy could 897 // be our next move. Repeat until we find no match for the 898 // input immediately after what was consumed by the last EmitCopy call. 899 // 900 // If we exit this loop normally then we need to call EmitLiteral next, 901 // though we don't yet know how big the literal will be. We handle that 902 // by proceeding to the next iteration of the main loop. We also can exit 903 // this loop via goto if we get close to exhausting the input. 904 emit_match: 905 do { 906 // We have a 4-byte match at ip, and no need to emit any 907 // "literal bytes" prior to ip. 908 const char* base = ip; 909 std::pair<size_t, bool> p = 910 FindMatchLength(candidate + 4, ip + 4, ip_end, &data); 911 size_t matched = 4 + p.first; 912 ip += matched; 913 size_t offset = base - candidate; 914 assert(0 == memcmp(base, candidate, matched)); 915 if (p.second) { 916 op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched); 917 } else { 918 op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched); 919 } 920 if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { 921 goto emit_remainder; 922 } 923 // Expect 5 bytes to match 924 assert((data & 0xFFFFFFFFFF) == 925 (LittleEndian::Load64(ip) & 0xFFFFFFFFFF)); 926 // We are now looking for a 4-byte match again. We read 927 // table[Hash(ip, mask)] for that. To improve compression, 928 // we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)]. 929 *TableEntry(table, LittleEndian::Load32(ip - 1), mask) = 930 ip - base_ip - 1; 931 uint16_t* table_entry = TableEntry(table, data, mask); 932 candidate = base_ip + *table_entry; 933 *table_entry = ip - base_ip; 934 // Measurements on the benchmarks have shown the following probabilities 935 // for the loop to exit (ie. avg. number of iterations is reciprocal). 936 // BM_Flat/6 txt1 p = 0.3-0.4 937 // BM_Flat/7 txt2 p = 0.35 938 // BM_Flat/8 txt3 p = 0.3-0.4 939 // BM_Flat/9 txt3 p = 0.34-0.4 940 // BM_Flat/10 pb p = 0.4 941 // BM_Flat/11 gaviota p = 0.1 942 // BM_Flat/12 cp p = 0.5 943 // BM_Flat/13 c p = 0.3 944 } while (static_cast<uint32_t>(data) == LittleEndian::Load32(candidate)); 945 // Because the least significant 5 bytes matched, we can utilize data 946 // for the next iteration. 947 preload = data >> 8; 948 } 949 } 950 951 emit_remainder: 952 // Emit the remaining bytes as a literal 953 if (ip < ip_end) { 954 op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip); 955 } 956 957 return op; 958 } 959 960 char* CompressFragmentDoubleHash(const char* input, size_t input_size, char* op, 961 uint16_t* table, const int table_size, 962 uint16_t* table2, const int table_size2) { 963 (void)table_size2; 964 assert(table_size == table_size2); 965 // "ip" is the input pointer, and "op" is the output pointer. 966 const char* ip = input; 967 assert(input_size <= kBlockSize); 968 assert((table_size & (table_size - 1)) == 0); // table must be power of two 969 const uint32_t mask = 2 * (table_size - 1); 970 const char* ip_end = input + input_size; 971 const char* base_ip = ip; 972 973 const size_t kInputMarginBytes = 15; 974 if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) { 975 const char* ip_limit = input + input_size - kInputMarginBytes; 976 977 for (;;) { 978 const char* next_emit = ip++; 979 uint64_t data = LittleEndian::Load64(ip); 980 uint32_t skip = 512; 981 982 const char* candidate; 983 uint32_t candidate_length; 984 while (true) { 985 assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip)); 986 uint16_t* table_entry2 = TableEntry8ByteMatch(table2, data, mask); 987 uint32_t bytes_between_hash_lookups = skip >> 9; 988 skip++; 989 const char* next_ip = ip + bytes_between_hash_lookups; 990 if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) { 991 ip = next_emit; 992 goto emit_remainder; 993 } 994 candidate = base_ip + *table_entry2; 995 assert(candidate >= base_ip); 996 assert(candidate < ip); 997 998 *table_entry2 = ip - base_ip; 999 if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) == 1000 LittleEndian::Load32(candidate))) { 1001 candidate_length = 1002 FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; 1003 break; 1004 } 1005 1006 uint16_t* table_entry = TableEntry4ByteMatch(table, data, mask); 1007 candidate = base_ip + *table_entry; 1008 assert(candidate >= base_ip); 1009 assert(candidate < ip); 1010 1011 *table_entry = ip - base_ip; 1012 if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) == 1013 LittleEndian::Load32(candidate))) { 1014 candidate_length = 1015 FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; 1016 table_entry2 = 1017 TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask); 1018 auto candidate2 = base_ip + *table_entry2; 1019 size_t candidate_length2 = 1020 FindMatchLengthPlain(candidate2, ip + 1, ip_end); 1021 if (candidate_length2 > candidate_length) { 1022 *table_entry2 = ip - base_ip; 1023 candidate = candidate2; 1024 candidate_length = candidate_length2; 1025 ++ip; 1026 } 1027 break; 1028 } 1029 data = LittleEndian::Load64(next_ip); 1030 ip = next_ip; 1031 } 1032 // Backtrack to the point it matches fully. 1033 while (ip > next_emit && candidate > base_ip && 1034 *(ip - 1) == *(candidate - 1)) { 1035 --ip; 1036 --candidate; 1037 ++candidate_length; 1038 } 1039 *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 1), mask) = 1040 ip - base_ip + 1; 1041 *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip + 2), mask) = 1042 ip - base_ip + 2; 1043 *TableEntry4ByteMatch(table, LittleEndian::Load32(ip + 1), mask) = 1044 ip - base_ip + 1; 1045 // Step 2: A 4-byte or 8-byte match has been found. 1046 // We'll later see if more than 4 bytes match. But, prior to the match, 1047 // input bytes [next_emit, ip) are unmatched. Emit them as 1048 // "literal bytes." 1049 assert(next_emit + 16 <= ip_end); 1050 if (ip - next_emit > 0) { 1051 op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, 1052 ip - next_emit); 1053 } 1054 // Step 3: Call EmitCopy, and then see if another EmitCopy could 1055 // be our next move. Repeat until we find no match for the 1056 // input immediately after what was consumed by the last EmitCopy call. 1057 // 1058 // If we exit this loop normally then we need to call EmitLiteral next, 1059 // though we don't yet know how big the literal will be. We handle that 1060 // by proceeding to the next iteration of the main loop. We also can exit 1061 // this loop via goto if we get close to exhausting the input. 1062 do { 1063 // We have a 4-byte match at ip, and no need to emit any 1064 // "literal bytes" prior to ip. 1065 const char* base = ip; 1066 ip += candidate_length; 1067 size_t offset = base - candidate; 1068 if (candidate_length < 12) { 1069 op = 1070 EmitCopy</*len_less_than_12=*/true>(op, offset, candidate_length); 1071 } else { 1072 op = EmitCopy</*len_less_than_12=*/false>(op, offset, 1073 candidate_length); 1074 } 1075 if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) { 1076 goto emit_remainder; 1077 } 1078 // We are now looking for a 4-byte match again. We read 1079 // table[Hash(ip, mask)] for that. To improve compression, 1080 // we also update several previous table entries. 1081 if (ip - base_ip > 7) { 1082 *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 7), mask) = 1083 ip - base_ip - 7; 1084 *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 4), mask) = 1085 ip - base_ip - 4; 1086 } 1087 *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 3), mask) = 1088 ip - base_ip - 3; 1089 *TableEntry8ByteMatch(table2, LittleEndian::Load64(ip - 2), mask) = 1090 ip - base_ip - 2; 1091 *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 2), mask) = 1092 ip - base_ip - 2; 1093 *TableEntry4ByteMatch(table, LittleEndian::Load32(ip - 1), mask) = 1094 ip - base_ip - 1; 1095 1096 uint16_t* table_entry = 1097 TableEntry8ByteMatch(table2, LittleEndian::Load64(ip), mask); 1098 candidate = base_ip + *table_entry; 1099 *table_entry = ip - base_ip; 1100 if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) { 1101 candidate_length = 1102 FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; 1103 continue; 1104 } 1105 table_entry = 1106 TableEntry4ByteMatch(table, LittleEndian::Load32(ip), mask); 1107 candidate = base_ip + *table_entry; 1108 *table_entry = ip - base_ip; 1109 if (LittleEndian::Load32(ip) == LittleEndian::Load32(candidate)) { 1110 candidate_length = 1111 FindMatchLengthPlain(candidate + 4, ip + 4, ip_end) + 4; 1112 continue; 1113 } 1114 break; 1115 } while (true); 1116 } 1117 } 1118 1119 emit_remainder: 1120 // Emit the remaining bytes as a literal 1121 if (ip < ip_end) { 1122 op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip); 1123 } 1124 1125 return op; 1126 } 1127 } // end namespace internal 1128 1129 static inline void Report(int token, const char *algorithm, size_t 1130 compressed_size, size_t uncompressed_size) { 1131 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 1132 (void)token; 1133 (void)algorithm; 1134 (void)compressed_size; 1135 (void)uncompressed_size; 1136 } 1137 1138 // Signature of output types needed by decompression code. 1139 // The decompression code is templatized on a type that obeys this 1140 // signature so that we do not pay virtual function call overhead in 1141 // the middle of a tight decompression loop. 1142 // 1143 // class DecompressionWriter { 1144 // public: 1145 // // Called before decompression 1146 // void SetExpectedLength(size_t length); 1147 // 1148 // // For performance a writer may choose to donate the cursor variable to the 1149 // // decompression function. The decompression will inject it in all its 1150 // // function calls to the writer. Keeping the important output cursor as a 1151 // // function local stack variable allows the compiler to keep it in 1152 // // register, which greatly aids performance by avoiding loads and stores of 1153 // // this variable in the fast path loop iterations. 1154 // T GetOutputPtr() const; 1155 // 1156 // // At end of decompression the loop donates the ownership of the cursor 1157 // // variable back to the writer by calling this function. 1158 // void SetOutputPtr(T op); 1159 // 1160 // // Called after decompression 1161 // bool CheckLength() const; 1162 // 1163 // // Called repeatedly during decompression 1164 // // Each function get a pointer to the op (output pointer), that the writer 1165 // // can use and update. Note it's important that these functions get fully 1166 // // inlined so that no actual address of the local variable needs to be 1167 // // taken. 1168 // bool Append(const char* ip, size_t length, T* op); 1169 // bool AppendFromSelf(uint32_t offset, size_t length, T* op); 1170 // 1171 // // The rules for how TryFastAppend differs from Append are somewhat 1172 // // convoluted: 1173 // // 1174 // // - TryFastAppend is allowed to decline (return false) at any 1175 // // time, for any reason -- just "return false" would be 1176 // // a perfectly legal implementation of TryFastAppend. 1177 // // The intention is for TryFastAppend to allow a fast path 1178 // // in the common case of a small append. 1179 // // - TryFastAppend is allowed to read up to <available> bytes 1180 // // from the input buffer, whereas Append is allowed to read 1181 // // <length>. However, if it returns true, it must leave 1182 // // at least five (kMaximumTagLength) bytes in the input buffer 1183 // // afterwards, so that there is always enough space to read the 1184 // // next tag without checking for a refill. 1185 // // - TryFastAppend must always return decline (return false) 1186 // // if <length> is 61 or more, as in this case the literal length is not 1187 // // decoded fully. In practice, this should not be a big problem, 1188 // // as it is unlikely that one would implement a fast path accepting 1189 // // this much data. 1190 // // 1191 // bool TryFastAppend(const char* ip, size_t available, size_t length, T* op); 1192 // }; 1193 1194 static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { 1195 assert(n >= 0); 1196 assert(n <= 4); 1197 #if SNAPPY_HAVE_BMI2 1198 return _bzhi_u32(v, 8 * n); 1199 #else 1200 // This needs to be wider than uint32_t otherwise `mask << 32` will be 1201 // undefined. 1202 uint64_t mask = 0xffffffff; 1203 return v & ~(mask << (8 * n)); 1204 #endif 1205 } 1206 1207 static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { 1208 assert(shift < 32); 1209 static const uint8_t masks[] = { 1210 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1211 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1212 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1213 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe}; 1214 return (value & masks[shift]) != 0; 1215 } 1216 1217 inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) { 1218 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 1219 (void)dst; 1220 return offset != 0; 1221 } 1222 1223 // Copies between size bytes and 64 bytes from src to dest. size cannot exceed 1224 // 64. More than size bytes, but never exceeding 64, might be copied if doing 1225 // so gives better performance. [src, src + size) must not overlap with 1226 // [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64). 1227 void MemCopy64(char* dst, const void* src, size_t size) { 1228 // Always copy this many bytes. If that's below size then copy the full 64. 1229 constexpr int kShortMemCopy = 32; 1230 1231 assert(size <= 64); 1232 assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size, 1233 dst) || 1234 std::less_equal<const void*>()(dst + size, src)); 1235 1236 // We know that src and dst are at least size bytes apart. However, because we 1237 // might copy more than size bytes the copy still might overlap past size. 1238 // E.g. if src and dst appear consecutively in memory (src + size >= dst). 1239 // TODO: Investigate wider copies on other platforms. 1240 #if defined(__x86_64__) && defined(__AVX__) 1241 assert(kShortMemCopy <= 32); 1242 __m256i data = _mm256_lddqu_si256(static_cast<const __m256i *>(src)); 1243 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), data); 1244 // Profiling shows that nearly all copies are short. 1245 if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { 1246 data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1); 1247 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data); 1248 } 1249 #else 1250 std::memmove(dst, src, kShortMemCopy); 1251 // Profiling shows that nearly all copies are short. 1252 if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { 1253 std::memmove(dst + kShortMemCopy, 1254 static_cast<const uint8_t*>(src) + kShortMemCopy, 1255 64 - kShortMemCopy); 1256 } 1257 #endif 1258 } 1259 1260 void MemCopy64(ptrdiff_t dst, const void* src, size_t size) { 1261 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 1262 (void)dst; 1263 (void)src; 1264 (void)size; 1265 } 1266 1267 void ClearDeferred(const void** deferred_src, size_t* deferred_length, 1268 uint8_t* safe_source) { 1269 *deferred_src = safe_source; 1270 *deferred_length = 0; 1271 } 1272 1273 void DeferMemCopy(const void** deferred_src, size_t* deferred_length, 1274 const void* src, size_t length) { 1275 *deferred_src = src; 1276 *deferred_length = length; 1277 } 1278 1279 SNAPPY_ATTRIBUTE_ALWAYS_INLINE 1280 inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { 1281 const uint8_t*& ip = *ip_p; 1282 // This section is crucial for the throughput of the decompression loop. 1283 // The latency of an iteration is fundamentally constrained by the 1284 // following data chain on ip. 1285 // ip -> c = Load(ip) -> delta1 = (c & 3) -> ip += delta1 or delta2 1286 // delta2 = ((c >> 2) + 1) ip++ 1287 // This is different from X86 optimizations because ARM has conditional add 1288 // instruction (csinc) and it removes several register moves. 1289 const size_t tag_type = *tag & 3; 1290 const bool is_literal = (tag_type == 0); 1291 if (is_literal) { 1292 size_t next_literal_tag = (*tag >> 2) + 1; 1293 *tag = ip[next_literal_tag]; 1294 ip += next_literal_tag + 1; 1295 } else { 1296 *tag = ip[tag_type]; 1297 ip += tag_type + 1; 1298 } 1299 return tag_type; 1300 } 1301 1302 SNAPPY_ATTRIBUTE_ALWAYS_INLINE 1303 inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) { 1304 const uint8_t*& ip = *ip_p; 1305 // This section is crucial for the throughput of the decompression loop. 1306 // The latency of an iteration is fundamentally constrained by the 1307 // following data chain on ip. 1308 // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2 1309 // ip2 = ip + 2 + (c >> 2) 1310 // This amounts to 8 cycles. 1311 // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov) 1312 size_t literal_len = *tag >> 2; 1313 size_t tag_type = *tag; 1314 bool is_literal; 1315 #if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__) 1316 // TODO clang misses the fact that the (c & 3) already correctly 1317 // sets the zero flag. 1318 asm("and $3, %k[tag_type]\n\t" 1319 : [tag_type] "+r"(tag_type), "=@ccz"(is_literal) 1320 :: "cc"); 1321 #else 1322 tag_type &= 3; 1323 is_literal = (tag_type == 0); 1324 #endif 1325 // TODO 1326 // This is code is subtle. Loading the values first and then cmov has less 1327 // latency then cmov ip and then load. However clang would move the loads 1328 // in an optimization phase, volatile prevents this transformation. 1329 // Note that we have enough slop bytes (64) that the loads are always valid. 1330 size_t tag_literal = 1331 static_cast<const volatile uint8_t*>(ip)[1 + literal_len]; 1332 size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type]; 1333 *tag = is_literal ? tag_literal : tag_copy; 1334 const uint8_t* ip_copy = ip + 1 + tag_type; 1335 const uint8_t* ip_literal = ip + 2 + literal_len; 1336 ip = is_literal ? ip_literal : ip_copy; 1337 #if defined(__GNUC__) && defined(__x86_64__) 1338 // TODO Clang is "optimizing" zero-extension (a totally free 1339 // operation) this means that after the cmov of tag, it emits another movzb 1340 // tag, byte(tag). It really matters as it's on the core chain. This dummy 1341 // asm, persuades clang to do the zero-extension at the load (it's automatic) 1342 // removing the expensive movzb. 1343 asm("" ::"r"(tag_copy)); 1344 #endif 1345 return tag_type; 1346 } 1347 1348 // Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4. 1349 inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) { 1350 // For x86 non-static storage works better. For ARM static storage is better. 1351 // TODO: Once the array is recognized as a register, improve the 1352 // readability for x86. 1353 #if defined(__x86_64__) 1354 constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; 1355 uint16_t result; 1356 memcpy(&result, 1357 reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type, 1358 sizeof(result)); 1359 return val & result; 1360 #elif defined(__aarch64__) 1361 constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; 1362 return val & static_cast<uint32_t>( 1363 (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF); 1364 #else 1365 static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0}; 1366 return val & kExtractMasks[tag_type]; 1367 #endif 1368 }; 1369 1370 // Core decompression loop, when there is enough data available. 1371 // Decompresses the input buffer [ip, ip_limit) into the output buffer 1372 // [op, op_limit_min_slop). Returning when either we are too close to the end 1373 // of the input buffer, or we exceed op_limit_min_slop or when a exceptional 1374 // tag is encountered (literal of length > 60) or a copy-4. 1375 // Returns {ip, op} at the points it stopped decoding. 1376 // TODO This function probably does not need to be inlined, as it 1377 // should decode large chunks at a time. This allows runtime dispatch to 1378 // implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy). 1379 template <typename T> 1380 std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless( 1381 const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base, 1382 ptrdiff_t op_limit_min_slop) { 1383 // If deferred_src is invalid point it here. 1384 uint8_t safe_source[64]; 1385 const void* deferred_src; 1386 size_t deferred_length; 1387 ClearDeferred(&deferred_src, &deferred_length, safe_source); 1388 1389 // We unroll the inner loop twice so we need twice the spare room. 1390 op_limit_min_slop -= kSlopBytes; 1391 if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) { 1392 const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1; 1393 ip++; 1394 // ip points just past the tag and we are touching at maximum kSlopBytes 1395 // in an iteration. 1396 size_t tag = ip[-1]; 1397 #if defined(__clang__) && defined(__aarch64__) 1398 // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317 1399 // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb) 1400 // comes with free zero-extension, so clang generates another 1401 // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is 1402 // redundant and can be removed by adding this dummy asm, which gives 1403 // clang a hint that we're doing the zero-extension at the load. 1404 asm("" ::"r"(tag)); 1405 #endif 1406 do { 1407 // The throughput is limited by instructions, unrolling the inner loop 1408 // twice reduces the amount of instructions checking limits and also 1409 // leads to reduced mov's. 1410 1411 SNAPPY_PREFETCH(ip + 128); 1412 for (int i = 0; i < 2; i++) { 1413 const uint8_t* old_ip = ip; 1414 assert(tag == ip[-1]); 1415 // For literals tag_type = 0, hence we will always obtain 0 from 1416 // ExtractLowBytes. For literals offset will thus be kLiteralOffset. 1417 ptrdiff_t len_minus_offset = kLengthMinusOffset[tag]; 1418 uint32_t next; 1419 #if defined(__aarch64__) 1420 size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag); 1421 // We never need more than 16 bits. Doing a Load16 allows the compiler 1422 // to elide the masking operation in ExtractOffset. 1423 next = LittleEndian::Load16(old_ip); 1424 #else 1425 size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag); 1426 next = LittleEndian::Load32(old_ip); 1427 #endif 1428 size_t len = len_minus_offset & 0xFF; 1429 ptrdiff_t extracted = ExtractOffset(next, tag_type); 1430 ptrdiff_t len_min_offset = len_minus_offset - extracted; 1431 if (SNAPPY_PREDICT_FALSE(len_minus_offset > extracted)) { 1432 if (SNAPPY_PREDICT_FALSE(len & 0x80)) { 1433 // Exceptional case (long literal or copy 4). 1434 // Actually doing the copy here is negatively impacting the main 1435 // loop due to compiler incorrectly allocating a register for 1436 // this fallback. Hence we just break. 1437 break_loop: 1438 ip = old_ip; 1439 goto exit; 1440 } 1441 // Only copy-1 or copy-2 tags can get here. 1442 assert(tag_type == 1 || tag_type == 2); 1443 std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len; 1444 // Guard against copies before the buffer start. 1445 // Execute any deferred MemCopy since we write to dst here. 1446 MemCopy64(op_base + op, deferred_src, deferred_length); 1447 op += deferred_length; 1448 ClearDeferred(&deferred_src, &deferred_length, safe_source); 1449 if (SNAPPY_PREDICT_FALSE(delta < 0 || 1450 !Copy64BytesWithPatternExtension( 1451 op_base + op, len - len_min_offset))) { 1452 goto break_loop; 1453 } 1454 // We aren't deferring this copy so add length right away. 1455 op += len; 1456 continue; 1457 } 1458 std::ptrdiff_t delta = (op + deferred_length) + len_min_offset - len; 1459 if (SNAPPY_PREDICT_FALSE(delta < 0)) { 1460 // Due to the spurious offset in literals have this will trigger 1461 // at the start of a block when op is still smaller than 256. 1462 if (tag_type != 0) goto break_loop; 1463 MemCopy64(op_base + op, deferred_src, deferred_length); 1464 op += deferred_length; 1465 DeferMemCopy(&deferred_src, &deferred_length, old_ip, len); 1466 continue; 1467 } 1468 1469 // For copies we need to copy from op_base + delta, for literals 1470 // we need to copy from ip instead of from the stream. 1471 const void* from = 1472 tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip; 1473 MemCopy64(op_base + op, deferred_src, deferred_length); 1474 op += deferred_length; 1475 DeferMemCopy(&deferred_src, &deferred_length, from, len); 1476 } 1477 } while (ip < ip_limit_min_slop && 1478 static_cast<ptrdiff_t>(op + deferred_length) < op_limit_min_slop); 1479 exit: 1480 ip--; 1481 assert(ip <= ip_limit); 1482 } 1483 // If we deferred a copy then we can perform. If we are up to date then we 1484 // might not have enough slop bytes and could run past the end. 1485 if (deferred_length) { 1486 MemCopy64(op_base + op, deferred_src, deferred_length); 1487 op += deferred_length; 1488 ClearDeferred(&deferred_src, &deferred_length, safe_source); 1489 } 1490 return {ip, op}; 1491 } 1492 1493 // Helper class for decompression 1494 class SnappyDecompressor { 1495 private: 1496 Source* reader_; // Underlying source of bytes to decompress 1497 const char* ip_; // Points to next buffered byte 1498 const char* ip_limit_; // Points just past buffered bytes 1499 // If ip < ip_limit_min_maxtaglen_ it's safe to read kMaxTagLength from 1500 // buffer. 1501 const char* ip_limit_min_maxtaglen_; 1502 uint64_t peeked_; // Bytes peeked from reader (need to skip) 1503 bool eof_; // Hit end of input without an error? 1504 char scratch_[kMaximumTagLength]; // See RefillTag(). 1505 1506 // Ensure that all of the tag metadata for the next tag is available 1507 // in [ip_..ip_limit_-1]. Also ensures that [ip,ip+4] is readable even 1508 // if (ip_limit_ - ip_ < 5). 1509 // 1510 // Returns true on success, false on error or end of input. 1511 bool RefillTag(); 1512 1513 void ResetLimit(const char* ip) { 1514 ip_limit_min_maxtaglen_ = 1515 ip_limit_ - std::min<ptrdiff_t>(ip_limit_ - ip, kMaximumTagLength - 1); 1516 } 1517 1518 public: 1519 explicit SnappyDecompressor(Source* reader) 1520 : reader_(reader), ip_(NULL), ip_limit_(NULL), peeked_(0), eof_(false) {} 1521 1522 ~SnappyDecompressor() { 1523 // Advance past any bytes we peeked at from the reader 1524 reader_->Skip(peeked_); 1525 } 1526 1527 // Returns true iff we have hit the end of the input without an error. 1528 bool eof() const { return eof_; } 1529 1530 // Read the uncompressed length stored at the start of the compressed data. 1531 // On success, stores the length in *result and returns true. 1532 // On failure, returns false. 1533 bool ReadUncompressedLength(uint32_t* result) { 1534 assert(ip_ == NULL); // Must not have read anything yet 1535 // Length is encoded in 1..5 bytes 1536 *result = 0; 1537 uint32_t shift = 0; 1538 while (true) { 1539 if (shift >= 32) return false; 1540 size_t n; 1541 const char* ip = reader_->Peek(&n); 1542 if (n == 0) return false; 1543 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip)); 1544 reader_->Skip(1); 1545 uint32_t val = c & 0x7f; 1546 if (LeftShiftOverflows(static_cast<uint8_t>(val), shift)) return false; 1547 *result |= val << shift; 1548 if (c < 128) { 1549 break; 1550 } 1551 shift += 7; 1552 } 1553 return true; 1554 } 1555 1556 // Process the next item found in the input. 1557 // Returns true if successful, false on error or end of input. 1558 template <class Writer> 1559 #if defined(__GNUC__) && defined(__x86_64__) 1560 __attribute__((aligned(32))) 1561 #endif 1562 void 1563 DecompressAllTags(Writer* writer) { 1564 const char* ip = ip_; 1565 ResetLimit(ip); 1566 auto op = writer->GetOutputPtr(); 1567 // We could have put this refill fragment only at the beginning of the loop. 1568 // However, duplicating it at the end of each branch gives the compiler more 1569 // scope to optimize the <ip_limit_ - ip> expression based on the local 1570 // context, which overall increases speed. 1571 #define MAYBE_REFILL() \ 1572 if (SNAPPY_PREDICT_FALSE(ip >= ip_limit_min_maxtaglen_)) { \ 1573 ip_ = ip; \ 1574 if (SNAPPY_PREDICT_FALSE(!RefillTag())) goto exit; \ 1575 ip = ip_; \ 1576 ResetLimit(ip); \ 1577 } \ 1578 preload = static_cast<uint8_t>(*ip) 1579 1580 // At the start of the for loop below the least significant byte of preload 1581 // contains the tag. 1582 uint32_t preload; 1583 MAYBE_REFILL(); 1584 for (;;) { 1585 { 1586 ptrdiff_t op_limit_min_slop; 1587 auto op_base = writer->GetBase(&op_limit_min_slop); 1588 if (op_base) { 1589 auto res = 1590 DecompressBranchless(reinterpret_cast<const uint8_t*>(ip), 1591 reinterpret_cast<const uint8_t*>(ip_limit_), 1592 op - op_base, op_base, op_limit_min_slop); 1593 ip = reinterpret_cast<const char*>(res.first); 1594 op = op_base + res.second; 1595 MAYBE_REFILL(); 1596 } 1597 } 1598 const uint8_t c = static_cast<uint8_t>(preload); 1599 ip++; 1600 1601 // Ratio of iterations that have LITERAL vs non-LITERAL for different 1602 // inputs. 1603 // 1604 // input LITERAL NON_LITERAL 1605 // ----------------------------------- 1606 // html|html4|cp 23% 77% 1607 // urls 36% 64% 1608 // jpg 47% 53% 1609 // pdf 19% 81% 1610 // txt[1-4] 25% 75% 1611 // pb 24% 76% 1612 // bin 24% 76% 1613 if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) { 1614 size_t literal_length = (c >> 2) + 1u; 1615 if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length, &op)) { 1616 assert(literal_length < 61); 1617 ip += literal_length; 1618 // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend() 1619 // will not return true unless there's already at least five spare 1620 // bytes in addition to the literal. 1621 preload = static_cast<uint8_t>(*ip); 1622 continue; 1623 } 1624 if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) { 1625 // Long literal. 1626 const size_t literal_length_length = literal_length - 60; 1627 literal_length = 1628 ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) + 1629 1; 1630 ip += literal_length_length; 1631 } 1632 1633 size_t avail = ip_limit_ - ip; 1634 while (avail < literal_length) { 1635 if (!writer->Append(ip, avail, &op)) goto exit; 1636 literal_length -= avail; 1637 reader_->Skip(peeked_); 1638 size_t n; 1639 ip = reader_->Peek(&n); 1640 avail = n; 1641 peeked_ = avail; 1642 if (avail == 0) goto exit; 1643 ip_limit_ = ip + avail; 1644 ResetLimit(ip); 1645 } 1646 if (!writer->Append(ip, literal_length, &op)) goto exit; 1647 ip += literal_length; 1648 MAYBE_REFILL(); 1649 } else { 1650 if (SNAPPY_PREDICT_FALSE((c & 3) == COPY_4_BYTE_OFFSET)) { 1651 const size_t copy_offset = LittleEndian::Load32(ip); 1652 const size_t length = (c >> 2) + 1; 1653 ip += 4; 1654 1655 if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit; 1656 } else { 1657 const ptrdiff_t entry = kLengthMinusOffset[c]; 1658 preload = LittleEndian::Load32(ip); 1659 const uint32_t trailer = ExtractLowBytes(preload, c & 3); 1660 const uint32_t length = entry & 0xff; 1661 assert(length > 0); 1662 1663 // copy_offset/256 is encoded in bits 8..10. By just fetching 1664 // those bits, we get copy_offset (since the bit-field starts at 1665 // bit 8). 1666 const uint32_t copy_offset = trailer - entry + length; 1667 if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit; 1668 1669 ip += (c & 3); 1670 // By using the result of the previous load we reduce the critical 1671 // dependency chain of ip to 4 cycles. 1672 preload >>= (c & 3) * 8; 1673 if (ip < ip_limit_min_maxtaglen_) continue; 1674 } 1675 MAYBE_REFILL(); 1676 } 1677 } 1678 #undef MAYBE_REFILL 1679 exit: 1680 writer->SetOutputPtr(op); 1681 } 1682 }; 1683 1684 constexpr uint32_t CalculateNeeded(uint8_t tag) { 1685 return ((tag & 3) == 0 && tag >= (60 * 4)) 1686 ? (tag >> 2) - 58 1687 : (0x05030201 >> ((tag * 8) & 31)) & 0xFF; 1688 } 1689 1690 #if __cplusplus >= 201402L 1691 constexpr bool VerifyCalculateNeeded() { 1692 for (int i = 0; i < 1; i++) { 1693 if (CalculateNeeded(i) != static_cast<uint32_t>((char_table[i] >> 11)) + 1) 1694 return false; 1695 } 1696 return true; 1697 } 1698 1699 // Make sure CalculateNeeded is correct by verifying it against the established 1700 // table encoding the number of added bytes needed. 1701 static_assert(VerifyCalculateNeeded(), ""); 1702 #endif // c++14 1703 1704 bool SnappyDecompressor::RefillTag() { 1705 const char* ip = ip_; 1706 if (ip == ip_limit_) { 1707 // Fetch a new fragment from the reader 1708 reader_->Skip(peeked_); // All peeked bytes are used up 1709 size_t n; 1710 ip = reader_->Peek(&n); 1711 peeked_ = n; 1712 eof_ = (n == 0); 1713 if (eof_) return false; 1714 ip_limit_ = ip + n; 1715 } 1716 1717 // Read the tag character 1718 assert(ip < ip_limit_); 1719 const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip)); 1720 // At this point make sure that the data for the next tag is consecutive. 1721 // For copy 1 this means the next 2 bytes (tag and 1 byte offset) 1722 // For copy 2 the next 3 bytes (tag and 2 byte offset) 1723 // For copy 4 the next 5 bytes (tag and 4 byte offset) 1724 // For all small literals we only need 1 byte buf for literals 60...63 the 1725 // length is encoded in 1...4 extra bytes. 1726 const uint32_t needed = CalculateNeeded(c); 1727 assert(needed <= sizeof(scratch_)); 1728 1729 // Read more bytes from reader if needed 1730 uint64_t nbuf = ip_limit_ - ip; 1731 if (nbuf < needed) { 1732 // Stitch together bytes from ip and reader to form the word 1733 // contents. We store the needed bytes in "scratch_". They 1734 // will be consumed immediately by the caller since we do not 1735 // read more than we need. 1736 std::memmove(scratch_, ip, nbuf); 1737 reader_->Skip(peeked_); // All peeked bytes are used up 1738 peeked_ = 0; 1739 while (nbuf < needed) { 1740 size_t length; 1741 const char* src = reader_->Peek(&length); 1742 if (length == 0) return false; 1743 uint64_t to_add = std::min<uint64_t>(needed - nbuf, length); 1744 std::memcpy(scratch_ + nbuf, src, to_add); 1745 nbuf += to_add; 1746 reader_->Skip(to_add); 1747 } 1748 assert(nbuf == needed); 1749 ip_ = scratch_; 1750 ip_limit_ = scratch_ + needed; 1751 } else if (nbuf < kMaximumTagLength) { 1752 // Have enough bytes, but move into scratch_ so that we do not 1753 // read past end of input 1754 std::memmove(scratch_, ip, nbuf); 1755 reader_->Skip(peeked_); // All peeked bytes are used up 1756 peeked_ = 0; 1757 ip_ = scratch_; 1758 ip_limit_ = scratch_ + nbuf; 1759 } else { 1760 // Pass pointer to buffer returned by reader_. 1761 ip_ = ip; 1762 } 1763 return true; 1764 } 1765 1766 template <typename Writer> 1767 static bool InternalUncompress(Source* r, Writer* writer) { 1768 // Read the uncompressed length from the front of the compressed input 1769 SnappyDecompressor decompressor(r); 1770 uint32_t uncompressed_len = 0; 1771 if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false; 1772 1773 return InternalUncompressAllTags(&decompressor, writer, r->Available(), 1774 uncompressed_len); 1775 } 1776 1777 template <typename Writer> 1778 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor, 1779 Writer* writer, uint32_t compressed_len, 1780 uint32_t uncompressed_len) { 1781 int token = 0; 1782 Report(token, "snappy_uncompress", compressed_len, uncompressed_len); 1783 1784 writer->SetExpectedLength(uncompressed_len); 1785 1786 // Process the entire input 1787 decompressor->DecompressAllTags(writer); 1788 writer->Flush(); 1789 return (decompressor->eof() && writer->CheckLength()); 1790 } 1791 1792 bool GetUncompressedLength(Source* source, uint32_t* result) { 1793 SnappyDecompressor decompressor(source); 1794 return decompressor.ReadUncompressedLength(result); 1795 } 1796 1797 size_t Compress(Source* reader, Sink* writer) { 1798 return Compress(reader, writer, CompressionOptions{}); 1799 } 1800 1801 size_t Compress(Source* reader, Sink* writer, CompressionOptions options) { 1802 assert(options.level == 1 || options.level == 2); 1803 int token = 0; 1804 size_t written = 0; 1805 size_t N = reader->Available(); 1806 assert(N <= 0xFFFFFFFFu); 1807 const size_t uncompressed_size = N; 1808 char ulength[Varint::kMax32]; 1809 char* p = Varint::Encode32(ulength, N); 1810 writer->Append(ulength, p - ulength); 1811 written += (p - ulength); 1812 1813 internal::WorkingMemory wmem(N); 1814 1815 while (N > 0) { 1816 // Get next block to compress (without copying if possible) 1817 size_t fragment_size; 1818 const char* fragment = reader->Peek(&fragment_size); 1819 assert(fragment_size != 0); // premature end of input 1820 const size_t num_to_read = std::min(N, kBlockSize); 1821 size_t bytes_read = fragment_size; 1822 1823 size_t pending_advance = 0; 1824 if (bytes_read >= num_to_read) { 1825 // Buffer returned by reader is large enough 1826 pending_advance = num_to_read; 1827 fragment_size = num_to_read; 1828 } else { 1829 char* scratch = wmem.GetScratchInput(); 1830 std::memcpy(scratch, fragment, bytes_read); 1831 reader->Skip(bytes_read); 1832 1833 while (bytes_read < num_to_read) { 1834 fragment = reader->Peek(&fragment_size); 1835 size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read); 1836 std::memcpy(scratch + bytes_read, fragment, n); 1837 bytes_read += n; 1838 reader->Skip(n); 1839 } 1840 assert(bytes_read == num_to_read); 1841 fragment = scratch; 1842 fragment_size = num_to_read; 1843 } 1844 assert(fragment_size == num_to_read); 1845 1846 // Get encoding table for compression 1847 int table_size; 1848 uint16_t* table = wmem.GetHashTable(num_to_read, &table_size); 1849 1850 // Compress input_fragment and append to dest 1851 int max_output = MaxCompressedLength(num_to_read); 1852 1853 // Since we encode kBlockSize regions followed by a region 1854 // which is <= kBlockSize in length, a previously allocated 1855 // scratch_output[] region is big enough for this iteration. 1856 // Need a scratch buffer for the output, in case the byte sink doesn't 1857 // have room for us directly. 1858 char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput()); 1859 char* end = nullptr; 1860 if (options.level == 1) { 1861 end = internal::CompressFragment(fragment, fragment_size, dest, table, 1862 table_size); 1863 } else if (options.level == 2) { 1864 end = internal::CompressFragmentDoubleHash( 1865 fragment, fragment_size, dest, table, table_size >> 1, 1866 table + (table_size >> 1), table_size >> 1); 1867 } 1868 writer->Append(dest, end - dest); 1869 written += (end - dest); 1870 1871 N -= num_to_read; 1872 reader->Skip(pending_advance); 1873 } 1874 1875 Report(token, "snappy_compress", written, uncompressed_size); 1876 return written; 1877 } 1878 1879 // ----------------------------------------------------------------------- 1880 // IOVec interfaces 1881 // ----------------------------------------------------------------------- 1882 1883 // A `Source` implementation that yields the contents of an `iovec` array. Note 1884 // that `total_size` is the total number of bytes to be read from the elements 1885 // of `iov` (_not_ the total number of elements in `iov`). 1886 class SnappyIOVecReader : public Source { 1887 public: 1888 SnappyIOVecReader(const struct iovec* iov, size_t total_size) 1889 : curr_iov_(iov), 1890 curr_pos_(total_size > 0 ? reinterpret_cast<const char*>(iov->iov_base) 1891 : nullptr), 1892 curr_size_remaining_(total_size > 0 ? iov->iov_len : 0), 1893 total_size_remaining_(total_size) { 1894 // Skip empty leading `iovec`s. 1895 if (total_size > 0 && curr_size_remaining_ == 0) Advance(); 1896 } 1897 1898 ~SnappyIOVecReader() override = default; 1899 1900 size_t Available() const override { return total_size_remaining_; } 1901 1902 const char* Peek(size_t* len) override { 1903 *len = curr_size_remaining_; 1904 return curr_pos_; 1905 } 1906 1907 void Skip(size_t n) override { 1908 while (n >= curr_size_remaining_ && n > 0) { 1909 n -= curr_size_remaining_; 1910 Advance(); 1911 } 1912 curr_size_remaining_ -= n; 1913 total_size_remaining_ -= n; 1914 curr_pos_ += n; 1915 } 1916 1917 private: 1918 // Advances to the next nonempty `iovec` and updates related variables. 1919 void Advance() { 1920 do { 1921 assert(total_size_remaining_ >= curr_size_remaining_); 1922 total_size_remaining_ -= curr_size_remaining_; 1923 if (total_size_remaining_ == 0) { 1924 curr_pos_ = nullptr; 1925 curr_size_remaining_ = 0; 1926 return; 1927 } 1928 ++curr_iov_; 1929 curr_pos_ = reinterpret_cast<const char*>(curr_iov_->iov_base); 1930 curr_size_remaining_ = curr_iov_->iov_len; 1931 } while (curr_size_remaining_ == 0); 1932 } 1933 1934 // The `iovec` currently being read. 1935 const struct iovec* curr_iov_; 1936 // The location in `curr_iov_` currently being read. 1937 const char* curr_pos_; 1938 // The amount of unread data in `curr_iov_`. 1939 size_t curr_size_remaining_; 1940 // The amount of unread data in the entire input array. 1941 size_t total_size_remaining_; 1942 }; 1943 1944 // A type that writes to an iovec. 1945 // Note that this is not a "ByteSink", but a type that matches the 1946 // Writer template argument to SnappyDecompressor::DecompressAllTags(). 1947 class SnappyIOVecWriter { 1948 private: 1949 // output_iov_end_ is set to iov + count and used to determine when 1950 // the end of the iovs is reached. 1951 const struct iovec* output_iov_end_; 1952 1953 #if !defined(NDEBUG) 1954 const struct iovec* output_iov_; 1955 #endif // !defined(NDEBUG) 1956 1957 // Current iov that is being written into. 1958 const struct iovec* curr_iov_; 1959 1960 // Pointer to current iov's write location. 1961 char* curr_iov_output_; 1962 1963 // Remaining bytes to write into curr_iov_output. 1964 size_t curr_iov_remaining_; 1965 1966 // Total bytes decompressed into output_iov_ so far. 1967 size_t total_written_; 1968 1969 // Maximum number of bytes that will be decompressed into output_iov_. 1970 size_t output_limit_; 1971 1972 static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) { 1973 return reinterpret_cast<char*>(iov->iov_base) + offset; 1974 } 1975 1976 public: 1977 // Does not take ownership of iov. iov must be valid during the 1978 // entire lifetime of the SnappyIOVecWriter. 1979 inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count) 1980 : output_iov_end_(iov + iov_count), 1981 #if !defined(NDEBUG) 1982 output_iov_(iov), 1983 #endif // !defined(NDEBUG) 1984 curr_iov_(iov), 1985 curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base) 1986 : nullptr), 1987 curr_iov_remaining_(iov_count ? iov->iov_len : 0), 1988 total_written_(0), 1989 output_limit_(-1) { 1990 } 1991 1992 inline void SetExpectedLength(size_t len) { output_limit_ = len; } 1993 1994 inline bool CheckLength() const { return total_written_ == output_limit_; } 1995 1996 inline bool Append(const char* ip, size_t len, char**) { 1997 if (total_written_ + len > output_limit_) { 1998 return false; 1999 } 2000 2001 return AppendNoCheck(ip, len); 2002 } 2003 2004 char* GetOutputPtr() { return nullptr; } 2005 char* GetBase(ptrdiff_t*) { return nullptr; } 2006 void SetOutputPtr(char* op) { 2007 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 2008 (void)op; 2009 } 2010 2011 inline bool AppendNoCheck(const char* ip, size_t len) { 2012 while (len > 0) { 2013 if (curr_iov_remaining_ == 0) { 2014 // This iovec is full. Go to the next one. 2015 if (curr_iov_ + 1 >= output_iov_end_) { 2016 return false; 2017 } 2018 ++curr_iov_; 2019 curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base); 2020 curr_iov_remaining_ = curr_iov_->iov_len; 2021 } 2022 2023 const size_t to_write = std::min(len, curr_iov_remaining_); 2024 std::memcpy(curr_iov_output_, ip, to_write); 2025 curr_iov_output_ += to_write; 2026 curr_iov_remaining_ -= to_write; 2027 total_written_ += to_write; 2028 ip += to_write; 2029 len -= to_write; 2030 } 2031 2032 return true; 2033 } 2034 2035 inline bool TryFastAppend(const char* ip, size_t available, size_t len, 2036 char**) { 2037 const size_t space_left = output_limit_ - total_written_; 2038 if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 && 2039 curr_iov_remaining_ >= 16) { 2040 // Fast path, used for the majority (about 95%) of invocations. 2041 UnalignedCopy128(ip, curr_iov_output_); 2042 curr_iov_output_ += len; 2043 curr_iov_remaining_ -= len; 2044 total_written_ += len; 2045 return true; 2046 } 2047 2048 return false; 2049 } 2050 2051 inline bool AppendFromSelf(size_t offset, size_t len, char**) { 2052 // See SnappyArrayWriter::AppendFromSelf for an explanation of 2053 // the "offset - 1u" trick. 2054 if (offset - 1u >= total_written_) { 2055 return false; 2056 } 2057 const size_t space_left = output_limit_ - total_written_; 2058 if (len > space_left) { 2059 return false; 2060 } 2061 2062 // Locate the iovec from which we need to start the copy. 2063 const iovec* from_iov = curr_iov_; 2064 size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_; 2065 while (offset > 0) { 2066 if (from_iov_offset >= offset) { 2067 from_iov_offset -= offset; 2068 break; 2069 } 2070 2071 offset -= from_iov_offset; 2072 --from_iov; 2073 #if !defined(NDEBUG) 2074 assert(from_iov >= output_iov_); 2075 #endif // !defined(NDEBUG) 2076 from_iov_offset = from_iov->iov_len; 2077 } 2078 2079 // Copy <len> bytes starting from the iovec pointed to by from_iov_index to 2080 // the current iovec. 2081 while (len > 0) { 2082 assert(from_iov <= curr_iov_); 2083 if (from_iov != curr_iov_) { 2084 const size_t to_copy = 2085 std::min(from_iov->iov_len - from_iov_offset, len); 2086 AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy); 2087 len -= to_copy; 2088 if (len > 0) { 2089 ++from_iov; 2090 from_iov_offset = 0; 2091 } 2092 } else { 2093 size_t to_copy = curr_iov_remaining_; 2094 if (to_copy == 0) { 2095 // This iovec is full. Go to the next one. 2096 if (curr_iov_ + 1 >= output_iov_end_) { 2097 return false; 2098 } 2099 ++curr_iov_; 2100 curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base); 2101 curr_iov_remaining_ = curr_iov_->iov_len; 2102 continue; 2103 } 2104 if (to_copy > len) { 2105 to_copy = len; 2106 } 2107 assert(to_copy > 0); 2108 2109 IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset), 2110 curr_iov_output_, curr_iov_output_ + to_copy, 2111 curr_iov_output_ + curr_iov_remaining_); 2112 curr_iov_output_ += to_copy; 2113 curr_iov_remaining_ -= to_copy; 2114 from_iov_offset += to_copy; 2115 total_written_ += to_copy; 2116 len -= to_copy; 2117 } 2118 } 2119 2120 return true; 2121 } 2122 2123 inline void Flush() {} 2124 }; 2125 2126 bool RawUncompressToIOVec(const char* compressed, size_t compressed_length, 2127 const struct iovec* iov, size_t iov_cnt) { 2128 ByteArraySource reader(compressed, compressed_length); 2129 return RawUncompressToIOVec(&reader, iov, iov_cnt); 2130 } 2131 2132 bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov, 2133 size_t iov_cnt) { 2134 SnappyIOVecWriter output(iov, iov_cnt); 2135 return InternalUncompress(compressed, &output); 2136 } 2137 2138 // ----------------------------------------------------------------------- 2139 // Flat array interfaces 2140 // ----------------------------------------------------------------------- 2141 2142 // A type that writes to a flat array. 2143 // Note that this is not a "ByteSink", but a type that matches the 2144 // Writer template argument to SnappyDecompressor::DecompressAllTags(). 2145 class SnappyArrayWriter { 2146 private: 2147 char* base_; 2148 char* op_; 2149 char* op_limit_; 2150 // If op < op_limit_min_slop_ then it's safe to unconditionally write 2151 // kSlopBytes starting at op. 2152 char* op_limit_min_slop_; 2153 2154 public: 2155 inline explicit SnappyArrayWriter(char* dst) 2156 : base_(dst), 2157 op_(dst), 2158 op_limit_(dst), 2159 op_limit_min_slop_(dst) {} // Safe default see invariant. 2160 2161 inline void SetExpectedLength(size_t len) { 2162 op_limit_ = op_ + len; 2163 // Prevent pointer from being past the buffer. 2164 op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, len); 2165 } 2166 2167 inline bool CheckLength() const { return op_ == op_limit_; } 2168 2169 char* GetOutputPtr() { return op_; } 2170 char* GetBase(ptrdiff_t* op_limit_min_slop) { 2171 *op_limit_min_slop = op_limit_min_slop_ - base_; 2172 return base_; 2173 } 2174 void SetOutputPtr(char* op) { op_ = op; } 2175 2176 inline bool Append(const char* ip, size_t len, char** op_p) { 2177 char* op = *op_p; 2178 const size_t space_left = op_limit_ - op; 2179 if (space_left < len) return false; 2180 std::memcpy(op, ip, len); 2181 *op_p = op + len; 2182 return true; 2183 } 2184 2185 inline bool TryFastAppend(const char* ip, size_t available, size_t len, 2186 char** op_p) { 2187 char* op = *op_p; 2188 const size_t space_left = op_limit_ - op; 2189 if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) { 2190 // Fast path, used for the majority (about 95%) of invocations. 2191 UnalignedCopy128(ip, op); 2192 *op_p = op + len; 2193 return true; 2194 } else { 2195 return false; 2196 } 2197 } 2198 2199 SNAPPY_ATTRIBUTE_ALWAYS_INLINE 2200 inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { 2201 assert(len > 0); 2202 char* const op = *op_p; 2203 assert(op >= base_); 2204 char* const op_end = op + len; 2205 2206 // Check if we try to append from before the start of the buffer. 2207 if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - base_) < offset)) 2208 return false; 2209 2210 if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) || 2211 op >= op_limit_min_slop_ || offset < len)) { 2212 if (op_end > op_limit_ || offset == 0) return false; 2213 *op_p = IncrementalCopy(op - offset, op, op_end, op_limit_); 2214 return true; 2215 } 2216 std::memmove(op, op - offset, kSlopBytes); 2217 *op_p = op_end; 2218 return true; 2219 } 2220 inline size_t Produced() const { 2221 assert(op_ >= base_); 2222 return op_ - base_; 2223 } 2224 inline void Flush() {} 2225 }; 2226 2227 bool RawUncompress(const char* compressed, size_t compressed_length, 2228 char* uncompressed) { 2229 ByteArraySource reader(compressed, compressed_length); 2230 return RawUncompress(&reader, uncompressed); 2231 } 2232 2233 bool RawUncompress(Source* compressed, char* uncompressed) { 2234 SnappyArrayWriter output(uncompressed); 2235 return InternalUncompress(compressed, &output); 2236 } 2237 2238 bool Uncompress(const char* compressed, size_t compressed_length, 2239 std::string* uncompressed) { 2240 size_t ulength; 2241 if (!GetUncompressedLength(compressed, compressed_length, &ulength)) { 2242 return false; 2243 } 2244 // On 32-bit builds: max_size() < kuint32max. Check for that instead 2245 // of crashing (e.g., consider externally specified compressed data). 2246 if (ulength > uncompressed->max_size()) { 2247 return false; 2248 } 2249 STLStringResizeUninitialized(uncompressed, ulength); 2250 return RawUncompress(compressed, compressed_length, 2251 string_as_array(uncompressed)); 2252 } 2253 2254 // A Writer that drops everything on the floor and just does validation 2255 class SnappyDecompressionValidator { 2256 private: 2257 size_t expected_; 2258 size_t produced_; 2259 2260 public: 2261 inline SnappyDecompressionValidator() : expected_(0), produced_(0) {} 2262 inline void SetExpectedLength(size_t len) { expected_ = len; } 2263 size_t GetOutputPtr() { return produced_; } 2264 size_t GetBase(ptrdiff_t* op_limit_min_slop) { 2265 *op_limit_min_slop = std::numeric_limits<ptrdiff_t>::max() - kSlopBytes + 1; 2266 return 1; 2267 } 2268 void SetOutputPtr(size_t op) { produced_ = op; } 2269 inline bool CheckLength() const { return expected_ == produced_; } 2270 inline bool Append(const char* ip, size_t len, size_t* produced) { 2271 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 2272 (void)ip; 2273 2274 *produced += len; 2275 return *produced <= expected_; 2276 } 2277 inline bool TryFastAppend(const char* ip, size_t available, size_t length, 2278 size_t* produced) { 2279 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 2280 (void)ip; 2281 (void)available; 2282 (void)length; 2283 (void)produced; 2284 2285 return false; 2286 } 2287 inline bool AppendFromSelf(size_t offset, size_t len, size_t* produced) { 2288 // See SnappyArrayWriter::AppendFromSelf for an explanation of 2289 // the "offset - 1u" trick. 2290 if (*produced <= offset - 1u) return false; 2291 *produced += len; 2292 return *produced <= expected_; 2293 } 2294 inline void Flush() {} 2295 }; 2296 2297 bool IsValidCompressedBuffer(const char* compressed, size_t compressed_length) { 2298 ByteArraySource reader(compressed, compressed_length); 2299 SnappyDecompressionValidator writer; 2300 return InternalUncompress(&reader, &writer); 2301 } 2302 2303 bool IsValidCompressed(Source* compressed) { 2304 SnappyDecompressionValidator writer; 2305 return InternalUncompress(compressed, &writer); 2306 } 2307 2308 void RawCompress(const char* input, size_t input_length, char* compressed, 2309 size_t* compressed_length) { 2310 RawCompress(input, input_length, compressed, compressed_length, 2311 CompressionOptions{}); 2312 } 2313 2314 void RawCompress(const char* input, size_t input_length, char* compressed, 2315 size_t* compressed_length, CompressionOptions options) { 2316 ByteArraySource reader(input, input_length); 2317 UncheckedByteArraySink writer(compressed); 2318 Compress(&reader, &writer, options); 2319 2320 // Compute how many bytes were added 2321 *compressed_length = (writer.CurrentDestination() - compressed); 2322 } 2323 2324 void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length, 2325 char* compressed, size_t* compressed_length) { 2326 RawCompressFromIOVec(iov, uncompressed_length, compressed, compressed_length, 2327 CompressionOptions{}); 2328 } 2329 2330 void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length, 2331 char* compressed, size_t* compressed_length, 2332 CompressionOptions options) { 2333 SnappyIOVecReader reader(iov, uncompressed_length); 2334 UncheckedByteArraySink writer(compressed); 2335 Compress(&reader, &writer, options); 2336 2337 // Compute how many bytes were added. 2338 *compressed_length = writer.CurrentDestination() - compressed; 2339 } 2340 2341 size_t Compress(const char* input, size_t input_length, 2342 std::string* compressed) { 2343 return Compress(input, input_length, compressed, CompressionOptions{}); 2344 } 2345 2346 size_t Compress(const char* input, size_t input_length, std::string* compressed, 2347 CompressionOptions options) { 2348 // Pre-grow the buffer to the max length of the compressed output 2349 STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length)); 2350 2351 size_t compressed_length; 2352 RawCompress(input, input_length, string_as_array(compressed), 2353 &compressed_length, options); 2354 compressed->erase(compressed_length); 2355 return compressed_length; 2356 } 2357 2358 size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt, 2359 std::string* compressed) { 2360 return CompressFromIOVec(iov, iov_cnt, compressed, CompressionOptions{}); 2361 } 2362 2363 size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt, 2364 std::string* compressed, CompressionOptions options) { 2365 // Compute the number of bytes to be compressed. 2366 size_t uncompressed_length = 0; 2367 for (size_t i = 0; i < iov_cnt; ++i) { 2368 uncompressed_length += iov[i].iov_len; 2369 } 2370 2371 // Pre-grow the buffer to the max length of the compressed output. 2372 STLStringResizeUninitialized(compressed, MaxCompressedLength( 2373 uncompressed_length)); 2374 2375 size_t compressed_length; 2376 RawCompressFromIOVec(iov, uncompressed_length, string_as_array(compressed), 2377 &compressed_length, options); 2378 compressed->erase(compressed_length); 2379 return compressed_length; 2380 } 2381 2382 // ----------------------------------------------------------------------- 2383 // Sink interface 2384 // ----------------------------------------------------------------------- 2385 2386 // A type that decompresses into a Sink. The template parameter 2387 // Allocator must export one method "char* Allocate(int size);", which 2388 // allocates a buffer of "size" and appends that to the destination. 2389 template <typename Allocator> 2390 class SnappyScatteredWriter { 2391 Allocator allocator_; 2392 2393 // We need random access into the data generated so far. Therefore 2394 // we keep track of all of the generated data as an array of blocks. 2395 // All of the blocks except the last have length kBlockSize. 2396 std::vector<char*> blocks_; 2397 size_t expected_; 2398 2399 // Total size of all fully generated blocks so far 2400 size_t full_size_; 2401 2402 // Pointer into current output block 2403 char* op_base_; // Base of output block 2404 char* op_ptr_; // Pointer to next unfilled byte in block 2405 char* op_limit_; // Pointer just past block 2406 // If op < op_limit_min_slop_ then it's safe to unconditionally write 2407 // kSlopBytes starting at op. 2408 char* op_limit_min_slop_; 2409 2410 inline size_t Size() const { return full_size_ + (op_ptr_ - op_base_); } 2411 2412 bool SlowAppend(const char* ip, size_t len); 2413 bool SlowAppendFromSelf(size_t offset, size_t len); 2414 2415 public: 2416 inline explicit SnappyScatteredWriter(const Allocator& allocator) 2417 : allocator_(allocator), 2418 full_size_(0), 2419 op_base_(NULL), 2420 op_ptr_(NULL), 2421 op_limit_(NULL), 2422 op_limit_min_slop_(NULL) {} 2423 char* GetOutputPtr() { return op_ptr_; } 2424 char* GetBase(ptrdiff_t* op_limit_min_slop) { 2425 *op_limit_min_slop = op_limit_min_slop_ - op_base_; 2426 return op_base_; 2427 } 2428 void SetOutputPtr(char* op) { op_ptr_ = op; } 2429 2430 inline void SetExpectedLength(size_t len) { 2431 assert(blocks_.empty()); 2432 expected_ = len; 2433 } 2434 2435 inline bool CheckLength() const { return Size() == expected_; } 2436 2437 // Return the number of bytes actually uncompressed so far 2438 inline size_t Produced() const { return Size(); } 2439 2440 inline bool Append(const char* ip, size_t len, char** op_p) { 2441 char* op = *op_p; 2442 size_t avail = op_limit_ - op; 2443 if (len <= avail) { 2444 // Fast path 2445 std::memcpy(op, ip, len); 2446 *op_p = op + len; 2447 return true; 2448 } else { 2449 op_ptr_ = op; 2450 bool res = SlowAppend(ip, len); 2451 *op_p = op_ptr_; 2452 return res; 2453 } 2454 } 2455 2456 inline bool TryFastAppend(const char* ip, size_t available, size_t length, 2457 char** op_p) { 2458 char* op = *op_p; 2459 const int space_left = op_limit_ - op; 2460 if (length <= 16 && available >= 16 + kMaximumTagLength && 2461 space_left >= 16) { 2462 // Fast path, used for the majority (about 95%) of invocations. 2463 UnalignedCopy128(ip, op); 2464 *op_p = op + length; 2465 return true; 2466 } else { 2467 return false; 2468 } 2469 } 2470 2471 inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) { 2472 char* op = *op_p; 2473 assert(op >= op_base_); 2474 // Check if we try to append from before the start of the buffer. 2475 if (SNAPPY_PREDICT_FALSE((kSlopBytes < 64 && len > kSlopBytes) || 2476 static_cast<size_t>(op - op_base_) < offset || 2477 op >= op_limit_min_slop_ || offset < len)) { 2478 if (offset == 0) return false; 2479 if (SNAPPY_PREDICT_FALSE(static_cast<size_t>(op - op_base_) < offset || 2480 op + len > op_limit_)) { 2481 op_ptr_ = op; 2482 bool res = SlowAppendFromSelf(offset, len); 2483 *op_p = op_ptr_; 2484 return res; 2485 } 2486 *op_p = IncrementalCopy(op - offset, op, op + len, op_limit_); 2487 return true; 2488 } 2489 // Fast path 2490 char* const op_end = op + len; 2491 std::memmove(op, op - offset, kSlopBytes); 2492 *op_p = op_end; 2493 return true; 2494 } 2495 2496 // Called at the end of the decompress. We ask the allocator 2497 // write all blocks to the sink. 2498 inline void Flush() { allocator_.Flush(Produced()); } 2499 }; 2500 2501 template <typename Allocator> 2502 bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) { 2503 size_t avail = op_limit_ - op_ptr_; 2504 while (len > avail) { 2505 // Completely fill this block 2506 std::memcpy(op_ptr_, ip, avail); 2507 op_ptr_ += avail; 2508 assert(op_limit_ - op_ptr_ == 0); 2509 full_size_ += (op_ptr_ - op_base_); 2510 len -= avail; 2511 ip += avail; 2512 2513 // Bounds check 2514 if (full_size_ + len > expected_) return false; 2515 2516 // Make new block 2517 size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_); 2518 op_base_ = allocator_.Allocate(bsize); 2519 op_ptr_ = op_base_; 2520 op_limit_ = op_base_ + bsize; 2521 op_limit_min_slop_ = op_limit_ - std::min<size_t>(kSlopBytes - 1, bsize); 2522 2523 blocks_.push_back(op_base_); 2524 avail = bsize; 2525 } 2526 2527 std::memcpy(op_ptr_, ip, len); 2528 op_ptr_ += len; 2529 return true; 2530 } 2531 2532 template <typename Allocator> 2533 bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset, 2534 size_t len) { 2535 // Overflow check 2536 // See SnappyArrayWriter::AppendFromSelf for an explanation of 2537 // the "offset - 1u" trick. 2538 const size_t cur = Size(); 2539 if (offset - 1u >= cur) return false; 2540 if (expected_ - cur < len) return false; 2541 2542 // Currently we shouldn't ever hit this path because Compress() chops the 2543 // input into blocks and does not create cross-block copies. However, it is 2544 // nice if we do not rely on that, since we can get better compression if we 2545 // allow cross-block copies and thus might want to change the compressor in 2546 // the future. 2547 // TODO Replace this with a properly optimized path. This is not 2548 // triggered right now. But this is so super slow, that it would regress 2549 // performance unacceptably if triggered. 2550 size_t src = cur - offset; 2551 char* op = op_ptr_; 2552 while (len-- > 0) { 2553 char c = blocks_[src >> kBlockLog][src & (kBlockSize - 1)]; 2554 if (!Append(&c, 1, &op)) { 2555 op_ptr_ = op; 2556 return false; 2557 } 2558 src++; 2559 } 2560 op_ptr_ = op; 2561 return true; 2562 } 2563 2564 class SnappySinkAllocator { 2565 public: 2566 explicit SnappySinkAllocator(Sink* dest) : dest_(dest) {} 2567 2568 char* Allocate(int size) { 2569 Datablock block(new char[size], size); 2570 blocks_.push_back(block); 2571 return block.data; 2572 } 2573 2574 // We flush only at the end, because the writer wants 2575 // random access to the blocks and once we hand the 2576 // block over to the sink, we can't access it anymore. 2577 // Also we don't write more than has been actually written 2578 // to the blocks. 2579 void Flush(size_t size) { 2580 size_t size_written = 0; 2581 for (Datablock& block : blocks_) { 2582 size_t block_size = std::min<size_t>(block.size, size - size_written); 2583 dest_->AppendAndTakeOwnership(block.data, block_size, 2584 &SnappySinkAllocator::Deleter, NULL); 2585 size_written += block_size; 2586 } 2587 blocks_.clear(); 2588 } 2589 2590 private: 2591 struct Datablock { 2592 char* data; 2593 size_t size; 2594 Datablock(char* p, size_t s) : data(p), size(s) {} 2595 }; 2596 2597 static void Deleter(void* arg, const char* bytes, size_t size) { 2598 // TODO: Switch to [[maybe_unused]] when we can assume C++17. 2599 (void)arg; 2600 (void)size; 2601 2602 delete[] bytes; 2603 } 2604 2605 Sink* dest_; 2606 std::vector<Datablock> blocks_; 2607 2608 // Note: copying this object is allowed 2609 }; 2610 2611 size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) { 2612 SnappySinkAllocator allocator(uncompressed); 2613 SnappyScatteredWriter<SnappySinkAllocator> writer(allocator); 2614 InternalUncompress(compressed, &writer); 2615 return writer.Produced(); 2616 } 2617 2618 bool Uncompress(Source* compressed, Sink* uncompressed) { 2619 // Read the uncompressed length from the front of the compressed input 2620 SnappyDecompressor decompressor(compressed); 2621 uint32_t uncompressed_len = 0; 2622 if (!decompressor.ReadUncompressedLength(&uncompressed_len)) { 2623 return false; 2624 } 2625 2626 char c; 2627 size_t allocated_size; 2628 char* buf = uncompressed->GetAppendBufferVariable(1, uncompressed_len, &c, 1, 2629 &allocated_size); 2630 2631 const size_t compressed_len = compressed->Available(); 2632 // If we can get a flat buffer, then use it, otherwise do block by block 2633 // uncompression 2634 if (allocated_size >= uncompressed_len) { 2635 SnappyArrayWriter writer(buf); 2636 bool result = InternalUncompressAllTags(&decompressor, &writer, 2637 compressed_len, uncompressed_len); 2638 uncompressed->Append(buf, writer.Produced()); 2639 return result; 2640 } else { 2641 SnappySinkAllocator allocator(uncompressed); 2642 SnappyScatteredWriter<SnappySinkAllocator> writer(allocator); 2643 return InternalUncompressAllTags(&decompressor, &writer, compressed_len, 2644 uncompressed_len); 2645 } 2646 } 2647 2648 } // namespace snappy