snappy-internal.h (16857B)
1 // Copyright 2008 Google Inc. All Rights Reserved. 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google Inc. nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 // 29 // Internals shared between the Snappy implementation and its unittest. 30 31 #ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_ 32 #define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_ 33 34 #include <utility> 35 36 #include "snappy-stubs-internal.h" 37 38 #if SNAPPY_HAVE_SSSE3 39 // Please do not replace with <x86intrin.h> or with headers that assume more 40 // advanced SSE versions without checking with all the OWNERS. 41 #include <emmintrin.h> 42 #include <tmmintrin.h> 43 #endif 44 45 #if SNAPPY_HAVE_NEON 46 #include <arm_neon.h> 47 #endif 48 49 #if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON 50 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1 51 #else 52 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0 53 #endif 54 55 namespace snappy { 56 namespace internal { 57 58 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 59 #if SNAPPY_HAVE_SSSE3 60 using V128 = __m128i; 61 #elif SNAPPY_HAVE_NEON 62 using V128 = uint8x16_t; 63 #endif 64 65 // Load 128 bits of integer data. `src` must be 16-byte aligned. 66 inline V128 V128_Load(const V128* src); 67 68 // Load 128 bits of integer data. `src` does not need to be aligned. 69 inline V128 V128_LoadU(const V128* src); 70 71 // Store 128 bits of integer data. `dst` does not need to be aligned. 72 inline void V128_StoreU(V128* dst, V128 val); 73 74 // Shuffle packed 8-bit integers using a shuffle mask. 75 // Each packed integer in the shuffle mask must be in [0,16). 76 inline V128 V128_Shuffle(V128 input, V128 shuffle_mask); 77 78 // Constructs V128 with 16 chars |c|. 79 inline V128 V128_DupChar(char c); 80 81 #if SNAPPY_HAVE_SSSE3 82 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } 83 84 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } 85 86 inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); } 87 88 inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { 89 return _mm_shuffle_epi8(input, shuffle_mask); 90 } 91 92 inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); } 93 94 #elif SNAPPY_HAVE_NEON 95 inline V128 V128_Load(const V128* src) { 96 return vld1q_u8(reinterpret_cast<const uint8_t*>(src)); 97 } 98 99 inline V128 V128_LoadU(const V128* src) { 100 return vld1q_u8(reinterpret_cast<const uint8_t*>(src)); 101 } 102 103 inline void V128_StoreU(V128* dst, V128 val) { 104 vst1q_u8(reinterpret_cast<uint8_t*>(dst), val); 105 } 106 107 inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { 108 assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15); 109 return vqtbl1q_u8(input, shuffle_mask); 110 } 111 112 inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } 113 #endif 114 #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 115 116 // Working memory performs a single allocation to hold all scratch space 117 // required for compression. 118 class WorkingMemory { 119 public: 120 explicit WorkingMemory(size_t input_size); 121 ~WorkingMemory(); 122 123 // Allocates and clears a hash table using memory in "*this", 124 // stores the number of buckets in "*table_size" and returns a pointer to 125 // the base of the hash table. 126 uint16_t* GetHashTable(size_t fragment_size, int* table_size) const; 127 char* GetScratchInput() const { return input_; } 128 char* GetScratchOutput() const { return output_; } 129 130 private: 131 char* mem_; // the allocated memory, never nullptr 132 size_t size_; // the size of the allocated memory, never 0 133 uint16_t* table_; // the pointer to the hashtable 134 char* input_; // the pointer to the input scratch buffer 135 char* output_; // the pointer to the output scratch buffer 136 137 // No copying 138 WorkingMemory(const WorkingMemory&); 139 void operator=(const WorkingMemory&); 140 }; 141 142 // Flat array compression that does not emit the "uncompressed length" 143 // prefix. Compresses "input" string to the "*op" buffer. 144 // 145 // REQUIRES: "input_length <= kBlockSize" 146 // REQUIRES: "op" points to an array of memory that is at least 147 // "MaxCompressedLength(input_length)" in size. 148 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero. 149 // REQUIRES: "table_size" is a power of two 150 // 151 // Returns an "end" pointer into "op" buffer. 152 // "end - op" is the compressed size of "input". 153 char* CompressFragment(const char* input, 154 size_t input_length, 155 char* op, 156 uint16_t* table, 157 const int table_size); 158 159 // Find the largest n such that 160 // 161 // s1[0,n-1] == s2[0,n-1] 162 // and n <= (s2_limit - s2). 163 // 164 // Return make_pair(n, n < 8). 165 // Does not read *s2_limit or beyond. 166 // Does not read *(s1 + (s2_limit - s2)) or beyond. 167 // Requires that s2_limit >= s2. 168 // 169 // In addition populate *data with the next 5 bytes from the end of the match. 170 // This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is 171 // that on some arch's this can be done faster in this routine than subsequent 172 // loading from s2 + n. 173 // 174 // Separate implementation for 64-bit, little-endian cpus. 175 #if !SNAPPY_IS_BIG_ENDIAN && \ 176 (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \ 177 defined(ARCH_ARM)) 178 static inline std::pair<size_t, bool> FindMatchLength(const char* s1, 179 const char* s2, 180 const char* s2_limit, 181 uint64_t* data) { 182 assert(s2_limit >= s2); 183 size_t matched = 0; 184 185 // This block isn't necessary for correctness; we could just start looping 186 // immediately. As an optimization though, it is useful. It creates some not 187 // uncommon code paths that determine, without extra effort, whether the match 188 // length is less than 8. In short, we are hoping to avoid a conditional 189 // branch, and perhaps get better code layout from the C++ compiler. 190 if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) { 191 uint64_t a1 = UNALIGNED_LOAD64(s1); 192 uint64_t a2 = UNALIGNED_LOAD64(s2); 193 if (SNAPPY_PREDICT_TRUE(a1 != a2)) { 194 // This code is critical for performance. The reason is that it determines 195 // how much to advance `ip` (s2). This obviously depends on both the loads 196 // from the `candidate` (s1) and `ip`. Furthermore the next `candidate` 197 // depends on the advanced `ip` calculated here through a load, hash and 198 // new candidate hash lookup (a lot of cycles). This makes s1 (ie. 199 // `candidate`) the variable that limits throughput. This is the reason we 200 // go through hoops to have this function update `data` for the next iter. 201 // The straightforward code would use *data, given by 202 // 203 // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles), 204 // 205 // as input for the hash table lookup to find next candidate. However 206 // this forces the load on the data dependency chain of s1, because 207 // matched_bytes directly depends on s1. However matched_bytes is 0..7, so 208 // we can also calculate *data by 209 // 210 // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8), 211 // matched_bytes); 212 // 213 // The loads do not depend on s1 anymore and are thus off the bottleneck. 214 // The straightforward implementation on x86_64 would be to use 215 // 216 // shrd rax, rdx, cl (cl being matched_bytes * 8) 217 // 218 // unfortunately shrd with a variable shift has a 4 cycle latency. So this 219 // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable 220 // shift instruction but can only shift 64 bits. If we focus on just 221 // obtaining the least significant 4 bytes, we can obtain this by 222 // 223 // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2), 224 // UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8); 225 // 226 // Writen like above this is not a big win, the conditional move would be 227 // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle). 228 // However matched_bytes < 4 is equal to 229 // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional 230 // move (2 cycles) can execute in parallel with FindLSBSetNonZero64 231 // (tzcnt), which takes 3 cycles. 232 uint64_t xorval = a1 ^ a2; 233 int shift = Bits::FindLSBSetNonZero64(xorval); 234 size_t matched_bytes = shift >> 3; 235 uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); 236 #ifndef __x86_64__ 237 a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; 238 #else 239 // Ideally this would just be 240 // 241 // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; 242 // 243 // However clang correctly infers that the above statement participates on 244 // a critical data dependency chain and thus, unfortunately, refuses to 245 // use a conditional move (it's tuned to cut data dependencies). In this 246 // case there is a longer parallel chain anyway AND this will be fairly 247 // unpredictable. 248 asm("testl %k2, %k2\n\t" 249 "cmovzq %1, %0\n\t" 250 : "+r"(a2) 251 : "r"(a3), "r"(xorval) 252 : "cc"); 253 #endif 254 *data = a2 >> (shift & (3 * 8)); 255 return std::pair<size_t, bool>(matched_bytes, true); 256 } else { 257 matched = 8; 258 s2 += 8; 259 } 260 } 261 SNAPPY_PREFETCH(s1 + 64); 262 SNAPPY_PREFETCH(s2 + 64); 263 264 // Find out how long the match is. We loop over the data 64 bits at a 265 // time until we find a 64-bit block that doesn't match; then we find 266 // the first non-matching bit and use that to calculate the total 267 // length of the match. 268 while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) { 269 uint64_t a1 = UNALIGNED_LOAD64(s1 + matched); 270 uint64_t a2 = UNALIGNED_LOAD64(s2); 271 if (a1 == a2) { 272 s2 += 8; 273 matched += 8; 274 } else { 275 uint64_t xorval = a1 ^ a2; 276 int shift = Bits::FindLSBSetNonZero64(xorval); 277 size_t matched_bytes = shift >> 3; 278 uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); 279 #ifndef __x86_64__ 280 a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; 281 #else 282 asm("testl %k2, %k2\n\t" 283 "cmovzq %1, %0\n\t" 284 : "+r"(a2) 285 : "r"(a3), "r"(xorval) 286 : "cc"); 287 #endif 288 *data = a2 >> (shift & (3 * 8)); 289 matched += matched_bytes; 290 assert(matched >= 8); 291 return std::pair<size_t, bool>(matched, false); 292 } 293 } 294 while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) { 295 if (s1[matched] == *s2) { 296 ++s2; 297 ++matched; 298 } else { 299 if (s2 <= s2_limit - 8) { 300 *data = UNALIGNED_LOAD64(s2); 301 } 302 return std::pair<size_t, bool>(matched, matched < 8); 303 } 304 } 305 return std::pair<size_t, bool>(matched, matched < 8); 306 } 307 #else 308 static inline std::pair<size_t, bool> FindMatchLength(const char* s1, 309 const char* s2, 310 const char* s2_limit, 311 uint64_t* data) { 312 // Implementation based on the x86-64 version, above. 313 assert(s2_limit >= s2); 314 int matched = 0; 315 316 while (s2 <= s2_limit - 4 && 317 UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) { 318 s2 += 4; 319 matched += 4; 320 } 321 if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) { 322 uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched); 323 int matching_bits = Bits::FindLSBSetNonZero(x); 324 matched += matching_bits >> 3; 325 s2 += matching_bits >> 3; 326 } else { 327 while ((s2 < s2_limit) && (s1[matched] == *s2)) { 328 ++s2; 329 ++matched; 330 } 331 } 332 if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2); 333 return std::pair<size_t, bool>(matched, matched < 8); 334 } 335 #endif 336 337 static inline size_t FindMatchLengthPlain(const char* s1, const char* s2, 338 const char* s2_limit) { 339 // Implementation based on the x86-64 version, above. 340 assert(s2_limit >= s2); 341 int matched = 0; 342 343 while (s2 <= s2_limit - 8 && 344 UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) { 345 s2 += 8; 346 matched += 8; 347 } 348 if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 8) { 349 uint64_t x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched); 350 int matching_bits = Bits::FindLSBSetNonZero64(x); 351 matched += matching_bits >> 3; 352 s2 += matching_bits >> 3; 353 } else { 354 while ((s2 < s2_limit) && (s1[matched] == *s2)) { 355 ++s2; 356 ++matched; 357 } 358 } 359 return matched; 360 } 361 362 // Lookup tables for decompression code. Give --snappy_dump_decompression_table 363 // to the unit test to recompute char_table. 364 365 enum { 366 LITERAL = 0, 367 COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode 368 COPY_2_BYTE_OFFSET = 2, 369 COPY_4_BYTE_OFFSET = 3 370 }; 371 static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset. 372 373 // Data stored per entry in lookup table: 374 // Range Bits-used Description 375 // ------------------------------------ 376 // 1..64 0..7 Literal/copy length encoded in opcode byte 377 // 0..7 8..10 Copy offset encoded in opcode byte / 256 378 // 0..4 11..13 Extra bytes after opcode 379 // 380 // We use eight bits for the length even though 7 would have sufficed 381 // because of efficiency reasons: 382 // (1) Extracting a byte is faster than a bit-field 383 // (2) It properly aligns copy offset so we do not need a <<8 384 static constexpr uint16_t char_table[256] = { 385 // clang-format off 386 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, 387 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, 388 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, 389 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, 390 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, 391 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, 392 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, 393 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, 394 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, 395 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, 396 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, 397 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, 398 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, 399 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, 400 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, 401 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, 402 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, 403 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, 404 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, 405 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, 406 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, 407 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, 408 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, 409 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, 410 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, 411 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, 412 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, 413 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, 414 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, 415 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, 416 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, 417 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040, 418 // clang-format on 419 }; 420 421 } // end namespace internal 422 } // end namespace snappy 423 424 #endif // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_