tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

snappy-internal.h (16857B)


      1 // Copyright 2008 Google Inc. All Rights Reserved.
      2 //
      3 // Redistribution and use in source and binary forms, with or without
      4 // modification, are permitted provided that the following conditions are
      5 // met:
      6 //
      7 //     * Redistributions of source code must retain the above copyright
      8 // notice, this list of conditions and the following disclaimer.
      9 //     * Redistributions in binary form must reproduce the above
     10 // copyright notice, this list of conditions and the following disclaimer
     11 // in the documentation and/or other materials provided with the
     12 // distribution.
     13 //     * Neither the name of Google Inc. nor the names of its
     14 // contributors may be used to endorse or promote products derived from
     15 // this software without specific prior written permission.
     16 //
     17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 //
     29 // Internals shared between the Snappy implementation and its unittest.
     30 
     31 #ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
     32 #define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
     33 
     34 #include <utility>
     35 
     36 #include "snappy-stubs-internal.h"
     37 
     38 #if SNAPPY_HAVE_SSSE3
     39 // Please do not replace with <x86intrin.h> or with headers that assume more
     40 // advanced SSE versions without checking with all the OWNERS.
     41 #include <emmintrin.h>
     42 #include <tmmintrin.h>
     43 #endif
     44 
     45 #if SNAPPY_HAVE_NEON
     46 #include <arm_neon.h>
     47 #endif
     48 
     49 #if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
     50 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
     51 #else
     52 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
     53 #endif
     54 
     55 namespace snappy {
     56 namespace internal {
     57 
     58 #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
     59 #if SNAPPY_HAVE_SSSE3
     60 using V128 = __m128i;
     61 #elif SNAPPY_HAVE_NEON
     62 using V128 = uint8x16_t;
     63 #endif
     64 
     65 // Load 128 bits of integer data. `src` must be 16-byte aligned.
     66 inline V128 V128_Load(const V128* src);
     67 
     68 // Load 128 bits of integer data. `src` does not need to be aligned.
     69 inline V128 V128_LoadU(const V128* src);
     70 
     71 // Store 128 bits of integer data. `dst` does not need to be aligned.
     72 inline void V128_StoreU(V128* dst, V128 val);
     73 
     74 // Shuffle packed 8-bit integers using a shuffle mask.
     75 // Each packed integer in the shuffle mask must be in [0,16).
     76 inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
     77 
     78 // Constructs V128 with 16 chars |c|.
     79 inline V128 V128_DupChar(char c);
     80 
     81 #if SNAPPY_HAVE_SSSE3
     82 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
     83 
     84 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
     85 
     86 inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
     87 
     88 inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
     89  return _mm_shuffle_epi8(input, shuffle_mask);
     90 }
     91 
     92 inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
     93 
     94 #elif SNAPPY_HAVE_NEON
     95 inline V128 V128_Load(const V128* src) {
     96  return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
     97 }
     98 
     99 inline V128 V128_LoadU(const V128* src) {
    100  return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
    101 }
    102 
    103 inline void V128_StoreU(V128* dst, V128 val) {
    104  vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
    105 }
    106 
    107 inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
    108  assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
    109  return vqtbl1q_u8(input, shuffle_mask);
    110 }
    111 
    112 inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
    113 #endif
    114 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
    115 
    116 // Working memory performs a single allocation to hold all scratch space
    117 // required for compression.
    118 class WorkingMemory {
    119 public:
    120  explicit WorkingMemory(size_t input_size);
    121  ~WorkingMemory();
    122 
    123  // Allocates and clears a hash table using memory in "*this",
    124  // stores the number of buckets in "*table_size" and returns a pointer to
    125  // the base of the hash table.
    126  uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
    127  char* GetScratchInput() const { return input_; }
    128  char* GetScratchOutput() const { return output_; }
    129 
    130 private:
    131  char* mem_;        // the allocated memory, never nullptr
    132  size_t size_;      // the size of the allocated memory, never 0
    133  uint16_t* table_;  // the pointer to the hashtable
    134  char* input_;      // the pointer to the input scratch buffer
    135  char* output_;     // the pointer to the output scratch buffer
    136 
    137  // No copying
    138  WorkingMemory(const WorkingMemory&);
    139  void operator=(const WorkingMemory&);
    140 };
    141 
    142 // Flat array compression that does not emit the "uncompressed length"
    143 // prefix. Compresses "input" string to the "*op" buffer.
    144 //
    145 // REQUIRES: "input_length <= kBlockSize"
    146 // REQUIRES: "op" points to an array of memory that is at least
    147 // "MaxCompressedLength(input_length)" in size.
    148 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
    149 // REQUIRES: "table_size" is a power of two
    150 //
    151 // Returns an "end" pointer into "op" buffer.
    152 // "end - op" is the compressed size of "input".
    153 char* CompressFragment(const char* input,
    154                       size_t input_length,
    155                       char* op,
    156                       uint16_t* table,
    157                       const int table_size);
    158 
    159 // Find the largest n such that
    160 //
    161 //   s1[0,n-1] == s2[0,n-1]
    162 //   and n <= (s2_limit - s2).
    163 //
    164 // Return make_pair(n, n < 8).
    165 // Does not read *s2_limit or beyond.
    166 // Does not read *(s1 + (s2_limit - s2)) or beyond.
    167 // Requires that s2_limit >= s2.
    168 //
    169 // In addition populate *data with the next 5 bytes from the end of the match.
    170 // This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
    171 // that on some arch's this can be done faster in this routine than subsequent
    172 // loading from s2 + n.
    173 //
    174 // Separate implementation for 64-bit, little-endian cpus.
    175 #if !SNAPPY_IS_BIG_ENDIAN && \
    176    (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
    177     defined(ARCH_ARM))
    178 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
    179                                                      const char* s2,
    180                                                      const char* s2_limit,
    181                                                      uint64_t* data) {
    182  assert(s2_limit >= s2);
    183  size_t matched = 0;
    184 
    185  // This block isn't necessary for correctness; we could just start looping
    186  // immediately.  As an optimization though, it is useful.  It creates some not
    187  // uncommon code paths that determine, without extra effort, whether the match
    188  // length is less than 8.  In short, we are hoping to avoid a conditional
    189  // branch, and perhaps get better code layout from the C++ compiler.
    190  if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
    191    uint64_t a1 = UNALIGNED_LOAD64(s1);
    192    uint64_t a2 = UNALIGNED_LOAD64(s2);
    193    if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
    194      // This code is critical for performance. The reason is that it determines
    195      // how much to advance `ip` (s2). This obviously depends on both the loads
    196      // from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
    197      // depends on the advanced `ip` calculated here through a load, hash and
    198      // new candidate hash lookup (a lot of cycles). This makes s1 (ie.
    199      // `candidate`) the variable that limits throughput. This is the reason we
    200      // go through hoops to have this function update `data` for the next iter.
    201      // The straightforward code would use *data, given by
    202      //
    203      // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
    204      //
    205      // as input for the hash table lookup to find next candidate. However
    206      // this forces the load on the data dependency chain of s1, because
    207      // matched_bytes directly depends on s1. However matched_bytes is 0..7, so
    208      // we can also calculate *data by
    209      //
    210      // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
    211      //                    matched_bytes);
    212      //
    213      // The loads do not depend on s1 anymore and are thus off the bottleneck.
    214      // The straightforward implementation on x86_64 would be to use
    215      //
    216      // shrd rax, rdx, cl  (cl being matched_bytes * 8)
    217      //
    218      // unfortunately shrd with a variable shift has a 4 cycle latency. So this
    219      // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
    220      // shift instruction but can only shift 64 bits. If we focus on just
    221      // obtaining the least significant 4 bytes, we can obtain this by
    222      //
    223      // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
    224      //     UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
    225      //
    226      // Writen like above this is not a big win, the conditional move would be
    227      // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
    228      // However matched_bytes < 4 is equal to
    229      // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
    230      // move (2 cycles) can execute in parallel with FindLSBSetNonZero64
    231      // (tzcnt), which takes 3 cycles.
    232      uint64_t xorval = a1 ^ a2;
    233      int shift = Bits::FindLSBSetNonZero64(xorval);
    234      size_t matched_bytes = shift >> 3;
    235      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
    236 #ifndef __x86_64__
    237      a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
    238 #else
    239      // Ideally this would just be
    240      //
    241      // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
    242      //
    243      // However clang correctly infers that the above statement participates on
    244      // a critical data dependency chain and thus, unfortunately, refuses to
    245      // use a conditional move (it's tuned to cut data dependencies). In this
    246      // case there is a longer parallel chain anyway AND this will be fairly
    247      // unpredictable.
    248      asm("testl %k2, %k2\n\t"
    249          "cmovzq %1, %0\n\t"
    250          : "+r"(a2)
    251          : "r"(a3), "r"(xorval)
    252          : "cc");
    253 #endif
    254      *data = a2 >> (shift & (3 * 8));
    255      return std::pair<size_t, bool>(matched_bytes, true);
    256    } else {
    257      matched = 8;
    258      s2 += 8;
    259    }
    260  }
    261  SNAPPY_PREFETCH(s1 + 64);
    262  SNAPPY_PREFETCH(s2 + 64);
    263 
    264  // Find out how long the match is. We loop over the data 64 bits at a
    265  // time until we find a 64-bit block that doesn't match; then we find
    266  // the first non-matching bit and use that to calculate the total
    267  // length of the match.
    268  while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
    269    uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
    270    uint64_t a2 = UNALIGNED_LOAD64(s2);
    271    if (a1 == a2) {
    272      s2 += 8;
    273      matched += 8;
    274    } else {
    275      uint64_t xorval = a1 ^ a2;
    276      int shift = Bits::FindLSBSetNonZero64(xorval);
    277      size_t matched_bytes = shift >> 3;
    278      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
    279 #ifndef __x86_64__
    280      a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
    281 #else
    282      asm("testl %k2, %k2\n\t"
    283          "cmovzq %1, %0\n\t"
    284          : "+r"(a2)
    285          : "r"(a3), "r"(xorval)
    286          : "cc");
    287 #endif
    288      *data = a2 >> (shift & (3 * 8));
    289      matched += matched_bytes;
    290      assert(matched >= 8);
    291      return std::pair<size_t, bool>(matched, false);
    292    }
    293  }
    294  while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
    295    if (s1[matched] == *s2) {
    296      ++s2;
    297      ++matched;
    298    } else {
    299      if (s2 <= s2_limit - 8) {
    300        *data = UNALIGNED_LOAD64(s2);
    301      }
    302      return std::pair<size_t, bool>(matched, matched < 8);
    303    }
    304  }
    305  return std::pair<size_t, bool>(matched, matched < 8);
    306 }
    307 #else
    308 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
    309                                                      const char* s2,
    310                                                      const char* s2_limit,
    311                                                      uint64_t* data) {
    312  // Implementation based on the x86-64 version, above.
    313  assert(s2_limit >= s2);
    314  int matched = 0;
    315 
    316  while (s2 <= s2_limit - 4 &&
    317         UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
    318    s2 += 4;
    319    matched += 4;
    320  }
    321  if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
    322    uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
    323    int matching_bits = Bits::FindLSBSetNonZero(x);
    324    matched += matching_bits >> 3;
    325    s2 += matching_bits >> 3;
    326  } else {
    327    while ((s2 < s2_limit) && (s1[matched] == *s2)) {
    328      ++s2;
    329      ++matched;
    330    }
    331  }
    332  if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
    333  return std::pair<size_t, bool>(matched, matched < 8);
    334 }
    335 #endif
    336 
    337 static inline size_t FindMatchLengthPlain(const char* s1, const char* s2,
    338                                          const char* s2_limit) {
    339  // Implementation based on the x86-64 version, above.
    340  assert(s2_limit >= s2);
    341  int matched = 0;
    342 
    343  while (s2 <= s2_limit - 8 &&
    344         UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
    345    s2 += 8;
    346    matched += 8;
    347  }
    348  if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 8) {
    349    uint64_t x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
    350    int matching_bits = Bits::FindLSBSetNonZero64(x);
    351    matched += matching_bits >> 3;
    352    s2 += matching_bits >> 3;
    353  } else {
    354    while ((s2 < s2_limit) && (s1[matched] == *s2)) {
    355      ++s2;
    356      ++matched;
    357    }
    358  }
    359  return matched;
    360 }
    361 
    362 // Lookup tables for decompression code.  Give --snappy_dump_decompression_table
    363 // to the unit test to recompute char_table.
    364 
    365 enum {
    366  LITERAL = 0,
    367  COPY_1_BYTE_OFFSET = 1,  // 3 bit length + 3 bits of offset in opcode
    368  COPY_2_BYTE_OFFSET = 2,
    369  COPY_4_BYTE_OFFSET = 3
    370 };
    371 static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual offset.
    372 
    373 // Data stored per entry in lookup table:
    374 //      Range   Bits-used       Description
    375 //      ------------------------------------
    376 //      1..64   0..7            Literal/copy length encoded in opcode byte
    377 //      0..7    8..10           Copy offset encoded in opcode byte / 256
    378 //      0..4    11..13          Extra bytes after opcode
    379 //
    380 // We use eight bits for the length even though 7 would have sufficed
    381 // because of efficiency reasons:
    382 //      (1) Extracting a byte is faster than a bit-field
    383 //      (2) It properly aligns copy offset so we do not need a <<8
    384 static constexpr uint16_t char_table[256] = {
    385    // clang-format off
    386  0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
    387  0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
    388  0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
    389  0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
    390  0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
    391  0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
    392  0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
    393  0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
    394  0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
    395  0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
    396  0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
    397  0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
    398  0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
    399  0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
    400  0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
    401  0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
    402  0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
    403  0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
    404  0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
    405  0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
    406  0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
    407  0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
    408  0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
    409  0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
    410  0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
    411  0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
    412  0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
    413  0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
    414  0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
    415  0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
    416  0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
    417  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
    418    // clang-format on
    419 };
    420 
    421 }  // end namespace internal
    422 }  // end namespace snappy
    423 
    424 #endif  // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_