tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

hash_to_binary_tree_inc.h (13081B)


      1 /* NOLINT(build/header_guard) */
      2 /* Copyright 2016 Google Inc. All Rights Reserved.
      3 
      4   Distributed under MIT license.
      5   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      6 */
      7 
      8 /* template parameters: FN, BUCKET_BITS, MAX_TREE_COMP_LENGTH,
      9                        MAX_TREE_SEARCH_DEPTH */
     10 
     11 /* A (forgetful) hash table where each hash bucket contains a binary tree of
     12   sequences whose first 4 bytes share the same hash code.
     13   Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting
     14   position in the input data. The binary tree is sorted by the lexicographic
     15   order of the sequences, and it is also a max-heap with respect to the
     16   starting positions. */
     17 
     18 #define HashToBinaryTree HASHER()
     19 
     20 #define BUCKET_SIZE (1 << BUCKET_BITS)
     21 
     22 static BROTLI_INLINE size_t FN(HashTypeLength)(void) { return 4; }
     23 static BROTLI_INLINE size_t FN(StoreLookahead)(void) {
     24  return MAX_TREE_COMP_LENGTH;
     25 }
     26 
     27 static uint32_t FN(HashBytes)(const uint8_t* BROTLI_RESTRICT data) {
     28  uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
     29  /* The higher bits contain more mixture from the multiplication,
     30     so we take our results from there. */
     31  return h >> (32 - BUCKET_BITS);
     32 }
     33 
     34 typedef struct HashToBinaryTree {
     35  /* The window size minus 1 */
     36  size_t window_mask_;
     37 
     38  /* Hash table that maps the 4-byte hashes of the sequence to the last
     39     position where this hash was found, which is the root of the binary
     40     tree of sequences that share this hash bucket. */
     41  uint32_t* buckets_;  /* uint32_t[BUCKET_SIZE]; */
     42 
     43  /* A position used to mark a non-existent sequence, i.e. a tree is empty if
     44     its root is at invalid_pos_ and a node is a leaf if both its children
     45     are at invalid_pos_. */
     46  uint32_t invalid_pos_;
     47 
     48  /* --- Dynamic size members --- */
     49 
     50  /* The union of the binary trees of each hash bucket. The root of the tree
     51     corresponding to a hash is a sequence starting at buckets_[hash] and
     52     the left and right children of a sequence starting at pos are
     53     forest_[2 * pos] and forest_[2 * pos + 1]. */
     54  uint32_t* forest_;  /* uint32_t[2 * num_nodes] */
     55 } HashToBinaryTree;
     56 
     57 static void FN(Initialize)(
     58    HasherCommon* common, HashToBinaryTree* BROTLI_RESTRICT self,
     59    const BrotliEncoderParams* params) {
     60  self->buckets_ = (uint32_t*)common->extra[0];
     61  self->forest_ = (uint32_t*)common->extra[1];
     62 
     63  self->window_mask_ = (1u << params->lgwin) - 1u;
     64  self->invalid_pos_ = (uint32_t)(0 - self->window_mask_);
     65 }
     66 
     67 static void FN(Prepare)
     68    (HashToBinaryTree* BROTLI_RESTRICT self, BROTLI_BOOL one_shot,
     69    size_t input_size, const uint8_t* BROTLI_RESTRICT data) {
     70  uint32_t invalid_pos = self->invalid_pos_;
     71  uint32_t i;
     72  uint32_t* BROTLI_RESTRICT buckets = self->buckets_;
     73  BROTLI_UNUSED(data);
     74  BROTLI_UNUSED(one_shot);
     75  BROTLI_UNUSED(input_size);
     76  for (i = 0; i < BUCKET_SIZE; i++) {
     77    buckets[i] = invalid_pos;
     78  }
     79 }
     80 
     81 static BROTLI_INLINE void FN(HashMemAllocInBytes)(
     82    const BrotliEncoderParams* params, BROTLI_BOOL one_shot,
     83    size_t input_size, size_t* alloc_size) {
     84  size_t num_nodes = (size_t)1 << params->lgwin;
     85  if (one_shot && input_size < num_nodes) {
     86    num_nodes = input_size;
     87  }
     88  alloc_size[0] = sizeof(uint32_t) * BUCKET_SIZE;
     89  alloc_size[1] = 2 * sizeof(uint32_t) * num_nodes;
     90 }
     91 
     92 static BROTLI_INLINE size_t FN(LeftChildIndex)(
     93    HashToBinaryTree* BROTLI_RESTRICT self,
     94    const size_t pos) {
     95  return 2 * (pos & self->window_mask_);
     96 }
     97 
     98 static BROTLI_INLINE size_t FN(RightChildIndex)(
     99    HashToBinaryTree* BROTLI_RESTRICT self,
    100    const size_t pos) {
    101  return 2 * (pos & self->window_mask_) + 1;
    102 }
    103 
    104 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the
    105   hash bucket's binary tree is searched for matches and is re-rooted at the
    106   current position.
    107 
    108   If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the
    109   current position is searched for matches, but the state of the hash table
    110   is not changed, since we can not know the final sorting order of the
    111   current (incomplete) sequence.
    112 
    113   This function must be called with increasing cur_ix positions. */
    114 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)(
    115    HashToBinaryTree* BROTLI_RESTRICT self, const uint8_t* BROTLI_RESTRICT data,
    116    const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length,
    117    const size_t max_backward, size_t* const BROTLI_RESTRICT best_len,
    118    BackwardMatch* BROTLI_RESTRICT matches) {
    119  const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
    120  const size_t max_comp_len =
    121      BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH);
    122  const BROTLI_BOOL should_reroot_tree =
    123      TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH);
    124  const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
    125  uint32_t* BROTLI_RESTRICT buckets = self->buckets_;
    126  uint32_t* BROTLI_RESTRICT forest = self->forest_;
    127  size_t prev_ix = buckets[key];
    128  /* The forest index of the rightmost node of the left subtree of the new
    129     root, updated as we traverse and re-root the tree of the hash bucket. */
    130  size_t node_left = FN(LeftChildIndex)(self, cur_ix);
    131  /* The forest index of the leftmost node of the right subtree of the new
    132     root, updated as we traverse and re-root the tree of the hash bucket. */
    133  size_t node_right = FN(RightChildIndex)(self, cur_ix);
    134  /* The match length of the rightmost node of the left subtree of the new
    135     root, updated as we traverse and re-root the tree of the hash bucket. */
    136  size_t best_len_left = 0;
    137  /* The match length of the leftmost node of the right subtree of the new
    138     root, updated as we traverse and re-root the tree of the hash bucket. */
    139  size_t best_len_right = 0;
    140  size_t depth_remaining;
    141  if (should_reroot_tree) {
    142    buckets[key] = (uint32_t)cur_ix;
    143  }
    144  for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) {
    145    const size_t backward = cur_ix - prev_ix;
    146    const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
    147    if (backward == 0 || backward > max_backward || depth_remaining == 0) {
    148      if (should_reroot_tree) {
    149        forest[node_left] = self->invalid_pos_;
    150        forest[node_right] = self->invalid_pos_;
    151      }
    152      break;
    153    }
    154    {
    155      const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right);
    156      size_t len;
    157      BROTLI_DCHECK(cur_len <= MAX_TREE_COMP_LENGTH);
    158      len = cur_len +
    159          FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
    160                                   &data[prev_ix_masked + cur_len],
    161                                   max_length - cur_len);
    162      BROTLI_DCHECK(
    163          0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len));
    164      if (matches && len > *best_len) {
    165        *best_len = len;
    166        InitBackwardMatch(matches++, backward, len);
    167      }
    168      if (len >= max_comp_len) {
    169        if (should_reroot_tree) {
    170          forest[node_left] = forest[FN(LeftChildIndex)(self, prev_ix)];
    171          forest[node_right] = forest[FN(RightChildIndex)(self, prev_ix)];
    172        }
    173        break;
    174      }
    175      if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
    176        best_len_left = len;
    177        if (should_reroot_tree) {
    178          forest[node_left] = (uint32_t)prev_ix;
    179        }
    180        node_left = FN(RightChildIndex)(self, prev_ix);
    181        prev_ix = forest[node_left];
    182      } else {
    183        best_len_right = len;
    184        if (should_reroot_tree) {
    185          forest[node_right] = (uint32_t)prev_ix;
    186        }
    187        node_right = FN(LeftChildIndex)(self, prev_ix);
    188        prev_ix = forest[node_right];
    189      }
    190    }
    191  }
    192  return matches;
    193 }
    194 
    195 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
    196   length of max_length and stores the position cur_ix in the hash table.
    197 
    198   Sets *num_matches to the number of matches found, and stores the found
    199   matches in matches[0] to matches[*num_matches - 1]. The matches will be
    200   sorted by strictly increasing length and (non-strictly) increasing
    201   distance. */
    202 static BROTLI_INLINE size_t FN(FindAllMatches)(
    203    HashToBinaryTree* BROTLI_RESTRICT self,
    204    const BrotliEncoderDictionary* dictionary,
    205    const uint8_t* BROTLI_RESTRICT data,
    206    const size_t ring_buffer_mask, const size_t cur_ix,
    207    const size_t max_length, const size_t max_backward,
    208    const size_t dictionary_distance, const BrotliEncoderParams* params,
    209    BackwardMatch* matches) {
    210  BackwardMatch* const orig_matches = matches;
    211  const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
    212  size_t best_len = 1;
    213  const size_t short_match_max_backward =
    214      params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64;
    215  size_t stop = cur_ix - short_match_max_backward;
    216  uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1];
    217  size_t i;
    218  if (cur_ix < short_match_max_backward) { stop = 0; }
    219  for (i = cur_ix - 1; i > stop && best_len <= 2; --i) {
    220    size_t prev_ix = i;
    221    const size_t backward = cur_ix - prev_ix;
    222    if (BROTLI_PREDICT_FALSE(backward > max_backward)) {
    223      break;
    224    }
    225    prev_ix &= ring_buffer_mask;
    226    if (data[cur_ix_masked] != data[prev_ix] ||
    227        data[cur_ix_masked + 1] != data[prev_ix + 1]) {
    228      continue;
    229    }
    230    {
    231      const size_t len =
    232          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
    233                                   max_length);
    234      if (len > best_len) {
    235        best_len = len;
    236        InitBackwardMatch(matches++, backward, len);
    237      }
    238    }
    239  }
    240  if (best_len < max_length) {
    241    matches = FN(StoreAndFindMatches)(self, data, cur_ix,
    242        ring_buffer_mask, max_length, max_backward, &best_len, matches);
    243  }
    244  for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) {
    245    dict_matches[i] = kInvalidMatch;
    246  }
    247  {
    248    size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1);
    249    if (BrotliFindAllStaticDictionaryMatches(dictionary,
    250        &data[cur_ix_masked], minlen, max_length, &dict_matches[0])) {
    251      size_t maxlen = BROTLI_MIN(
    252          size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length);
    253      size_t l;
    254      for (l = minlen; l <= maxlen; ++l) {
    255        uint32_t dict_id = dict_matches[l];
    256        if (dict_id < kInvalidMatch) {
    257          size_t distance = dictionary_distance + (dict_id >> 5) + 1;
    258          if (distance <= params->dist.max_distance) {
    259            InitDictionaryBackwardMatch(matches++, distance, l, dict_id & 31);
    260          }
    261        }
    262      }
    263    }
    264  }
    265  return (size_t)(matches - orig_matches);
    266 }
    267 
    268 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the
    269   current sequence, without returning any matches.
    270   REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */
    271 static BROTLI_INLINE void FN(Store)(HashToBinaryTree* BROTLI_RESTRICT self,
    272    const uint8_t* BROTLI_RESTRICT data,
    273    const size_t mask, const size_t ix) {
    274  /* Maximum distance is window size - 16, see section 9.1. of the spec. */
    275  const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1;
    276  FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH,
    277      max_backward, NULL, NULL);
    278 }
    279 
    280 static BROTLI_INLINE void FN(StoreRange)(HashToBinaryTree* BROTLI_RESTRICT self,
    281    const uint8_t* BROTLI_RESTRICT data, const size_t mask,
    282    const size_t ix_start, const size_t ix_end) {
    283  size_t i = ix_start;
    284  size_t j = ix_start;
    285  if (ix_start + 63 <= ix_end) {
    286    i = ix_end - 63;
    287  }
    288  if (ix_start + 512 <= i) {
    289    for (; j < i; j += 8) {
    290      FN(Store)(self, data, mask, j);
    291    }
    292  }
    293  for (; i < ix_end; ++i) {
    294    FN(Store)(self, data, mask, i);
    295  }
    296 }
    297 
    298 static BROTLI_INLINE void FN(StitchToPreviousBlock)(
    299    HashToBinaryTree* BROTLI_RESTRICT self,
    300    size_t num_bytes, size_t position, const uint8_t* ringbuffer,
    301    size_t ringbuffer_mask) {
    302  if (num_bytes >= FN(HashTypeLength)() - 1 &&
    303      position >= MAX_TREE_COMP_LENGTH) {
    304    /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher.
    305       These could not be calculated before, since they require knowledge
    306       of both the previous and the current block. */
    307    const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1;
    308    const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes);
    309    size_t i;
    310    for (i = i_start; i < i_end; ++i) {
    311      /* Maximum distance is window size - 16, see section 9.1. of the spec.
    312         Furthermore, we have to make sure that we don't look further back
    313         from the start of the next block than the window size, otherwise we
    314         could access already overwritten areas of the ring-buffer. */
    315      const size_t max_backward =
    316          self->window_mask_ - BROTLI_MAX(size_t,
    317                                          BROTLI_WINDOW_GAP - 1,
    318                                          position - i);
    319      /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the
    320         end of the current block and that we have at least
    321         MAX_TREE_COMP_LENGTH tail in the ring-buffer. */
    322      FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask,
    323          MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL);
    324    }
    325  }
    326 }
    327 
    328 #undef BUCKET_SIZE
    329 
    330 #undef HashToBinaryTree