tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

entropy_encode.c (14568B)


      1 /* Copyright 2010 Google Inc. All Rights Reserved.
      2 
      3   Distributed under MIT license.
      4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* Entropy encoding (Huffman) utilities. */
      8 
      9 #include "entropy_encode.h"
     10 
     11 #include "../common/constants.h"
     12 #include "../common/platform.h"
     13 
     14 #if defined(__cplusplus) || defined(c_plusplus)
     15 extern "C" {
     16 #endif
     17 
     18 const BROTLI_MODEL("small") size_t kBrotliShellGaps[] = {132, 57, 23, 10, 4, 1};
     19 
     20 BROTLI_BOOL BrotliSetDepth(
     21    int p0, HuffmanTree* pool, uint8_t* depth, int max_depth) {
     22  int stack[16];
     23  int level = 0;
     24  int p = p0;
     25  BROTLI_DCHECK(max_depth <= 15);
     26  stack[0] = -1;
     27  while (BROTLI_TRUE) {
     28    if (pool[p].index_left_ >= 0) {
     29      level++;
     30      if (level > max_depth) return BROTLI_FALSE;
     31      stack[level] = pool[p].index_right_or_value_;
     32      p = pool[p].index_left_;
     33      continue;
     34    } else {
     35      depth[pool[p].index_right_or_value_] = (uint8_t)level;
     36    }
     37    while (level >= 0 && stack[level] == -1) level--;
     38    if (level < 0) return BROTLI_TRUE;
     39    p = stack[level];
     40    stack[level] = -1;
     41  }
     42 }
     43 
     44 /* Sort the root nodes, least popular first. */
     45 static BROTLI_INLINE BROTLI_BOOL SortHuffmanTree(
     46    const HuffmanTree* v0, const HuffmanTree* v1) {
     47  if (v0->total_count_ != v1->total_count_) {
     48    return TO_BROTLI_BOOL(v0->total_count_ < v1->total_count_);
     49  }
     50  return TO_BROTLI_BOOL(v0->index_right_or_value_ > v1->index_right_or_value_);
     51 }
     52 
     53 /* This function will create a Huffman tree.
     54 
     55   The catch here is that the tree cannot be arbitrarily deep.
     56   Brotli specifies a maximum depth of 15 bits for "code trees"
     57   and 7 bits for "code length code trees."
     58 
     59   count_limit is the value that is to be faked as the minimum value
     60   and this minimum value is raised until the tree matches the
     61   maximum length requirement.
     62 
     63   This algorithm is not of excellent performance for very long data blocks,
     64   especially when population counts are longer than 2**tree_limit, but
     65   we are not planning to use this with extremely long blocks.
     66 
     67   See http://en.wikipedia.org/wiki/Huffman_coding */
     68 void BrotliCreateHuffmanTree(const uint32_t* data,
     69                             const size_t length,
     70                             const int tree_limit,
     71                             HuffmanTree* tree,
     72                             uint8_t* depth) {
     73  uint32_t count_limit;
     74  HuffmanTree sentinel;
     75  InitHuffmanTree(&sentinel, BROTLI_UINT32_MAX, -1, -1);
     76  /* For block sizes below 64 kB, we never need to do a second iteration
     77     of this loop. Probably all of our block sizes will be smaller than
     78     that, so this loop is mostly of academic interest. If we actually
     79     would need this, we would be better off with the Katajainen algorithm. */
     80  for (count_limit = 1; ; count_limit *= 2) {
     81    size_t n = 0;
     82    size_t i;
     83    size_t j;
     84    size_t k;
     85    for (i = length; i != 0;) {
     86      --i;
     87      if (data[i]) {
     88        const uint32_t count = BROTLI_MAX(uint32_t, data[i], count_limit);
     89        InitHuffmanTree(&tree[n++], count, -1, (int16_t)i);
     90      }
     91    }
     92 
     93    if (n == 1) {
     94      depth[tree[0].index_right_or_value_] = 1;  /* Only one element. */
     95      break;
     96    }
     97 
     98    SortHuffmanTreeItems(tree, n, SortHuffmanTree);
     99 
    100    /* The nodes are:
    101       [0, n): the sorted leaf nodes that we start with.
    102       [n]: we add a sentinel here.
    103       [n + 1, 2n): new parent nodes are added here, starting from
    104                    (n+1). These are naturally in ascending order.
    105       [2n]: we add a sentinel at the end as well.
    106       There will be (2n+1) elements at the end. */
    107    tree[n] = sentinel;
    108    tree[n + 1] = sentinel;
    109 
    110    i = 0;      /* Points to the next leaf node. */
    111    j = n + 1;  /* Points to the next non-leaf node. */
    112    for (k = n - 1; k != 0; --k) {
    113      size_t left, right;
    114      if (tree[i].total_count_ <= tree[j].total_count_) {
    115        left = i;
    116        ++i;
    117      } else {
    118        left = j;
    119        ++j;
    120      }
    121      if (tree[i].total_count_ <= tree[j].total_count_) {
    122        right = i;
    123        ++i;
    124      } else {
    125        right = j;
    126        ++j;
    127      }
    128 
    129      {
    130        /* The sentinel node becomes the parent node. */
    131        size_t j_end = 2 * n - k;
    132        tree[j_end].total_count_ =
    133            tree[left].total_count_ + tree[right].total_count_;
    134        tree[j_end].index_left_ = (int16_t)left;
    135        tree[j_end].index_right_or_value_ = (int16_t)right;
    136 
    137        /* Add back the last sentinel node. */
    138        tree[j_end + 1] = sentinel;
    139      }
    140    }
    141    if (BrotliSetDepth((int)(2 * n - 1), &tree[0], depth, tree_limit)) {
    142      /* We need to pack the Huffman tree in tree_limit bits. If this was not
    143         successful, add fake entities to the lowest values and retry. */
    144      break;
    145    }
    146  }
    147 }
    148 
    149 static void Reverse(uint8_t* v, size_t start, size_t end) {
    150  --end;
    151  while (start < end) {
    152    uint8_t tmp = v[start];
    153    v[start] = v[end];
    154    v[end] = tmp;
    155    ++start;
    156    --end;
    157  }
    158 }
    159 
    160 static void BrotliWriteHuffmanTreeRepetitions(
    161    const uint8_t previous_value,
    162    const uint8_t value,
    163    size_t repetitions,
    164    size_t* tree_size,
    165    uint8_t* tree,
    166    uint8_t* extra_bits_data) {
    167  BROTLI_DCHECK(repetitions > 0);
    168  if (previous_value != value) {
    169    tree[*tree_size] = value;
    170    extra_bits_data[*tree_size] = 0;
    171    ++(*tree_size);
    172    --repetitions;
    173  }
    174  if (repetitions == 7) {
    175    tree[*tree_size] = value;
    176    extra_bits_data[*tree_size] = 0;
    177    ++(*tree_size);
    178    --repetitions;
    179  }
    180  if (repetitions < 3) {
    181    size_t i;
    182    for (i = 0; i < repetitions; ++i) {
    183      tree[*tree_size] = value;
    184      extra_bits_data[*tree_size] = 0;
    185      ++(*tree_size);
    186    }
    187  } else {
    188    size_t start = *tree_size;
    189    repetitions -= 3;
    190    while (BROTLI_TRUE) {
    191      tree[*tree_size] = BROTLI_REPEAT_PREVIOUS_CODE_LENGTH;
    192      extra_bits_data[*tree_size] = repetitions & 0x3;
    193      ++(*tree_size);
    194      repetitions >>= 2;
    195      if (repetitions == 0) {
    196        break;
    197      }
    198      --repetitions;
    199    }
    200    Reverse(tree, start, *tree_size);
    201    Reverse(extra_bits_data, start, *tree_size);
    202  }
    203 }
    204 
    205 static void BrotliWriteHuffmanTreeRepetitionsZeros(
    206    size_t repetitions,
    207    size_t* tree_size,
    208    uint8_t* tree,
    209    uint8_t* extra_bits_data) {
    210  if (repetitions == 11) {
    211    tree[*tree_size] = 0;
    212    extra_bits_data[*tree_size] = 0;
    213    ++(*tree_size);
    214    --repetitions;
    215  }
    216  if (repetitions < 3) {
    217    size_t i;
    218    for (i = 0; i < repetitions; ++i) {
    219      tree[*tree_size] = 0;
    220      extra_bits_data[*tree_size] = 0;
    221      ++(*tree_size);
    222    }
    223  } else {
    224    size_t start = *tree_size;
    225    repetitions -= 3;
    226    while (BROTLI_TRUE) {
    227      tree[*tree_size] = BROTLI_REPEAT_ZERO_CODE_LENGTH;
    228      extra_bits_data[*tree_size] = repetitions & 0x7;
    229      ++(*tree_size);
    230      repetitions >>= 3;
    231      if (repetitions == 0) {
    232        break;
    233      }
    234      --repetitions;
    235    }
    236    Reverse(tree, start, *tree_size);
    237    Reverse(extra_bits_data, start, *tree_size);
    238  }
    239 }
    240 
    241 void BrotliOptimizeHuffmanCountsForRle(size_t length, uint32_t* counts,
    242                                       uint8_t* good_for_rle) {
    243  size_t nonzero_count = 0;
    244  size_t stride;
    245  size_t limit;
    246  size_t sum;
    247  const size_t streak_limit = 1240;
    248  /* Let's make the Huffman code more compatible with RLE encoding. */
    249  size_t i;
    250  for (i = 0; i < length; i++) {
    251    if (counts[i]) {
    252      ++nonzero_count;
    253    }
    254  }
    255  if (nonzero_count < 16) {
    256    return;
    257  }
    258  while (length != 0 && counts[length - 1] == 0) {
    259    --length;
    260  }
    261  if (length == 0) {
    262    return;  /* All zeros. */
    263  }
    264  /* Now counts[0..length - 1] does not have trailing zeros. */
    265  {
    266    size_t nonzeros = 0;
    267    uint32_t smallest_nonzero = 1 << 30;
    268    for (i = 0; i < length; ++i) {
    269      if (counts[i] != 0) {
    270        ++nonzeros;
    271        if (smallest_nonzero > counts[i]) {
    272          smallest_nonzero = counts[i];
    273        }
    274      }
    275    }
    276    if (nonzeros < 5) {
    277      /* Small histogram will model it well. */
    278      return;
    279    }
    280    if (smallest_nonzero < 4) {
    281      size_t zeros = length - nonzeros;
    282      if (zeros < 6) {
    283        for (i = 1; i < length - 1; ++i) {
    284          if (counts[i - 1] != 0 && counts[i] == 0 && counts[i + 1] != 0) {
    285            counts[i] = 1;
    286          }
    287        }
    288      }
    289    }
    290    if (nonzeros < 28) {
    291      return;
    292    }
    293  }
    294  /* 2) Let's mark all population counts that already can be encoded
    295     with an RLE code. */
    296  memset(good_for_rle, 0, length);
    297  {
    298    /* Let's not spoil any of the existing good RLE codes.
    299       Mark any seq of 0's that is longer as 5 as a good_for_rle.
    300       Mark any seq of non-0's that is longer as 7 as a good_for_rle. */
    301    uint32_t symbol = counts[0];
    302    size_t step = 0;
    303    for (i = 0; i <= length; ++i) {
    304      if (i == length || counts[i] != symbol) {
    305        if ((symbol == 0 && step >= 5) ||
    306            (symbol != 0 && step >= 7)) {
    307          size_t k;
    308          for (k = 0; k < step; ++k) {
    309            good_for_rle[i - k - 1] = 1;
    310          }
    311        }
    312        step = 1;
    313        if (i != length) {
    314          symbol = counts[i];
    315        }
    316      } else {
    317        ++step;
    318      }
    319    }
    320  }
    321  /* 3) Let's replace those population counts that lead to more RLE codes.
    322     Math here is in 24.8 fixed point representation. */
    323  stride = 0;
    324  limit = 256 * (counts[0] + counts[1] + counts[2]) / 3 + 420;
    325  sum = 0;
    326  for (i = 0; i <= length; ++i) {
    327    if (i == length || good_for_rle[i] ||
    328        (i != 0 && good_for_rle[i - 1]) ||
    329        (256 * counts[i] - limit + streak_limit) >= 2 * streak_limit) {
    330      if (stride >= 4 || (stride >= 3 && sum == 0)) {
    331        size_t k;
    332        /* The stride must end, collapse what we have, if we have enough (4). */
    333        size_t count = (sum + stride / 2) / stride;
    334        if (count == 0) {
    335          count = 1;
    336        }
    337        if (sum == 0) {
    338          /* Don't make an all zeros stride to be upgraded to ones. */
    339          count = 0;
    340        }
    341        for (k = 0; k < stride; ++k) {
    342          /* We don't want to change value at counts[i],
    343             that is already belonging to the next stride. Thus - 1. */
    344          counts[i - k - 1] = (uint32_t)count;
    345        }
    346      }
    347      stride = 0;
    348      sum = 0;
    349      if (i < length - 2) {
    350        /* All interesting strides have a count of at least 4, */
    351        /* at least when non-zeros. */
    352        limit = 256 * (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 420;
    353      } else if (i < length) {
    354        limit = 256 * counts[i];
    355      } else {
    356        limit = 0;
    357      }
    358    }
    359    ++stride;
    360    if (i != length) {
    361      sum += counts[i];
    362      if (stride >= 4) {
    363        limit = (256 * sum + stride / 2) / stride;
    364      }
    365      if (stride == 4) {
    366        limit += 120;
    367      }
    368    }
    369  }
    370 }
    371 
    372 static void DecideOverRleUse(const uint8_t* depth, const size_t length,
    373                             BROTLI_BOOL* use_rle_for_non_zero,
    374                             BROTLI_BOOL* use_rle_for_zero) {
    375  size_t total_reps_zero = 0;
    376  size_t total_reps_non_zero = 0;
    377  size_t count_reps_zero = 1;
    378  size_t count_reps_non_zero = 1;
    379  size_t i;
    380  for (i = 0; i < length;) {
    381    const uint8_t value = depth[i];
    382    size_t reps = 1;
    383    size_t k;
    384    for (k = i + 1; k < length && depth[k] == value; ++k) {
    385      ++reps;
    386    }
    387    if (reps >= 3 && value == 0) {
    388      total_reps_zero += reps;
    389      ++count_reps_zero;
    390    }
    391    if (reps >= 4 && value != 0) {
    392      total_reps_non_zero += reps;
    393      ++count_reps_non_zero;
    394    }
    395    i += reps;
    396  }
    397  *use_rle_for_non_zero =
    398      TO_BROTLI_BOOL(total_reps_non_zero > count_reps_non_zero * 2);
    399  *use_rle_for_zero = TO_BROTLI_BOOL(total_reps_zero > count_reps_zero * 2);
    400 }
    401 
    402 void BrotliWriteHuffmanTree(const uint8_t* depth,
    403                            size_t length,
    404                            size_t* tree_size,
    405                            uint8_t* tree,
    406                            uint8_t* extra_bits_data) {
    407  uint8_t previous_value = BROTLI_INITIAL_REPEATED_CODE_LENGTH;
    408  size_t i;
    409  BROTLI_BOOL use_rle_for_non_zero = BROTLI_FALSE;
    410  BROTLI_BOOL use_rle_for_zero = BROTLI_FALSE;
    411 
    412  /* Throw away trailing zeros. */
    413  size_t new_length = length;
    414  for (i = 0; i < length; ++i) {
    415    if (depth[length - i - 1] == 0) {
    416      --new_length;
    417    } else {
    418      break;
    419    }
    420  }
    421 
    422  /* First gather statistics on if it is a good idea to do RLE. */
    423  if (length > 50) {
    424    /* Find RLE coding for longer codes.
    425       Shorter codes seem not to benefit from RLE. */
    426    DecideOverRleUse(depth, new_length,
    427                     &use_rle_for_non_zero, &use_rle_for_zero);
    428  }
    429 
    430  /* Actual RLE coding. */
    431  for (i = 0; i < new_length;) {
    432    const uint8_t value = depth[i];
    433    size_t reps = 1;
    434    if ((value != 0 && use_rle_for_non_zero) ||
    435        (value == 0 && use_rle_for_zero)) {
    436      size_t k;
    437      for (k = i + 1; k < new_length && depth[k] == value; ++k) {
    438        ++reps;
    439      }
    440    }
    441    if (value == 0) {
    442      BrotliWriteHuffmanTreeRepetitionsZeros(
    443          reps, tree_size, tree, extra_bits_data);
    444    } else {
    445      BrotliWriteHuffmanTreeRepetitions(previous_value,
    446                                        value, reps, tree_size,
    447                                        tree, extra_bits_data);
    448      previous_value = value;
    449    }
    450    i += reps;
    451  }
    452 }
    453 
    454 static uint16_t BrotliReverseBits(size_t num_bits, uint16_t bits) {
    455  static const size_t BROTLI_MODEL("small") kLut[16] =
    456  {  /* Pre-reversed 4-bit values. */
    457    0x00, 0x08, 0x04, 0x0C, 0x02, 0x0A, 0x06, 0x0E,
    458    0x01, 0x09, 0x05, 0x0D, 0x03, 0x0B, 0x07, 0x0F
    459  };
    460  size_t retval = kLut[bits & 0x0F];
    461  size_t i;
    462  for (i = 4; i < num_bits; i += 4) {
    463    retval <<= 4;
    464    bits = (uint16_t)(bits >> 4);
    465    retval |= kLut[bits & 0x0F];
    466  }
    467  retval >>= ((0 - num_bits) & 0x03);
    468  return (uint16_t)retval;
    469 }
    470 
    471 /* 0..15 are values for bits */
    472 #define MAX_HUFFMAN_BITS 16
    473 
    474 void BrotliConvertBitDepthsToSymbols(const uint8_t* depth,
    475                                     size_t len,
    476                                     uint16_t* bits) {
    477  /* In Brotli, all bit depths are [1..15]
    478     0 bit depth means that the symbol does not exist. */
    479  uint16_t bl_count[MAX_HUFFMAN_BITS] = { 0 };
    480  uint16_t next_code[MAX_HUFFMAN_BITS];
    481  size_t i;
    482  int code = 0;
    483  for (i = 0; i < len; ++i) {
    484    ++bl_count[depth[i]];
    485  }
    486  bl_count[0] = 0;
    487  next_code[0] = 0;
    488  for (i = 1; i < MAX_HUFFMAN_BITS; ++i) {
    489    code = (code + bl_count[i - 1]) << 1;
    490    next_code[i] = (uint16_t)code;
    491  }
    492  for (i = 0; i < len; ++i) {
    493    if (depth[i]) {
    494      bits[i] = BrotliReverseBits(depth[i], next_code[depth[i]]++);
    495    }
    496  }
    497 }
    498 
    499 #if defined(__cplusplus) || defined(c_plusplus)
    500 }  /* extern "C" */
    501 #endif