tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cluster_inc.h (11826B)


      1 /* NOLINT(build/header_guard) */
      2 /* Copyright 2013 Google Inc. All Rights Reserved.
      3 
      4   Distributed under MIT license.
      5   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      6 */
      7 
      8 /* template parameters: FN, CODE */
      9 
     10 #define HistogramType FN(Histogram)
     11 
     12 /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
     13   it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
     14 BROTLI_INTERNAL void FN(BrotliCompareAndPushToQueue)(
     15    const HistogramType* out, HistogramType* tmp, const uint32_t* cluster_size,
     16    uint32_t idx1, uint32_t idx2, size_t max_num_pairs, HistogramPair* pairs,
     17    size_t* num_pairs) CODE({
     18  BROTLI_BOOL is_good_pair = BROTLI_FALSE;
     19  HistogramPair p;
     20  p.idx1 = p.idx2 = 0;
     21  p.cost_diff = p.cost_combo = 0;
     22  if (idx1 == idx2) {
     23    return;
     24  }
     25  if (idx2 < idx1) {
     26    uint32_t t = idx2;
     27    idx2 = idx1;
     28    idx1 = t;
     29  }
     30  p.idx1 = idx1;
     31  p.idx2 = idx2;
     32  p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
     33  p.cost_diff -= out[idx1].bit_cost_;
     34  p.cost_diff -= out[idx2].bit_cost_;
     35 
     36  if (out[idx1].total_count_ == 0) {
     37    p.cost_combo = out[idx2].bit_cost_;
     38    is_good_pair = BROTLI_TRUE;
     39  } else if (out[idx2].total_count_ == 0) {
     40    p.cost_combo = out[idx1].bit_cost_;
     41    is_good_pair = BROTLI_TRUE;
     42  } else {
     43    double threshold = *num_pairs == 0 ? 1e99 :
     44        BROTLI_MAX(double, 0.0, pairs[0].cost_diff);
     45    double cost_combo;
     46    *tmp = out[idx1];
     47    FN(HistogramAddHistogram)(tmp, &out[idx2]);
     48    cost_combo = FN(BrotliPopulationCost)(tmp);
     49    if (cost_combo < threshold - p.cost_diff) {
     50      p.cost_combo = cost_combo;
     51      is_good_pair = BROTLI_TRUE;
     52    }
     53  }
     54  if (is_good_pair) {
     55    p.cost_diff += p.cost_combo;
     56    if (*num_pairs > 0 && HistogramPairIsLess(&pairs[0], &p)) {
     57      /* Replace the top of the queue if needed. */
     58      if (*num_pairs < max_num_pairs) {
     59        pairs[*num_pairs] = pairs[0];
     60        ++(*num_pairs);
     61      }
     62      pairs[0] = p;
     63    } else if (*num_pairs < max_num_pairs) {
     64      pairs[*num_pairs] = p;
     65      ++(*num_pairs);
     66    }
     67  }
     68 })
     69 
     70 BROTLI_INTERNAL size_t FN(BrotliHistogramCombine)(HistogramType* out,
     71                                                  HistogramType* tmp,
     72                                                  uint32_t* cluster_size,
     73                                                  uint32_t* symbols,
     74                                                  uint32_t* clusters,
     75                                                  HistogramPair* pairs,
     76                                                  size_t num_clusters,
     77                                                  size_t symbols_size,
     78                                                  size_t max_clusters,
     79                                                  size_t max_num_pairs) CODE({
     80  double cost_diff_threshold = 0.0;
     81  size_t min_cluster_size = 1;
     82  size_t num_pairs = 0;
     83 
     84  {
     85    /* We maintain a vector of histogram pairs, with the property that the pair
     86       with the maximum bit cost reduction is the first. */
     87    size_t idx1;
     88    for (idx1 = 0; idx1 < num_clusters; ++idx1) {
     89      size_t idx2;
     90      for (idx2 = idx1 + 1; idx2 < num_clusters; ++idx2) {
     91        FN(BrotliCompareAndPushToQueue)(out, tmp, cluster_size, clusters[idx1],
     92            clusters[idx2], max_num_pairs, &pairs[0], &num_pairs);
     93      }
     94    }
     95  }
     96 
     97  while (num_clusters > min_cluster_size) {
     98    uint32_t best_idx1;
     99    uint32_t best_idx2;
    100    size_t i;
    101    if (pairs[0].cost_diff >= cost_diff_threshold) {
    102      cost_diff_threshold = 1e99;
    103      min_cluster_size = max_clusters;
    104      continue;
    105    }
    106    /* Take the best pair from the top of heap. */
    107    best_idx1 = pairs[0].idx1;
    108    best_idx2 = pairs[0].idx2;
    109    FN(HistogramAddHistogram)(&out[best_idx1], &out[best_idx2]);
    110    out[best_idx1].bit_cost_ = pairs[0].cost_combo;
    111    cluster_size[best_idx1] += cluster_size[best_idx2];
    112    for (i = 0; i < symbols_size; ++i) {
    113      if (symbols[i] == best_idx2) {
    114        symbols[i] = best_idx1;
    115      }
    116    }
    117    for (i = 0; i < num_clusters; ++i) {
    118      if (clusters[i] == best_idx2) {
    119        memmove(&clusters[i], &clusters[i + 1],
    120                (num_clusters - i - 1) * sizeof(clusters[0]));
    121        break;
    122      }
    123    }
    124    --num_clusters;
    125    {
    126      /* Remove pairs intersecting the just combined best pair. */
    127      size_t copy_to_idx = 0;
    128      for (i = 0; i < num_pairs; ++i) {
    129        HistogramPair* p = &pairs[i];
    130        if (p->idx1 == best_idx1 || p->idx2 == best_idx1 ||
    131            p->idx1 == best_idx2 || p->idx2 == best_idx2) {
    132          /* Remove invalid pair from the queue. */
    133          continue;
    134        }
    135        if (HistogramPairIsLess(&pairs[0], p)) {
    136          /* Replace the top of the queue if needed. */
    137          HistogramPair front = pairs[0];
    138          pairs[0] = *p;
    139          pairs[copy_to_idx] = front;
    140        } else {
    141          pairs[copy_to_idx] = *p;
    142        }
    143        ++copy_to_idx;
    144      }
    145      num_pairs = copy_to_idx;
    146    }
    147 
    148    /* Push new pairs formed with the combined histogram to the heap. */
    149    for (i = 0; i < num_clusters; ++i) {
    150      FN(BrotliCompareAndPushToQueue)(out, tmp, cluster_size, best_idx1,
    151          clusters[i], max_num_pairs, &pairs[0], &num_pairs);
    152    }
    153  }
    154  return num_clusters;
    155 })
    156 
    157 /* What is the bit cost of moving histogram from cur_symbol to candidate. */
    158 BROTLI_INTERNAL double FN(BrotliHistogramBitCostDistance)(
    159    const HistogramType* histogram, const HistogramType* candidate,
    160    HistogramType* tmp) CODE({
    161  if (histogram->total_count_ == 0) {
    162    return 0.0;
    163  } else {
    164    *tmp = *histogram;
    165    FN(HistogramAddHistogram)(tmp, candidate);
    166    return FN(BrotliPopulationCost)(tmp) - candidate->bit_cost_;
    167  }
    168 })
    169 
    170 /* Find the best 'out' histogram for each of the 'in' histograms.
    171   When called, clusters[0..num_clusters) contains the unique values from
    172   symbols[0..in_size), but this property is not preserved in this function.
    173   Note: we assume that out[]->bit_cost_ is already up-to-date. */
    174 BROTLI_INTERNAL void FN(BrotliHistogramRemap)(const HistogramType* in,
    175    size_t in_size, const uint32_t* clusters, size_t num_clusters,
    176    HistogramType* out, HistogramType* tmp, uint32_t* symbols) CODE({
    177  size_t i;
    178  for (i = 0; i < in_size; ++i) {
    179    uint32_t best_out = i == 0 ? symbols[0] : symbols[i - 1];
    180    double best_bits =
    181        FN(BrotliHistogramBitCostDistance)(&in[i], &out[best_out], tmp);
    182    size_t j;
    183    for (j = 0; j < num_clusters; ++j) {
    184      const double cur_bits =
    185          FN(BrotliHistogramBitCostDistance)(&in[i], &out[clusters[j]], tmp);
    186      if (cur_bits < best_bits) {
    187        best_bits = cur_bits;
    188        best_out = clusters[j];
    189      }
    190    }
    191    symbols[i] = best_out;
    192  }
    193 
    194  /* Recompute each out based on raw and symbols. */
    195  for (i = 0; i < num_clusters; ++i) {
    196    FN(HistogramClear)(&out[clusters[i]]);
    197  }
    198  for (i = 0; i < in_size; ++i) {
    199    FN(HistogramAddHistogram)(&out[symbols[i]], &in[i]);
    200  }
    201 })
    202 
    203 /* Reorders elements of the out[0..length) array and changes values in
    204   symbols[0..length) array in the following way:
    205     * when called, symbols[] contains indexes into out[], and has N unique
    206       values (possibly N < length)
    207     * on return, symbols'[i] = f(symbols[i]) and
    208                  out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
    209       where f is a bijection between the range of symbols[] and [0..N), and
    210       the first occurrences of values in symbols'[i] come in consecutive
    211       increasing order.
    212   Returns N, the number of unique values in symbols[]. */
    213 BROTLI_INTERNAL size_t FN(BrotliHistogramReindex)(MemoryManager* m,
    214    HistogramType* out, uint32_t* symbols, size_t length) CODE({
    215  static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
    216  uint32_t* new_index = BROTLI_ALLOC(m, uint32_t, length);
    217  uint32_t next_index;
    218  HistogramType* tmp;
    219  size_t i;
    220  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return 0;
    221  for (i = 0; i < length; ++i) {
    222      new_index[i] = kInvalidIndex;
    223  }
    224  next_index = 0;
    225  for (i = 0; i < length; ++i) {
    226    if (new_index[symbols[i]] == kInvalidIndex) {
    227      new_index[symbols[i]] = next_index;
    228      ++next_index;
    229    }
    230  }
    231  /* TODO(eustas): by using idea of "cycle-sort" we can avoid allocation of
    232     tmp and reduce the number of copying by the factor of 2. */
    233  tmp = BROTLI_ALLOC(m, HistogramType, next_index);
    234  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(tmp)) return 0;
    235  next_index = 0;
    236  for (i = 0; i < length; ++i) {
    237    if (new_index[symbols[i]] == next_index) {
    238      tmp[next_index] = out[symbols[i]];
    239      ++next_index;
    240    }
    241    symbols[i] = new_index[symbols[i]];
    242  }
    243  BROTLI_FREE(m, new_index);
    244  for (i = 0; i < next_index; ++i) {
    245    out[i] = tmp[i];
    246  }
    247  BROTLI_FREE(m, tmp);
    248  return next_index;
    249 })
    250 
    251 BROTLI_INTERNAL void FN(BrotliClusterHistograms)(
    252    MemoryManager* m, const HistogramType* in, const size_t in_size,
    253    size_t max_histograms, HistogramType* out, size_t* out_size,
    254    uint32_t* histogram_symbols) CODE({
    255  uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, in_size);
    256  uint32_t* clusters = BROTLI_ALLOC(m, uint32_t, in_size);
    257  size_t num_clusters = 0;
    258  const size_t max_input_histograms = 64;
    259  size_t pairs_capacity = max_input_histograms * max_input_histograms / 2;
    260  /* For the first pass of clustering, we allow all pairs. */
    261  HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity + 1);
    262  /* TODO(eustas): move to "persistent" arena? */
    263  HistogramType* tmp = BROTLI_ALLOC(m, HistogramType, 1);
    264  size_t i;
    265 
    266  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(cluster_size) ||
    267      BROTLI_IS_NULL(clusters) || BROTLI_IS_NULL(pairs)|| BROTLI_IS_NULL(tmp)) {
    268    return;
    269  }
    270 
    271  for (i = 0; i < in_size; ++i) {
    272    cluster_size[i] = 1;
    273  }
    274 
    275  for (i = 0; i < in_size; ++i) {
    276    out[i] = in[i];
    277    out[i].bit_cost_ = FN(BrotliPopulationCost)(&in[i]);
    278    histogram_symbols[i] = (uint32_t)i;
    279  }
    280 
    281  for (i = 0; i < in_size; i += max_input_histograms) {
    282    size_t num_to_combine =
    283        BROTLI_MIN(size_t, in_size - i, max_input_histograms);
    284    size_t num_new_clusters;
    285    size_t j;
    286    for (j = 0; j < num_to_combine; ++j) {
    287      clusters[num_clusters + j] = (uint32_t)(i + j);
    288    }
    289    num_new_clusters =
    290        FN(BrotliHistogramCombine)(out, tmp, cluster_size,
    291                                   &histogram_symbols[i],
    292                                   &clusters[num_clusters], pairs,
    293                                   num_to_combine, num_to_combine,
    294                                   max_histograms, pairs_capacity);
    295    num_clusters += num_new_clusters;
    296  }
    297 
    298  {
    299    /* For the second pass, we limit the total number of histogram pairs.
    300       After this limit is reached, we only keep searching for the best pair. */
    301    size_t max_num_pairs = BROTLI_MIN(size_t,
    302        64 * num_clusters, (num_clusters / 2) * num_clusters);
    303    BROTLI_ENSURE_CAPACITY(
    304        m, HistogramPair, pairs, pairs_capacity, max_num_pairs + 1);
    305    if (BROTLI_IS_OOM(m)) return;
    306 
    307    /* Collapse similar histograms. */
    308    num_clusters = FN(BrotliHistogramCombine)(out, tmp, cluster_size,
    309                                              histogram_symbols, clusters,
    310                                              pairs, num_clusters, in_size,
    311                                              max_histograms, max_num_pairs);
    312  }
    313  BROTLI_FREE(m, pairs);
    314  BROTLI_FREE(m, cluster_size);
    315  /* Find the optimal map from original histograms to the final ones. */
    316  FN(BrotliHistogramRemap)(in, in_size, clusters, num_clusters,
    317                           out, tmp, histogram_symbols);
    318  BROTLI_FREE(m, tmp);
    319  BROTLI_FREE(m, clusters);
    320  /* Convert the context map to a canonical form. */
    321  *out_size = FN(BrotliHistogramReindex)(m, out, histogram_symbols, in_size);
    322  if (BROTLI_IS_OOM(m)) return;
    323 })
    324 
    325 #undef HistogramType