tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

BloomFilter.h (10684B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /*
      8 * A counting Bloom filter implementation.  This allows consumers to
      9 * do fast probabilistic "is item X in set Y?" testing which will
     10 * never answer "no" when the correct answer is "yes" (but might
     11 * incorrectly answer "yes" when the correct answer is "no").
     12 */
     13 
     14 #ifndef mozilla_BloomFilter_h
     15 #define mozilla_BloomFilter_h
     16 
     17 #include "mozilla/Attributes.h"
     18 #include "mozilla/Likely.h"
     19 
     20 #include <climits>
     21 #include <cstdint>
     22 #include <cstring>
     23 
     24 namespace mozilla {
     25 
     26 /*
     27 * This class implements a classic Bloom filter as described at
     28 * <http://en.wikipedia.org/wiki/Bloom_filter>.  This allows quick
     29 * probabilistic answers to the question "is object X in set Y?" where the
     30 * contents of Y might not be time-invariant.  The probabilistic nature of the
     31 * test means that sometimes the answer will be "yes" when it should be "no".
     32 * If the answer is "no", then X is guaranteed not to be in Y.
     33 *
     34 * The filter is parametrized on KeySize, which is the size of the key
     35 * generated by each of hash functions used by the filter, in bits,
     36 * and the type of object T being added and removed.  T must implement
     37 * a |uint32_t hash() const| method which returns a uint32_t hash key
     38 * that will be used to generate the two separate hash functions for
     39 * the Bloom filter.  This hash key MUST be well-distributed for good
     40 * results!  KeySize is not allowed to be larger than 16.
     41 *
     42 * The filter uses exactly 2**KeySize bit (2**(KeySize-3) bytes) of memory.
     43 * From now on we will refer to the memory used by the filter as M.
     44 *
     45 * The expected rate of incorrect "yes" answers depends on M and on
     46 * the number N of objects in set Y.  As long as N is small compared
     47 * to M, the rate of such answers is expected to be approximately
     48 * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
     49 * elements then using a KeySize of 12 gives a reasonably low
     50 * incorrect answer rate.  A KeySize of 12 has the additional benefit
     51 * of using exactly one page for the filter in typical hardware
     52 * configurations.
     53 */
     54 template <unsigned KeySize, class T>
     55 class BitBloomFilter {
     56  /*
     57   * A counting Bloom filter with 8-bit counters.  For now we assume
     58   * that having two hash functions is enough, but we may revisit that
     59   * decision later.
     60   *
     61   * The filter uses an array with 2**KeySize entries.
     62   *
     63   * Assuming a well-distributed hash function, a Bloom filter with
     64   * array size M containing N elements and
     65   * using k hash function has expected false positive rate exactly
     66   *
     67   * $  (1 - (1 - 1/M)^{kN})^k  $
     68   *
     69   * because each array slot has a
     70   *
     71   * $  (1 - 1/M)^{kN}  $
     72   *
     73   * chance of being 0, and the expected false positive rate is the
     74   * probability that all of the k hash functions will hit a nonzero
     75   * slot.
     76   *
     77   * For reasonable assumptions (M large, kN large, which should both
     78   * hold if we're worried about false positives) about M and kN this
     79   * becomes approximately
     80   *
     81   * $$  (1 - \exp(-kN/M))^k   $$
     82   *
     83   * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
     84   * or in other words
     85   *
     86   * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
     87   *
     88   * where r is the false positive rate.  This can be used to compute
     89   * the desired KeySize for a given load N and false positive rate r.
     90   *
     91   * If N/M is assumed small, then the false positive rate can
     92   * further be approximated as 4*N^2/M^2.  So increasing KeySize by
     93   * 1, which doubles M, reduces the false positive rate by about a
     94   * factor of 4, and a false positive rate of 1% corresponds to
     95   * about M/N == 20.
     96   *
     97   * What this means in practice is that for a few hundred keys using a
     98   * KeySize of 12 gives false positive rates on the order of 0.25-4%.
     99   *
    100   * Similarly, using a KeySize of 10 would lead to a 4% false
    101   * positive rate for N == 100 and to quite bad false positive
    102   * rates for larger N.
    103   */
    104 public:
    105  BitBloomFilter() {
    106    static_assert(KeySize >= 3, "KeySize too small");
    107    static_assert(KeySize <= kKeyShift, "KeySize too big");
    108 
    109    // XXX: Should we have a custom operator new using calloc instead and
    110    // require that we're allocated via the operator?
    111    clear();
    112  }
    113 
    114  /*
    115   * Clear the filter.  This should be done before reusing it.
    116   */
    117  void clear();
    118 
    119  /*
    120   * Add an item to the filter.
    121   */
    122  void add(const T* aValue);
    123 
    124  /*
    125   * Check whether the filter might contain an item.  This can
    126   * sometimes return true even if the item is not in the filter,
    127   * but will never return false for items that are actually in the
    128   * filter.
    129   */
    130  bool mightContain(const T* aValue) const;
    131 
    132  /*
    133   * Methods for add/contain when we already have a hash computed
    134   */
    135  void add(uint32_t aHash);
    136  bool mightContain(uint32_t aHash) const;
    137 
    138 private:
    139  static const size_t kArraySize = (1 << (KeySize - 3));
    140  static const uint32_t kKeyMask = (1 << KeySize) - 1;
    141  static const uint32_t kKeyShift = 16;
    142 
    143  static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
    144  static uint32_t hash2(uint32_t aHash) {
    145    return (aHash >> kKeyShift) & kKeyMask;
    146  }
    147 
    148  bool getSlot(uint32_t aHash) const {
    149    uint32_t index = aHash / 8;
    150    uint8_t shift = aHash % 8;
    151    uint8_t mask = 1 << shift;
    152    return !!(mBits[index] & mask);
    153  }
    154 
    155  void setSlot(uint32_t aHash) {
    156    uint32_t index = aHash / 8;
    157    uint8_t shift = aHash % 8;
    158    uint8_t bit = 1 << shift;
    159    mBits[index] |= bit;
    160  }
    161 
    162  bool getFirstSlot(uint32_t aHash) const { return getSlot(hash1(aHash)); }
    163  bool getSecondSlot(uint32_t aHash) const { return getSlot(hash2(aHash)); }
    164 
    165  void setFirstSlot(uint32_t aHash) { setSlot(hash1(aHash)); }
    166  void setSecondSlot(uint32_t aHash) { setSlot(hash2(aHash)); }
    167 
    168  uint8_t mBits[kArraySize];
    169 };
    170 
    171 template <unsigned KeySize, class T>
    172 inline void BitBloomFilter<KeySize, T>::clear() {
    173  memset(mBits, 0, kArraySize);
    174 }
    175 
    176 template <unsigned KeySize, class T>
    177 inline void BitBloomFilter<KeySize, T>::add(uint32_t aHash) {
    178  setFirstSlot(aHash);
    179  setSecondSlot(aHash);
    180 }
    181 
    182 template <unsigned KeySize, class T>
    183 MOZ_ALWAYS_INLINE void BitBloomFilter<KeySize, T>::add(const T* aValue) {
    184  uint32_t hash = aValue->hash();
    185  return add(hash);
    186 }
    187 
    188 template <unsigned KeySize, class T>
    189 MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
    190    uint32_t aHash) const {
    191  // Check that all the slots for this hash contain something
    192  return getFirstSlot(aHash) && getSecondSlot(aHash);
    193 }
    194 
    195 template <unsigned KeySize, class T>
    196 MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
    197    const T* aValue) const {
    198  uint32_t hash = aValue->hash();
    199  return mightContain(hash);
    200 }
    201 
    202 /*
    203 * This class implements a counting Bloom filter as described at
    204 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
    205 * 8-bit counters.
    206 *
    207 * Compared to `BitBloomFilter`, this class supports 'remove' operation.
    208 *
    209 * The filter uses exactly 2**KeySize bytes of memory.
    210 *
    211 * Other characteristics are the same as BitBloomFilter.
    212 */
    213 template <unsigned KeySize, class T>
    214 class CountingBloomFilter {
    215 public:
    216  CountingBloomFilter() {
    217    static_assert(KeySize <= kKeyShift, "KeySize too big");
    218 
    219    clear();
    220  }
    221 
    222  /*
    223   * Clear the filter.  This should be done before reusing it, because
    224   * just removing all items doesn't clear counters that hit the upper
    225   * bound.
    226   */
    227  void clear();
    228 
    229  /*
    230   * Add an item to the filter.
    231   */
    232  void add(const T* aValue);
    233 
    234  /*
    235   * Remove an item from the filter.
    236   */
    237  void remove(const T* aValue);
    238 
    239  /*
    240   * Check whether the filter might contain an item.  This can
    241   * sometimes return true even if the item is not in the filter,
    242   * but will never return false for items that are actually in the
    243   * filter.
    244   */
    245  bool mightContain(const T* aValue) const;
    246 
    247  /*
    248   * Methods for add/remove/contain when we already have a hash computed
    249   */
    250  void add(uint32_t aHash);
    251  void remove(uint32_t aHash);
    252  bool mightContain(uint32_t aHash) const;
    253 
    254 private:
    255  static const size_t kArraySize = (1 << KeySize);
    256  static const uint32_t kKeyMask = (1 << KeySize) - 1;
    257  static const uint32_t kKeyShift = 16;
    258 
    259  static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
    260  static uint32_t hash2(uint32_t aHash) {
    261    return (aHash >> kKeyShift) & kKeyMask;
    262  }
    263 
    264  uint8_t& firstSlot(uint32_t aHash) { return mCounters[hash1(aHash)]; }
    265  uint8_t& secondSlot(uint32_t aHash) { return mCounters[hash2(aHash)]; }
    266 
    267  const uint8_t& firstSlot(uint32_t aHash) const {
    268    return mCounters[hash1(aHash)];
    269  }
    270  const uint8_t& secondSlot(uint32_t aHash) const {
    271    return mCounters[hash2(aHash)];
    272  }
    273 
    274  static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
    275 
    276  uint8_t mCounters[kArraySize];
    277 };
    278 
    279 template <unsigned KeySize, class T>
    280 inline void CountingBloomFilter<KeySize, T>::clear() {
    281  memset(mCounters, 0, kArraySize);
    282 }
    283 
    284 template <unsigned KeySize, class T>
    285 inline void CountingBloomFilter<KeySize, T>::add(uint32_t aHash) {
    286  uint8_t& slot1 = firstSlot(aHash);
    287  if (MOZ_LIKELY(!full(slot1))) {
    288    ++slot1;
    289  }
    290  uint8_t& slot2 = secondSlot(aHash);
    291  if (MOZ_LIKELY(!full(slot2))) {
    292    ++slot2;
    293  }
    294 }
    295 
    296 template <unsigned KeySize, class T>
    297 MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::add(const T* aValue) {
    298  uint32_t hash = aValue->hash();
    299  return add(hash);
    300 }
    301 
    302 template <unsigned KeySize, class T>
    303 inline void CountingBloomFilter<KeySize, T>::remove(uint32_t aHash) {
    304  // If the slots are full, we don't know whether we bumped them to be
    305  // there when we added or not, so just leave them full.
    306  uint8_t& slot1 = firstSlot(aHash);
    307  if (MOZ_LIKELY(!full(slot1))) {
    308    --slot1;
    309  }
    310  uint8_t& slot2 = secondSlot(aHash);
    311  if (MOZ_LIKELY(!full(slot2))) {
    312    --slot2;
    313  }
    314 }
    315 
    316 template <unsigned KeySize, class T>
    317 MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::remove(
    318    const T* aValue) {
    319  uint32_t hash = aValue->hash();
    320  remove(hash);
    321 }
    322 
    323 template <unsigned KeySize, class T>
    324 MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
    325    uint32_t aHash) const {
    326  // Check that all the slots for this hash contain something
    327  return firstSlot(aHash) && secondSlot(aHash);
    328 }
    329 
    330 template <unsigned KeySize, class T>
    331 MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
    332    const T* aValue) const {
    333  uint32_t hash = aValue->hash();
    334  return mightContain(hash);
    335 }
    336 
    337 }  // namespace mozilla
    338 
    339 #endif /* mozilla_BloomFilter_h */