tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

bloom.rs (11810B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
      4 
      5 //! Counting and non-counting Bloom filters tuned for use as ancestor filters
      6 //! for selector matching.
      7 
      8 use std::fmt::{self, Debug};
      9 
     10 // The top 8 bits of the 32-bit hash value are not used by the bloom filter.
     11 // Consumers may rely on this to pack hashes more efficiently.
     12 pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
     13 const KEY_SIZE: usize = 12;
     14 
     15 const ARRAY_SIZE: usize = 1 << KEY_SIZE;
     16 const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
     17 
     18 /// A counting Bloom filter with 8-bit counters.
     19 pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
     20 
     21 /// A counting Bloom filter with parameterized storage to handle
     22 /// counters of different sizes.  For now we assume that having two hash
     23 /// functions is enough, but we may revisit that decision later.
     24 ///
     25 /// The filter uses an array with 2**KeySize entries.
     26 ///
     27 /// Assuming a well-distributed hash function, a Bloom filter with
     28 /// array size M containing N elements and
     29 /// using k hash function has expected false positive rate exactly
     30 ///
     31 /// $  (1 - (1 - 1/M)^{kN})^k  $
     32 ///
     33 /// because each array slot has a
     34 ///
     35 /// $  (1 - 1/M)^{kN}  $
     36 ///
     37 /// chance of being 0, and the expected false positive rate is the
     38 /// probability that all of the k hash functions will hit a nonzero
     39 /// slot.
     40 ///
     41 /// For reasonable assumptions (M large, kN large, which should both
     42 /// hold if we're worried about false positives) about M and kN this
     43 /// becomes approximately
     44 ///
     45 /// $$  (1 - \exp(-kN/M))^k   $$
     46 ///
     47 /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
     48 /// or in other words
     49 ///
     50 /// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
     51 ///
     52 /// where r is the false positive rate.  This can be used to compute
     53 /// the desired KeySize for a given load N and false positive rate r.
     54 ///
     55 /// If N/M is assumed small, then the false positive rate can
     56 /// further be approximated as 4*N^2/M^2.  So increasing KeySize by
     57 /// 1, which doubles M, reduces the false positive rate by about a
     58 /// factor of 4, and a false positive rate of 1% corresponds to
     59 /// about M/N == 20.
     60 ///
     61 /// What this means in practice is that for a few hundred keys using a
     62 /// KeySize of 12 gives false positive rates on the order of 0.25-4%.
     63 ///
     64 /// Similarly, using a KeySize of 10 would lead to a 4% false
     65 /// positive rate for N == 100 and to quite bad false positive
     66 /// rates for larger N.
     67 #[derive(Clone, Default)]
     68 pub struct CountingBloomFilter<S>
     69 where
     70    S: BloomStorage,
     71 {
     72    storage: S,
     73 }
     74 
     75 impl<S> CountingBloomFilter<S>
     76 where
     77    S: BloomStorage,
     78 {
     79    /// Creates a new bloom filter.
     80    #[inline]
     81    pub fn new() -> Self {
     82        Default::default()
     83    }
     84 
     85    #[inline]
     86    pub fn clear(&mut self) {
     87        self.storage = Default::default();
     88    }
     89 
     90    // Slow linear accessor to make sure the bloom filter is zeroed. This should
     91    // never be used in release builds.
     92    #[cfg(debug_assertions)]
     93    pub fn is_zeroed(&self) -> bool {
     94        self.storage.is_zeroed()
     95    }
     96 
     97    #[cfg(not(debug_assertions))]
     98    pub fn is_zeroed(&self) -> bool {
     99        unreachable!()
    100    }
    101 
    102    /// Inserts an item with a particular hash into the bloom filter.
    103    #[inline]
    104    pub fn insert_hash(&mut self, hash: u32) {
    105        self.storage.adjust_first_slot(hash, true);
    106        self.storage.adjust_second_slot(hash, true);
    107    }
    108 
    109    /// Removes an item with a particular hash from the bloom filter.
    110    #[inline]
    111    pub fn remove_hash(&mut self, hash: u32) {
    112        self.storage.adjust_first_slot(hash, false);
    113        self.storage.adjust_second_slot(hash, false);
    114    }
    115 
    116    /// Check whether the filter might contain an item with the given hash.
    117    /// This can sometimes return true even if the item is not in the filter,
    118    /// but will never return false for items that are actually in the filter.
    119    #[inline]
    120    pub fn might_contain_hash(&self, hash: u32) -> bool {
    121        !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
    122    }
    123 }
    124 
    125 impl<S> Debug for CountingBloomFilter<S>
    126 where
    127    S: BloomStorage,
    128 {
    129    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
    130        let mut slots_used = 0;
    131        for i in 0..ARRAY_SIZE {
    132            if !self.storage.slot_is_empty(i) {
    133                slots_used += 1;
    134            }
    135        }
    136        write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
    137    }
    138 }
    139 
    140 pub trait BloomStorage: Clone + Default {
    141    fn slot_is_empty(&self, index: usize) -> bool;
    142    fn adjust_slot(&mut self, index: usize, increment: bool);
    143    fn is_zeroed(&self) -> bool;
    144 
    145    #[inline]
    146    fn first_slot_is_empty(&self, hash: u32) -> bool {
    147        self.slot_is_empty(Self::first_slot_index(hash))
    148    }
    149 
    150    #[inline]
    151    fn second_slot_is_empty(&self, hash: u32) -> bool {
    152        self.slot_is_empty(Self::second_slot_index(hash))
    153    }
    154 
    155    #[inline]
    156    fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
    157        self.adjust_slot(Self::first_slot_index(hash), increment)
    158    }
    159 
    160    #[inline]
    161    fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
    162        self.adjust_slot(Self::second_slot_index(hash), increment)
    163    }
    164 
    165    #[inline]
    166    fn first_slot_index(hash: u32) -> usize {
    167        hash1(hash) as usize
    168    }
    169 
    170    #[inline]
    171    fn second_slot_index(hash: u32) -> usize {
    172        hash2(hash) as usize
    173    }
    174 }
    175 
    176 /// Storage class for a CountingBloomFilter that has 8-bit counters.
    177 pub struct BloomStorageU8 {
    178    counters: [u8; ARRAY_SIZE],
    179 }
    180 
    181 impl BloomStorage for BloomStorageU8 {
    182    #[inline]
    183    fn adjust_slot(&mut self, index: usize, increment: bool) {
    184        let slot = &mut self.counters[index];
    185        if *slot != 0xff {
    186            // full
    187            if increment {
    188                *slot += 1;
    189            } else {
    190                *slot -= 1;
    191            }
    192        }
    193    }
    194 
    195    #[inline]
    196    fn slot_is_empty(&self, index: usize) -> bool {
    197        self.counters[index] == 0
    198    }
    199 
    200    #[inline]
    201    fn is_zeroed(&self) -> bool {
    202        self.counters.iter().all(|x| *x == 0)
    203    }
    204 }
    205 
    206 impl Default for BloomStorageU8 {
    207    fn default() -> Self {
    208        BloomStorageU8 {
    209            counters: [0; ARRAY_SIZE],
    210        }
    211    }
    212 }
    213 
    214 impl Clone for BloomStorageU8 {
    215    fn clone(&self) -> Self {
    216        BloomStorageU8 {
    217            counters: self.counters,
    218        }
    219    }
    220 }
    221 
    222 /// Storage class for a CountingBloomFilter that has 1-bit counters.
    223 pub struct BloomStorageBool {
    224    counters: [u8; ARRAY_SIZE / 8],
    225 }
    226 
    227 impl BloomStorage for BloomStorageBool {
    228    #[inline]
    229    fn adjust_slot(&mut self, index: usize, increment: bool) {
    230        let bit = 1 << (index % 8);
    231        let byte = &mut self.counters[index / 8];
    232 
    233        // Since we have only one bit for storage, decrementing it
    234        // should never do anything.  Assert against an accidental
    235        // decrementing of a bit that was never set.
    236        assert!(
    237            increment || (*byte & bit) != 0,
    238            "should not decrement if slot is already false"
    239        );
    240 
    241        if increment {
    242            *byte |= bit;
    243        }
    244    }
    245 
    246    #[inline]
    247    fn slot_is_empty(&self, index: usize) -> bool {
    248        let bit = 1 << (index % 8);
    249        (self.counters[index / 8] & bit) == 0
    250    }
    251 
    252    #[inline]
    253    fn is_zeroed(&self) -> bool {
    254        self.counters.iter().all(|x| *x == 0)
    255    }
    256 }
    257 
    258 impl Default for BloomStorageBool {
    259    fn default() -> Self {
    260        BloomStorageBool {
    261            counters: [0; ARRAY_SIZE / 8],
    262        }
    263    }
    264 }
    265 
    266 impl Clone for BloomStorageBool {
    267    fn clone(&self) -> Self {
    268        BloomStorageBool {
    269            counters: self.counters,
    270        }
    271    }
    272 }
    273 
    274 #[inline]
    275 fn hash1(hash: u32) -> u32 {
    276    hash & KEY_MASK
    277 }
    278 
    279 #[inline]
    280 fn hash2(hash: u32) -> u32 {
    281    (hash >> KEY_SIZE) & KEY_MASK
    282 }
    283 
    284 #[test]
    285 fn create_and_insert_some_stuff() {
    286    use rustc_hash::FxHasher;
    287    use std::hash::{Hash, Hasher};
    288    use std::mem::transmute;
    289 
    290    fn hash_as_str(i: usize) -> u32 {
    291        let mut hasher = FxHasher::default();
    292        let s = i.to_string();
    293        s.hash(&mut hasher);
    294        let hash: u64 = hasher.finish();
    295        (hash >> 32) as u32 ^ (hash as u32)
    296    }
    297 
    298    let mut bf = BloomFilter::new();
    299 
    300    // Statically assert that ARRAY_SIZE is a multiple of 8, which
    301    // BloomStorageBool relies on.
    302    unsafe {
    303        transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
    304    }
    305 
    306    for i in 0_usize..1000 {
    307        bf.insert_hash(hash_as_str(i));
    308    }
    309 
    310    for i in 0_usize..1000 {
    311        assert!(bf.might_contain_hash(hash_as_str(i)));
    312    }
    313 
    314    let false_positives = (1001_usize..2000)
    315        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
    316        .count();
    317 
    318    assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.
    319 
    320    for i in 0_usize..100 {
    321        bf.remove_hash(hash_as_str(i));
    322    }
    323 
    324    for i in 100_usize..1000 {
    325        assert!(bf.might_contain_hash(hash_as_str(i)));
    326    }
    327 
    328    let false_positives = (0_usize..100)
    329        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
    330        .count();
    331 
    332    assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
    333 
    334    bf.clear();
    335 
    336    for i in 0_usize..2000 {
    337        assert!(!bf.might_contain_hash(hash_as_str(i)));
    338    }
    339 }
    340 
    341 #[cfg(feature = "bench")]
    342 #[cfg(test)]
    343 mod bench {
    344    extern crate test;
    345    use super::BloomFilter;
    346 
    347    #[derive(Default)]
    348    struct HashGenerator(u32);
    349 
    350    impl HashGenerator {
    351        fn next(&mut self) -> u32 {
    352            // Each hash is split into two twelve-bit segments, which are used
    353            // as an index into an array. We increment each by 64 so that we
    354            // hit the next cache line, and then another 1 so that our wrapping
    355            // behavior leads us to different entries each time.
    356            //
    357            // Trying to simulate cold caches is rather difficult with the cargo
    358            // benchmarking setup, so it may all be moot depending on the number
    359            // of iterations that end up being run. But we might as well.
    360            self.0 += (65) + (65 << super::KEY_SIZE);
    361            self.0
    362        }
    363    }
    364 
    365    #[bench]
    366    fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
    367        b.iter(|| {
    368            let mut gen1 = HashGenerator::default();
    369            let mut gen2 = HashGenerator::default();
    370            let mut bf = BloomFilter::new();
    371            for _ in 0_usize..1000 {
    372                bf.insert_hash(gen1.next());
    373            }
    374            for _ in 0_usize..100 {
    375                bf.remove_hash(gen2.next());
    376            }
    377            for _ in 100_usize..200 {
    378                test::black_box(bf.might_contain_hash(gen2.next()));
    379            }
    380        });
    381    }
    382 
    383    #[bench]
    384    fn might_contain_10(b: &mut test::Bencher) {
    385        let bf = BloomFilter::new();
    386        let mut gen = HashGenerator::default();
    387        b.iter(|| {
    388            for _ in 0..10 {
    389                test::black_box(bf.might_contain_hash(gen.next()));
    390            }
    391        });
    392    }
    393 
    394    #[bench]
    395    fn clear(b: &mut test::Bencher) {
    396        let mut bf = Box::new(BloomFilter::new());
    397        b.iter(|| test::black_box(&mut bf).clear());
    398    }
    399 
    400    #[bench]
    401    fn insert_10(b: &mut test::Bencher) {
    402        let mut bf = BloomFilter::new();
    403        let mut gen = HashGenerator::default();
    404        b.iter(|| {
    405            for _ in 0..10 {
    406                test::black_box(bf.insert_hash(gen.next()));
    407            }
    408        });
    409    }
    410 
    411    #[bench]
    412    fn remove_10(b: &mut test::Bencher) {
    413        let mut bf = BloomFilter::new();
    414        let mut gen = HashGenerator::default();
    415        // Note: this will underflow, and that's ok.
    416        b.iter(|| {
    417            for _ in 0..10 {
    418                bf.remove_hash(gen.next())
    419            }
    420        });
    421    }
    422 }