bloom.rs (11810B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ 4 5 //! Counting and non-counting Bloom filters tuned for use as ancestor filters 6 //! for selector matching. 7 8 use std::fmt::{self, Debug}; 9 10 // The top 8 bits of the 32-bit hash value are not used by the bloom filter. 11 // Consumers may rely on this to pack hashes more efficiently. 12 pub const BLOOM_HASH_MASK: u32 = 0x00ffffff; 13 const KEY_SIZE: usize = 12; 14 15 const ARRAY_SIZE: usize = 1 << KEY_SIZE; 16 const KEY_MASK: u32 = (1 << KEY_SIZE) - 1; 17 18 /// A counting Bloom filter with 8-bit counters. 19 pub type BloomFilter = CountingBloomFilter<BloomStorageU8>; 20 21 /// A counting Bloom filter with parameterized storage to handle 22 /// counters of different sizes. For now we assume that having two hash 23 /// functions is enough, but we may revisit that decision later. 24 /// 25 /// The filter uses an array with 2**KeySize entries. 26 /// 27 /// Assuming a well-distributed hash function, a Bloom filter with 28 /// array size M containing N elements and 29 /// using k hash function has expected false positive rate exactly 30 /// 31 /// $ (1 - (1 - 1/M)^{kN})^k $ 32 /// 33 /// because each array slot has a 34 /// 35 /// $ (1 - 1/M)^{kN} $ 36 /// 37 /// chance of being 0, and the expected false positive rate is the 38 /// probability that all of the k hash functions will hit a nonzero 39 /// slot. 40 /// 41 /// For reasonable assumptions (M large, kN large, which should both 42 /// hold if we're worried about false positives) about M and kN this 43 /// becomes approximately 44 /// 45 /// $$ (1 - \exp(-kN/M))^k $$ 46 /// 47 /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, 48 /// or in other words 49 /// 50 /// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ 51 /// 52 /// where r is the false positive rate. This can be used to compute 53 /// the desired KeySize for a given load N and false positive rate r. 54 /// 55 /// If N/M is assumed small, then the false positive rate can 56 /// further be approximated as 4*N^2/M^2. So increasing KeySize by 57 /// 1, which doubles M, reduces the false positive rate by about a 58 /// factor of 4, and a false positive rate of 1% corresponds to 59 /// about M/N == 20. 60 /// 61 /// What this means in practice is that for a few hundred keys using a 62 /// KeySize of 12 gives false positive rates on the order of 0.25-4%. 63 /// 64 /// Similarly, using a KeySize of 10 would lead to a 4% false 65 /// positive rate for N == 100 and to quite bad false positive 66 /// rates for larger N. 67 #[derive(Clone, Default)] 68 pub struct CountingBloomFilter<S> 69 where 70 S: BloomStorage, 71 { 72 storage: S, 73 } 74 75 impl<S> CountingBloomFilter<S> 76 where 77 S: BloomStorage, 78 { 79 /// Creates a new bloom filter. 80 #[inline] 81 pub fn new() -> Self { 82 Default::default() 83 } 84 85 #[inline] 86 pub fn clear(&mut self) { 87 self.storage = Default::default(); 88 } 89 90 // Slow linear accessor to make sure the bloom filter is zeroed. This should 91 // never be used in release builds. 92 #[cfg(debug_assertions)] 93 pub fn is_zeroed(&self) -> bool { 94 self.storage.is_zeroed() 95 } 96 97 #[cfg(not(debug_assertions))] 98 pub fn is_zeroed(&self) -> bool { 99 unreachable!() 100 } 101 102 /// Inserts an item with a particular hash into the bloom filter. 103 #[inline] 104 pub fn insert_hash(&mut self, hash: u32) { 105 self.storage.adjust_first_slot(hash, true); 106 self.storage.adjust_second_slot(hash, true); 107 } 108 109 /// Removes an item with a particular hash from the bloom filter. 110 #[inline] 111 pub fn remove_hash(&mut self, hash: u32) { 112 self.storage.adjust_first_slot(hash, false); 113 self.storage.adjust_second_slot(hash, false); 114 } 115 116 /// Check whether the filter might contain an item with the given hash. 117 /// This can sometimes return true even if the item is not in the filter, 118 /// but will never return false for items that are actually in the filter. 119 #[inline] 120 pub fn might_contain_hash(&self, hash: u32) -> bool { 121 !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash) 122 } 123 } 124 125 impl<S> Debug for CountingBloomFilter<S> 126 where 127 S: BloomStorage, 128 { 129 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 130 let mut slots_used = 0; 131 for i in 0..ARRAY_SIZE { 132 if !self.storage.slot_is_empty(i) { 133 slots_used += 1; 134 } 135 } 136 write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE) 137 } 138 } 139 140 pub trait BloomStorage: Clone + Default { 141 fn slot_is_empty(&self, index: usize) -> bool; 142 fn adjust_slot(&mut self, index: usize, increment: bool); 143 fn is_zeroed(&self) -> bool; 144 145 #[inline] 146 fn first_slot_is_empty(&self, hash: u32) -> bool { 147 self.slot_is_empty(Self::first_slot_index(hash)) 148 } 149 150 #[inline] 151 fn second_slot_is_empty(&self, hash: u32) -> bool { 152 self.slot_is_empty(Self::second_slot_index(hash)) 153 } 154 155 #[inline] 156 fn adjust_first_slot(&mut self, hash: u32, increment: bool) { 157 self.adjust_slot(Self::first_slot_index(hash), increment) 158 } 159 160 #[inline] 161 fn adjust_second_slot(&mut self, hash: u32, increment: bool) { 162 self.adjust_slot(Self::second_slot_index(hash), increment) 163 } 164 165 #[inline] 166 fn first_slot_index(hash: u32) -> usize { 167 hash1(hash) as usize 168 } 169 170 #[inline] 171 fn second_slot_index(hash: u32) -> usize { 172 hash2(hash) as usize 173 } 174 } 175 176 /// Storage class for a CountingBloomFilter that has 8-bit counters. 177 pub struct BloomStorageU8 { 178 counters: [u8; ARRAY_SIZE], 179 } 180 181 impl BloomStorage for BloomStorageU8 { 182 #[inline] 183 fn adjust_slot(&mut self, index: usize, increment: bool) { 184 let slot = &mut self.counters[index]; 185 if *slot != 0xff { 186 // full 187 if increment { 188 *slot += 1; 189 } else { 190 *slot -= 1; 191 } 192 } 193 } 194 195 #[inline] 196 fn slot_is_empty(&self, index: usize) -> bool { 197 self.counters[index] == 0 198 } 199 200 #[inline] 201 fn is_zeroed(&self) -> bool { 202 self.counters.iter().all(|x| *x == 0) 203 } 204 } 205 206 impl Default for BloomStorageU8 { 207 fn default() -> Self { 208 BloomStorageU8 { 209 counters: [0; ARRAY_SIZE], 210 } 211 } 212 } 213 214 impl Clone for BloomStorageU8 { 215 fn clone(&self) -> Self { 216 BloomStorageU8 { 217 counters: self.counters, 218 } 219 } 220 } 221 222 /// Storage class for a CountingBloomFilter that has 1-bit counters. 223 pub struct BloomStorageBool { 224 counters: [u8; ARRAY_SIZE / 8], 225 } 226 227 impl BloomStorage for BloomStorageBool { 228 #[inline] 229 fn adjust_slot(&mut self, index: usize, increment: bool) { 230 let bit = 1 << (index % 8); 231 let byte = &mut self.counters[index / 8]; 232 233 // Since we have only one bit for storage, decrementing it 234 // should never do anything. Assert against an accidental 235 // decrementing of a bit that was never set. 236 assert!( 237 increment || (*byte & bit) != 0, 238 "should not decrement if slot is already false" 239 ); 240 241 if increment { 242 *byte |= bit; 243 } 244 } 245 246 #[inline] 247 fn slot_is_empty(&self, index: usize) -> bool { 248 let bit = 1 << (index % 8); 249 (self.counters[index / 8] & bit) == 0 250 } 251 252 #[inline] 253 fn is_zeroed(&self) -> bool { 254 self.counters.iter().all(|x| *x == 0) 255 } 256 } 257 258 impl Default for BloomStorageBool { 259 fn default() -> Self { 260 BloomStorageBool { 261 counters: [0; ARRAY_SIZE / 8], 262 } 263 } 264 } 265 266 impl Clone for BloomStorageBool { 267 fn clone(&self) -> Self { 268 BloomStorageBool { 269 counters: self.counters, 270 } 271 } 272 } 273 274 #[inline] 275 fn hash1(hash: u32) -> u32 { 276 hash & KEY_MASK 277 } 278 279 #[inline] 280 fn hash2(hash: u32) -> u32 { 281 (hash >> KEY_SIZE) & KEY_MASK 282 } 283 284 #[test] 285 fn create_and_insert_some_stuff() { 286 use rustc_hash::FxHasher; 287 use std::hash::{Hash, Hasher}; 288 use std::mem::transmute; 289 290 fn hash_as_str(i: usize) -> u32 { 291 let mut hasher = FxHasher::default(); 292 let s = i.to_string(); 293 s.hash(&mut hasher); 294 let hash: u64 = hasher.finish(); 295 (hash >> 32) as u32 ^ (hash as u32) 296 } 297 298 let mut bf = BloomFilter::new(); 299 300 // Statically assert that ARRAY_SIZE is a multiple of 8, which 301 // BloomStorageBool relies on. 302 unsafe { 303 transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]); 304 } 305 306 for i in 0_usize..1000 { 307 bf.insert_hash(hash_as_str(i)); 308 } 309 310 for i in 0_usize..1000 { 311 assert!(bf.might_contain_hash(hash_as_str(i))); 312 } 313 314 let false_positives = (1001_usize..2000) 315 .filter(|i| bf.might_contain_hash(hash_as_str(*i))) 316 .count(); 317 318 assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%. 319 320 for i in 0_usize..100 { 321 bf.remove_hash(hash_as_str(i)); 322 } 323 324 for i in 100_usize..1000 { 325 assert!(bf.might_contain_hash(hash_as_str(i))); 326 } 327 328 let false_positives = (0_usize..100) 329 .filter(|i| bf.might_contain_hash(hash_as_str(*i))) 330 .count(); 331 332 assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%. 333 334 bf.clear(); 335 336 for i in 0_usize..2000 { 337 assert!(!bf.might_contain_hash(hash_as_str(i))); 338 } 339 } 340 341 #[cfg(feature = "bench")] 342 #[cfg(test)] 343 mod bench { 344 extern crate test; 345 use super::BloomFilter; 346 347 #[derive(Default)] 348 struct HashGenerator(u32); 349 350 impl HashGenerator { 351 fn next(&mut self) -> u32 { 352 // Each hash is split into two twelve-bit segments, which are used 353 // as an index into an array. We increment each by 64 so that we 354 // hit the next cache line, and then another 1 so that our wrapping 355 // behavior leads us to different entries each time. 356 // 357 // Trying to simulate cold caches is rather difficult with the cargo 358 // benchmarking setup, so it may all be moot depending on the number 359 // of iterations that end up being run. But we might as well. 360 self.0 += (65) + (65 << super::KEY_SIZE); 361 self.0 362 } 363 } 364 365 #[bench] 366 fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) { 367 b.iter(|| { 368 let mut gen1 = HashGenerator::default(); 369 let mut gen2 = HashGenerator::default(); 370 let mut bf = BloomFilter::new(); 371 for _ in 0_usize..1000 { 372 bf.insert_hash(gen1.next()); 373 } 374 for _ in 0_usize..100 { 375 bf.remove_hash(gen2.next()); 376 } 377 for _ in 100_usize..200 { 378 test::black_box(bf.might_contain_hash(gen2.next())); 379 } 380 }); 381 } 382 383 #[bench] 384 fn might_contain_10(b: &mut test::Bencher) { 385 let bf = BloomFilter::new(); 386 let mut gen = HashGenerator::default(); 387 b.iter(|| { 388 for _ in 0..10 { 389 test::black_box(bf.might_contain_hash(gen.next())); 390 } 391 }); 392 } 393 394 #[bench] 395 fn clear(b: &mut test::Bencher) { 396 let mut bf = Box::new(BloomFilter::new()); 397 b.iter(|| test::black_box(&mut bf).clear()); 398 } 399 400 #[bench] 401 fn insert_10(b: &mut test::Bencher) { 402 let mut bf = BloomFilter::new(); 403 let mut gen = HashGenerator::default(); 404 b.iter(|| { 405 for _ in 0..10 { 406 test::black_box(bf.insert_hash(gen.next())); 407 } 408 }); 409 } 410 411 #[bench] 412 fn remove_10(b: &mut test::Bencher) { 413 let mut bf = BloomFilter::new(); 414 let mut gen = HashGenerator::default(); 415 // Note: this will underflow, and that's ok. 416 b.iter(|| { 417 for _ in 0..10 { 418 bf.remove_hash(gen.next()) 419 } 420 }); 421 } 422 }