BloomFilter.h (10684B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* 8 * A counting Bloom filter implementation. This allows consumers to 9 * do fast probabilistic "is item X in set Y?" testing which will 10 * never answer "no" when the correct answer is "yes" (but might 11 * incorrectly answer "yes" when the correct answer is "no"). 12 */ 13 14 #ifndef mozilla_BloomFilter_h 15 #define mozilla_BloomFilter_h 16 17 #include "mozilla/Attributes.h" 18 #include "mozilla/Likely.h" 19 20 #include <climits> 21 #include <cstdint> 22 #include <cstring> 23 24 namespace mozilla { 25 26 /* 27 * This class implements a classic Bloom filter as described at 28 * <http://en.wikipedia.org/wiki/Bloom_filter>. This allows quick 29 * probabilistic answers to the question "is object X in set Y?" where the 30 * contents of Y might not be time-invariant. The probabilistic nature of the 31 * test means that sometimes the answer will be "yes" when it should be "no". 32 * If the answer is "no", then X is guaranteed not to be in Y. 33 * 34 * The filter is parametrized on KeySize, which is the size of the key 35 * generated by each of hash functions used by the filter, in bits, 36 * and the type of object T being added and removed. T must implement 37 * a |uint32_t hash() const| method which returns a uint32_t hash key 38 * that will be used to generate the two separate hash functions for 39 * the Bloom filter. This hash key MUST be well-distributed for good 40 * results! KeySize is not allowed to be larger than 16. 41 * 42 * The filter uses exactly 2**KeySize bit (2**(KeySize-3) bytes) of memory. 43 * From now on we will refer to the memory used by the filter as M. 44 * 45 * The expected rate of incorrect "yes" answers depends on M and on 46 * the number N of objects in set Y. As long as N is small compared 47 * to M, the rate of such answers is expected to be approximately 48 * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred 49 * elements then using a KeySize of 12 gives a reasonably low 50 * incorrect answer rate. A KeySize of 12 has the additional benefit 51 * of using exactly one page for the filter in typical hardware 52 * configurations. 53 */ 54 template <unsigned KeySize, class T> 55 class BitBloomFilter { 56 /* 57 * A counting Bloom filter with 8-bit counters. For now we assume 58 * that having two hash functions is enough, but we may revisit that 59 * decision later. 60 * 61 * The filter uses an array with 2**KeySize entries. 62 * 63 * Assuming a well-distributed hash function, a Bloom filter with 64 * array size M containing N elements and 65 * using k hash function has expected false positive rate exactly 66 * 67 * $ (1 - (1 - 1/M)^{kN})^k $ 68 * 69 * because each array slot has a 70 * 71 * $ (1 - 1/M)^{kN} $ 72 * 73 * chance of being 0, and the expected false positive rate is the 74 * probability that all of the k hash functions will hit a nonzero 75 * slot. 76 * 77 * For reasonable assumptions (M large, kN large, which should both 78 * hold if we're worried about false positives) about M and kN this 79 * becomes approximately 80 * 81 * $$ (1 - \exp(-kN/M))^k $$ 82 * 83 * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, 84 * or in other words 85 * 86 * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ 87 * 88 * where r is the false positive rate. This can be used to compute 89 * the desired KeySize for a given load N and false positive rate r. 90 * 91 * If N/M is assumed small, then the false positive rate can 92 * further be approximated as 4*N^2/M^2. So increasing KeySize by 93 * 1, which doubles M, reduces the false positive rate by about a 94 * factor of 4, and a false positive rate of 1% corresponds to 95 * about M/N == 20. 96 * 97 * What this means in practice is that for a few hundred keys using a 98 * KeySize of 12 gives false positive rates on the order of 0.25-4%. 99 * 100 * Similarly, using a KeySize of 10 would lead to a 4% false 101 * positive rate for N == 100 and to quite bad false positive 102 * rates for larger N. 103 */ 104 public: 105 BitBloomFilter() { 106 static_assert(KeySize >= 3, "KeySize too small"); 107 static_assert(KeySize <= kKeyShift, "KeySize too big"); 108 109 // XXX: Should we have a custom operator new using calloc instead and 110 // require that we're allocated via the operator? 111 clear(); 112 } 113 114 /* 115 * Clear the filter. This should be done before reusing it. 116 */ 117 void clear(); 118 119 /* 120 * Add an item to the filter. 121 */ 122 void add(const T* aValue); 123 124 /* 125 * Check whether the filter might contain an item. This can 126 * sometimes return true even if the item is not in the filter, 127 * but will never return false for items that are actually in the 128 * filter. 129 */ 130 bool mightContain(const T* aValue) const; 131 132 /* 133 * Methods for add/contain when we already have a hash computed 134 */ 135 void add(uint32_t aHash); 136 bool mightContain(uint32_t aHash) const; 137 138 private: 139 static const size_t kArraySize = (1 << (KeySize - 3)); 140 static const uint32_t kKeyMask = (1 << KeySize) - 1; 141 static const uint32_t kKeyShift = 16; 142 143 static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; } 144 static uint32_t hash2(uint32_t aHash) { 145 return (aHash >> kKeyShift) & kKeyMask; 146 } 147 148 bool getSlot(uint32_t aHash) const { 149 uint32_t index = aHash / 8; 150 uint8_t shift = aHash % 8; 151 uint8_t mask = 1 << shift; 152 return !!(mBits[index] & mask); 153 } 154 155 void setSlot(uint32_t aHash) { 156 uint32_t index = aHash / 8; 157 uint8_t shift = aHash % 8; 158 uint8_t bit = 1 << shift; 159 mBits[index] |= bit; 160 } 161 162 bool getFirstSlot(uint32_t aHash) const { return getSlot(hash1(aHash)); } 163 bool getSecondSlot(uint32_t aHash) const { return getSlot(hash2(aHash)); } 164 165 void setFirstSlot(uint32_t aHash) { setSlot(hash1(aHash)); } 166 void setSecondSlot(uint32_t aHash) { setSlot(hash2(aHash)); } 167 168 uint8_t mBits[kArraySize]; 169 }; 170 171 template <unsigned KeySize, class T> 172 inline void BitBloomFilter<KeySize, T>::clear() { 173 memset(mBits, 0, kArraySize); 174 } 175 176 template <unsigned KeySize, class T> 177 inline void BitBloomFilter<KeySize, T>::add(uint32_t aHash) { 178 setFirstSlot(aHash); 179 setSecondSlot(aHash); 180 } 181 182 template <unsigned KeySize, class T> 183 MOZ_ALWAYS_INLINE void BitBloomFilter<KeySize, T>::add(const T* aValue) { 184 uint32_t hash = aValue->hash(); 185 return add(hash); 186 } 187 188 template <unsigned KeySize, class T> 189 MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain( 190 uint32_t aHash) const { 191 // Check that all the slots for this hash contain something 192 return getFirstSlot(aHash) && getSecondSlot(aHash); 193 } 194 195 template <unsigned KeySize, class T> 196 MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain( 197 const T* aValue) const { 198 uint32_t hash = aValue->hash(); 199 return mightContain(hash); 200 } 201 202 /* 203 * This class implements a counting Bloom filter as described at 204 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with 205 * 8-bit counters. 206 * 207 * Compared to `BitBloomFilter`, this class supports 'remove' operation. 208 * 209 * The filter uses exactly 2**KeySize bytes of memory. 210 * 211 * Other characteristics are the same as BitBloomFilter. 212 */ 213 template <unsigned KeySize, class T> 214 class CountingBloomFilter { 215 public: 216 CountingBloomFilter() { 217 static_assert(KeySize <= kKeyShift, "KeySize too big"); 218 219 clear(); 220 } 221 222 /* 223 * Clear the filter. This should be done before reusing it, because 224 * just removing all items doesn't clear counters that hit the upper 225 * bound. 226 */ 227 void clear(); 228 229 /* 230 * Add an item to the filter. 231 */ 232 void add(const T* aValue); 233 234 /* 235 * Remove an item from the filter. 236 */ 237 void remove(const T* aValue); 238 239 /* 240 * Check whether the filter might contain an item. This can 241 * sometimes return true even if the item is not in the filter, 242 * but will never return false for items that are actually in the 243 * filter. 244 */ 245 bool mightContain(const T* aValue) const; 246 247 /* 248 * Methods for add/remove/contain when we already have a hash computed 249 */ 250 void add(uint32_t aHash); 251 void remove(uint32_t aHash); 252 bool mightContain(uint32_t aHash) const; 253 254 private: 255 static const size_t kArraySize = (1 << KeySize); 256 static const uint32_t kKeyMask = (1 << KeySize) - 1; 257 static const uint32_t kKeyShift = 16; 258 259 static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; } 260 static uint32_t hash2(uint32_t aHash) { 261 return (aHash >> kKeyShift) & kKeyMask; 262 } 263 264 uint8_t& firstSlot(uint32_t aHash) { return mCounters[hash1(aHash)]; } 265 uint8_t& secondSlot(uint32_t aHash) { return mCounters[hash2(aHash)]; } 266 267 const uint8_t& firstSlot(uint32_t aHash) const { 268 return mCounters[hash1(aHash)]; 269 } 270 const uint8_t& secondSlot(uint32_t aHash) const { 271 return mCounters[hash2(aHash)]; 272 } 273 274 static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; } 275 276 uint8_t mCounters[kArraySize]; 277 }; 278 279 template <unsigned KeySize, class T> 280 inline void CountingBloomFilter<KeySize, T>::clear() { 281 memset(mCounters, 0, kArraySize); 282 } 283 284 template <unsigned KeySize, class T> 285 inline void CountingBloomFilter<KeySize, T>::add(uint32_t aHash) { 286 uint8_t& slot1 = firstSlot(aHash); 287 if (MOZ_LIKELY(!full(slot1))) { 288 ++slot1; 289 } 290 uint8_t& slot2 = secondSlot(aHash); 291 if (MOZ_LIKELY(!full(slot2))) { 292 ++slot2; 293 } 294 } 295 296 template <unsigned KeySize, class T> 297 MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::add(const T* aValue) { 298 uint32_t hash = aValue->hash(); 299 return add(hash); 300 } 301 302 template <unsigned KeySize, class T> 303 inline void CountingBloomFilter<KeySize, T>::remove(uint32_t aHash) { 304 // If the slots are full, we don't know whether we bumped them to be 305 // there when we added or not, so just leave them full. 306 uint8_t& slot1 = firstSlot(aHash); 307 if (MOZ_LIKELY(!full(slot1))) { 308 --slot1; 309 } 310 uint8_t& slot2 = secondSlot(aHash); 311 if (MOZ_LIKELY(!full(slot2))) { 312 --slot2; 313 } 314 } 315 316 template <unsigned KeySize, class T> 317 MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::remove( 318 const T* aValue) { 319 uint32_t hash = aValue->hash(); 320 remove(hash); 321 } 322 323 template <unsigned KeySize, class T> 324 MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain( 325 uint32_t aHash) const { 326 // Check that all the slots for this hash contain something 327 return firstSlot(aHash) && secondSlot(aHash); 328 } 329 330 template <unsigned KeySize, class T> 331 MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain( 332 const T* aValue) const { 333 uint32_t hash = aValue->hash(); 334 return mightContain(hash); 335 } 336 337 } // namespace mozilla 338 339 #endif /* mozilla_BloomFilter_h */