bloomfilt.c (3764B)
1 /* Copyright (c) 2003-2004, Roger Dingledine 2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. 3 * Copyright (c) 2007-2021, The Tor Project, Inc. */ 4 /* See LICENSE for licensing information */ 5 6 /** 7 * \file bloomfilt.c 8 * \brief Uses bitarray_t to implement a bloom filter. 9 **/ 10 11 #include <stdlib.h> 12 13 #include "lib/malloc/malloc.h" 14 #include "lib/container/bloomfilt.h" 15 #include "lib/intmath/bits.h" 16 #include "lib/log/util_bug.h" 17 #include "ext/siphash.h" 18 19 /** How many bloom-filter bits we set per address. This is twice the 20 * BLOOMFILT_N_HASHES value, since we split the siphash output into two 32-bit 21 * values. */ 22 #define N_BITS_PER_ITEM (BLOOMFILT_N_HASHES * 2) 23 24 struct bloomfilt_t { 25 /** siphash keys to make BLOOMFILT_N_HASHES independent hashes for each 26 * items. */ 27 struct sipkey key[BLOOMFILT_N_HASHES]; 28 bloomfilt_hash_fn hashfn; /**< Function used to generate hashes */ 29 uint32_t mask; /**< One less than the number of bits in <b>ba</b>; always 30 * one less than a power of two. */ 31 bitarray_t *ba; /**< A bit array to implement the Bloom filter. */ 32 }; 33 34 #define BIT(set, n) ((n) & (set)->mask) 35 36 /** Add the element <b>item</b> to <b>set</b>. */ 37 void 38 bloomfilt_add(bloomfilt_t *set, 39 const void *item) 40 { 41 int i; 42 for (i = 0; i < BLOOMFILT_N_HASHES; ++i) { 43 uint64_t h = set->hashfn(&set->key[i], item); 44 uint32_t high_bits = (uint32_t)(h >> 32); 45 uint32_t low_bits = (uint32_t)(h); 46 bitarray_set(set->ba, BIT(set, high_bits)); 47 bitarray_set(set->ba, BIT(set, low_bits)); 48 } 49 } 50 51 /** If <b>item</b> is in <b>set</b>, return nonzero. Otherwise, 52 * <em>probably</em> return zero. */ 53 int 54 bloomfilt_probably_contains(const bloomfilt_t *set, 55 const void *item) 56 { 57 int i, matches = 0; 58 for (i = 0; i < BLOOMFILT_N_HASHES; ++i) { 59 uint64_t h = set->hashfn(&set->key[i], item); 60 uint32_t high_bits = (uint32_t)(h >> 32); 61 uint32_t low_bits = (uint32_t)(h); 62 // Note that !! is necessary here, since bitarray_is_set does not 63 // necessarily return 1 on true. 64 matches += !! bitarray_is_set(set->ba, BIT(set, high_bits)); 65 matches += !! bitarray_is_set(set->ba, BIT(set, low_bits)); 66 } 67 return matches == N_BITS_PER_ITEM; 68 } 69 70 /** Return a newly allocated bloomfilt_t, optimized to hold a total of 71 * <b>max_elements</b> elements with a reasonably low false positive weight. 72 * 73 * Uses the siphash-based function <b>hashfn</b> to compute hard-to-collide 74 * functions of the items, and the key material <b>random_key</b> to 75 * key the hash. There must be BLOOMFILT_KEY_LEN bytes in the supplied key. 76 **/ 77 bloomfilt_t * 78 bloomfilt_new(int max_elements, 79 bloomfilt_hash_fn hashfn, 80 const uint8_t *random_key) 81 { 82 /* The probability of false positives is about P=(1 - exp(-kn/m))^k, where k 83 * is the number of hash functions per entry, m is the bits in the array, 84 * and n is the number of elements inserted. For us, k==4, n<=max_elements, 85 * and m==n_bits= approximately max_elements*32. This gives 86 * P<(1-exp(-4*n/(32*n)))^4 == (1-exp(1/-8))^4 == .00019 87 * 88 * It would be more optimal in space vs false positives to get this false 89 * positive rate by going for k==13, and m==18.5n, but we also want to 90 * conserve CPU, and k==13 is pretty big. 91 */ 92 int n_bits = 1u << (tor_log2(max_elements)+5); 93 bloomfilt_t *r = tor_malloc(sizeof(bloomfilt_t)); 94 r->mask = n_bits - 1; 95 r->ba = bitarray_init_zero(n_bits); 96 97 tor_assert(sizeof(r->key) == BLOOMFILT_KEY_LEN); 98 memcpy(r->key, random_key, sizeof(r->key)); 99 100 r->hashfn = hashfn; 101 102 return r; 103 } 104 105 /** Free all storage held in <b>set</b>. */ 106 void 107 bloomfilt_free_(bloomfilt_t *set) 108 { 109 if (!set) 110 return; 111 bitarray_free(set->ba); 112 tor_free(set); 113 }