neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

hashtab.c (13853B)


      1 /// @file hashtab.c
      2 ///
      3 /// Handling of a hashtable with Vim-specific properties.
      4 ///
      5 /// Each item in a hashtable has a NUL terminated string key. A key can appear
      6 /// only once in the table.
      7 ///
      8 /// A hash number is computed from the key for quick lookup. When the hashes
      9 /// of two different keys point to the same entry an algorithm is used to
     10 /// iterate over other entries in the table until the right one is found.
     11 /// To make the iteration work removed keys are different from entries where a
     12 /// key was never present.
     13 ///
     14 /// The mechanism has been partly based on how Python Dictionaries are
     15 /// implemented. The algorithm is from Knuth Vol. 3, Sec. 6.4.
     16 ///
     17 /// The hashtable grows to accommodate more entries when needed. At least 1/3
     18 /// of the entries is empty to keep the lookup efficient (at the cost of extra
     19 /// memory).
     20 
     21 #include <assert.h>
     22 #include <inttypes.h>
     23 #include <stdbool.h>
     24 #include <string.h>
     25 
     26 #include "nvim/ascii_defs.h"
     27 #include "nvim/gettext_defs.h"
     28 #include "nvim/hashtab.h"
     29 #include "nvim/macros_defs.h"
     30 #include "nvim/memory.h"
     31 #include "nvim/message.h"
     32 #include "nvim/vim_defs.h"
     33 
     34 // Magic value for algorithm that walks through the array.
     35 #define PERTURB_SHIFT 5
     36 
     37 #include "hashtab.c.generated.h"
     38 
     39 char hash_removed;
     40 
     41 /// Initialize an empty hash table.
     42 void hash_init(hashtab_T *ht)
     43 {
     44  // This zeroes all "ht_" entries and all the "hi_key" in "ht_smallarray".
     45  CLEAR_POINTER(ht);
     46  ht->ht_array = ht->ht_smallarray;
     47  ht->ht_mask = HT_INIT_SIZE - 1;
     48 }
     49 
     50 /// Free the array of a hash table without freeing contained values.
     51 ///
     52 /// If "ht" is not freed (after calling this) then you should call hash_init()
     53 /// right next!
     54 void hash_clear(hashtab_T *ht)
     55 {
     56  if (ht->ht_array != ht->ht_smallarray) {
     57    xfree(ht->ht_array);
     58  }
     59 }
     60 
     61 /// Free the array of a hash table and all contained values.
     62 ///
     63 /// @param off the offset from start of value to start of key (@see hashitem_T).
     64 void hash_clear_all(hashtab_T *ht, unsigned off)
     65 {
     66  size_t todo = ht->ht_used;
     67  for (hashitem_T *hi = ht->ht_array; todo > 0; hi++) {
     68    if (!HASHITEM_EMPTY(hi)) {
     69      xfree(hi->hi_key - off);
     70      todo--;
     71    }
     72  }
     73  hash_clear(ht);
     74 }
     75 
     76 /// Find item for given "key" in hashtable "ht".
     77 ///
     78 /// @param key The key of the looked-for item. Must not be NULL.
     79 ///
     80 /// @return Pointer to the hash item corresponding to the given key.
     81 ///         If not found, then return pointer to the empty item that would be
     82 ///         used for that key.
     83 ///         WARNING: Returned pointer becomes invalid as soon as the hash table
     84 ///                  is changed in any way.
     85 hashitem_T *hash_find(const hashtab_T *const ht, const char *const key)
     86 {
     87  return hash_lookup(ht, key, strlen(key), hash_hash(key));
     88 }
     89 
     90 /// Like hash_find, but key is not NUL-terminated
     91 ///
     92 /// @param[in]  ht  Hashtab to look in.
     93 /// @param[in]  key  Key of the looked-for item. Must not be NULL.
     94 /// @param[in]  len  Key length.
     95 ///
     96 /// @return Pointer to the hash item corresponding to the given key.
     97 ///         If not found, then return pointer to the empty item that would be
     98 ///         used for that key.
     99 ///
    100 ///         @warning Returned pointer becomes invalid as soon as the hash table
    101 ///                  is changed in any way.
    102 hashitem_T *hash_find_len(const hashtab_T *const ht, const char *const key, const size_t len)
    103 {
    104  return hash_lookup(ht, key, len, hash_hash_len(key, len));
    105 }
    106 
    107 /// Like hash_find(), but caller computes "hash".
    108 ///
    109 /// @param[in]  key  The key of the looked-for item. Must not be NULL.
    110 /// @param[in]  key_len  Key length.
    111 /// @param[in]  hash  The precomputed hash for the key.
    112 ///
    113 /// @return Pointer to the hashitem corresponding to the given key.
    114 ///         If not found, then return pointer to the empty item that would be
    115 ///         used for that key.
    116 ///         WARNING: Returned pointer becomes invalid as soon as the hash table
    117 ///                  is changed in any way.
    118 hashitem_T *hash_lookup(const hashtab_T *const ht, const char *const key, const size_t key_len,
    119                        const hash_T hash)
    120 {
    121 #ifdef HT_DEBUG
    122  hash_count_lookup++;
    123 #endif
    124 
    125  // Quickly handle the most common situations:
    126  // - return if there is no item at all
    127  // - skip over a removed item
    128  // - return if the item matches
    129  hash_T idx = hash & ht->ht_mask;
    130  hashitem_T *hi = &ht->ht_array[idx];
    131 
    132  if (hi->hi_key == NULL) {
    133    return hi;
    134  }
    135 
    136  hashitem_T *freeitem = NULL;
    137  if (hi->hi_key == HI_KEY_REMOVED) {
    138    freeitem = hi;
    139  } else if ((hi->hi_hash == hash)
    140             && (strncmp(hi->hi_key, key, key_len) == 0)
    141             && hi->hi_key[key_len] == NUL) {
    142    return hi;
    143  }
    144 
    145  // Need to search through the table to find the key. The algorithm
    146  // to step through the table starts with large steps, gradually becoming
    147  // smaller down to (1/4 table size + 1). This means it goes through all
    148  // table entries in the end.
    149  // When we run into a NULL key it's clear that the key isn't there.
    150  // Return the first available slot found (can be a slot of a removed
    151  // item).
    152  for (hash_T perturb = hash;; perturb >>= PERTURB_SHIFT) {
    153 #ifdef HT_DEBUG
    154    // count a "miss" for hashtab lookup
    155    hash_count_perturb++;
    156 #endif
    157    idx = 5 * idx + perturb + 1;
    158    hi = &ht->ht_array[idx & ht->ht_mask];
    159 
    160    if (hi->hi_key == NULL) {
    161      return freeitem == NULL ? hi : freeitem;
    162    }
    163 
    164    if ((hi->hi_hash == hash)
    165        && (hi->hi_key != HI_KEY_REMOVED)
    166        && (strncmp(hi->hi_key, key, key_len) == 0)
    167        && hi->hi_key[key_len] == NUL) {
    168      return hi;
    169    }
    170 
    171    if ((hi->hi_key == HI_KEY_REMOVED) && (freeitem == NULL)) {
    172      freeitem = hi;
    173    }
    174  }
    175 }
    176 
    177 /// Print the efficiency of hashtable lookups.
    178 ///
    179 /// Useful when trying different hash algorithms.
    180 /// Called when exiting.
    181 void hash_debug_results(void)
    182 {
    183 #ifdef HT_DEBUG
    184  fprintf(stderr, "\r\n\r\n\r\n\r\n");
    185  fprintf(stderr, "Number of hashtable lookups: %" PRId64 "\r\n",
    186          (int64_t)hash_count_lookup);
    187  fprintf(stderr, "Number of perturb loops: %" PRId64 "\r\n",
    188          (int64_t)hash_count_perturb);
    189  fprintf(stderr, "Percentage of perturb loops: %" PRId64 "%%\r\n",
    190          (int64_t)(hash_count_perturb * 100 / hash_count_lookup));
    191 #endif
    192 }
    193 
    194 /// Add (empty) item for key `key` to hashtable `ht`.
    195 ///
    196 /// @param key Pointer to the key for the new item. The key has to be contained
    197 ///            in the new item (@see hashitem_T). Must not be NULL.
    198 ///
    199 /// @return OK   if success.
    200 ///         FAIL if key already present
    201 int hash_add(hashtab_T *ht, char *key)
    202 {
    203  hash_T hash = hash_hash(key);
    204  hashitem_T *hi = hash_lookup(ht, key, strlen(key), hash);
    205  if (!HASHITEM_EMPTY(hi)) {
    206    siemsg(_("E685: Internal error: hash_add(): duplicate key \"%s\""), key);
    207    return FAIL;
    208  }
    209  hash_add_item(ht, hi, key, hash);
    210  return OK;
    211 }
    212 
    213 /// Add item "hi" for key "key" to hashtable "ht".
    214 ///
    215 /// @param hi   The hash item to be used. Must have been obtained through
    216 ///             hash_lookup() and point to an empty item.
    217 /// @param key  Pointer to the key for the new item. The key has to be contained
    218 ///             in the new item (@see hashitem_T). Must not be NULL.
    219 /// @param hash The precomputed hash value for the key.
    220 void hash_add_item(hashtab_T *ht, hashitem_T *hi, char *key, hash_T hash)
    221 {
    222  ht->ht_used++;
    223  ht->ht_changed++;
    224  if (hi->hi_key == NULL) {
    225    ht->ht_filled++;
    226  }
    227  hi->hi_key = key;
    228  hi->hi_hash = hash;
    229 
    230  // When the space gets low may resize the array.
    231  hash_may_resize(ht, 0);
    232 }
    233 
    234 /// Remove item "hi" from hashtable "ht".
    235 ///
    236 /// Caller must take care of freeing the item itself.
    237 ///
    238 /// @param hi The hash item to be removed.
    239 ///           It must have been obtained with hash_lookup().
    240 void hash_remove(hashtab_T *ht, hashitem_T *hi)
    241 {
    242  ht->ht_used--;
    243  ht->ht_changed++;
    244  hi->hi_key = HI_KEY_REMOVED;
    245  hash_may_resize(ht, 0);
    246 }
    247 
    248 /// Lock hashtable (prevent changes in ht_array).
    249 ///
    250 /// Don't use this when items are to be added!
    251 /// Must call hash_unlock() later.
    252 void hash_lock(hashtab_T *ht)
    253 {
    254  ht->ht_locked++;
    255 }
    256 
    257 /// Unlock hashtable (allow changes in ht_array again).
    258 ///
    259 /// Table will be resized (shrunk) when necessary.
    260 /// This must balance a call to hash_lock().
    261 void hash_unlock(hashtab_T *ht)
    262 {
    263  ht->ht_locked--;
    264  hash_may_resize(ht, 0);
    265 }
    266 
    267 /// Resize hashtable (new size can be given or automatically computed).
    268 ///
    269 /// @param minitems Minimum number of items the new table should hold.
    270 ///                 If zero, new size will depend on currently used items:
    271 ///                 - Shrink when too much empty space.
    272 ///                 - Grow when not enough empty space.
    273 ///                 If non-zero, passed minitems will be used.
    274 static void hash_may_resize(hashtab_T *ht, size_t minitems)
    275 {
    276  // Don't resize a locked table.
    277  if (ht->ht_locked > 0) {
    278    return;
    279  }
    280 
    281 #ifdef HT_DEBUG
    282  if (ht->ht_used > ht->ht_filled) {
    283    emsg("hash_may_resize(): more used than filled");
    284  }
    285 
    286  if (ht->ht_filled >= ht->ht_mask + 1) {
    287    emsg("hash_may_resize(): table completely filled");
    288  }
    289 #endif
    290 
    291  size_t minsize;
    292  const size_t oldsize = ht->ht_mask + 1;
    293  if (minitems == 0) {
    294    // Return quickly for small tables with at least two NULL items.
    295    // items are required for the lookup to decide a key isn't there.
    296    if ((ht->ht_filled < HT_INIT_SIZE - 1)
    297        && (ht->ht_array == ht->ht_smallarray)) {
    298      return;
    299    }
    300 
    301    // Grow or refill the array when it's more than 2/3 full (including
    302    // removed items, so that they get cleaned up).
    303    // Shrink the array when it's less than 1/5 full. When growing it is
    304    // at least 1/4 full (avoids repeated grow-shrink operations)
    305    if ((ht->ht_filled * 3 < oldsize * 2) && (ht->ht_used > oldsize / 5)) {
    306      return;
    307    }
    308 
    309    if (ht->ht_used > 1000) {
    310      // it's big, don't make too much room
    311      minsize = ht->ht_used * 2;
    312    } else {
    313      // make plenty of room
    314      minsize = ht->ht_used * 4;
    315    }
    316  } else {
    317    // Use specified size.
    318    minitems = MAX(minitems, ht->ht_used);
    319    // array is up to 2/3 full
    320    minsize = (minitems * 3 + 1) / 2;
    321  }
    322 
    323  size_t newsize = HT_INIT_SIZE;
    324  while (newsize < minsize) {
    325    // make sure it's always a power of 2
    326    newsize <<= 1;
    327    // assert newsize didn't overflow
    328    assert(newsize != 0);
    329  }
    330 
    331  bool newarray_is_small = newsize == HT_INIT_SIZE;
    332 
    333  if (!newarray_is_small && newsize == oldsize && ht->ht_filled * 3 < oldsize * 2) {
    334    // The hashtab is already at the desired size, and there are not too
    335    // many removed items, bail out.
    336    return;
    337  }
    338 
    339  bool keep_smallarray = newarray_is_small
    340                         && ht->ht_array == ht->ht_smallarray;
    341 
    342  // Make sure that oldarray and newarray do not overlap,
    343  // so that copying is possible.
    344  hashitem_T temparray[HT_INIT_SIZE];
    345  hashitem_T *oldarray = keep_smallarray
    346                         ? memcpy(temparray, ht->ht_smallarray, sizeof(temparray))
    347                         : ht->ht_array;
    348 
    349  if (newarray_is_small) {
    350    CLEAR_FIELD(ht->ht_smallarray);
    351  }
    352  hashitem_T *newarray = newarray_is_small
    353                         ? ht->ht_smallarray
    354                         : xcalloc(newsize, sizeof(hashitem_T));
    355 
    356  // Move all the items from the old array to the new one, placing them in
    357  // the right spot. The new array won't have any removed items, thus this
    358  // is also a cleanup action.
    359  hash_T newmask = newsize - 1;
    360  size_t todo = ht->ht_used;
    361 
    362  for (hashitem_T *olditem = oldarray; todo > 0; olditem++) {
    363    if (HASHITEM_EMPTY(olditem)) {
    364      continue;
    365    }
    366    // The algorithm to find the spot to add the item is identical to
    367    // the algorithm to find an item in hash_lookup(). But we only
    368    // need to search for a NULL key, thus it's simpler.
    369    hash_T newi = olditem->hi_hash & newmask;
    370    hashitem_T *newitem = &newarray[newi];
    371    if (newitem->hi_key != NULL) {
    372      for (hash_T perturb = olditem->hi_hash;; perturb >>= PERTURB_SHIFT) {
    373        newi = 5 * newi + perturb + 1;
    374        newitem = &newarray[newi & newmask];
    375        if (newitem->hi_key == NULL) {
    376          break;
    377        }
    378      }
    379    }
    380    *newitem = *olditem;
    381    todo--;
    382  }
    383 
    384  if (ht->ht_array != ht->ht_smallarray) {
    385    xfree(ht->ht_array);
    386  }
    387  ht->ht_array = newarray;
    388  ht->ht_mask = newmask;
    389  ht->ht_filled = ht->ht_used;
    390  ht->ht_changed++;
    391 }
    392 
    393 #define HASH_CYCLE_BODY(hash, p) \
    394  hash = hash * 101 + *p++
    395 
    396 /// Get the hash number for a key.
    397 ///
    398 /// If you think you know a better hash function: Compile with HT_DEBUG set and
    399 /// run a script that uses hashtables a lot. Vim will then print statistics
    400 /// when exiting. Try that with the current hash algorithm and yours. The
    401 /// lower the percentage the better.
    402 hash_T hash_hash(const char *key)
    403 {
    404  hash_T hash = (uint8_t)(*key);
    405 
    406  if (hash == 0) {
    407    return 0;
    408  }
    409 
    410  // A simplistic algorithm that appears to do very well.
    411  // Suggested by George Reilly.
    412  const uint8_t *p = (uint8_t *)key + 1;
    413  while (*p != NUL) {
    414    HASH_CYCLE_BODY(hash, p);
    415  }
    416 
    417  return hash;
    418 }
    419 
    420 /// Get the hash number for a key that is not a NUL-terminated string
    421 ///
    422 /// @warning Function does not check whether key contains NUL. But you will not
    423 ///          be able to get hash entry in this case.
    424 ///
    425 /// @param[in]  key  Key.
    426 /// @param[in]  len  Key length.
    427 ///
    428 /// @return Key hash.
    429 hash_T hash_hash_len(const char *key, const size_t len)
    430  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
    431 {
    432  if (len == 0) {
    433    return 0;
    434  }
    435 
    436  hash_T hash = *(uint8_t *)key;
    437  const uint8_t *end = (uint8_t *)key + len;
    438 
    439  const uint8_t *p = (const uint8_t *)key + 1;
    440  while (p < end) {
    441    HASH_CYCLE_BODY(hash, p);
    442  }
    443 
    444  return hash;
    445 }
    446 
    447 #undef HASH_CYCLE_BODY
    448 
    449 /// Function to get HI_KEY_REMOVED value
    450 ///
    451 /// Used for testing because luajit ffi does not allow getting addresses of
    452 /// globals.
    453 const char *_hash_key_removed(void)
    454  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
    455 {
    456  return HI_KEY_REMOVED;
    457 }