hashtab.c (13853B)
1 /// @file hashtab.c 2 /// 3 /// Handling of a hashtable with Vim-specific properties. 4 /// 5 /// Each item in a hashtable has a NUL terminated string key. A key can appear 6 /// only once in the table. 7 /// 8 /// A hash number is computed from the key for quick lookup. When the hashes 9 /// of two different keys point to the same entry an algorithm is used to 10 /// iterate over other entries in the table until the right one is found. 11 /// To make the iteration work removed keys are different from entries where a 12 /// key was never present. 13 /// 14 /// The mechanism has been partly based on how Python Dictionaries are 15 /// implemented. The algorithm is from Knuth Vol. 3, Sec. 6.4. 16 /// 17 /// The hashtable grows to accommodate more entries when needed. At least 1/3 18 /// of the entries is empty to keep the lookup efficient (at the cost of extra 19 /// memory). 20 21 #include <assert.h> 22 #include <inttypes.h> 23 #include <stdbool.h> 24 #include <string.h> 25 26 #include "nvim/ascii_defs.h" 27 #include "nvim/gettext_defs.h" 28 #include "nvim/hashtab.h" 29 #include "nvim/macros_defs.h" 30 #include "nvim/memory.h" 31 #include "nvim/message.h" 32 #include "nvim/vim_defs.h" 33 34 // Magic value for algorithm that walks through the array. 35 #define PERTURB_SHIFT 5 36 37 #include "hashtab.c.generated.h" 38 39 char hash_removed; 40 41 /// Initialize an empty hash table. 42 void hash_init(hashtab_T *ht) 43 { 44 // This zeroes all "ht_" entries and all the "hi_key" in "ht_smallarray". 45 CLEAR_POINTER(ht); 46 ht->ht_array = ht->ht_smallarray; 47 ht->ht_mask = HT_INIT_SIZE - 1; 48 } 49 50 /// Free the array of a hash table without freeing contained values. 51 /// 52 /// If "ht" is not freed (after calling this) then you should call hash_init() 53 /// right next! 54 void hash_clear(hashtab_T *ht) 55 { 56 if (ht->ht_array != ht->ht_smallarray) { 57 xfree(ht->ht_array); 58 } 59 } 60 61 /// Free the array of a hash table and all contained values. 62 /// 63 /// @param off the offset from start of value to start of key (@see hashitem_T). 64 void hash_clear_all(hashtab_T *ht, unsigned off) 65 { 66 size_t todo = ht->ht_used; 67 for (hashitem_T *hi = ht->ht_array; todo > 0; hi++) { 68 if (!HASHITEM_EMPTY(hi)) { 69 xfree(hi->hi_key - off); 70 todo--; 71 } 72 } 73 hash_clear(ht); 74 } 75 76 /// Find item for given "key" in hashtable "ht". 77 /// 78 /// @param key The key of the looked-for item. Must not be NULL. 79 /// 80 /// @return Pointer to the hash item corresponding to the given key. 81 /// If not found, then return pointer to the empty item that would be 82 /// used for that key. 83 /// WARNING: Returned pointer becomes invalid as soon as the hash table 84 /// is changed in any way. 85 hashitem_T *hash_find(const hashtab_T *const ht, const char *const key) 86 { 87 return hash_lookup(ht, key, strlen(key), hash_hash(key)); 88 } 89 90 /// Like hash_find, but key is not NUL-terminated 91 /// 92 /// @param[in] ht Hashtab to look in. 93 /// @param[in] key Key of the looked-for item. Must not be NULL. 94 /// @param[in] len Key length. 95 /// 96 /// @return Pointer to the hash item corresponding to the given key. 97 /// If not found, then return pointer to the empty item that would be 98 /// used for that key. 99 /// 100 /// @warning Returned pointer becomes invalid as soon as the hash table 101 /// is changed in any way. 102 hashitem_T *hash_find_len(const hashtab_T *const ht, const char *const key, const size_t len) 103 { 104 return hash_lookup(ht, key, len, hash_hash_len(key, len)); 105 } 106 107 /// Like hash_find(), but caller computes "hash". 108 /// 109 /// @param[in] key The key of the looked-for item. Must not be NULL. 110 /// @param[in] key_len Key length. 111 /// @param[in] hash The precomputed hash for the key. 112 /// 113 /// @return Pointer to the hashitem corresponding to the given key. 114 /// If not found, then return pointer to the empty item that would be 115 /// used for that key. 116 /// WARNING: Returned pointer becomes invalid as soon as the hash table 117 /// is changed in any way. 118 hashitem_T *hash_lookup(const hashtab_T *const ht, const char *const key, const size_t key_len, 119 const hash_T hash) 120 { 121 #ifdef HT_DEBUG 122 hash_count_lookup++; 123 #endif 124 125 // Quickly handle the most common situations: 126 // - return if there is no item at all 127 // - skip over a removed item 128 // - return if the item matches 129 hash_T idx = hash & ht->ht_mask; 130 hashitem_T *hi = &ht->ht_array[idx]; 131 132 if (hi->hi_key == NULL) { 133 return hi; 134 } 135 136 hashitem_T *freeitem = NULL; 137 if (hi->hi_key == HI_KEY_REMOVED) { 138 freeitem = hi; 139 } else if ((hi->hi_hash == hash) 140 && (strncmp(hi->hi_key, key, key_len) == 0) 141 && hi->hi_key[key_len] == NUL) { 142 return hi; 143 } 144 145 // Need to search through the table to find the key. The algorithm 146 // to step through the table starts with large steps, gradually becoming 147 // smaller down to (1/4 table size + 1). This means it goes through all 148 // table entries in the end. 149 // When we run into a NULL key it's clear that the key isn't there. 150 // Return the first available slot found (can be a slot of a removed 151 // item). 152 for (hash_T perturb = hash;; perturb >>= PERTURB_SHIFT) { 153 #ifdef HT_DEBUG 154 // count a "miss" for hashtab lookup 155 hash_count_perturb++; 156 #endif 157 idx = 5 * idx + perturb + 1; 158 hi = &ht->ht_array[idx & ht->ht_mask]; 159 160 if (hi->hi_key == NULL) { 161 return freeitem == NULL ? hi : freeitem; 162 } 163 164 if ((hi->hi_hash == hash) 165 && (hi->hi_key != HI_KEY_REMOVED) 166 && (strncmp(hi->hi_key, key, key_len) == 0) 167 && hi->hi_key[key_len] == NUL) { 168 return hi; 169 } 170 171 if ((hi->hi_key == HI_KEY_REMOVED) && (freeitem == NULL)) { 172 freeitem = hi; 173 } 174 } 175 } 176 177 /// Print the efficiency of hashtable lookups. 178 /// 179 /// Useful when trying different hash algorithms. 180 /// Called when exiting. 181 void hash_debug_results(void) 182 { 183 #ifdef HT_DEBUG 184 fprintf(stderr, "\r\n\r\n\r\n\r\n"); 185 fprintf(stderr, "Number of hashtable lookups: %" PRId64 "\r\n", 186 (int64_t)hash_count_lookup); 187 fprintf(stderr, "Number of perturb loops: %" PRId64 "\r\n", 188 (int64_t)hash_count_perturb); 189 fprintf(stderr, "Percentage of perturb loops: %" PRId64 "%%\r\n", 190 (int64_t)(hash_count_perturb * 100 / hash_count_lookup)); 191 #endif 192 } 193 194 /// Add (empty) item for key `key` to hashtable `ht`. 195 /// 196 /// @param key Pointer to the key for the new item. The key has to be contained 197 /// in the new item (@see hashitem_T). Must not be NULL. 198 /// 199 /// @return OK if success. 200 /// FAIL if key already present 201 int hash_add(hashtab_T *ht, char *key) 202 { 203 hash_T hash = hash_hash(key); 204 hashitem_T *hi = hash_lookup(ht, key, strlen(key), hash); 205 if (!HASHITEM_EMPTY(hi)) { 206 siemsg(_("E685: Internal error: hash_add(): duplicate key \"%s\""), key); 207 return FAIL; 208 } 209 hash_add_item(ht, hi, key, hash); 210 return OK; 211 } 212 213 /// Add item "hi" for key "key" to hashtable "ht". 214 /// 215 /// @param hi The hash item to be used. Must have been obtained through 216 /// hash_lookup() and point to an empty item. 217 /// @param key Pointer to the key for the new item. The key has to be contained 218 /// in the new item (@see hashitem_T). Must not be NULL. 219 /// @param hash The precomputed hash value for the key. 220 void hash_add_item(hashtab_T *ht, hashitem_T *hi, char *key, hash_T hash) 221 { 222 ht->ht_used++; 223 ht->ht_changed++; 224 if (hi->hi_key == NULL) { 225 ht->ht_filled++; 226 } 227 hi->hi_key = key; 228 hi->hi_hash = hash; 229 230 // When the space gets low may resize the array. 231 hash_may_resize(ht, 0); 232 } 233 234 /// Remove item "hi" from hashtable "ht". 235 /// 236 /// Caller must take care of freeing the item itself. 237 /// 238 /// @param hi The hash item to be removed. 239 /// It must have been obtained with hash_lookup(). 240 void hash_remove(hashtab_T *ht, hashitem_T *hi) 241 { 242 ht->ht_used--; 243 ht->ht_changed++; 244 hi->hi_key = HI_KEY_REMOVED; 245 hash_may_resize(ht, 0); 246 } 247 248 /// Lock hashtable (prevent changes in ht_array). 249 /// 250 /// Don't use this when items are to be added! 251 /// Must call hash_unlock() later. 252 void hash_lock(hashtab_T *ht) 253 { 254 ht->ht_locked++; 255 } 256 257 /// Unlock hashtable (allow changes in ht_array again). 258 /// 259 /// Table will be resized (shrunk) when necessary. 260 /// This must balance a call to hash_lock(). 261 void hash_unlock(hashtab_T *ht) 262 { 263 ht->ht_locked--; 264 hash_may_resize(ht, 0); 265 } 266 267 /// Resize hashtable (new size can be given or automatically computed). 268 /// 269 /// @param minitems Minimum number of items the new table should hold. 270 /// If zero, new size will depend on currently used items: 271 /// - Shrink when too much empty space. 272 /// - Grow when not enough empty space. 273 /// If non-zero, passed minitems will be used. 274 static void hash_may_resize(hashtab_T *ht, size_t minitems) 275 { 276 // Don't resize a locked table. 277 if (ht->ht_locked > 0) { 278 return; 279 } 280 281 #ifdef HT_DEBUG 282 if (ht->ht_used > ht->ht_filled) { 283 emsg("hash_may_resize(): more used than filled"); 284 } 285 286 if (ht->ht_filled >= ht->ht_mask + 1) { 287 emsg("hash_may_resize(): table completely filled"); 288 } 289 #endif 290 291 size_t minsize; 292 const size_t oldsize = ht->ht_mask + 1; 293 if (minitems == 0) { 294 // Return quickly for small tables with at least two NULL items. 295 // items are required for the lookup to decide a key isn't there. 296 if ((ht->ht_filled < HT_INIT_SIZE - 1) 297 && (ht->ht_array == ht->ht_smallarray)) { 298 return; 299 } 300 301 // Grow or refill the array when it's more than 2/3 full (including 302 // removed items, so that they get cleaned up). 303 // Shrink the array when it's less than 1/5 full. When growing it is 304 // at least 1/4 full (avoids repeated grow-shrink operations) 305 if ((ht->ht_filled * 3 < oldsize * 2) && (ht->ht_used > oldsize / 5)) { 306 return; 307 } 308 309 if (ht->ht_used > 1000) { 310 // it's big, don't make too much room 311 minsize = ht->ht_used * 2; 312 } else { 313 // make plenty of room 314 minsize = ht->ht_used * 4; 315 } 316 } else { 317 // Use specified size. 318 minitems = MAX(minitems, ht->ht_used); 319 // array is up to 2/3 full 320 minsize = (minitems * 3 + 1) / 2; 321 } 322 323 size_t newsize = HT_INIT_SIZE; 324 while (newsize < minsize) { 325 // make sure it's always a power of 2 326 newsize <<= 1; 327 // assert newsize didn't overflow 328 assert(newsize != 0); 329 } 330 331 bool newarray_is_small = newsize == HT_INIT_SIZE; 332 333 if (!newarray_is_small && newsize == oldsize && ht->ht_filled * 3 < oldsize * 2) { 334 // The hashtab is already at the desired size, and there are not too 335 // many removed items, bail out. 336 return; 337 } 338 339 bool keep_smallarray = newarray_is_small 340 && ht->ht_array == ht->ht_smallarray; 341 342 // Make sure that oldarray and newarray do not overlap, 343 // so that copying is possible. 344 hashitem_T temparray[HT_INIT_SIZE]; 345 hashitem_T *oldarray = keep_smallarray 346 ? memcpy(temparray, ht->ht_smallarray, sizeof(temparray)) 347 : ht->ht_array; 348 349 if (newarray_is_small) { 350 CLEAR_FIELD(ht->ht_smallarray); 351 } 352 hashitem_T *newarray = newarray_is_small 353 ? ht->ht_smallarray 354 : xcalloc(newsize, sizeof(hashitem_T)); 355 356 // Move all the items from the old array to the new one, placing them in 357 // the right spot. The new array won't have any removed items, thus this 358 // is also a cleanup action. 359 hash_T newmask = newsize - 1; 360 size_t todo = ht->ht_used; 361 362 for (hashitem_T *olditem = oldarray; todo > 0; olditem++) { 363 if (HASHITEM_EMPTY(olditem)) { 364 continue; 365 } 366 // The algorithm to find the spot to add the item is identical to 367 // the algorithm to find an item in hash_lookup(). But we only 368 // need to search for a NULL key, thus it's simpler. 369 hash_T newi = olditem->hi_hash & newmask; 370 hashitem_T *newitem = &newarray[newi]; 371 if (newitem->hi_key != NULL) { 372 for (hash_T perturb = olditem->hi_hash;; perturb >>= PERTURB_SHIFT) { 373 newi = 5 * newi + perturb + 1; 374 newitem = &newarray[newi & newmask]; 375 if (newitem->hi_key == NULL) { 376 break; 377 } 378 } 379 } 380 *newitem = *olditem; 381 todo--; 382 } 383 384 if (ht->ht_array != ht->ht_smallarray) { 385 xfree(ht->ht_array); 386 } 387 ht->ht_array = newarray; 388 ht->ht_mask = newmask; 389 ht->ht_filled = ht->ht_used; 390 ht->ht_changed++; 391 } 392 393 #define HASH_CYCLE_BODY(hash, p) \ 394 hash = hash * 101 + *p++ 395 396 /// Get the hash number for a key. 397 /// 398 /// If you think you know a better hash function: Compile with HT_DEBUG set and 399 /// run a script that uses hashtables a lot. Vim will then print statistics 400 /// when exiting. Try that with the current hash algorithm and yours. The 401 /// lower the percentage the better. 402 hash_T hash_hash(const char *key) 403 { 404 hash_T hash = (uint8_t)(*key); 405 406 if (hash == 0) { 407 return 0; 408 } 409 410 // A simplistic algorithm that appears to do very well. 411 // Suggested by George Reilly. 412 const uint8_t *p = (uint8_t *)key + 1; 413 while (*p != NUL) { 414 HASH_CYCLE_BODY(hash, p); 415 } 416 417 return hash; 418 } 419 420 /// Get the hash number for a key that is not a NUL-terminated string 421 /// 422 /// @warning Function does not check whether key contains NUL. But you will not 423 /// be able to get hash entry in this case. 424 /// 425 /// @param[in] key Key. 426 /// @param[in] len Key length. 427 /// 428 /// @return Key hash. 429 hash_T hash_hash_len(const char *key, const size_t len) 430 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT 431 { 432 if (len == 0) { 433 return 0; 434 } 435 436 hash_T hash = *(uint8_t *)key; 437 const uint8_t *end = (uint8_t *)key + len; 438 439 const uint8_t *p = (const uint8_t *)key + 1; 440 while (p < end) { 441 HASH_CYCLE_BODY(hash, p); 442 } 443 444 return hash; 445 } 446 447 #undef HASH_CYCLE_BODY 448 449 /// Function to get HI_KEY_REMOVED value 450 /// 451 /// Used for testing because luajit ffi does not allow getting addresses of 452 /// globals. 453 const char *_hash_key_removed(void) 454 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT 455 { 456 return HI_KEY_REMOVED; 457 }