tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

HashFunctions.h (14613B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /* Utilities for hashing. */
      8 
      9 /*
     10 * This file exports functions for hashing data down to a uint32_t (a.k.a.
     11 * mozilla::HashNumber), including:
     12 *
     13 *  - HashString    Hash a char* or char16_t/wchar_t* of known or unknown
     14 *                  length.
     15 *
     16 *  - HashBytes     Hash a byte array of known length.
     17 *
     18 *  - HashGeneric   Hash one or more values.  Currently, we support uint32_t,
     19 *                  types which can be implicitly cast to uint32_t, data
     20 *                  pointers, and function pointers.
     21 *
     22 *  - AddToHash     Add one or more values to the given hash.  This supports the
     23 *                  same list of types as HashGeneric.
     24 *
     25 *
     26 * You can chain these functions together to hash complex objects.  For example:
     27 *
     28 *  class ComplexObject
     29 *  {
     30 *    char* mStr;
     31 *    uint32_t mUint1, mUint2;
     32 *    void (*mCallbackFn)();
     33 *
     34 *  public:
     35 *    HashNumber hash()
     36 *    {
     37 *      HashNumber hash = HashString(mStr);
     38 *      hash = AddToHash(hash, mUint1, mUint2);
     39 *      return AddToHash(hash, mCallbackFn);
     40 *    }
     41 *  };
     42 *
     43 * If you want to hash an nsAString or nsACString, use the HashString functions
     44 * in nsHashKeys.h.
     45 */
     46 
     47 #ifndef mozilla_HashFunctions_h
     48 #define mozilla_HashFunctions_h
     49 
     50 #include "mozilla/Attributes.h"
     51 #include "mozilla/MathAlgorithms.h"
     52 #include "mozilla/Types.h"
     53 #include "mozilla/WrappingOperations.h"
     54 
     55 #include <stdint.h>
     56 #include <type_traits>
     57 
     58 namespace mozilla {
     59 
     60 using HashNumber = uint32_t;
     61 static const uint32_t kHashNumberBits = 32;
     62 
     63 /**
     64 * The golden ratio as a 32-bit fixed-point value.
     65 */
     66 static const HashNumber kGoldenRatioU32 = 0x9E3779B9U;
     67 
     68 /*
     69 * Given a raw hash code, h, return a number that can be used to select a hash
     70 * bucket.
     71 *
     72 * This function aims to produce as uniform an output distribution as possible,
     73 * especially in the most significant (leftmost) bits, even though the input
     74 * distribution may be highly nonrandom, given the constraints that this must
     75 * be deterministic and quick to compute.
     76 *
     77 * Since the leftmost bits of the result are best, the hash bucket index is
     78 * computed by doing ScrambleHashCode(h) / (2^32/N) or the equivalent
     79 * right-shift, not ScrambleHashCode(h) % N or the equivalent bit-mask.
     80 */
     81 constexpr HashNumber ScrambleHashCode(HashNumber h) {
     82  /*
     83   * Simply returning h would not cause any hash tables to produce wrong
     84   * answers. But it can produce pathologically bad performance: The caller
     85   * right-shifts the result, keeping only the highest bits. The high bits of
     86   * hash codes are very often completely entropy-free. (So are the lowest
     87   * bits.)
     88   *
     89   * So we use Fibonacci hashing, as described in Knuth, The Art of Computer
     90   * Programming, 6.4. This mixes all the bits of the input hash code h.
     91   *
     92   * The value of goldenRatio is taken from the hex expansion of the golden
     93   * ratio, which starts 1.9E3779B9.... This value is especially good if
     94   * values with consecutive hash codes are stored in a hash table; see Knuth
     95   * for details.
     96   */
     97  return mozilla::WrappingMultiply(h, kGoldenRatioU32);
     98 }
     99 
    100 namespace detail {
    101 
    102 MOZ_NO_SANITIZE_UNSIGNED_OVERFLOW
    103 constexpr HashNumber RotateLeft5(HashNumber aValue) {
    104  return (aValue << 5) | (aValue >> 27);
    105 }
    106 
    107 constexpr HashNumber AddU32ToHash(HashNumber aHash, uint32_t aValue) {
    108  /*
    109   * This is the meat of all our hash routines.  This hash function is not
    110   * particularly sophisticated, but it seems to work well for our mostly
    111   * plain-text inputs.  Implementation notes follow.
    112   *
    113   * Our use of the golden ratio here is arbitrary; we could pick almost any
    114   * number which:
    115   *
    116   *  * is odd (because otherwise, all our hash values will be even)
    117   *
    118   *  * has a reasonably-even mix of 1's and 0's (consider the extreme case
    119   *    where we multiply by 0x3 or 0xeffffff -- this will not produce good
    120   *    mixing across all bits of the hash).
    121   *
    122   * The rotation length of 5 is also arbitrary, although an odd number is again
    123   * preferable so our hash explores the whole universe of possible rotations.
    124   *
    125   * Finally, we multiply by the golden ratio *after* xor'ing, not before.
    126   * Otherwise, if |aHash| is 0 (as it often is for the beginning of a
    127   * message), the expression
    128   *
    129   *   mozilla::WrappingMultiply(kGoldenRatioU32, RotateLeft5(aHash))
    130   *   |xor|
    131   *   aValue
    132   *
    133   * evaluates to |aValue|.
    134   *
    135   * (Number-theoretic aside: Because any odd number |m| is relatively prime to
    136   * our modulus (2**32), the list
    137   *
    138   *    [x * m (mod 2**32) for 0 <= x < 2**32]
    139   *
    140   * has no duplicate elements.  This means that multiplying by |m| does not
    141   * cause us to skip any possible hash values.
    142   *
    143   * It's also nice if |m| has large-ish order mod 2**32 -- that is, if the
    144   * smallest k such that m**k == 1 (mod 2**32) is large -- so we can safely
    145   * multiply our hash value by |m| a few times without negating the
    146   * multiplicative effect.  Our golden ratio constant has order 2**29, which is
    147   * more than enough for our purposes.)
    148   */
    149  return mozilla::WrappingMultiply(kGoldenRatioU32,
    150                                   RotateLeft5(aHash) ^ aValue);
    151 }
    152 
    153 /**
    154 * AddUintNToHash takes sizeof(int_type) as a template parameter.
    155 * Changes to these functions need to be propagated to
    156 * MacroAssembler::prepareHashNonGCThing, which inlines them manually for
    157 * the JIT.
    158 */
    159 template <size_t Size>
    160 constexpr HashNumber AddUintNToHash(HashNumber aHash, uint64_t aValue) {
    161  return AddU32ToHash(aHash, static_cast<uint32_t>(aValue));
    162 }
    163 
    164 template <>
    165 inline HashNumber AddUintNToHash<8>(HashNumber aHash, uint64_t aValue) {
    166  uint32_t v1 = static_cast<uint32_t>(aValue);
    167  uint32_t v2 = static_cast<uint32_t>(aValue >> 32);
    168  return AddU32ToHash(AddU32ToHash(aHash, v1), v2);
    169 }
    170 
    171 } /* namespace detail */
    172 
    173 /**
    174 * AddToHash takes a hash and some values and returns a new hash based on the
    175 * inputs.
    176 *
    177 * Currently, we support hashing uint32_t's, values which we can implicitly
    178 * convert to uint32_t, data pointers, and function pointers.
    179 */
    180 template <typename T, bool TypeIsNotIntegral = !std::is_integral_v<T>,
    181          bool TypeIsNotEnum = !std::is_enum_v<T>,
    182          std::enable_if_t<TypeIsNotIntegral && TypeIsNotEnum, int> = 0>
    183 [[nodiscard]] inline HashNumber AddToHash(HashNumber aHash, T aA) {
    184  /*
    185   * Try to convert |A| to uint32_t implicitly.  If this works, great.  If not,
    186   * we'll error out.
    187   */
    188  return detail::AddU32ToHash(aHash, aA);
    189 }
    190 
    191 template <typename A>
    192 [[nodiscard]] inline HashNumber AddToHash(HashNumber aHash, A* aA) {
    193  /*
    194   * You might think this function should just take a void*.  But then we'd only
    195   * catch data pointers and couldn't handle function pointers.
    196   */
    197 
    198  static_assert(sizeof(aA) == sizeof(uintptr_t), "Strange pointer!");
    199 
    200  return detail::AddUintNToHash<sizeof(uintptr_t)>(aHash, uintptr_t(aA));
    201 }
    202 
    203 // We use AddUintNToHash() for hashing all integral types.  8-byte integral
    204 // types are treated the same as 64-bit pointers, and smaller integral types are
    205 // first implicitly converted to 32 bits and then passed to AddUintNToHash()
    206 // to be hashed.
    207 template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
    208 [[nodiscard]] constexpr HashNumber AddToHash(HashNumber aHash, T aA) {
    209  return detail::AddUintNToHash<sizeof(T)>(aHash, aA);
    210 }
    211 
    212 template <typename T, std::enable_if_t<std::is_enum_v<T>, int> = 0>
    213 [[nodiscard]] constexpr HashNumber AddToHash(HashNumber aHash, T aA) {
    214  // Hash using AddUintNToHash with the underlying type of the enum type
    215  using UnderlyingType = typename std::underlying_type<T>::type;
    216  return detail::AddUintNToHash<sizeof(UnderlyingType)>(
    217      aHash, static_cast<UnderlyingType>(aA));
    218 }
    219 
    220 template <typename A, typename... Args>
    221 [[nodiscard]] HashNumber AddToHash(HashNumber aHash, A aArg, Args... aArgs) {
    222  return AddToHash(AddToHash(aHash, aArg), aArgs...);
    223 }
    224 
    225 /**
    226 * The HashGeneric class of functions let you hash one or more values.
    227 *
    228 * If you want to hash together two values x and y, calling HashGeneric(x, y) is
    229 * much better than calling AddToHash(x, y), because AddToHash(x, y) assumes
    230 * that x has already been hashed.
    231 */
    232 template <typename... Args>
    233 [[nodiscard]] inline HashNumber HashGeneric(Args... aArgs) {
    234  return AddToHash(0, aArgs...);
    235 }
    236 
    237 /**
    238 * Hash successive |*aIter| until |!*aIter|, i.e. til null-termination.
    239 *
    240 * This function is *not* named HashString like the non-template overloads
    241 * below.  Some users define HashString overloads and pass inexactly-matching
    242 * values to them -- but an inexactly-matching value would match this overload
    243 * instead!  We follow the general rule and don't mix and match template and
    244 * regular overloads to avoid this.
    245 *
    246 * If you have the string's length, call HashStringKnownLength: it may be
    247 * marginally faster.
    248 */
    249 template <typename Iterator>
    250 [[nodiscard]] constexpr HashNumber HashStringUntilZero(Iterator aIter) {
    251  HashNumber hash = 0;
    252  for (; auto c = *aIter; ++aIter) {
    253    hash = AddToHash(hash, c);
    254  }
    255  return hash;
    256 }
    257 
    258 /**
    259 * Hash successive |aIter[i]| up to |i == aLength|.
    260 */
    261 template <typename Iterator>
    262 [[nodiscard]] constexpr HashNumber HashStringKnownLength(Iterator aIter,
    263                                                         size_t aLength) {
    264  HashNumber hash = 0;
    265  for (size_t i = 0; i < aLength; i++) {
    266    hash = AddToHash(hash, aIter[i]);
    267  }
    268  return hash;
    269 }
    270 
    271 /**
    272 * The HashString overloads below do just what you'd expect.
    273 *
    274 * These functions are non-template functions so that users can 1) overload them
    275 * with their own types 2) in a way that allows implicit conversions to happen.
    276 */
    277 [[nodiscard]] inline HashNumber HashString(const char* aStr) {
    278  // Use the |const unsigned char*| version of the above so that all ordinary
    279  // character data hashes identically.
    280  return HashStringUntilZero(reinterpret_cast<const unsigned char*>(aStr));
    281 }
    282 
    283 [[nodiscard]] inline HashNumber HashString(const char* aStr, size_t aLength) {
    284  // Delegate to the |const unsigned char*| version of the above to share
    285  // template instantiations.
    286  return HashStringKnownLength(reinterpret_cast<const unsigned char*>(aStr),
    287                               aLength);
    288 }
    289 
    290 [[nodiscard]] inline HashNumber HashString(const unsigned char* aStr,
    291                                           size_t aLength) {
    292  return HashStringKnownLength(aStr, aLength);
    293 }
    294 
    295 [[nodiscard]] constexpr HashNumber HashString(const char16_t* aStr) {
    296  return HashStringUntilZero(aStr);
    297 }
    298 
    299 [[nodiscard]] inline HashNumber HashString(const char16_t* aStr,
    300                                           size_t aLength) {
    301  return HashStringKnownLength(aStr, aLength);
    302 }
    303 
    304 /**
    305 * HashString overloads for |wchar_t| on platforms where it isn't |char16_t|.
    306 */
    307 template <typename WCharT, typename = typename std::enable_if<
    308                               std::is_same<WCharT, wchar_t>::value &&
    309                               !std::is_same<wchar_t, char16_t>::value>::type>
    310 [[nodiscard]] inline HashNumber HashString(const WCharT* aStr) {
    311  return HashStringUntilZero(aStr);
    312 }
    313 
    314 template <typename WCharT, typename = typename std::enable_if<
    315                               std::is_same<WCharT, wchar_t>::value &&
    316                               !std::is_same<wchar_t, char16_t>::value>::type>
    317 [[nodiscard]] inline HashNumber HashString(const WCharT* aStr, size_t aLength) {
    318  return HashStringKnownLength(aStr, aLength);
    319 }
    320 
    321 /**
    322 * Hash some number of bytes.
    323 *
    324 * This hash walks word-by-word, rather than byte-by-byte, so you won't get the
    325 * same result out of HashBytes as you would out of HashString.
    326 */
    327 [[nodiscard]] extern MFBT_API HashNumber HashBytes(const void* bytes,
    328                                                   size_t aLength,
    329                                                   HashNumber startingHash = 0);
    330 
    331 /**
    332 * A pseudorandom function mapping 32-bit integers to 32-bit integers.
    333 *
    334 * This is for when you're feeding private data (like pointer values or credit
    335 * card numbers) to a non-crypto hash function (like HashBytes) and then using
    336 * the hash code for something that untrusted parties could observe (like a JS
    337 * Map). Plug in a HashCodeScrambler before that last step to avoid leaking the
    338 * private data.
    339 *
    340 * By itself, this does not prevent hash-flooding DoS attacks, because an
    341 * attacker can still generate many values with exactly equal hash codes by
    342 * attacking the non-crypto hash function alone. Equal hash codes will, of
    343 * course, still be equal however much you scramble them.
    344 *
    345 * The algorithm is SipHash-1-3. See <https://131002.net/siphash/>.
    346 */
    347 class HashCodeScrambler {
    348  struct SipHasher;
    349 
    350  uint64_t mK0, mK1;
    351 
    352 public:
    353  /** Creates a new scrambler with the given 128-bit key. */
    354  constexpr HashCodeScrambler(uint64_t aK0, uint64_t aK1)
    355      : mK0(aK0), mK1(aK1) {}
    356 
    357  /**
    358   * Scramble a hash code. Always produces the same result for the same
    359   * combination of key and hash code.
    360   */
    361  HashNumber scramble(HashNumber aHashCode) const {
    362    SipHasher hasher(mK0, mK1);
    363    return HashNumber(hasher.sipHash(aHashCode));
    364  }
    365 
    366  static constexpr size_t offsetOfMK0() {
    367    return offsetof(HashCodeScrambler, mK0);
    368  }
    369 
    370  static constexpr size_t offsetOfMK1() {
    371    return offsetof(HashCodeScrambler, mK1);
    372  }
    373 
    374 private:
    375  struct SipHasher {
    376    SipHasher(uint64_t aK0, uint64_t aK1) {
    377      // 1. Initialization.
    378      mV0 = aK0 ^ UINT64_C(0x736f6d6570736575);
    379      mV1 = aK1 ^ UINT64_C(0x646f72616e646f6d);
    380      mV2 = aK0 ^ UINT64_C(0x6c7967656e657261);
    381      mV3 = aK1 ^ UINT64_C(0x7465646279746573);
    382    }
    383 
    384    uint64_t sipHash(uint64_t aM) {
    385      // 2. Compression.
    386      mV3 ^= aM;
    387      sipRound();
    388      mV0 ^= aM;
    389 
    390      // 3. Finalization.
    391      mV2 ^= 0xff;
    392      for (int i = 0; i < 3; i++) sipRound();
    393      return mV0 ^ mV1 ^ mV2 ^ mV3;
    394    }
    395 
    396    void sipRound() {
    397      mV0 = WrappingAdd(mV0, mV1);
    398      mV1 = RotateLeft(mV1, 13);
    399      mV1 ^= mV0;
    400      mV0 = RotateLeft(mV0, 32);
    401      mV2 = WrappingAdd(mV2, mV3);
    402      mV3 = RotateLeft(mV3, 16);
    403      mV3 ^= mV2;
    404      mV0 = WrappingAdd(mV0, mV3);
    405      mV3 = RotateLeft(mV3, 21);
    406      mV3 ^= mV0;
    407      mV2 = WrappingAdd(mV2, mV1);
    408      mV1 = RotateLeft(mV1, 17);
    409      mV1 ^= mV2;
    410      mV2 = RotateLeft(mV2, 32);
    411    }
    412 
    413    uint64_t mV0, mV1, mV2, mV3;
    414  };
    415 };
    416 
    417 } /* namespace mozilla */
    418 
    419 #endif /* mozilla_HashFunctions_h */