tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

special-case.h (4748B)


      1 // Copyright 2019 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_REGEXP_SPECIAL_CASE_H_
      6 #define V8_REGEXP_SPECIAL_CASE_H_
      7 
      8 #ifdef V8_INTL_SUPPORT
      9 #include "irregexp/RegExpShim.h"
     10 
     11 #include "unicode/uchar.h"
     12 #include "unicode/uniset.h"
     13 #include "unicode/unistr.h"
     14 
     15 namespace v8 {
     16 namespace internal {
     17 
     18 // Sets of Unicode characters that need special handling under "i" mode
     19 
     20 // For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
     21 // defines slightly different case-folding rules than Unicode. An
     22 // input character should match a pattern character if the result of
     23 // the Canonicalize algorithm is the same for both characters.
     24 //
     25 // Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
     26 // c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
     27 // string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
     28 // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
     29 // the precise definition.
     30 //
     31 // While compiling such regular expressions, we need to compute the
     32 // set of characters that should match a given input character. (See
     33 // GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
     34 // For almost all characters, this can be efficiently computed using
     35 // UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
     36 // the remaining special cases.
     37 //
     38 // For a character c, the rules are as follows:
     39 //
     40 // 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
     41 //    UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
     42 //    containing c will produce the set of characters that should
     43 //    match /c/i (or /[c]/i), and only those characters.
     44 //
     45 // 2. If c is in IgnoreSet, then the only character it should match is
     46 //    itself. However, closeOver will add additional incorrect
     47 //    matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
     48 //    (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
     49 //    "SS".  Step 3.e therefore requires that 'ß' canonicalizes to
     50 //    itself, and should not match 'ẞ'. In these cases, we can skip
     51 //    the closeOver entirely, because it will never add an equivalent
     52 //    character.
     53 //
     54 // 3. If c is in SpecialAddSet, then it should match at least one
     55 //    character other than itself. However, closeOver will add at
     56 //    least one additional incorrect match. For example, consider the
     57 //    letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
     58 //    K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
     59 //    SIGN should not match either of the other two characters. As a
     60 //    result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
     61 //    IgnoreSet). To find the correct matches for characters in
     62 //    SpecialAddSet, we closeOver the original character, but filter
     63 //    out the results that do not have the same canonical value.
     64 //
     65 // The contents of these sets are calculated at build time by
     66 // src/regexp/gen-regexp-special-case.cc, which generates
     67 // gen/src/regexp/special-case.cc. This is done by iterating over the
     68 // result of closeOver for each BMP character, and finding sets for
     69 // which at least one character has a different canonical value than
     70 // another character. Characters that match no other characters in
     71 // their equivalence class are added to IgnoreSet. Characters that
     72 // match at least one other character are added to SpecialAddSet.
     73 
     74 class RegExpCaseFolding final : public AllStatic {
     75 public:
     76  static const icu::UnicodeSet& IgnoreSet();
     77  static const icu::UnicodeSet& SpecialAddSet();
     78 
     79  // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
     80  // Canonicalize) step 3, which is used to determine whether
     81  // characters match when ignoreCase is true and unicode is false.
     82  static UChar32 Canonicalize(UChar32 ch) {
     83    // a. Assert: ch is a UTF-16 code unit.
     84    CHECK_LE(ch, 0xffff);
     85 
     86    // b. Let s be the String value consisting of the single code unit ch.
     87    icu::UnicodeString s(ch);
     88 
     89    // c. Let u be the same result produced as if by performing the algorithm
     90    // for String.prototype.toUpperCase using s as the this value.
     91    // d. Assert: Type(u) is String.
     92    icu::UnicodeString& u = s.toUpper();
     93 
     94    // e. If u does not consist of a single code unit, return ch.
     95    if (u.length() != 1) {
     96      return ch;
     97    }
     98 
     99    // f. Let cu be u's single code unit element.
    100    UChar32 cu = u.char32At(0);
    101 
    102    // g. If the value of ch >= 128 and the value of cu < 128, return ch.
    103    if (ch >= 128 && cu < 128) {
    104      return ch;
    105    }
    106 
    107    // h. Return cu.
    108    return cu;
    109  }
    110 };
    111 
    112 }  // namespace internal
    113 }  // namespace v8
    114 
    115 #endif  // V8_INTL_SUPPORT
    116 
    117 #endif  // V8_REGEXP_SPECIAL_CASE_H_