tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-regexp-special-case.cc (6062B)


      1 // Copyright 2020 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <fstream>
      6 #include <iomanip>
      7 #include <iostream>
      8 #include <sstream>
      9 
     10 #include "irregexp/imported/special-case.h"
     11 
     12 namespace v8 {
     13 namespace internal {
     14 
     15 static const base::uc32 kSurrogateStart = 0xd800;
     16 static const base::uc32 kSurrogateEnd = 0xdfff;
     17 static const base::uc32 kNonBmpStart = 0x10000;
     18 
     19 // The following code generates "src/regexp/special-case.cc".
     20 void PrintSet(std::ofstream& out, const char* name,
     21              const icu::UnicodeSet& set) {
     22  out << "icu::UnicodeSet Build" << name << "() {\n"
     23      << "  icu::UnicodeSet set;\n";
     24  for (int32_t i = 0; i < set.getRangeCount(); i++) {
     25    if (set.getRangeStart(i) == set.getRangeEnd(i)) {
     26      out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
     27    } else {
     28      out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
     29          << set.getRangeEnd(i) << ");\n";
     30    }
     31  }
     32  out << "  set.freeze();\n"
     33      << "  return set;\n"
     34      << "}\n\n";
     35 
     36  out << "struct " << name << "Data {\n"
     37      << "  " << name << "Data() : set(Build" << name << "()) {}\n"
     38      << "  const icu::UnicodeSet set;\n"
     39      << "};\n\n";
     40 
     41  out << "//static\n"
     42      << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
     43      << "  static base::LazyInstance<" << name << "Data>::type set =\n"
     44      << "      LAZY_INSTANCE_INITIALIZER;\n"
     45      << "  return set.Pointer()->set;\n"
     46      << "}\n\n";
     47 }
     48 
     49 void PrintSpecial(std::ofstream& out) {
     50  icu::UnicodeSet current;
     51  icu::UnicodeSet special_add;
     52  icu::UnicodeSet ignore;
     53  UErrorCode status = U_ZERO_ERROR;
     54  icu::UnicodeSet upper("[\\p{Lu}]", status);
     55  CHECK(U_SUCCESS(status));
     56 
     57  // Iterate through all chars in BMP except surrogates.
     58  for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
     59    if (i >= static_cast<UChar32>(kSurrogateStart) &&
     60        i <= static_cast<UChar32>(kSurrogateEnd)) {
     61      continue;  // Ignore surrogate range
     62    }
     63    current.set(i, i);
     64    current.closeOver(USET_CASE_INSENSITIVE);
     65 
     66    // Check to see if all characters in the case-folding equivalence
     67    // class as defined by UnicodeSet::closeOver all map to the same
     68    // canonical value.
     69    UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
     70    bool class_has_matching_canonical_char = false;
     71    bool class_has_non_matching_canonical_char = false;
     72    for (int32_t j = 0; j < current.getRangeCount(); j++) {
     73      for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
     74           c++) {
     75        if (c == i) {
     76          continue;
     77        }
     78        UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
     79        if (canonical == other_canonical) {
     80          class_has_matching_canonical_char = true;
     81        } else {
     82          class_has_non_matching_canonical_char = true;
     83        }
     84      }
     85    }
     86    // If any other character in i's equivalence class has a
     87    // different canonical value, then i needs special handling.  If
     88    // no other character shares a canonical value with i, we can
     89    // ignore i when adding alternatives for case-independent
     90    // comparison.  If at least one other character shares a
     91    // canonical value, then i needs special handling.
     92    if (class_has_non_matching_canonical_char) {
     93      if (class_has_matching_canonical_char) {
     94        special_add.add(i);
     95      } else {
     96        ignore.add(i);
     97      }
     98    }
     99  }
    100 
    101  // Verify that no Unicode equivalence class contains two non-trivial
    102  // JS equivalence classes. Every character in SpecialAddSet has the
    103  // same canonical value as every other non-IgnoreSet character in
    104  // its Unicode equivalence class. Therefore, if we call closeOver on
    105  // a set containing no IgnoreSet characters, the only characters
    106  // that must be removed from the result are in IgnoreSet. This fact
    107  // is used in CharacterRange::AddCaseEquivalents.
    108  for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
    109    for (UChar32 c = special_add.getRangeStart(i);
    110         c <= special_add.getRangeEnd(i); c++) {
    111      UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
    112      current.set(c, c);
    113      current.closeOver(USET_CASE_INSENSITIVE);
    114      current.removeAll(ignore);
    115      for (int32_t j = 0; j < current.getRangeCount(); j++) {
    116        for (UChar32 c2 = current.getRangeStart(j);
    117             c2 <= current.getRangeEnd(j); c2++) {
    118          CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
    119        }
    120      }
    121    }
    122  }
    123 
    124  PrintSet(out, "IgnoreSet", ignore);
    125  PrintSet(out, "SpecialAddSet", special_add);
    126 }
    127 
    128 void WriteHeader(const char* header_filename) {
    129  std::ofstream out(header_filename);
    130  out << std::hex << std::setfill('0') << std::setw(4);
    131  out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
    132      << "// Use of this source code is governed by a BSD-style license that\n"
    133      << "// can be found in the LICENSE file.\n\n"
    134      << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
    135      << "// The following functions are used to build UnicodeSets\n"
    136      << "// for special cases where the case-folding algorithm used by\n"
    137      << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
    138      << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
    139      << "// Semantics: Canonicalize) step 3.\n\n"
    140      << "#ifdef V8_INTL_SUPPORT\n"
    141      << "#include \"src/base/lazy-instance.h\"\n\n"
    142      << "#include \"src/regexp/special-case.h\"\n\n"
    143      << "#include \"unicode/uniset.h\"\n"
    144      << "namespace v8 {\n"
    145      << "namespace internal {\n\n";
    146 
    147  PrintSpecial(out);
    148 
    149  out << "\n"
    150      << "}  // namespace internal\n"
    151      << "}  // namespace v8\n"
    152      << "#endif  // V8_INTL_SUPPORT\n";
    153 }
    154 
    155 }  // namespace internal
    156 }  // namespace v8
    157 
    158 int main(int argc, const char** argv) {
    159  if (argc != 2) {
    160    std::cerr << "Usage: " << argv[0] << " <output filename>\n";
    161    std::exit(1);
    162  }
    163  v8::internal::WriteHeader(argv[1]);
    164 
    165  return 0;
    166 }