gen-regexp-special-case.cc (6062B)
1 // Copyright 2020 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <fstream> 6 #include <iomanip> 7 #include <iostream> 8 #include <sstream> 9 10 #include "irregexp/imported/special-case.h" 11 12 namespace v8 { 13 namespace internal { 14 15 static const base::uc32 kSurrogateStart = 0xd800; 16 static const base::uc32 kSurrogateEnd = 0xdfff; 17 static const base::uc32 kNonBmpStart = 0x10000; 18 19 // The following code generates "src/regexp/special-case.cc". 20 void PrintSet(std::ofstream& out, const char* name, 21 const icu::UnicodeSet& set) { 22 out << "icu::UnicodeSet Build" << name << "() {\n" 23 << " icu::UnicodeSet set;\n"; 24 for (int32_t i = 0; i < set.getRangeCount(); i++) { 25 if (set.getRangeStart(i) == set.getRangeEnd(i)) { 26 out << " set.add(0x" << set.getRangeStart(i) << ");\n"; 27 } else { 28 out << " set.add(0x" << set.getRangeStart(i) << ", 0x" 29 << set.getRangeEnd(i) << ");\n"; 30 } 31 } 32 out << " set.freeze();\n" 33 << " return set;\n" 34 << "}\n\n"; 35 36 out << "struct " << name << "Data {\n" 37 << " " << name << "Data() : set(Build" << name << "()) {}\n" 38 << " const icu::UnicodeSet set;\n" 39 << "};\n\n"; 40 41 out << "//static\n" 42 << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" 43 << " static base::LazyInstance<" << name << "Data>::type set =\n" 44 << " LAZY_INSTANCE_INITIALIZER;\n" 45 << " return set.Pointer()->set;\n" 46 << "}\n\n"; 47 } 48 49 void PrintSpecial(std::ofstream& out) { 50 icu::UnicodeSet current; 51 icu::UnicodeSet special_add; 52 icu::UnicodeSet ignore; 53 UErrorCode status = U_ZERO_ERROR; 54 icu::UnicodeSet upper("[\\p{Lu}]", status); 55 CHECK(U_SUCCESS(status)); 56 57 // Iterate through all chars in BMP except surrogates. 58 for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { 59 if (i >= static_cast<UChar32>(kSurrogateStart) && 60 i <= static_cast<UChar32>(kSurrogateEnd)) { 61 continue; // Ignore surrogate range 62 } 63 current.set(i, i); 64 current.closeOver(USET_CASE_INSENSITIVE); 65 66 // Check to see if all characters in the case-folding equivalence 67 // class as defined by UnicodeSet::closeOver all map to the same 68 // canonical value. 69 UChar32 canonical = RegExpCaseFolding::Canonicalize(i); 70 bool class_has_matching_canonical_char = false; 71 bool class_has_non_matching_canonical_char = false; 72 for (int32_t j = 0; j < current.getRangeCount(); j++) { 73 for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); 74 c++) { 75 if (c == i) { 76 continue; 77 } 78 UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); 79 if (canonical == other_canonical) { 80 class_has_matching_canonical_char = true; 81 } else { 82 class_has_non_matching_canonical_char = true; 83 } 84 } 85 } 86 // If any other character in i's equivalence class has a 87 // different canonical value, then i needs special handling. If 88 // no other character shares a canonical value with i, we can 89 // ignore i when adding alternatives for case-independent 90 // comparison. If at least one other character shares a 91 // canonical value, then i needs special handling. 92 if (class_has_non_matching_canonical_char) { 93 if (class_has_matching_canonical_char) { 94 special_add.add(i); 95 } else { 96 ignore.add(i); 97 } 98 } 99 } 100 101 // Verify that no Unicode equivalence class contains two non-trivial 102 // JS equivalence classes. Every character in SpecialAddSet has the 103 // same canonical value as every other non-IgnoreSet character in 104 // its Unicode equivalence class. Therefore, if we call closeOver on 105 // a set containing no IgnoreSet characters, the only characters 106 // that must be removed from the result are in IgnoreSet. This fact 107 // is used in CharacterRange::AddCaseEquivalents. 108 for (int32_t i = 0; i < special_add.getRangeCount(); i++) { 109 for (UChar32 c = special_add.getRangeStart(i); 110 c <= special_add.getRangeEnd(i); c++) { 111 UChar32 canonical = RegExpCaseFolding::Canonicalize(c); 112 current.set(c, c); 113 current.closeOver(USET_CASE_INSENSITIVE); 114 current.removeAll(ignore); 115 for (int32_t j = 0; j < current.getRangeCount(); j++) { 116 for (UChar32 c2 = current.getRangeStart(j); 117 c2 <= current.getRangeEnd(j); c2++) { 118 CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); 119 } 120 } 121 } 122 } 123 124 PrintSet(out, "IgnoreSet", ignore); 125 PrintSet(out, "SpecialAddSet", special_add); 126 } 127 128 void WriteHeader(const char* header_filename) { 129 std::ofstream out(header_filename); 130 out << std::hex << std::setfill('0') << std::setw(4); 131 out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" 132 << "// Use of this source code is governed by a BSD-style license that\n" 133 << "// can be found in the LICENSE file.\n\n" 134 << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" 135 << "// The following functions are used to build UnicodeSets\n" 136 << "// for special cases where the case-folding algorithm used by\n" 137 << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" 138 << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" 139 << "// Semantics: Canonicalize) step 3.\n\n" 140 << "#ifdef V8_INTL_SUPPORT\n" 141 << "#include \"src/base/lazy-instance.h\"\n\n" 142 << "#include \"src/regexp/special-case.h\"\n\n" 143 << "#include \"unicode/uniset.h\"\n" 144 << "namespace v8 {\n" 145 << "namespace internal {\n\n"; 146 147 PrintSpecial(out); 148 149 out << "\n" 150 << "} // namespace internal\n" 151 << "} // namespace v8\n" 152 << "#endif // V8_INTL_SUPPORT\n"; 153 } 154 155 } // namespace internal 156 } // namespace v8 157 158 int main(int argc, const char** argv) { 159 if (argc != 2) { 160 std::cerr << "Usage: " << argv[0] << " <output filename>\n"; 161 std::exit(1); 162 } 163 v8::internal::WriteHeader(argv[1]); 164 165 return 0; 166 }