regexst.cpp (6708B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // regexst.h 5 // 6 // Copyright (C) 2004-2015, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains class RegexStaticSets 10 // 11 // This class is internal to the regular expression implementation. 12 // For the public Regular Expression API, see the file "unicode/regex.h" 13 // 14 // RegexStaticSets groups together the common UnicodeSets that are needed 15 // for compiling or executing RegularExpressions. This grouping simplifies 16 // the thread safe lazy creation and sharing of these sets across 17 // all instances of regular expressions. 18 // 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 22 23 #include "unicode/unistr.h" 24 #include "unicode/uniset.h" 25 #include "unicode/uchar.h" 26 #include "unicode/regex.h" 27 #include "uprops.h" 28 #include "cmemory.h" 29 #include "cstring.h" 30 #include "uassert.h" 31 #include "ucln_in.h" 32 #include "umutex.h" 33 34 #include "regexcst.h" // Contains state table for the regex pattern parser. 35 // generated by a Perl script. 36 #include "regexst.h" 37 38 U_NAMESPACE_BEGIN 39 40 // "Rule Char" Characters are those with special meaning, and therefore 41 // need to be escaped to appear as literals in a regexp. 42 constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; 43 44 // 45 // The backslash escape characters that ICU's unescape() function will handle. 46 // 47 constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; 48 49 // 50 // Unicode Set pattern for Regular Expression \w 51 // 52 constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; 53 54 // 55 // Unicode Set Definitions for Regular Expression \s 56 // 57 constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; 58 59 // 60 // UnicodeSets used in implementation of Grapheme Cluster detection, \X 61 // 62 constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; 63 constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; 64 constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; 65 constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; 66 constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; 67 constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; 68 constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; 69 70 71 RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; 72 UInitOnce gStaticSetsInitOnce {}; 73 74 75 RegexStaticSets::RegexStaticSets(UErrorCode *status) { 76 // Initialize the shared static sets to their correct values. 77 fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); 78 fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); 79 fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); 80 fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(true, gGC_ExtendPattern, -1), *status).freeze(); 81 fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(true, gGC_ControlPattern, -1), *status).freeze(); 82 fPropSets[URX_GC_L].applyPattern(UnicodeString(true, gGC_LPattern, -1), *status).freeze(); 83 fPropSets[URX_GC_V].applyPattern(UnicodeString(true, gGC_VPattern, -1), *status).freeze(); 84 fPropSets[URX_GC_T].applyPattern(UnicodeString(true, gGC_TPattern, -1), *status).freeze(); 85 fPropSets[URX_GC_LV].applyPattern(UnicodeString(true, gGC_LVPattern, -1), *status).freeze(); 86 fPropSets[URX_GC_LVT].applyPattern(UnicodeString(true, gGC_LVTPattern, -1), *status).freeze(); 87 88 89 // 90 // "Normal" is the set of characters that don't need special handling 91 // when finding grapheme cluster boundaries. 92 // 93 fPropSets[URX_GC_NORMAL].complement(); 94 fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); 95 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); 96 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); 97 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); 98 fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); 99 fPropSets[URX_GC_NORMAL].freeze(); 100 101 // Initialize the 8-bit fast bit sets from the parallel full 102 // UnicodeSets. 103 // 104 // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? 105 // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" 106 // This runs in exponential time, making it easy to adjust the time for 107 // convenient measuring. 108 // 109 // This 8 bit optimization dates from the early days of ICU, 110 // with a less optimized UnicodeSet. At the time, the difference 111 // was substantial. 112 113 for (int32_t i=0; i<URX_LAST_SET; i++) { 114 fPropSets8[i].init(&fPropSets[i]); 115 } 116 117 // Sets used while parsing rules, but not referenced from the parse state table 118 fRuleSets[kRuleSet_rule_char-128] 119 .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze(); 120 121 fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze(); 122 fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze(); 123 fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128]; 124 125 // Finally, initialize an empty UText string for utility purposes 126 fEmptyText = utext_openUChars(nullptr, nullptr, 0, status); 127 128 } 129 130 131 RegexStaticSets::~RegexStaticSets() { 132 fRuleDigitsAlias = nullptr; 133 utext_close(fEmptyText); 134 } 135 136 137 //------------------------------------------------------------------------------ 138 // 139 // regex_cleanup Memory cleanup function, free/delete all 140 // cached memory. Called by ICU's u_cleanup() function. 141 // 142 //------------------------------------------------------------------------------ 143 144 U_CDECL_BEGIN 145 static UBool U_CALLCONV 146 regex_cleanup() { 147 delete RegexStaticSets::gStaticSets; 148 RegexStaticSets::gStaticSets = nullptr; 149 gStaticSetsInitOnce.reset(); 150 return true; 151 } 152 153 static void U_CALLCONV initStaticSets(UErrorCode &status) { 154 U_ASSERT(RegexStaticSets::gStaticSets == nullptr); 155 ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup); 156 RegexStaticSets::gStaticSets = new RegexStaticSets(&status); 157 if (U_FAILURE(status)) { 158 delete RegexStaticSets::gStaticSets; 159 RegexStaticSets::gStaticSets = nullptr; 160 } 161 if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) { 162 status = U_MEMORY_ALLOCATION_ERROR; 163 } 164 } 165 U_CDECL_END 166 167 void RegexStaticSets::initGlobals(UErrorCode *status) { 168 umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status); 169 } 170 171 U_NAMESPACE_END 172 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS