tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexst.cpp (6708B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  regexst.h
      5 //
      6 //  Copyright (C) 2004-2015, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains class RegexStaticSets
     10 //
     11 //  This class is internal to the regular expression implementation.
     12 //  For the public Regular Expression API, see the file "unicode/regex.h"
     13 //
     14 //  RegexStaticSets groups together the common UnicodeSets that are needed
     15 //   for compiling or executing RegularExpressions.  This grouping simplifies
     16 //   the thread safe lazy creation and sharing of these sets across
     17 //   all instances of regular expressions.
     18 //
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     22 
     23 #include "unicode/unistr.h"
     24 #include "unicode/uniset.h"
     25 #include "unicode/uchar.h"
     26 #include "unicode/regex.h"
     27 #include "uprops.h"
     28 #include "cmemory.h"
     29 #include "cstring.h"
     30 #include "uassert.h"
     31 #include "ucln_in.h"
     32 #include "umutex.h"
     33 
     34 #include "regexcst.h"   // Contains state table for the regex pattern parser.
     35                        //   generated by a Perl script.
     36 #include "regexst.h"
     37 
     38 U_NAMESPACE_BEGIN
     39 
     40 // "Rule Char" Characters are those with special meaning, and therefore
     41 //    need to be escaped to appear as literals in a regexp.
     42 constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
     43 
     44 //
     45 //   The backslash escape characters that ICU's unescape() function will handle.
     46 //
     47 constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
     48 
     49 //
     50 //  Unicode Set pattern for Regular Expression  \w
     51 //
     52 constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
     53 
     54 //
     55 //  Unicode Set Definitions for Regular Expression  \s
     56 //
     57 constexpr  char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
     58 
     59 //
     60 //  UnicodeSets used in implementation of Grapheme Cluster detection, \X
     61 //
     62 constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
     63 constexpr char16_t const *gGC_ExtendPattern  = u"[\\p{Grapheme_Extend}]";
     64 constexpr char16_t const *gGC_LPattern       = u"[\\p{Hangul_Syllable_Type=L}]";
     65 constexpr char16_t const *gGC_VPattern       = u"[\\p{Hangul_Syllable_Type=V}]";
     66 constexpr char16_t const *gGC_TPattern       = u"[\\p{Hangul_Syllable_Type=T}]";
     67 constexpr char16_t const *gGC_LVPattern      = u"[\\p{Hangul_Syllable_Type=LV}]";
     68 constexpr char16_t const *gGC_LVTPattern     = u"[\\p{Hangul_Syllable_Type=LVT}]";
     69 
     70 
     71 RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
     72 UInitOnce gStaticSetsInitOnce {};
     73 
     74 
     75 RegexStaticSets::RegexStaticSets(UErrorCode *status) {
     76    // Initialize the shared static sets to their correct values.
     77    fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
     78    fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
     79    fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
     80    fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(true, gGC_ExtendPattern, -1), *status).freeze();
     81    fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(true, gGC_ControlPattern, -1), *status).freeze();
     82    fPropSets[URX_GC_L].applyPattern(UnicodeString(true, gGC_LPattern, -1), *status).freeze();
     83    fPropSets[URX_GC_V].applyPattern(UnicodeString(true, gGC_VPattern, -1), *status).freeze();
     84    fPropSets[URX_GC_T].applyPattern(UnicodeString(true, gGC_TPattern, -1), *status).freeze();
     85    fPropSets[URX_GC_LV].applyPattern(UnicodeString(true, gGC_LVPattern, -1), *status).freeze();
     86    fPropSets[URX_GC_LVT].applyPattern(UnicodeString(true, gGC_LVTPattern, -1), *status).freeze();
     87    
     88 
     89    //
     90    //  "Normal" is the set of characters that don't need special handling
     91    //            when finding grapheme cluster boundaries.
     92    //
     93    fPropSets[URX_GC_NORMAL].complement();
     94    fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
     95    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
     96    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
     97    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
     98    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
     99    fPropSets[URX_GC_NORMAL].freeze();
    100 
    101    // Initialize the 8-bit fast bit sets from the parallel full
    102    //   UnicodeSets.
    103    //
    104    // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
    105    //       Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
    106    //       This runs in exponential time, making it easy to adjust the time for
    107    //       convenient measuring.
    108    //
    109    //       This 8 bit optimization dates from the early days of ICU,
    110    //       with a less optimized UnicodeSet. At the time, the difference
    111    //       was substantial.
    112 
    113    for (int32_t i=0; i<URX_LAST_SET; i++) {
    114        fPropSets8[i].init(&fPropSets[i]);
    115    }
    116 
    117    // Sets used while parsing rules, but not referenced from the parse state table
    118    fRuleSets[kRuleSet_rule_char-128]
    119            .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
    120 
    121    fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
    122    fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
    123    fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
    124    
    125    // Finally, initialize an empty UText string for utility purposes
    126    fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
    127    
    128 }
    129 
    130 
    131 RegexStaticSets::~RegexStaticSets() {
    132    fRuleDigitsAlias = nullptr;
    133    utext_close(fEmptyText);
    134 }
    135 
    136 
    137 //------------------------------------------------------------------------------
    138 //
    139 //   regex_cleanup      Memory cleanup function, free/delete all
    140 //                      cached memory.  Called by ICU's u_cleanup() function.
    141 //
    142 //------------------------------------------------------------------------------
    143 
    144 U_CDECL_BEGIN
    145 static UBool U_CALLCONV
    146 regex_cleanup() {
    147    delete RegexStaticSets::gStaticSets;
    148    RegexStaticSets::gStaticSets = nullptr;
    149    gStaticSetsInitOnce.reset();
    150    return true;
    151 }
    152 
    153 static void U_CALLCONV initStaticSets(UErrorCode &status) {
    154    U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
    155    ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
    156    RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
    157    if (U_FAILURE(status)) {
    158        delete RegexStaticSets::gStaticSets;
    159        RegexStaticSets::gStaticSets = nullptr;
    160    }
    161    if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
    162        status = U_MEMORY_ALLOCATION_ERROR;
    163    }
    164 }
    165 U_CDECL_END
    166 
    167 void RegexStaticSets::initGlobals(UErrorCode *status) {
    168    umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
    169 }
    170 
    171 U_NAMESPACE_END
    172 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS