tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

uniset_props.cpp (39796B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  uniset_props.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004aug25
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Character property dependent functions moved here from uniset.cpp
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/uchar.h"
     25 #include "unicode/uscript.h"
     26 #include "unicode/symtable.h"
     27 #include "unicode/uset.h"
     28 #include "unicode/locid.h"
     29 #include "unicode/brkiter.h"
     30 #include "uset_imp.h"
     31 #include "ruleiter.h"
     32 #include "cmemory.h"
     33 #include "ucln_cmn.h"
     34 #include "util.h"
     35 #include "uvector.h"
     36 #include "uprops.h"
     37 #include "propname.h"
     38 #include "normalizer2impl.h"
     39 #include "uinvchar.h"
     40 #include "uprops.h"
     41 #include "charstr.h"
     42 #include "cstring.h"
     43 #include "mutex.h"
     44 #include "umutex.h"
     45 #include "uassert.h"
     46 #include "hash.h"
     47 
     48 U_NAMESPACE_USE
     49 
     50 namespace {
     51 
     52 // Special property set IDs
     53 constexpr char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
     54 constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
     55 constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]
     56 
     57 // Unicode name property alias
     58 constexpr char16_t NAME_PROP[] = u"na";
     59 
     60 }  // namespace
     61 
     62 // Cached sets ------------------------------------------------------------- ***
     63 
     64 U_CDECL_BEGIN
     65 static UBool U_CALLCONV uset_cleanup();
     66 
     67 static UnicodeSet *uni32Singleton;
     68 static icu::UInitOnce uni32InitOnce {};
     69 
     70 /**
     71 * Cleanup function for UnicodeSet
     72 */
     73 static UBool U_CALLCONV uset_cleanup() {
     74    delete uni32Singleton;
     75    uni32Singleton = nullptr;
     76    uni32InitOnce.reset();
     77    return true;
     78 }
     79 
     80 U_CDECL_END
     81 
     82 U_NAMESPACE_BEGIN
     83 
     84 namespace {
     85 
     86 // Cache some sets for other services -------------------------------------- ***
     87 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
     88    U_ASSERT(uni32Singleton == nullptr);
     89    uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"), errorCode);
     90    if(uni32Singleton==nullptr) {
     91        errorCode=U_MEMORY_ALLOCATION_ERROR;
     92    } else {
     93        uni32Singleton->freeze();
     94    }
     95    ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
     96 }
     97 
     98 
     99 U_CFUNC UnicodeSet *
    100 uniset_getUnicode32Instance(UErrorCode &errorCode) {
    101    umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
    102    return uni32Singleton;
    103 }
    104 
    105 // helper functions for matching of pattern syntax pieces ------------------ ***
    106 // these functions are parallel to the PERL_OPEN etc. strings above
    107 
    108 // using these functions is not only faster than UnicodeString::compare() and
    109 // caseCompare(), but they also make UnicodeSet work for simple patterns when
    110 // no Unicode properties data is available - when caseCompare() fails
    111 
    112 inline UBool
    113 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
    114    char16_t c;
    115    return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
    116 }
    117 
    118 /*static inline UBool
    119 isPerlClose(const UnicodeString &pattern, int32_t pos) {
    120    return pattern.charAt(pos)==u'}';
    121 }*/
    122 
    123 inline UBool
    124 isNameOpen(const UnicodeString &pattern, int32_t pos) {
    125    return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
    126 }
    127 
    128 inline UBool
    129 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
    130    return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
    131 }
    132 
    133 /*static inline UBool
    134 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
    135    return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
    136 }*/
    137 
    138 // TODO memory debugging provided inside uniset.cpp
    139 // could be made available here but probably obsolete with use of modern
    140 // memory leak checker tools
    141 #define _dbgct(me)
    142 
    143 }  // namespace
    144 
    145 //----------------------------------------------------------------
    146 // Constructors &c
    147 //----------------------------------------------------------------
    148 
    149 /**
    150 * Constructs a set from the given pattern, optionally ignoring
    151 * white space.  See the class description for the syntax of the
    152 * pattern language.
    153 * @param pattern a string specifying what characters are in the set
    154 */
    155 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
    156                       UErrorCode& status) {
    157    applyPattern(pattern, status);
    158    _dbgct(this);
    159 }
    160 
    161 //----------------------------------------------------------------
    162 // Public API
    163 //----------------------------------------------------------------
    164 
    165 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
    166                                     UErrorCode& status) {
    167    // Equivalent to
    168    //   return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
    169    // but without dependency on closeOver().
    170    ParsePosition pos(0);
    171    applyPatternIgnoreSpace(pattern, pos, nullptr, status);
    172    if (U_FAILURE(status)) return *this;
    173 
    174    int32_t i = pos.getIndex();
    175    // Skip over trailing whitespace
    176    ICU_Utility::skipWhitespace(pattern, i, true);
    177    if (i != pattern.length()) {
    178        status = U_ILLEGAL_ARGUMENT_ERROR;
    179    }
    180    return *this;
    181 }
    182 
    183 void
    184 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
    185                                    ParsePosition& pos,
    186                                    const SymbolTable* symbols,
    187                                    UErrorCode& status) {
    188    if (U_FAILURE(status)) {
    189        return;
    190    }
    191    if (isFrozen()) {
    192        status = U_NO_WRITE_PERMISSION;
    193        return;
    194    }
    195    // Need to build the pattern in a temporary string because
    196    // _applyPattern calls add() etc., which set pat to empty.
    197    UnicodeString rebuiltPat;
    198    RuleCharacterIterator chars(pattern, symbols, pos);
    199    applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
    200    if (U_FAILURE(status)) return;
    201    if (chars.inVariable()) {
    202        // syntaxError(chars, "Extra chars in variable value");
    203        status = U_MALFORMED_SET;
    204        return;
    205    }
    206    setPattern(rebuiltPat);
    207 }
    208 
    209 /**
    210 * Return true if the given position, in the given pattern, appears
    211 * to be the start of a UnicodeSet pattern.
    212 */
    213 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
    214    return ((pos+1) < pattern.length() &&
    215            pattern.charAt(pos) == static_cast<char16_t>(91)/*[*/) ||
    216        resemblesPropertyPattern(pattern, pos);
    217 }
    218 
    219 //----------------------------------------------------------------
    220 // Implementation: Pattern parsing
    221 //----------------------------------------------------------------
    222 
    223 namespace {
    224 
    225 /**
    226 * A small all-inline class to manage a UnicodeSet pointer.  Add
    227 * operator->() etc. as needed.
    228 */
    229 class UnicodeSetPointer {
    230    UnicodeSet* p;
    231 public:
    232    inline UnicodeSetPointer() : p(nullptr) {}
    233    inline ~UnicodeSetPointer() { delete p; }
    234    inline UnicodeSet* pointer() { return p; }
    235    inline UBool allocate() {
    236        if (p == nullptr) {
    237            p = new UnicodeSet();
    238        }
    239        return p != nullptr;
    240    }
    241 };
    242 
    243 constexpr int32_t MAX_DEPTH = 100;
    244 
    245 }  // namespace
    246 
    247 /**
    248 * Parse the pattern from the given RuleCharacterIterator.  The
    249 * iterator is advanced over the parsed pattern.
    250 * @param chars iterator over the pattern characters.  Upon return
    251 * it will be advanced to the first character after the parsed
    252 * pattern, or the end of the iteration if all characters are
    253 * parsed.
    254 * @param symbols symbol table to use to parse and dereference
    255 * variables, or null if none.
    256 * @param rebuiltPat the pattern that was parsed, rebuilt or
    257 * copied from the input pattern, as appropriate.
    258 * @param options a bit mask of zero or more of the following:
    259 * IGNORE_SPACE, CASE.
    260 */
    261 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
    262                              const SymbolTable* symbols,
    263                              UnicodeString& rebuiltPat,
    264                              uint32_t options,
    265                              UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
    266                              int32_t depth,
    267                              UErrorCode& ec) {
    268    if (U_FAILURE(ec)) return;
    269    if (depth > MAX_DEPTH) {
    270        ec = U_ILLEGAL_ARGUMENT_ERROR;
    271        return;
    272    }
    273 
    274    // Syntax characters: [ ] ^ - & { }
    275 
    276    // Recognized special forms for chars, sets: c-c s-s s&s
    277 
    278    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
    279                   RuleCharacterIterator::PARSE_ESCAPES;
    280    if ((options & USET_IGNORE_SPACE) != 0) {
    281        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
    282    }
    283 
    284    UnicodeString patLocal, buf;
    285    UBool usePat = false;
    286    UnicodeSetPointer scratch;
    287    RuleCharacterIterator::Pos backup;
    288 
    289    // mode: 0=before [, 1=between [...], 2=after ]
    290    // lastItem: 0=none, 1=char, 2=set
    291    int8_t lastItem = 0, mode = 0;
    292    UChar32 lastChar = 0;
    293    char16_t op = 0;
    294 
    295    UBool invert = false;
    296 
    297    clear();
    298 
    299    while (mode != 2 && !chars.atEnd()) {
    300        U_ASSERT((lastItem == 0 && op == 0) ||
    301                 (lastItem == 1 && (op == 0 || op == u'-')) ||
    302                 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
    303 
    304        UChar32 c = 0;
    305        UBool literal = false;
    306        UnicodeSet* nested = nullptr; // alias - do not delete
    307 
    308        // -------- Check for property pattern
    309 
    310        // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
    311        int8_t setMode = 0;
    312        if (resemblesPropertyPattern(chars, opts)) {
    313            setMode = 2;
    314        }
    315 
    316        // -------- Parse '[' of opening delimiter OR nested set.
    317        // If there is a nested set, use `setMode' to define how
    318        // the set should be parsed.  If the '[' is part of the
    319        // opening delimiter for this pattern, parse special
    320        // strings "[", "[^", "[-", and "[^-".  Check for stand-in
    321        // characters representing a nested set in the symbol
    322        // table.
    323 
    324        else {
    325            // Prepare to backup if necessary
    326            chars.getPos(backup);
    327            c = chars.next(opts, literal, ec);
    328            if (U_FAILURE(ec)) return;
    329 
    330            if (c == u'[' && !literal) {
    331                if (mode == 1) {
    332                    chars.setPos(backup); // backup
    333                    setMode = 1;
    334                } else {
    335                    // Handle opening '[' delimiter
    336                    mode = 1;
    337                    patLocal.append(u'[');
    338                    chars.getPos(backup); // prepare to backup
    339                    c = chars.next(opts, literal, ec); 
    340                    if (U_FAILURE(ec)) return;
    341                    if (c == u'^' && !literal) {
    342                        invert = true;
    343                        patLocal.append(u'^');
    344                        chars.getPos(backup); // prepare to backup
    345                        c = chars.next(opts, literal, ec);
    346                        if (U_FAILURE(ec)) return;
    347                    }
    348                    // Fall through to handle special leading '-';
    349                    // otherwise restart loop for nested [], \p{}, etc.
    350                    if (c == u'-') {
    351                        literal = true;
    352                        // Fall through to handle literal '-' below
    353                    } else {
    354                        chars.setPos(backup); // backup
    355                        continue;
    356                    }
    357                }
    358            } else if (symbols != nullptr) {
    359                const UnicodeFunctor *m = symbols->lookupMatcher(c);
    360                if (m != nullptr) {
    361                    const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
    362                    if (ms == nullptr) {
    363                        ec = U_MALFORMED_SET;
    364                        return;
    365                    }
    366                    // casting away const, but `nested' won't be modified
    367                    // (important not to modify stored set)
    368                    nested = const_cast<UnicodeSet*>(ms);
    369                    setMode = 3;
    370                }
    371            }
    372        }
    373 
    374        // -------- Handle a nested set.  This either is inline in
    375        // the pattern or represented by a stand-in that has
    376        // previously been parsed and was looked up in the symbol
    377        // table.
    378 
    379        if (setMode != 0) {
    380            if (lastItem == 1) {
    381                if (op != 0) {
    382                    // syntaxError(chars, "Char expected after operator");
    383                    ec = U_MALFORMED_SET;
    384                    return;
    385                }
    386                add(lastChar, lastChar);
    387                _appendToPat(patLocal, lastChar, false);
    388                lastItem = 0;
    389                op = 0;
    390            }
    391 
    392            if (op == u'-' || op == u'&') {
    393                patLocal.append(op);
    394            }
    395 
    396            if (nested == nullptr) {
    397                // lazy allocation
    398                if (!scratch.allocate()) {
    399                    ec = U_MEMORY_ALLOCATION_ERROR;
    400                    return;
    401                }
    402                nested = scratch.pointer();
    403            }
    404            switch (setMode) {
    405            case 1:
    406                nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
    407                break;
    408            case 2:
    409                chars.skipIgnored(opts);
    410                nested->applyPropertyPattern(chars, patLocal, ec);
    411                if (U_FAILURE(ec)) return;
    412                break;
    413            case 3: // `nested' already parsed
    414                nested->_toPattern(patLocal, false);
    415                break;
    416            }
    417 
    418            usePat = true;
    419 
    420            if (mode == 0) {
    421                // Entire pattern is a category; leave parse loop
    422                *this = *nested;
    423                mode = 2;
    424                break;
    425            }
    426 
    427            switch (op) {
    428            case u'-':
    429                removeAll(*nested);
    430                break;
    431            case u'&':
    432                retainAll(*nested);
    433                break;
    434            case 0:
    435                addAll(*nested);
    436                break;
    437            }
    438 
    439            op = 0;
    440            lastItem = 2;
    441 
    442            continue;
    443        }
    444 
    445        if (mode == 0) {
    446            // syntaxError(chars, "Missing '['");
    447            ec = U_MALFORMED_SET;
    448            return;
    449        }
    450 
    451        // -------- Parse special (syntax) characters.  If the
    452        // current character is not special, or if it is escaped,
    453        // then fall through and handle it below.
    454 
    455        if (!literal) {
    456            switch (c) {
    457            case u']':
    458                if (lastItem == 1) {
    459                    add(lastChar, lastChar);
    460                    _appendToPat(patLocal, lastChar, false);
    461                }
    462                // Treat final trailing '-' as a literal
    463                if (op == u'-') {
    464                    add(op, op);
    465                    patLocal.append(op);
    466                } else if (op == u'&') {
    467                    // syntaxError(chars, "Trailing '&'");
    468                    ec = U_MALFORMED_SET;
    469                    return;
    470                }
    471                patLocal.append(u']');
    472                mode = 2;
    473                continue;
    474            case u'-':
    475                if (op == 0) {
    476                    if (lastItem != 0) {
    477                        op = static_cast<char16_t>(c);
    478                        continue;
    479                    } else {
    480                        // Treat final trailing '-' as a literal
    481                        add(c, c);
    482                        c = chars.next(opts, literal, ec);
    483                        if (U_FAILURE(ec)) return;
    484                        if (c == u']' && !literal) {
    485                            patLocal.append(u"-]", 2);
    486                            mode = 2;
    487                            continue;
    488                        }
    489                    }
    490                }
    491                // syntaxError(chars, "'-' not after char or set");
    492                ec = U_MALFORMED_SET;
    493                return;
    494            case u'&':
    495                if (lastItem == 2 && op == 0) {
    496                    op = static_cast<char16_t>(c);
    497                    continue;
    498                }
    499                // syntaxError(chars, "'&' not after set");
    500                ec = U_MALFORMED_SET;
    501                return;
    502            case u'^':
    503                // syntaxError(chars, "'^' not after '['");
    504                ec = U_MALFORMED_SET;
    505                return;
    506            case u'{':
    507                if (op != 0) {
    508                    // syntaxError(chars, "Missing operand after operator");
    509                    ec = U_MALFORMED_SET;
    510                    return;
    511                }
    512                if (lastItem == 1) {
    513                    add(lastChar, lastChar);
    514                    _appendToPat(patLocal, lastChar, false);
    515                }
    516                lastItem = 0;
    517                buf.truncate(0);
    518                {
    519                    UBool ok = false;
    520                    while (!chars.atEnd()) {
    521                        c = chars.next(opts, literal, ec);
    522                        if (U_FAILURE(ec)) return;
    523                        if (c == u'}' && !literal) {
    524                            ok = true;
    525                            break;
    526                        }
    527                        buf.append(c);
    528                    }
    529                    if (!ok) {
    530                        // syntaxError(chars, "Invalid multicharacter string");
    531                        ec = U_MALFORMED_SET;
    532                        return;
    533                    }
    534                }
    535                // We have new string. Add it to set and continue;
    536                // we don't need to drop through to the further
    537                // processing
    538                add(buf);
    539                patLocal.append(u'{');
    540                _appendToPat(patLocal, buf, false);
    541                patLocal.append(u'}');
    542                continue;
    543            case SymbolTable::SYMBOL_REF:
    544                //         symbols  nosymbols
    545                // [a-$]   error    error (ambiguous)
    546                // [a$]    anchor   anchor
    547                // [a-$x]  var "x"* literal '$'
    548                // [a-$.]  error    literal '$'
    549                // *We won't get here in the case of var "x"
    550                {
    551                    chars.getPos(backup);
    552                    c = chars.next(opts, literal, ec);
    553                    if (U_FAILURE(ec)) return;
    554                    UBool anchor = (c == u']' && !literal);
    555                    if (symbols == nullptr && !anchor) {
    556                        c = SymbolTable::SYMBOL_REF;
    557                        chars.setPos(backup);
    558                        break; // literal '$'
    559                    }
    560                    if (anchor && op == 0) {
    561                        if (lastItem == 1) {
    562                            add(lastChar, lastChar);
    563                            _appendToPat(patLocal, lastChar, false);
    564                        }
    565                        add(U_ETHER);
    566                        usePat = true;
    567                        patLocal.append(static_cast<char16_t>(SymbolTable::SYMBOL_REF));
    568                        patLocal.append(u']');
    569                        mode = 2;
    570                        continue;
    571                    }
    572                    // syntaxError(chars, "Unquoted '$'");
    573                    ec = U_MALFORMED_SET;
    574                    return;
    575                }
    576            default:
    577                break;
    578            }
    579        }
    580 
    581        // -------- Parse literal characters.  This includes both
    582        // escaped chars ("\u4E01") and non-syntax characters
    583        // ("a").
    584 
    585        switch (lastItem) {
    586        case 0:
    587            lastItem = 1;
    588            lastChar = c;
    589            break;
    590        case 1:
    591            if (op == u'-') {
    592                if (lastChar >= c) {
    593                    // Don't allow redundant (a-a) or empty (b-a) ranges;
    594                    // these are most likely typos.
    595                    // syntaxError(chars, "Invalid range");
    596                    ec = U_MALFORMED_SET;
    597                    return;
    598                }
    599                add(lastChar, c);
    600                _appendToPat(patLocal, lastChar, false);
    601                patLocal.append(op);
    602                _appendToPat(patLocal, c, false);
    603                lastItem = 0;
    604                op = 0;
    605            } else {
    606                add(lastChar, lastChar);
    607                _appendToPat(patLocal, lastChar, false);
    608                lastChar = c;
    609            }
    610            break;
    611        case 2:
    612            if (op != 0) {
    613                // syntaxError(chars, "Set expected after operator");
    614                ec = U_MALFORMED_SET;
    615                return;
    616            }
    617            lastChar = c;
    618            lastItem = 1;
    619            break;
    620        }
    621    }
    622 
    623    if (mode != 2) {
    624        // syntaxError(chars, "Missing ']'");
    625        ec = U_MALFORMED_SET;
    626        return;
    627    }
    628 
    629    chars.skipIgnored(opts);
    630 
    631    /**
    632     * Handle global flags (invert, case insensitivity).  If this
    633     * pattern should be compiled case-insensitive, then we need
    634     * to close over case BEFORE COMPLEMENTING.  This makes
    635     * patterns like /[^abc]/i work.
    636     */
    637    if ((options & USET_CASE_MASK) != 0) {
    638        (this->*caseClosure)(options);
    639    }
    640    if (invert) {
    641        complement().removeAllStrings();  // code point complement
    642    }
    643 
    644    // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
    645    // generated pattern.
    646    if (usePat) {
    647        rebuiltPat.append(patLocal);
    648    } else {
    649        _generatePattern(rebuiltPat, false);
    650    }
    651    if (isBogus() && U_SUCCESS(ec)) {
    652        // We likely ran out of memory. AHHH!
    653        ec = U_MEMORY_ALLOCATION_ERROR;
    654    }
    655 }
    656 
    657 //----------------------------------------------------------------
    658 // Property set implementation
    659 //----------------------------------------------------------------
    660 
    661 namespace {
    662 
    663 UBool numericValueFilter(UChar32 ch, void* context) {
    664    return u_getNumericValue(ch) == *static_cast<double*>(context);
    665 }
    666 
    667 UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
    668    int32_t value = *static_cast<int32_t*>(context);
    669    return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
    670 }
    671 
    672 UBool versionFilter(UChar32 ch, void* context) {
    673    static const UVersionInfo none = { 0, 0, 0, 0 };
    674    UVersionInfo v;
    675    u_charAge(ch, v);
    676    UVersionInfo* version = static_cast<UVersionInfo*>(context);
    677    return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
    678 }
    679 
    680 typedef struct {
    681    UProperty prop;
    682    int32_t value;
    683 } IntPropertyContext;
    684 
    685 UBool intPropertyFilter(UChar32 ch, void* context) {
    686    IntPropertyContext* c = static_cast<IntPropertyContext*>(context);
    687    return u_getIntPropertyValue(ch, c->prop) == c->value;
    688 }
    689 
    690 UBool scriptExtensionsFilter(UChar32 ch, void* context) {
    691    return uscript_hasScript(ch, *static_cast<UScriptCode*>(context));
    692 }
    693 
    694 UBool idTypeFilter(UChar32 ch, void* context) {
    695    return u_hasIDType(ch, *static_cast<UIdentifierType*>(context));
    696 }
    697 
    698 }  // namespace
    699 
    700 /**
    701 * Generic filter-based scanning code for UCD property UnicodeSets.
    702 */
    703 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
    704                             void* context,
    705                             const UnicodeSet* inclusions,
    706                             UErrorCode &status) {
    707    if (U_FAILURE(status)) return;
    708 
    709    // Logically, walk through all Unicode characters, noting the start
    710    // and end of each range for which filter.contain(c) is
    711    // true.  Add each range to a set.
    712    //
    713    // To improve performance, use an inclusions set which
    714    // encodes information about character ranges that are known
    715    // to have identical properties.
    716    // inclusions contains the first characters of
    717    // same-value ranges for the given property.
    718 
    719    clear();
    720 
    721    UChar32 startHasProperty = -1;
    722    int32_t limitRange = inclusions->getRangeCount();
    723 
    724    for (int j=0; j<limitRange; ++j) {
    725        // get current range
    726        UChar32 start = inclusions->getRangeStart(j);
    727        UChar32 end = inclusions->getRangeEnd(j);
    728 
    729        // for all the code points in the range, process
    730        for (UChar32 ch = start; ch <= end; ++ch) {
    731            // only add to this UnicodeSet on inflection points --
    732            // where the hasProperty value changes to false
    733            if ((*filter)(ch, context)) {
    734                if (startHasProperty < 0) {
    735                    startHasProperty = ch;
    736                }
    737            } else if (startHasProperty >= 0) {
    738                add(startHasProperty, ch-1);
    739                startHasProperty = -1;
    740            }
    741        }
    742    }
    743    if (startHasProperty >= 0) {
    744        add(startHasProperty, static_cast<UChar32>(0x10FFFF));
    745    }
    746    if (isBogus() && U_SUCCESS(status)) {
    747        // We likely ran out of memory. AHHH!
    748        status = U_MEMORY_ALLOCATION_ERROR;
    749    }
    750 }
    751 
    752 namespace {
    753 
    754 UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
    755    /* Note: we use ' ' in compiler code page */
    756    int32_t j = 0;
    757    char ch;
    758    --dstCapacity; /* make room for term. zero */
    759    while ((ch = *src++) != 0) {
    760        if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
    761            continue;
    762        }
    763        if (j >= dstCapacity) return false;
    764        dst[j++] = ch;
    765    }
    766    if (j > 0 && dst[j-1] == ' ') --j;
    767    dst[j] = 0;
    768    return true;
    769 }
    770 
    771 }  // namespace
    772 
    773 //----------------------------------------------------------------
    774 // Property set API
    775 //----------------------------------------------------------------
    776 
    777 #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
    778    ec=U_ILLEGAL_ARGUMENT_ERROR; \
    779    return *this; \
    780 } UPRV_BLOCK_MACRO_END
    781 
    782 UnicodeSet&
    783 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
    784    if (U_FAILURE(ec) || isFrozen()) { return *this; }
    785    if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
    786        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
    787        applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
    788    } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
    789        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
    790        UScriptCode script = static_cast<UScriptCode>(value);
    791        applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
    792    } else if (prop == UCHAR_IDENTIFIER_TYPE) {
    793        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
    794        UIdentifierType idType = static_cast<UIdentifierType>(value);
    795        applyFilter(idTypeFilter, &idType, inclusions, ec);
    796    } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
    797        if (value == 0 || value == 1) {
    798            const USet *set = u_getBinaryPropertySet(prop, &ec);
    799            if (U_FAILURE(ec)) { return *this; }
    800            copyFrom(*UnicodeSet::fromUSet(set), true);
    801            if (value == 0) {
    802                complement().removeAllStrings();  // code point complement
    803            }
    804        } else {
    805            clear();
    806        }
    807    } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
    808        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
    809        IntPropertyContext c = {prop, value};
    810        applyFilter(intPropertyFilter, &c, inclusions, ec);
    811    } else {
    812        ec = U_ILLEGAL_ARGUMENT_ERROR;
    813    }
    814    return *this;
    815 }
    816 
    817 UnicodeSet&
    818 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
    819                               const UnicodeString& value,
    820                               UErrorCode& ec) {
    821    if (U_FAILURE(ec) || isFrozen()) return *this;
    822 
    823    // prop and value used to be converted to char * using the default
    824    // converter instead of the invariant conversion.
    825    // This should not be necessary because all Unicode property and value
    826    // names use only invariant characters.
    827    // If there are any variant characters, then we won't find them anyway.
    828    // Checking first avoids assertion failures in the conversion.
    829    if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
    830        !uprv_isInvariantUString(value.getBuffer(), value.length())
    831    ) {
    832        FAIL(ec);
    833    }
    834    CharString pname, vname;
    835    pname.appendInvariantChars(prop, ec);
    836    vname.appendInvariantChars(value, ec);
    837    if (U_FAILURE(ec)) return *this;
    838 
    839    UProperty p;
    840    int32_t v;
    841    UBool invert = false;
    842 
    843    if (value.length() > 0) {
    844        p = u_getPropertyEnum(pname.data());
    845        if (p == UCHAR_INVALID_CODE) FAIL(ec);
    846 
    847        // Treat gc as gcm
    848        if (p == UCHAR_GENERAL_CATEGORY) {
    849            p = UCHAR_GENERAL_CATEGORY_MASK;
    850        }
    851 
    852        if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
    853            (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
    854            (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
    855            v = u_getPropertyValueEnum(p, vname.data());
    856            if (v == UCHAR_INVALID_CODE) {
    857                // Handle numeric CCC
    858                if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
    859                    p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
    860                    p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
    861                    char* end;
    862                    double val = uprv_strtod(vname.data(), &end);
    863                    // Anything between 0 and 255 is valid even if unused.
    864                    // Cast double->int only after range check.
    865                    // We catch NaN here because comparing it with both 0 and 255 will be false
    866                    // (as are all comparisons with NaN).
    867                    if (*end != 0 || !(0 <= val && val <= 255) ||
    868                            (v = static_cast<int32_t>(val)) != val) {
    869                        // non-integral value or outside 0..255, or trailing junk
    870                        FAIL(ec);
    871                    }
    872                } else {
    873                    FAIL(ec);
    874                }
    875            }
    876        }
    877 
    878        else {
    879 
    880            switch (p) {
    881            case UCHAR_NUMERIC_VALUE:
    882                {
    883                    char* end;
    884                    double val = uprv_strtod(vname.data(), &end);
    885                    if (*end != 0) {
    886                        FAIL(ec);
    887                    }
    888                    applyFilter(numericValueFilter, &val,
    889                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
    890                    return *this;
    891                }
    892            case UCHAR_NAME:
    893                {
    894                    // Must munge name, since u_charFromName() does not do
    895                    // 'loose' matching.
    896                    char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
    897                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
    898                    UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
    899                    if (U_SUCCESS(ec)) {
    900                        clear();
    901                        add(ch);
    902                        return *this;
    903                    } else {
    904                        FAIL(ec);
    905                    }
    906                }
    907            case UCHAR_UNICODE_1_NAME:
    908                // ICU 49 deprecates the Unicode_1_Name property APIs.
    909                FAIL(ec);
    910            case UCHAR_AGE:
    911                {
    912                    // Must munge name, since u_versionFromString() does not do
    913                    // 'loose' matching.
    914                    char buf[128];
    915                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
    916                    UVersionInfo version;
    917                    u_versionFromString(version, buf);
    918                    applyFilter(versionFilter, &version,
    919                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
    920                    return *this;
    921                }
    922            case UCHAR_SCRIPT_EXTENSIONS:
    923                v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
    924                if (v == UCHAR_INVALID_CODE) {
    925                    FAIL(ec);
    926                }
    927                // fall through to calling applyIntPropertyValue()
    928                break;
    929            case UCHAR_IDENTIFIER_TYPE:
    930                v = u_getPropertyValueEnum(p, vname.data());
    931                if (v == UCHAR_INVALID_CODE) {
    932                    FAIL(ec);
    933                }
    934                // fall through to calling applyIntPropertyValue()
    935                break;
    936            default:
    937                // p is a non-binary, non-enumerated property that we
    938                // don't support (yet).
    939                FAIL(ec);
    940            }
    941        }
    942    }
    943 
    944    else {
    945        // value is empty.  Interpret as General Category, Script, or
    946        // Binary property.
    947        p = UCHAR_GENERAL_CATEGORY_MASK;
    948        v = u_getPropertyValueEnum(p, pname.data());
    949        if (v == UCHAR_INVALID_CODE) {
    950            p = UCHAR_SCRIPT;
    951            v = u_getPropertyValueEnum(p, pname.data());
    952            if (v == UCHAR_INVALID_CODE) {
    953                p = u_getPropertyEnum(pname.data());
    954                if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
    955                    v = 1;
    956                } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
    957                    set(MIN_VALUE, MAX_VALUE);
    958                    return *this;
    959                } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
    960                    set(0, 0x7F);
    961                    return *this;
    962                } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
    963                    // [:Assigned:]=[:^Cn:]
    964                    p = UCHAR_GENERAL_CATEGORY_MASK;
    965                    v = U_GC_CN_MASK;
    966                    invert = true;
    967                } else {
    968                    FAIL(ec);
    969                }
    970            }
    971        }
    972    }
    973 
    974    applyIntPropertyValue(p, v, ec);
    975    if(invert) {
    976        complement().removeAllStrings();  // code point complement
    977    }
    978 
    979    if (isBogus() && U_SUCCESS(ec)) {
    980        // We likely ran out of memory. AHHH!
    981        ec = U_MEMORY_ALLOCATION_ERROR;
    982    }
    983    return *this;
    984 }
    985 
    986 //----------------------------------------------------------------
    987 // Property set patterns
    988 //----------------------------------------------------------------
    989 
    990 /**
    991 * Return true if the given position, in the given pattern, appears
    992 * to be the start of a property set pattern.
    993 */
    994 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
    995                                           int32_t pos) {
    996    // Patterns are at least 5 characters long
    997    if ((pos+5) > pattern.length()) {
    998        return false;
    999    }
   1000 
   1001    // Look for an opening [:, [:^, \p, or \P
   1002    return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
   1003 }
   1004 
   1005 /**
   1006 * Return true if the given iterator appears to point at a
   1007 * property pattern.  Regardless of the result, return with the
   1008 * iterator unchanged.
   1009 * @param chars iterator over the pattern characters.  Upon return
   1010 * it will be unchanged.
   1011 * @param iterOpts RuleCharacterIterator options
   1012 */
   1013 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
   1014                                           int32_t iterOpts) {
   1015    // NOTE: literal will always be false, because we don't parse escapes.
   1016    UBool result = false, literal;
   1017    UErrorCode ec = U_ZERO_ERROR;
   1018    iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
   1019    RuleCharacterIterator::Pos pos;
   1020    chars.getPos(pos);
   1021    UChar32 c = chars.next(iterOpts, literal, ec);
   1022    if (c == u'[' || c == u'\\') {
   1023        UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
   1024                               literal, ec);
   1025        result = (c == u'[') ? (d == u':') :
   1026                               (d == u'N' || d == u'p' || d == u'P');
   1027    }
   1028    chars.setPos(pos);
   1029    return result && U_SUCCESS(ec);
   1030 }
   1031 
   1032 /**
   1033 * Parse the given property pattern at the given parse position.
   1034 */
   1035 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
   1036                                             ParsePosition& ppos,
   1037                                             UErrorCode &ec) {
   1038    int32_t pos = ppos.getIndex();
   1039 
   1040    UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
   1041    UBool isName = false; // true for \N{pat}, o/w false
   1042    UBool invert = false;
   1043 
   1044    if (U_FAILURE(ec)) return *this;
   1045 
   1046    // Minimum length is 5 characters, e.g. \p{L}
   1047    if ((pos+5) > pattern.length()) {
   1048        FAIL(ec);
   1049    }
   1050 
   1051    // On entry, ppos should point to one of the following locations:
   1052    // Look for an opening [:, [:^, \p, or \P
   1053    if (isPOSIXOpen(pattern, pos)) {
   1054        posix = true;
   1055        pos += 2;
   1056        pos = ICU_Utility::skipWhitespace(pattern, pos);
   1057        if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
   1058            ++pos;
   1059            invert = true;
   1060        }
   1061    } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
   1062        char16_t c = pattern.charAt(pos+1);
   1063        invert = (c == u'P');
   1064        isName = (c == u'N');
   1065        pos += 2;
   1066        pos = ICU_Utility::skipWhitespace(pattern, pos);
   1067        if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
   1068            // Syntax error; "\p" or "\P" not followed by "{"
   1069            FAIL(ec);
   1070        }
   1071    } else {
   1072        // Open delimiter not seen
   1073        FAIL(ec);
   1074    }
   1075 
   1076    // Look for the matching close delimiter, either :] or }
   1077    int32_t close;
   1078    if (posix) {
   1079      close = pattern.indexOf(u":]", 2, pos);
   1080    } else {
   1081      close = pattern.indexOf(u'}', pos);
   1082    }
   1083    if (close < 0) {
   1084        // Syntax error; close delimiter missing
   1085        FAIL(ec);
   1086    }
   1087 
   1088    // Look for an '=' sign.  If this is present, we will parse a
   1089    // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
   1090    // pattern.
   1091    int32_t equals = pattern.indexOf(u'=', pos);
   1092    UnicodeString propName, valueName;
   1093    if (equals >= 0 && equals < close && !isName) {
   1094        // Equals seen; parse medium/long pattern
   1095        pattern.extractBetween(pos, equals, propName);
   1096        pattern.extractBetween(equals+1, close, valueName);
   1097    }
   1098 
   1099    else {
   1100        // Handle case where no '=' is seen, and \N{}
   1101        pattern.extractBetween(pos, close, propName);
   1102            
   1103        // Handle \N{name}
   1104        if (isName) {
   1105            // This is a little inefficient since it means we have to
   1106            // parse NAME_PROP back to UCHAR_NAME even though we already
   1107            // know it's UCHAR_NAME.  If we refactor the API to
   1108            // support args of (UProperty, char*) then we can remove
   1109            // NAME_PROP and make this a little more efficient.
   1110            valueName = propName;
   1111            propName = NAME_PROP;
   1112        }
   1113    }
   1114 
   1115    applyPropertyAlias(propName, valueName, ec);
   1116 
   1117    if (U_SUCCESS(ec)) {
   1118        if (invert) {
   1119            complement().removeAllStrings();  // code point complement
   1120        }
   1121 
   1122        // Move to the limit position after the close delimiter if the
   1123        // parse succeeded.
   1124        ppos.setIndex(close + (posix ? 2 : 1));
   1125    }
   1126 
   1127    return *this;
   1128 }
   1129 
   1130 /**
   1131 * Parse a property pattern.
   1132 * @param chars iterator over the pattern characters.  Upon return
   1133 * it will be advanced to the first character after the parsed
   1134 * pattern, or the end of the iteration if all characters are
   1135 * parsed.
   1136 * @param rebuiltPat the pattern that was parsed, rebuilt or
   1137 * copied from the input pattern, as appropriate.
   1138 */
   1139 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
   1140                                      UnicodeString& rebuiltPat,
   1141                                      UErrorCode& ec) {
   1142    if (U_FAILURE(ec)) return;
   1143    UnicodeString pattern;
   1144    chars.lookahead(pattern);
   1145    ParsePosition pos(0);
   1146    applyPropertyPattern(pattern, pos, ec);
   1147    if (U_FAILURE(ec)) return;
   1148    if (pos.getIndex() == 0) {
   1149        // syntaxError(chars, "Invalid property pattern");
   1150        ec = U_MALFORMED_SET;
   1151        return;
   1152    }
   1153    chars.jumpahead(pos.getIndex());
   1154    rebuiltPat.append(pattern, 0, pos.getIndex());
   1155 }
   1156 
   1157 U_NAMESPACE_END