[ tor-browser ].git.dasho

uniset.cpp (72729B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   10/20/99    alan        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/parsepos.h"
     15 #include "unicode/symtable.h"
     16 #include "unicode/uniset.h"
     17 #include "unicode/ustring.h"
     18 #include "unicode/utf8.h"
     19 #include "unicode/utf16.h"
     20 #include "ruleiter.h"
     21 #include "cmemory.h"
     22 #include "cstring.h"
     23 #include "patternprops.h"
     24 #include "uelement.h"
     25 #include "util.h"
     26 #include "uvector.h"
     27 #include "charstr.h"
     28 #include "ustrfmt.h"
     29 #include "uassert.h"
     30 #include "bmpset.h"
     31 #include "unisetspan.h"
     32 
     33 // HIGH_VALUE > all valid values. 110000 for codepoints
     34 #define UNICODESET_HIGH 0x0110000
     35 
     36 // LOW <= all valid values. ZERO for codepoints
     37 #define UNICODESET_LOW 0x000000
     38 
     39 /** Max list [0, 1, 2, ..., max code point, HIGH] */
     40 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
     41 
     42 U_NAMESPACE_BEGIN
     43 
     44 SymbolTable::~SymbolTable() {}
     45 
     46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
     47 
     48 /**
     49 * Modify the given UChar32 variable so that it is in range, by
     50 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
     51 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
     52 * It modifies its argument in-place and also returns it.
     53 */
     54 static inline UChar32 pinCodePoint(UChar32& c) {
     55    if (c < UNICODESET_LOW) {
     56        c = UNICODESET_LOW;
     57    } else if (c > (UNICODESET_HIGH-1)) {
     58        c = (UNICODESET_HIGH-1);
     59    }
     60    return c;
     61 }
     62 
     63 //----------------------------------------------------------------
     64 // Debugging
     65 //----------------------------------------------------------------
     66 
     67 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
     68 // To enable the debugging, define the symbol DEBUG_MEM in the line
     69 // below.  This will result in text being sent to stdout that looks
     70 // like this:
     71 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
     72 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
     73 // Each line lists a construction (ct) or destruction (dt) event, the
     74 // object address, the number of outstanding objects after the event,
     75 // and the pattern of the object in question.
     76 
     77 // #define DEBUG_MEM
     78 
     79 #ifdef DEBUG_MEM
     80 #include <stdio.h>
     81 static int32_t _dbgCount = 0;
     82 
     83 static inline void _dbgct(UnicodeSet* set) {
     84    UnicodeString str;
     85    set->toPattern(str, true);
     86    char buf[40];
     87    str.extract(0, 39, buf, "");
     88    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
     89 }
     90 
     91 static inline void _dbgdt(UnicodeSet* set) {
     92    UnicodeString str;
     93    set->toPattern(str, true);
     94    char buf[40];
     95    str.extract(0, 39, buf, "");
     96    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
     97 }
     98 
     99 #else
    100 
    101 #define _dbgct(set)
    102 #define _dbgdt(set)
    103 
    104 #endif
    105 
    106 //----------------------------------------------------------------
    107 // UnicodeString in UVector support
    108 //----------------------------------------------------------------
    109 
    110 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
    111    dst->pointer = new UnicodeString(*static_cast<UnicodeString*>(src->pointer));
    112 }
    113 
    114 static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
    115    const UnicodeString& a = *static_cast<const UnicodeString*>(t1.pointer);
    116    const UnicodeString& b = *static_cast<const UnicodeString*>(t2.pointer);
    117    return a.compare(b);
    118 }
    119 
    120 UBool UnicodeSet::hasStrings() const {
    121    return strings_ != nullptr && !strings_->isEmpty();
    122 }
    123 
    124 int32_t UnicodeSet::stringsSize() const {
    125    return strings_ == nullptr ? 0 : strings_->size();
    126 }
    127 
    128 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
    129    return strings_ != nullptr && strings_->contains((void*) &s);
    130 }
    131 
    132 //----------------------------------------------------------------
    133 // Constructors &c
    134 //----------------------------------------------------------------
    135 
    136 /**
    137 * Constructs an empty set.
    138 */
    139 UnicodeSet::UnicodeSet() {
    140    list[0] = UNICODESET_HIGH;
    141    _dbgct(this);
    142 }
    143 
    144 /**
    145 * Constructs a set containing the given range. If <code>end >
    146 * start</code> then an empty set is created.
    147 *
    148 * @param start first character, inclusive, of range
    149 * @param end last character, inclusive, of range
    150 */
    151 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
    152    list[0] = UNICODESET_HIGH;
    153    add(start, end);
    154    _dbgct(this);
    155 }
    156 
    157 /**
    158 * Constructs a set that is identical to the given UnicodeSet.
    159 */
    160 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
    161    *this = o;
    162    _dbgct(this);
    163 }
    164 
    165 // Copy-construct as thawed.
    166 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
    167    if (ensureCapacity(o.len)) {
    168        // *this = o except for bmpSet and stringSpan
    169        len = o.len;
    170        uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
    171        if (o.hasStrings()) {
    172            UErrorCode status = U_ZERO_ERROR;
    173            if (!allocateStrings(status) ||
    174                    (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
    175                setToBogus();
    176                return;
    177            }
    178        }
    179        if (o.pat) {
    180            setPattern(o.pat, o.patLen);
    181        }
    182        _dbgct(this);
    183    }
    184 }
    185 
    186 /**
    187 * Destructs the set.
    188 */
    189 UnicodeSet::~UnicodeSet() {
    190    _dbgdt(this); // first!
    191    if (list != stackList) {
    192        uprv_free(list);
    193    }
    194    delete bmpSet;
    195    if (buffer != stackList) {
    196        uprv_free(buffer);
    197    }
    198    delete strings_;
    199    delete stringSpan;
    200    releasePattern();
    201 }
    202 
    203 /**
    204 * Assigns this object to be a copy of another.
    205 */
    206 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
    207    return copyFrom(o, false);
    208 }
    209 
    210 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
    211    if (this == &o) {
    212        return *this;
    213    }
    214    if (isFrozen()) {
    215        return *this;
    216    }
    217    if (o.isBogus()) {
    218        setToBogus();
    219        return *this;
    220    }
    221    if (!ensureCapacity(o.len)) {
    222        // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
    223        return *this;
    224    }
    225    len = o.len;
    226    uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
    227    if (o.bmpSet != nullptr && !asThawed) {
    228        bmpSet = new BMPSet(*o.bmpSet, list, len);
    229        if (bmpSet == nullptr) { // Check for memory allocation error.
    230            setToBogus();
    231            return *this;
    232        }
    233    }
    234    if (o.hasStrings()) {
    235        UErrorCode status = U_ZERO_ERROR;
    236        if ((strings_ == nullptr && !allocateStrings(status)) ||
    237                (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
    238            setToBogus();
    239            return *this;
    240        }
    241    } else if (hasStrings()) {
    242        strings_->removeAllElements();
    243    }
    244    if (o.stringSpan != nullptr && !asThawed) {
    245        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_);
    246        if (stringSpan == nullptr) { // Check for memory allocation error.
    247            setToBogus();
    248            return *this;
    249        }
    250    }
    251    releasePattern();
    252    if (o.pat) {
    253        setPattern(o.pat, o.patLen);
    254    }
    255    return *this;
    256 }
    257 
    258 /**
    259 * Returns a copy of this object.  All UnicodeMatcher objects have
    260 * to support cloning in order to allow classes using
    261 * UnicodeMatchers, such as Transliterator, to implement cloning.
    262 */
    263 UnicodeSet* UnicodeSet::clone() const {
    264    return new UnicodeSet(*this);
    265 }
    266 
    267 UnicodeSet *UnicodeSet::cloneAsThawed() const {
    268    return new UnicodeSet(*this, true);
    269 }
    270 
    271 /**
    272 * Compares the specified object with this set for equality.  Returns
    273 * <tt>true</tt> if the two sets
    274 * have the same size, and every member of the specified set is
    275 * contained in this set (or equivalently, every member of this set is
    276 * contained in the specified set).
    277 *
    278 * @param o set to be compared for equality with this set.
    279 * @return <tt>true</tt> if the specified set is equal to this set.
    280 */
    281 bool UnicodeSet::operator==(const UnicodeSet& o) const {
    282    if (len != o.len) return false;
    283    for (int32_t i = 0; i < len; ++i) {
    284        if (list[i] != o.list[i]) return false;
    285    }
    286    if (hasStrings() != o.hasStrings()) { return false; }
    287    if (hasStrings() && *strings_ != *o.strings_) return false;
    288    return true;
    289 }
    290 
    291 /**
    292 * Returns the hash code value for this set.
    293 *
    294 * @return the hash code value for this set.
    295 * @see Object#hashCode()
    296 */
    297 int32_t UnicodeSet::hashCode() const {
    298    uint32_t result = static_cast<uint32_t>(len);
    299    for (int32_t i = 0; i < len; ++i) {
    300        result *= 1000003u;
    301        result += list[i];
    302    }
    303    return static_cast<int32_t>(result);
    304 }
    305 
    306 //----------------------------------------------------------------
    307 // Public API
    308 //----------------------------------------------------------------
    309 
    310 /**
    311 * Returns the number of elements in this set (its cardinality),
    312 * Note than the elements of a set may include both individual
    313 * codepoints and strings.
    314 *
    315 * @return the number of elements in this set (its cardinality).
    316 */
    317 int32_t UnicodeSet::size() const {
    318    int32_t n = 0;
    319    int32_t count = getRangeCount();
    320    for (int32_t i = 0; i < count; ++i) {
    321        n += getRangeEnd(i) - getRangeStart(i) + 1;
    322    }
    323    return n + stringsSize();
    324 }
    325 
    326 /**
    327 * Returns <tt>true</tt> if this set contains no elements.
    328 *
    329 * @return <tt>true</tt> if this set contains no elements.
    330 */
    331 UBool UnicodeSet::isEmpty() const {
    332    return len == 1 && !hasStrings();
    333 }
    334 
    335 /**
    336 * Returns true if this set contains the given character.
    337 * @param c character to be checked for containment
    338 * @return true if the test condition is met
    339 */
    340 UBool UnicodeSet::contains(UChar32 c) const {
    341    // Set i to the index of the start item greater than ch
    342    // We know we will terminate without length test!
    343    // LATER: for large sets, add binary search
    344    //int32_t i = -1;
    345    //for (;;) {
    346    //    if (c < list[++i]) break;
    347    //}
    348    if (bmpSet != nullptr) {
    349        return bmpSet->contains(c);
    350    }
    351    if (stringSpan != nullptr) {
    352        return stringSpan->contains(c);
    353    }
    354    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
    355        return false;
    356    }
    357    int32_t i = findCodePoint(c);
    358    return i & 1; // return true if odd
    359 }
    360 
    361 /**
    362 * Returns the smallest value i such that c < list[i].  Caller
    363 * must ensure that c is a legal value or this method will enter
    364 * an infinite loop.  This method performs a binary search.
    365 * @param c a character in the range MIN_VALUE..MAX_VALUE
    366 * inclusive
    367 * @return the smallest integer i in the range 0..len-1,
    368 * inclusive, such that c < list[i]
    369 */
    370 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
    371    /* Examples:
    372                                       findCodePoint(c)
    373       set              list[]         c=0 1 3 4 7 8
    374       ===              ==============   ===========
    375       []               [110000]         0 0 0 0 0 0
    376       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
    377       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
    378       [:Any:]          [0, 110000]      1 1 1 1 1 1
    379     */
    380 
    381    // Return the smallest i such that c < list[i].  Assume
    382    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
    383    if (c < list[0])
    384        return 0;
    385    // High runner test.  c is often after the last range, so an
    386    // initial check for this condition pays off.
    387    int32_t lo = 0;
    388    int32_t hi = len - 1;
    389    if (lo >= hi || c >= list[hi-1])
    390        return hi;
    391    // invariant: c >= list[lo]
    392    // invariant: c < list[hi]
    393    for (;;) {
    394        int32_t i = (lo + hi) >> 1;
    395        if (i == lo) {
    396            break; // Found!
    397        } else if (c < list[i]) {
    398            hi = i;
    399        } else {
    400            lo = i;
    401        }
    402    }
    403    return hi;
    404 }
    405 
    406 /**
    407 * Returns true if this set contains every character
    408 * of the given range.
    409 * @param start first character, inclusive, of the range
    410 * @param end last character, inclusive, of the range
    411 * @return true if the test condition is met
    412 */
    413 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
    414    //int32_t i = -1;
    415    //for (;;) {
    416    //    if (start < list[++i]) break;
    417    //}
    418    int32_t i = findCodePoint(start);
    419    return ((i & 1) != 0 && end < list[i]);
    420 }
    421 
    422 /**
    423 * Returns <tt>true</tt> if this set contains the given
    424 * multicharacter string.
    425 * @param s string to be checked for containment
    426 * @return <tt>true</tt> if this set contains the specified string
    427 */
    428 UBool UnicodeSet::contains(const UnicodeString& s) const {
    429    int32_t cp = getSingleCP(s);
    430    if (cp < 0) {
    431        return stringsContains(s);
    432    } else {
    433        return contains(static_cast<UChar32>(cp));
    434    }
    435 }
    436 
    437 /**
    438 * Returns true if this set contains all the characters and strings
    439 * of the given set.
    440 * @param c set to be checked for containment
    441 * @return true if the test condition is met
    442 */
    443 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
    444    // The specified set is a subset if all of its pairs are contained in
    445    // this set.  It's possible to code this more efficiently in terms of
    446    // direct manipulation of the inversion lists if the need arises.
    447    int32_t n = c.getRangeCount();
    448    for (int i=0; i<n; ++i) {
    449        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
    450            return false;
    451        }
    452    }
    453    return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_));
    454 }
    455 
    456 /**
    457 * Returns true if this set contains all the characters
    458 * of the given string.
    459 * @param s string containing characters to be checked for containment
    460 * @return true if the test condition is met
    461 */
    462 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
    463    return span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == s.length();
    464 }
    465 
    466 /**
    467 * Returns true if this set contains none of the characters
    468 * of the given range.
    469 * @param start first character, inclusive, of the range
    470 * @param end last character, inclusive, of the range
    471 * @return true if the test condition is met
    472 */
    473 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
    474    //int32_t i = -1;
    475    //for (;;) {
    476    //    if (start < list[++i]) break;
    477    //}
    478    int32_t i = findCodePoint(start);
    479    return ((i & 1) == 0 && end < list[i]);
    480 }
    481 
    482 /**
    483 * Returns true if this set contains none of the characters and strings
    484 * of the given set.
    485 * @param c set to be checked for containment
    486 * @return true if the test condition is met
    487 */
    488 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
    489    // The specified set is a subset if all of its pairs are contained in
    490    // this set.  It's possible to code this more efficiently in terms of
    491    // direct manipulation of the inversion lists if the need arises.
    492    int32_t n = c.getRangeCount();
    493    for (int32_t i=0; i<n; ++i) {
    494        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
    495            return false;
    496        }
    497    }
    498    return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_);
    499 }
    500 
    501 /**
    502 * Returns true if this set contains none of the characters
    503 * of the given string.
    504 * @param s string containing characters to be checked for containment
    505 * @return true if the test condition is met
    506 */
    507 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
    508    return span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == s.length();
    509 }
    510 
    511 /**
    512 * Returns <tt>true</tt> if this set contains any character whose low byte
    513 * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
    514 * indexing.
    515 */
    516 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
    517    /* The index value v, in the range [0,255], is contained in this set if
    518     * it is contained in any pair of this set.  Pairs either have the high
    519     * bytes equal, or unequal.  If the high bytes are equal, then we have
    520     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
    521     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
    522     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
    523     * time zone month containment logic.)
    524     */
    525    int32_t i;
    526    int32_t rangeCount=getRangeCount();
    527    for (i=0; i<rangeCount; ++i) {
    528        UChar32 low = getRangeStart(i);
    529        UChar32 high = getRangeEnd(i);
    530        if ((low & ~0xFF) == (high & ~0xFF)) {
    531            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
    532                return true;
    533            }
    534        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
    535            return true;
    536        }
    537    }
    538    if (hasStrings()) {
    539        for (i=0; i<strings_->size(); ++i) {
    540            const UnicodeString& s = *static_cast<const UnicodeString*>(strings_->elementAt(i));
    541            if (s.isEmpty()) {
    542                continue;  // skip the empty string
    543            }
    544            UChar32 c = s.char32At(0);
    545            if ((c & 0xFF) == v) {
    546                return true;
    547            }
    548        }
    549    }
    550    return false;
    551 }
    552 
    553 /**
    554 * Implementation of UnicodeMatcher::matches().  Always matches the
    555 * longest possible multichar string.
    556 */
    557 UMatchDegree UnicodeSet::matches(const Replaceable& text,
    558                                 int32_t& offset,
    559                                 int32_t limit,
    560                                 UBool incremental) {
    561    if (offset == limit) {
    562        if (contains(U_ETHER)) {
    563            return incremental ? U_PARTIAL_MATCH : U_MATCH;
    564        } else {
    565            return U_MISMATCH;
    566        }
    567    } else {
    568        if (hasStrings()) { // try strings first
    569 
    570            // might separate forward and backward loops later
    571            // for now they are combined
    572 
    573            // TODO Improve efficiency of this, at least in the forward
    574            // direction, if not in both.  In the forward direction we
    575            // can assume the strings are sorted.
    576 
    577            int32_t i;
    578            UBool forward = offset < limit;
    579 
    580            // firstChar is the leftmost char to match in the
    581            // forward direction or the rightmost char to match in
    582            // the reverse direction.
    583            char16_t firstChar = text.charAt(offset);
    584 
    585            // If there are multiple strings that can match we
    586            // return the longest match.
    587            int32_t highWaterLength = 0;
    588 
    589            for (i=0; i<strings_->size(); ++i) {
    590                const UnicodeString& trial = *static_cast<const UnicodeString*>(strings_->elementAt(i));
    591                if (trial.isEmpty()) {
    592                    continue;  // skip the empty string
    593                }
    594 
    595                char16_t c = trial.charAt(forward ? 0 : trial.length() - 1);
    596 
    597                // Strings are sorted, so we can optimize in the
    598                // forward direction.
    599                if (forward && c > firstChar) break;
    600                if (c != firstChar) continue;
    601 
    602                int32_t matchLen = matchRest(text, offset, limit, trial);
    603 
    604                if (incremental) {
    605                    int32_t maxLen = forward ? limit-offset : offset-limit;
    606                    if (matchLen == maxLen) {
    607                        // We have successfully matched but only up to limit.
    608                        return U_PARTIAL_MATCH;
    609                    }
    610                }
    611 
    612                if (matchLen == trial.length()) {
    613                    // We have successfully matched the whole string.
    614                    if (matchLen > highWaterLength) {
    615                        highWaterLength = matchLen;
    616                    }
    617                    // In the forward direction we know strings
    618                    // are sorted so we can bail early.
    619                    if (forward && matchLen < highWaterLength) {
    620                        break;
    621                    }
    622                    continue;
    623                }
    624            }
    625 
    626            // We've checked all strings without a partial match.
    627            // If we have full matches, return the longest one.
    628            if (highWaterLength != 0) {
    629                offset += forward ? highWaterLength : -highWaterLength;
    630                return U_MATCH;
    631            }
    632        }
    633        return UnicodeFilter::matches(text, offset, limit, incremental);
    634    }
    635 }
    636 
    637 /**
    638 * Returns the longest match for s in text at the given position.
    639 * If limit > start then match forward from start+1 to limit
    640 * matching all characters except s.charAt(0).  If limit < start,
    641 * go backward starting from start-1 matching all characters
    642 * except s.charAt(s.length()-1).  This method assumes that the
    643 * first character, text.charAt(start), matches s, so it does not
    644 * check it.
    645 * @param text the text to match
    646 * @param start the first character to match.  In the forward
    647 * direction, text.charAt(start) is matched against s.charAt(0).
    648 * In the reverse direction, it is matched against
    649 * s.charAt(s.length()-1).
    650 * @param limit the limit offset for matching, either last+1 in
    651 * the forward direction, or last-1 in the reverse direction,
    652 * where last is the index of the last character to match.
    653 * @return If part of s matches up to the limit, return |limit -
    654 * start|.  If all of s matches before reaching the limit, return
    655 * s.length().  If there is a mismatch between s and text, return
    656 * 0
    657 */
    658 int32_t UnicodeSet::matchRest(const Replaceable& text,
    659                              int32_t start, int32_t limit,
    660                              const UnicodeString& s) {
    661    int32_t i;
    662    int32_t maxLen;
    663    int32_t slen = s.length();
    664    if (start < limit) {
    665        maxLen = limit - start;
    666        if (maxLen > slen) maxLen = slen;
    667        for (i = 1; i < maxLen; ++i) {
    668            if (text.charAt(start + i) != s.charAt(i)) return 0;
    669        }
    670    } else {
    671        maxLen = start - limit;
    672        if (maxLen > slen) maxLen = slen;
    673        --slen; // <=> slen = s.length() - 1;
    674        for (i = 1; i < maxLen; ++i) {
    675            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
    676        }
    677    }
    678    return maxLen;
    679 }
    680 
    681 /**
    682 * Implement of UnicodeMatcher
    683 */
    684 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
    685    toUnionTo.addAll(*this);
    686 }
    687 
    688 /**
    689 * Returns the index of the given character within this set, where
    690 * the set is ordered by ascending code point.  If the character
    691 * is not in this set, return -1.  The inverse of this method is
    692 * <code>charAt()</code>.
    693 * @return an index from 0..size()-1, or -1
    694 */
    695 int32_t UnicodeSet::indexOf(UChar32 c) const {
    696    if (c < MIN_VALUE || c > MAX_VALUE) {
    697        return -1;
    698    }
    699    int32_t i = 0;
    700    int32_t n = 0;
    701    for (;;) {
    702        UChar32 start = list[i++];
    703        if (c < start) {
    704            return -1;
    705        }
    706        UChar32 limit = list[i++];
    707        if (c < limit) {
    708            return n + c - start;
    709        }
    710        n += limit - start;
    711    }
    712 }
    713 
    714 /**
    715 * Returns the character at the given index within this set, where
    716 * the set is ordered by ascending code point.  If the index is
    717 * out of range, return (UChar32)-1.  The inverse of this method is
    718 * <code>indexOf()</code>.
    719 * @param index an index from 0..size()-1
    720 * @return the character at the given index, or (UChar32)-1.
    721 */
    722 UChar32 UnicodeSet::charAt(int32_t index) const {
    723    if (index >= 0) {
    724        // len2 is the largest even integer <= len, that is, it is len
    725        // for even values and len-1 for odd values.  With odd values
    726        // the last entry is UNICODESET_HIGH.
    727        int32_t len2 = len & ~1;
    728        for (int32_t i=0; i < len2;) {
    729            UChar32 start = list[i++];
    730            int32_t count = list[i++] - start;
    731            if (index < count) {
    732                return static_cast<UChar32>(start + index);
    733            }
    734            index -= count;
    735        }
    736    }
    737    return static_cast<UChar32>(-1);
    738 }
    739 
    740 /**
    741 * Make this object represent the range <code>start - end</code>.
    742 * If <code>end > start</code> then this object is set to an
    743 * an empty range.
    744 *
    745 * @param start first character in the set, inclusive
    746 * @rparam end last character in the set, inclusive
    747 */
    748 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
    749    clear();
    750    complement(start, end);
    751    return *this;
    752 }
    753 
    754 /**
    755 * Adds the specified range to this set if it is not already
    756 * present.  If this set already contains the specified range,
    757 * the call leaves this set unchanged.  If <code>end > start</code>
    758 * then an empty range is added, leaving the set unchanged.
    759 *
    760 * @param start first character, inclusive, of range to be added
    761 * to this set.
    762 * @param end last character, inclusive, of range to be added
    763 * to this set.
    764 */
    765 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
    766    if (pinCodePoint(start) < pinCodePoint(end)) {
    767        UChar32 limit = end + 1;
    768        // Fast path for adding a new range after the last one.
    769        // Odd list length: [..., lastStart, lastLimit, HIGH]
    770        if ((len & 1) != 0) {
    771            // If the list is empty, set lastLimit low enough to not be adjacent to 0.
    772            UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
    773            if (lastLimit <= start && !isFrozen() && !isBogus()) {
    774                if (lastLimit == start) {
    775                    // Extend the last range.
    776                    list[len - 2] = limit;
    777                    if (limit == UNICODESET_HIGH) {
    778                        --len;
    779                    }
    780                } else {
    781                    list[len - 1] = start;
    782                    if (limit < UNICODESET_HIGH) {
    783                        if (ensureCapacity(len + 2)) {
    784                            list[len++] = limit;
    785                            list[len++] = UNICODESET_HIGH;
    786                        }
    787                    } else {  // limit == UNICODESET_HIGH
    788                        if (ensureCapacity(len + 1)) {
    789                            list[len++] = UNICODESET_HIGH;
    790                        }
    791                    }
    792                }
    793                releasePattern();
    794                return *this;
    795            }
    796        }
    797        // This is slow. Could be much faster using findCodePoint(start)
    798        // and modifying the list, dealing with adjacent & overlapping ranges.
    799        UChar32 range[3] = { start, limit, UNICODESET_HIGH };
    800        add(range, 2, 0);
    801    } else if (start == end) {
    802        add(start);
    803    }
    804    return *this;
    805 }
    806 
    807 // #define DEBUG_US_ADD
    808 
    809 #ifdef DEBUG_US_ADD
    810 #include <stdio.h>
    811 void dump(UChar32 c) {
    812    if (c <= 0xFF) {
    813        printf("%c", (char)c);
    814    } else {
    815        printf("U+%04X", c);
    816    }
    817 }
    818 void dump(const UChar32* list, int32_t len) {
    819    printf("[");
    820    for (int32_t i=0; i<len; ++i) {
    821        if (i != 0) printf(", ");
    822        dump(list[i]);
    823    }
    824    printf("]");
    825 }
    826 #endif
    827 
    828 /**
    829 * Adds the specified character to this set if it is not already
    830 * present.  If this set already contains the specified character,
    831 * the call leaves this set unchanged.
    832 */
    833 UnicodeSet& UnicodeSet::add(UChar32 c) {
    834    // find smallest i such that c < list[i]
    835    // if odd, then it is IN the set
    836    // if even, then it is OUT of the set
    837    int32_t i = findCodePoint(pinCodePoint(c));
    838 
    839    // already in set?
    840    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
    841 
    842    // HIGH is 0x110000
    843    // assert(list[len-1] == HIGH);
    844 
    845    // empty = [HIGH]
    846    // [start_0, limit_0, start_1, limit_1, HIGH]
    847 
    848    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
    849    //                             ^
    850    //                             list[i]
    851 
    852    // i == 0 means c is before the first range
    853 
    854 #ifdef DEBUG_US_ADD
    855    printf("Add of ");
    856    dump(c);
    857    printf(" found at %d", i);
    858    printf(": ");
    859    dump(list, len);
    860    printf(" => ");
    861 #endif
    862 
    863    if (c == list[i]-1) {
    864        // c is before start of next range
    865        list[i] = c;
    866        // if we touched the HIGH mark, then add a new one
    867        if (c == (UNICODESET_HIGH - 1)) {
    868            if (!ensureCapacity(len+1)) {
    869                // ensureCapacity will mark the object as Bogus if OOM failure happens.
    870                return *this;
    871            }
    872            list[len++] = UNICODESET_HIGH;
    873        }
    874        if (i > 0 && c == list[i-1]) {
    875            // collapse adjacent ranges
    876 
    877            // [..., start_k-1, c, c, limit_k, ..., HIGH]
    878            //                     ^
    879            //                     list[i]
    880 
    881            //for (int32_t k=i-1; k<len-2; ++k) {
    882            //    list[k] = list[k+2];
    883            //}
    884            UChar32* dst = list + i - 1;
    885            UChar32* src = dst + 2;
    886            UChar32* srclimit = list + len;
    887            while (src < srclimit) *(dst++) = *(src++);
    888 
    889            len -= 2;
    890        }
    891    }
    892 
    893    else if (i > 0 && c == list[i-1]) {
    894        // c is after end of prior range
    895        list[i-1]++;
    896        // no need to check for collapse here
    897    }
    898 
    899    else {
    900        // At this point we know the new char is not adjacent to
    901        // any existing ranges, and it is not 10FFFF.
    902 
    903 
    904        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
    905        //                             ^
    906        //                             list[i]
    907 
    908        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
    909        //                             ^
    910        //                             list[i]
    911 
    912        if (!ensureCapacity(len+2)) {
    913            // ensureCapacity will mark the object as Bogus if OOM failure happens.
    914            return *this;
    915        }
    916 
    917        UChar32 *p = list + i;
    918        uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
    919        list[i] = c;
    920        list[i+1] = c+1;
    921        len += 2;
    922    }
    923 
    924 #ifdef DEBUG_US_ADD
    925    dump(list, len);
    926    printf("\n");
    927 
    928    for (i=1; i<len; ++i) {
    929        if (list[i] <= list[i-1]) {
    930            // Corrupt array!
    931            printf("ERROR: list has been corrupted\n");
    932            exit(1);
    933        }
    934    }
    935 #endif
    936 
    937    releasePattern();
    938    return *this;
    939 }
    940 
    941 /**
    942 * Adds the specified multicharacter to this set if it is not already
    943 * present.  If this set already contains the multicharacter,
    944 * the call leaves this set unchanged.
    945 * Thus "ch" => {"ch"}
    946 *
    947 * @param s the source string
    948 * @return the modified set, for chaining
    949 */
    950 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
    951    if (isFrozen() || isBogus()) return *this;
    952    int32_t cp = getSingleCP(s);
    953    if (cp < 0) {
    954        if (!stringsContains(s)) {
    955            _add(s);
    956            releasePattern();
    957        }
    958    } else {
    959        add(static_cast<UChar32>(cp));
    960    }
    961    return *this;
    962 }
    963 
    964 /**
    965 * Adds the given string, in order, to 'strings_'.  The given string
    966 * must have been checked by the caller to not already be in 'strings_'.
    967 */
    968 void UnicodeSet::_add(const UnicodeString& s) {
    969    if (isFrozen() || isBogus()) {
    970        return;
    971    }
    972    UErrorCode ec = U_ZERO_ERROR;
    973    if (strings_ == nullptr && !allocateStrings(ec)) {
    974        setToBogus();
    975        return;
    976    }
    977    LocalPointer<UnicodeString> t(new UnicodeString(s));
    978    if (t.isNull()) { // Check for memory allocation error.
    979        setToBogus();
    980        return;
    981    }
    982    strings_->sortedInsert(t.orphan(), compareUnicodeString, ec);
    983    if (U_FAILURE(ec)) {
    984        setToBogus();
    985    }
    986 }
    987 
    988 /**
    989 * @return a code point IF the string consists of a single one.
    990 * otherwise returns -1.
    991 * @param string to test
    992 */
    993 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
    994    int32_t sLength = s.length();
    995    if (sLength == 1) return s.charAt(0);
    996    if (sLength == 2) {
    997        UChar32 cp = s.char32At(0);
    998        if (cp > 0xFFFF) { // is surrogate pair
    999            return cp;
   1000        }
   1001    }
   1002    return -1;
   1003 }
   1004 
   1005 /**
   1006 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
   1007 * If this set already any particular character, it has no effect on that character.
   1008 * @param the source string
   1009 * @return the modified set, for chaining
   1010 */
   1011 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
   1012    UChar32 cp;
   1013    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
   1014        cp = s.char32At(i);
   1015        add(cp);
   1016    }
   1017    return *this;
   1018 }
   1019 
   1020 /**
   1021 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1022 * If this set already any particular character, it has no effect on that character.
   1023 * @param the source string
   1024 * @return the modified set, for chaining
   1025 */
   1026 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
   1027    UnicodeSet set;
   1028    set.addAll(s);
   1029    retainAll(set);
   1030    return *this;
   1031 }
   1032 
   1033 /**
   1034 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1035 * If this set already any particular character, it has no effect on that character.
   1036 * @param the source string
   1037 * @return the modified set, for chaining
   1038 */
   1039 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
   1040    UnicodeSet set;
   1041    set.addAll(s);
   1042    complementAll(set);
   1043    return *this;
   1044 }
   1045 
   1046 /**
   1047 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1048 * If this set already any particular character, it has no effect on that character.
   1049 * @param the source string
   1050 * @return the modified set, for chaining
   1051 */
   1052 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
   1053    UnicodeSet set;
   1054    set.addAll(s);
   1055    removeAll(set);
   1056    return *this;
   1057 }
   1058 
   1059 UnicodeSet& UnicodeSet::removeAllStrings() {
   1060    if (!isFrozen() && hasStrings()) {
   1061        strings_->removeAllElements();
   1062        releasePattern();
   1063    }
   1064    return *this;
   1065 }
   1066 
   1067 
   1068 /**
   1069 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
   1070 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1071 * @param the source string
   1072 * @return a newly created set containing the given string
   1073 */
   1074 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
   1075    UnicodeSet *set = new UnicodeSet();
   1076    if (set != nullptr) { // Check for memory allocation error.
   1077        set->add(s);
   1078    }
   1079    return set;
   1080 }
   1081 
   1082 
   1083 /**
   1084 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
   1085 * @param the source string
   1086 * @return a newly created set containing the given characters
   1087 */
   1088 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
   1089    UnicodeSet *set = new UnicodeSet();
   1090    if (set != nullptr) { // Check for memory allocation error.
   1091        set->addAll(s);
   1092    }
   1093    return set;
   1094 }
   1095 
   1096 /**
   1097 * Retain only the elements in this set that are contained in the
   1098 * specified range.  If <code>end > start</code> then an empty range is
   1099 * retained, leaving the set empty.
   1100 *
   1101 * @param start first character, inclusive, of range to be retained
   1102 * to this set.
   1103 * @param end last character, inclusive, of range to be retained
   1104 * to this set.
   1105 */
   1106 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
   1107    if (pinCodePoint(start) <= pinCodePoint(end)) {
   1108        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1109        retain(range, 2, 0);
   1110    } else {
   1111        clear();
   1112    }
   1113    return *this;
   1114 }
   1115 
   1116 UnicodeSet& UnicodeSet::retain(UChar32 c) {
   1117    return retain(c, c);
   1118 }
   1119 
   1120 UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
   1121    if (isFrozen() || isBogus()) { return *this; }
   1122    UChar32 cp = getSingleCP(s);
   1123    if (cp < 0) {
   1124        bool isIn = stringsContains(s);
   1125        // Check for getRangeCount() first to avoid somewhat-expensive size()
   1126        // when there are single code points.
   1127        if (isIn && getRangeCount() == 0 && size() == 1) {
   1128            return *this;
   1129        }
   1130        clear();
   1131        if (isIn) {
   1132            _add(s);
   1133        }
   1134    } else {
   1135        retain(cp, cp);
   1136    }
   1137    return *this;
   1138 }
   1139 
   1140 /**
   1141 * Removes the specified range from this set if it is present.
   1142 * The set will not contain the specified range once the call
   1143 * returns.  If <code>end > start</code> then an empty range is
   1144 * removed, leaving the set unchanged.
   1145 *
   1146 * @param start first character, inclusive, of range to be removed
   1147 * from this set.
   1148 * @param end last character, inclusive, of range to be removed
   1149 * from this set.
   1150 */
   1151 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
   1152    if (pinCodePoint(start) <= pinCodePoint(end)) {
   1153        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1154        retain(range, 2, 2);
   1155    }
   1156    return *this;
   1157 }
   1158 
   1159 /**
   1160 * Removes the specified character from this set if it is present.
   1161 * The set will not contain the specified range once the call
   1162 * returns.
   1163 */
   1164 UnicodeSet& UnicodeSet::remove(UChar32 c) {
   1165    return remove(c, c);
   1166 }
   1167 
   1168 /**
   1169 * Removes the specified string from this set if it is present.
   1170 * The set will not contain the specified character once the call
   1171 * returns.
   1172 * @param the source string
   1173 * @return the modified set, for chaining
   1174 */
   1175 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
   1176    if (isFrozen() || isBogus()) return *this;
   1177    int32_t cp = getSingleCP(s);
   1178    if (cp < 0) {
   1179        if (strings_ != nullptr && strings_->removeElement((void*) &s)) {
   1180            releasePattern();
   1181        }
   1182    } else {
   1183        remove(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
   1184    }
   1185    return *this;
   1186 }
   1187 
   1188 /**
   1189 * Complements the specified range in this set.  Any character in
   1190 * the range will be removed if it is in this set, or will be
   1191 * added if it is not in this set.  If <code>end > start</code>
   1192 * then an empty range is xor'ed, leaving the set unchanged.
   1193 *
   1194 * @param start first character, inclusive, of range to be removed
   1195 * from this set.
   1196 * @param end last character, inclusive, of range to be removed
   1197 * from this set.
   1198 */
   1199 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
   1200    if (isFrozen() || isBogus()) {
   1201        return *this;
   1202    }
   1203    if (pinCodePoint(start) <= pinCodePoint(end)) {
   1204        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1205        exclusiveOr(range, 2, 0);
   1206    }
   1207    releasePattern();
   1208    return *this;
   1209 }
   1210 
   1211 UnicodeSet& UnicodeSet::complement(UChar32 c) {
   1212    return complement(c, c);
   1213 }
   1214 
   1215 /**
   1216 * This is equivalent to
   1217 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
   1218 */
   1219 UnicodeSet& UnicodeSet::complement() {
   1220    if (isFrozen() || isBogus()) {
   1221        return *this;
   1222    }
   1223    if (list[0] == UNICODESET_LOW) {
   1224        uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
   1225        --len;
   1226    } else {
   1227        if (!ensureCapacity(len+1)) {
   1228            return *this;
   1229        }
   1230        uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
   1231        list[0] = UNICODESET_LOW;
   1232        ++len;
   1233    }
   1234    releasePattern();
   1235    return *this;
   1236 }
   1237 
   1238 /**
   1239 * Complement the specified string in this set.
   1240 * The set will not contain the specified string once the call
   1241 * returns.
   1242 *
   1243 * @param s the string to complement
   1244 * @return this object, for chaining
   1245 */
   1246 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
   1247    if (isFrozen() || isBogus()) return *this;
   1248    int32_t cp = getSingleCP(s);
   1249    if (cp < 0) {
   1250        if (stringsContains(s)) {
   1251            strings_->removeElement((void*) &s);
   1252        } else {
   1253            _add(s);
   1254        }
   1255        releasePattern();
   1256    } else {
   1257        complement(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
   1258    }
   1259    return *this;
   1260 }
   1261 
   1262 /**
   1263 * Adds all of the elements in the specified set to this set if
   1264 * they're not already present.  This operation effectively
   1265 * modifies this set so that its value is the <i>union</i> of the two
   1266 * sets.  The behavior of this operation is unspecified if the specified
   1267 * collection is modified while the operation is in progress.
   1268 *
   1269 * @param c set whose elements are to be added to this set.
   1270 * @see #add(char, char)
   1271 */
   1272 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
   1273    if ( c.len>0 && c.list!=nullptr ) {
   1274        add(c.list, c.len, 0);
   1275    }
   1276 
   1277    // Add strings in order
   1278    if ( c.strings_!=nullptr ) {
   1279        for (int32_t i=0; i<c.strings_->size(); ++i) {
   1280            const UnicodeString* s = static_cast<const UnicodeString*>(c.strings_->elementAt(i));
   1281            if (!stringsContains(*s)) {
   1282                _add(*s);
   1283            }
   1284        }
   1285    }
   1286    return *this;
   1287 }
   1288 
   1289 /**
   1290 * Retains only the elements in this set that are contained in the
   1291 * specified set.  In other words, removes from this set all of
   1292 * its elements that are not contained in the specified set.  This
   1293 * operation effectively modifies this set so that its value is
   1294 * the <i>intersection</i> of the two sets.
   1295 *
   1296 * @param c set that defines which elements this set will retain.
   1297 */
   1298 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
   1299    if (isFrozen() || isBogus()) {
   1300        return *this;
   1301    }
   1302    retain(c.list, c.len, 0);
   1303    if (hasStrings()) {
   1304        if (!c.hasStrings()) {
   1305            strings_->removeAllElements();
   1306        } else {
   1307            strings_->retainAll(*c.strings_);
   1308        }
   1309    }
   1310    return *this;
   1311 }
   1312 
   1313 /**
   1314 * Removes from this set all of its elements that are contained in the
   1315 * specified set.  This operation effectively modifies this
   1316 * set so that its value is the <i>asymmetric set difference</i> of
   1317 * the two sets.
   1318 *
   1319 * @param c set that defines which elements will be removed from
   1320 *          this set.
   1321 */
   1322 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
   1323    if (isFrozen() || isBogus()) {
   1324        return *this;
   1325    }
   1326    retain(c.list, c.len, 2);
   1327    if (hasStrings() && c.hasStrings()) {
   1328        strings_->removeAll(*c.strings_);
   1329    }
   1330    return *this;
   1331 }
   1332 
   1333 /**
   1334 * Complements in this set all elements contained in the specified
   1335 * set.  Any character in the other set will be removed if it is
   1336 * in this set, or will be added if it is not in this set.
   1337 *
   1338 * @param c set that defines which elements will be xor'ed from
   1339 *          this set.
   1340 */
   1341 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
   1342    if (isFrozen() || isBogus()) {
   1343        return *this;
   1344    }
   1345    exclusiveOr(c.list, c.len, 0);
   1346 
   1347    if (c.strings_ != nullptr) {
   1348        for (int32_t i=0; i<c.strings_->size(); ++i) {
   1349            void* e = c.strings_->elementAt(i);
   1350            if (strings_ == nullptr || !strings_->removeElement(e)) {
   1351                _add(*static_cast<const UnicodeString*>(e));
   1352            }
   1353        }
   1354    }
   1355    return *this;
   1356 }
   1357 
   1358 /**
   1359 * Removes all of the elements from this set.  This set will be
   1360 * empty after this call returns.
   1361 */
   1362 UnicodeSet& UnicodeSet::clear() {
   1363    if (isFrozen()) {
   1364        return *this;
   1365    }
   1366    list[0] = UNICODESET_HIGH;
   1367    len = 1;
   1368    releasePattern();
   1369    if (strings_ != nullptr) {
   1370        strings_->removeAllElements();
   1371    }
   1372    // Remove bogus
   1373    fFlags = 0;
   1374    return *this;
   1375 }
   1376 
   1377 /**
   1378 * Iteration method that returns the number of ranges contained in
   1379 * this set.
   1380 * @see #getRangeStart
   1381 * @see #getRangeEnd
   1382 */
   1383 int32_t UnicodeSet::getRangeCount() const {
   1384    return len/2;
   1385 }
   1386 
   1387 /**
   1388 * Iteration method that returns the first character in the
   1389 * specified range of this set.
   1390 * @see #getRangeCount
   1391 * @see #getRangeEnd
   1392 */
   1393 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
   1394    return list[index*2];
   1395 }
   1396 
   1397 /**
   1398 * Iteration method that returns the last character in the
   1399 * specified range of this set.
   1400 * @see #getRangeStart
   1401 * @see #getRangeEnd
   1402 */
   1403 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
   1404    return list[index*2 + 1] - 1;
   1405 }
   1406 
   1407 const UnicodeString* UnicodeSet::getString(int32_t index) const {
   1408    return static_cast<const UnicodeString*>(strings_->elementAt(index));
   1409 }
   1410 
   1411 /**
   1412 * Reallocate this objects internal structures to take up the least
   1413 * possible space, without changing this object's value.
   1414 */
   1415 UnicodeSet& UnicodeSet::compact() {
   1416    if (isFrozen() || isBogus()) {
   1417        return *this;
   1418    }
   1419    // Delete buffer first to defragment memory less.
   1420    if (buffer != stackList) {
   1421        uprv_free(buffer);
   1422        buffer = nullptr;
   1423        bufferCapacity = 0;
   1424    }
   1425    if (list == stackList) {
   1426        // pass
   1427    } else if (len <= INITIAL_CAPACITY) {
   1428        uprv_memcpy(stackList, list, len * sizeof(UChar32));
   1429        uprv_free(list);
   1430        list = stackList;
   1431        capacity = INITIAL_CAPACITY;
   1432    } else if ((len + 7) < capacity) {
   1433        // If we have more than a little unused capacity, shrink it to len.
   1434        UChar32* temp = static_cast<UChar32*>(uprv_realloc(list, sizeof(UChar32) * len));
   1435        if (temp) {
   1436            list = temp;
   1437            capacity = len;
   1438        }
   1439        // else what the heck happened?! We allocated less memory!
   1440        // Oh well. We'll keep our original array.
   1441    }
   1442    if (strings_ != nullptr && strings_->isEmpty()) {
   1443        delete strings_;
   1444        strings_ = nullptr;
   1445    }
   1446    return *this;
   1447 }
   1448 
   1449 #ifdef DEBUG_SERIALIZE
   1450 #include <stdio.h>
   1451 #endif
   1452 
   1453 /**
   1454 * Deserialize constructor.
   1455 */
   1456 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
   1457                       UErrorCode &ec) {
   1458 
   1459  if(U_FAILURE(ec)) {
   1460    setToBogus();
   1461    return;
   1462  }
   1463 
   1464  if( (serialization != kSerialized)
   1465      || (data==nullptr)
   1466      || (dataLen < 1)) {
   1467    ec = U_ILLEGAL_ARGUMENT_ERROR;
   1468    setToBogus();
   1469    return;
   1470  }
   1471 
   1472  // bmp?
   1473  int32_t headerSize = ((data[0]&0x8000)) ?2:1;
   1474  int32_t bmpLength = (headerSize==1)?data[0]:data[1];
   1475 
   1476  int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
   1477 #ifdef DEBUG_SERIALIZE
   1478  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
   1479 #endif
   1480  if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
   1481    return;
   1482  }
   1483  // copy bmp
   1484  int32_t i;
   1485  for(i = 0; i< bmpLength;i++) {
   1486    list[i] = data[i+headerSize];
   1487 #ifdef DEBUG_SERIALIZE
   1488    printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
   1489 #endif
   1490  }
   1491  // copy smp
   1492  for(i=bmpLength;i<newLength;i++) {
   1493    list[i] = (static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 0]) << 16) +
   1494               static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 1]);
   1495 #ifdef DEBUG_SERIALIZE
   1496    printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
   1497 #endif
   1498  }
   1499  U_ASSERT(i == newLength);
   1500  if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
   1501    list[i++] = UNICODESET_HIGH;
   1502  }
   1503  len = i;
   1504 }
   1505 
   1506 
   1507 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
   1508    int32_t bmpLength, length, destLength;
   1509 
   1510    if (U_FAILURE(ec)) {
   1511        return 0;
   1512    }
   1513 
   1514    if (destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
   1515        ec=U_ILLEGAL_ARGUMENT_ERROR;
   1516        return 0;
   1517    }
   1518 
   1519    /* count necessary 16-bit units */
   1520    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
   1521    // assert(length>=0);
   1522    if (length==0) {
   1523        /* empty set */
   1524        if (destCapacity>0) {
   1525            *dest=0;
   1526        } else {
   1527            ec=U_BUFFER_OVERFLOW_ERROR;
   1528        }
   1529        return 1;
   1530    }
   1531    /* now length>0 */
   1532 
   1533    if (this->list[length-1]<=0xffff) {
   1534        /* all BMP */
   1535        bmpLength=length;
   1536    } else if (this->list[0]>=0x10000) {
   1537        /* all supplementary */
   1538        bmpLength=0;
   1539        length*=2;
   1540    } else {
   1541        /* some BMP, some supplementary */
   1542        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
   1543        length=bmpLength+2*(length-bmpLength);
   1544    }
   1545 #ifdef DEBUG_SERIALIZE
   1546    printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
   1547 #endif
   1548    /* length: number of 16-bit array units */
   1549    if (length>0x7fff) {
   1550        /* there are only 15 bits for the length in the first serialized word */
   1551        ec=U_INDEX_OUTOFBOUNDS_ERROR;
   1552        return 0;
   1553    }
   1554 
   1555    /*
   1556     * total serialized length:
   1557     * number of 16-bit array units (length) +
   1558     * 1 length unit (always) +
   1559     * 1 bmpLength unit (if there are supplementary values)
   1560     */
   1561    destLength=length+((length>bmpLength)?2:1);
   1562    if (destLength<=destCapacity) {
   1563        const UChar32 *p;
   1564        int32_t i;
   1565 
   1566 #ifdef DEBUG_SERIALIZE
   1567        printf("writeHdr\n");
   1568 #endif
   1569        *dest = static_cast<uint16_t>(length);
   1570        if (length>bmpLength) {
   1571            *dest|=0x8000;
   1572            *++dest = static_cast<uint16_t>(bmpLength);
   1573        }
   1574        ++dest;
   1575 
   1576        /* write the BMP part of the array */
   1577        p=this->list;
   1578        for (i=0; i<bmpLength; ++i) {
   1579 #ifdef DEBUG_SERIALIZE
   1580          printf("writebmp: %x\n", (int)*p);
   1581 #endif
   1582            *dest++ = static_cast<uint16_t>(*p++);
   1583        }
   1584 
   1585        /* write the supplementary part of the array */
   1586        for (; i<length; i+=2) {
   1587 #ifdef DEBUG_SERIALIZE
   1588          printf("write32: %x\n", (int)*p);
   1589 #endif
   1590            *dest++ = static_cast<uint16_t>(*p >> 16);
   1591            *dest++ = static_cast<uint16_t>(*p++);
   1592        }
   1593    } else {
   1594        ec=U_BUFFER_OVERFLOW_ERROR;
   1595    }
   1596    return destLength;
   1597 }
   1598 
   1599 //----------------------------------------------------------------
   1600 // Implementation: Utility methods
   1601 //----------------------------------------------------------------
   1602 
   1603 /**
   1604 * Allocate our strings vector and return true if successful.
   1605 */
   1606 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
   1607    if (U_FAILURE(status)) {
   1608        return false;
   1609    }
   1610    strings_ = new UVector(uprv_deleteUObject,
   1611                          uhash_compareUnicodeString, 1, status);
   1612    if (strings_ == nullptr) { // Check for memory allocation error.
   1613        status = U_MEMORY_ALLOCATION_ERROR;
   1614        return false;
   1615    }
   1616    if (U_FAILURE(status)) {
   1617        delete strings_;
   1618        strings_ = nullptr;
   1619        return false;
   1620    } 
   1621    return true;
   1622 }
   1623 
   1624 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
   1625    // Grow exponentially to reduce the frequency of allocations.
   1626    if (minCapacity < INITIAL_CAPACITY) {
   1627        return minCapacity + INITIAL_CAPACITY;
   1628    } else if (minCapacity <= 2500) {
   1629        return 5 * minCapacity;
   1630    } else {
   1631        int32_t newCapacity = 2 * minCapacity;
   1632        if (newCapacity > MAX_LENGTH) {
   1633            newCapacity = MAX_LENGTH;
   1634        }
   1635        return newCapacity;
   1636    }
   1637 }
   1638 
   1639 bool UnicodeSet::ensureCapacity(int32_t newLen) {
   1640    if (newLen > MAX_LENGTH) {
   1641        newLen = MAX_LENGTH;
   1642    }
   1643    if (newLen <= capacity) {
   1644        return true;
   1645    }
   1646    int32_t newCapacity = nextCapacity(newLen);
   1647    UChar32* temp = static_cast<UChar32*>(uprv_malloc(newCapacity * sizeof(UChar32)));
   1648    if (temp == nullptr) {
   1649        setToBogus(); // set the object to bogus state if an OOM failure occurred.
   1650        return false;
   1651    }
   1652    // Copy only the actual contents.
   1653    uprv_memcpy(temp, list, len * sizeof(UChar32));
   1654    if (list != stackList) {
   1655        uprv_free(list);
   1656    }
   1657    list = temp;
   1658    capacity = newCapacity;
   1659    return true;
   1660 }
   1661 
   1662 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
   1663    if (newLen > MAX_LENGTH) {
   1664        newLen = MAX_LENGTH;
   1665    }
   1666    if (newLen <= bufferCapacity) {
   1667        return true;
   1668    }
   1669    int32_t newCapacity = nextCapacity(newLen);
   1670    UChar32* temp = static_cast<UChar32*>(uprv_malloc(newCapacity * sizeof(UChar32)));
   1671    if (temp == nullptr) {
   1672        setToBogus();
   1673        return false;
   1674    }
   1675    // The buffer has no contents to be copied.
   1676    // It is always filled from scratch after this call.
   1677    if (buffer != stackList) {
   1678        uprv_free(buffer);
   1679    }
   1680    buffer = temp;
   1681    bufferCapacity = newCapacity;
   1682    return true;
   1683 }
   1684 
   1685 /**
   1686 * Swap list and buffer.
   1687 */
   1688 void UnicodeSet::swapBuffers() {
   1689    // swap list and buffer
   1690    UChar32* temp = list;
   1691    list = buffer;
   1692    buffer = temp;
   1693 
   1694    int32_t c = capacity;
   1695    capacity = bufferCapacity;
   1696    bufferCapacity = c;
   1697 }
   1698 
   1699 void UnicodeSet::setToBogus() {
   1700    clear(); // Remove everything in the set.
   1701    fFlags = kIsBogus;
   1702 }
   1703 
   1704 //----------------------------------------------------------------
   1705 // Implementation: Fundamental operators
   1706 //----------------------------------------------------------------
   1707 
   1708 static inline UChar32 max(UChar32 a, UChar32 b) {
   1709    return (a > b) ? a : b;
   1710 }
   1711 
   1712 // polarity = 0, 3 is normal: x xor y
   1713 // polarity = 1, 2: x xor ~y == x === y
   1714 
   1715 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
   1716    if (isFrozen() || isBogus()) {
   1717        return;
   1718    }
   1719    if (!ensureBufferCapacity(len + otherLen)) {
   1720        return;
   1721    }
   1722 
   1723    int32_t i = 0, j = 0, k = 0;
   1724    UChar32 a = list[i++];
   1725    UChar32 b;
   1726    if (polarity == 1 || polarity == 2) {
   1727        b = UNICODESET_LOW;
   1728        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
   1729            ++j;
   1730            b = other[j];
   1731        }
   1732    } else {
   1733        b = other[j++];
   1734    }
   1735    // simplest of all the routines
   1736    // sort the values, discarding identicals!
   1737    for (;;) {
   1738        if (a < b) {
   1739            buffer[k++] = a;
   1740            a = list[i++];
   1741        } else if (b < a) {
   1742            buffer[k++] = b;
   1743            b = other[j++];
   1744        } else if (a != UNICODESET_HIGH) { // at this point, a == b
   1745            // discard both values!
   1746            a = list[i++];
   1747            b = other[j++];
   1748        } else { // DONE!
   1749            buffer[k++] = UNICODESET_HIGH;
   1750            len = k;
   1751            break;
   1752        }
   1753    }
   1754    swapBuffers();
   1755    releasePattern();
   1756 }
   1757 
   1758 // polarity = 0 is normal: x union y
   1759 // polarity = 2: x union ~y
   1760 // polarity = 1: ~x union y
   1761 // polarity = 3: ~x union ~y
   1762 
   1763 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
   1764    if (isFrozen() || isBogus() || other==nullptr) {
   1765        return;
   1766    }
   1767    if (!ensureBufferCapacity(len + otherLen)) {
   1768        return;
   1769    }
   1770 
   1771    int32_t i = 0, j = 0, k = 0;
   1772    UChar32 a = list[i++];
   1773    UChar32 b = other[j++];
   1774    // change from xor is that we have to check overlapping pairs
   1775    // polarity bit 1 means a is second, bit 2 means b is.
   1776    for (;;) {
   1777        switch (polarity) {
   1778          case 0: // both first; take lower if unequal
   1779            if (a < b) { // take a
   1780                // Back up over overlapping ranges in buffer[]
   1781                if (k > 0 && a <= buffer[k-1]) {
   1782                    // Pick latter end value in buffer[] vs. list[]
   1783                    a = max(list[i], buffer[--k]);
   1784                } else {
   1785                    // No overlap
   1786                    buffer[k++] = a;
   1787                    a = list[i];
   1788                }
   1789                i++; // Common if/else code factored out
   1790                polarity ^= 1;
   1791            } else if (b < a) { // take b
   1792                if (k > 0 && b <= buffer[k-1]) {
   1793                    b = max(other[j], buffer[--k]);
   1794                } else {
   1795                    buffer[k++] = b;
   1796                    b = other[j];
   1797                }
   1798                j++;
   1799                polarity ^= 2;
   1800            } else { // a == b, take a, drop b
   1801                if (a == UNICODESET_HIGH) goto loop_end;
   1802                // This is symmetrical; it doesn't matter if
   1803                // we backtrack with a or b. - liu
   1804                if (k > 0 && a <= buffer[k-1]) {
   1805                    a = max(list[i], buffer[--k]);
   1806                } else {
   1807                    // No overlap
   1808                    buffer[k++] = a;
   1809                    a = list[i];
   1810                }
   1811                i++;
   1812                polarity ^= 1;
   1813                b = other[j++];
   1814                polarity ^= 2;
   1815            }
   1816            break;
   1817          case 3: // both second; take higher if unequal, and drop other
   1818            if (b <= a) { // take a
   1819                if (a == UNICODESET_HIGH) goto loop_end;
   1820                buffer[k++] = a;
   1821            } else { // take b
   1822                if (b == UNICODESET_HIGH) goto loop_end;
   1823                buffer[k++] = b;
   1824            }
   1825            a = list[i++];
   1826            polarity ^= 1;   // factored common code
   1827            b = other[j++];
   1828            polarity ^= 2;
   1829            break;
   1830          case 1: // a second, b first; if b < a, overlap
   1831            if (a < b) { // no overlap, take a
   1832                buffer[k++] = a; a = list[i++]; polarity ^= 1;
   1833            } else if (b < a) { // OVERLAP, drop b
   1834                b = other[j++];
   1835                polarity ^= 2;
   1836            } else { // a == b, drop both!
   1837                if (a == UNICODESET_HIGH) goto loop_end;
   1838                a = list[i++];
   1839                polarity ^= 1;
   1840                b = other[j++];
   1841                polarity ^= 2;
   1842            }
   1843            break;
   1844          case 2: // a first, b second; if a < b, overlap
   1845            if (b < a) { // no overlap, take b
   1846                buffer[k++] = b;
   1847                b = other[j++];
   1848                polarity ^= 2;
   1849            } else  if (a < b) { // OVERLAP, drop a
   1850                a = list[i++];
   1851                polarity ^= 1;
   1852            } else { // a == b, drop both!
   1853                if (a == UNICODESET_HIGH) goto loop_end;
   1854                a = list[i++];
   1855                polarity ^= 1;
   1856                b = other[j++];
   1857                polarity ^= 2;
   1858            }
   1859            break;
   1860        }
   1861    }
   1862 loop_end:
   1863    buffer[k++] = UNICODESET_HIGH;    // terminate
   1864    len = k;
   1865    swapBuffers();
   1866    releasePattern();
   1867 }
   1868 
   1869 // polarity = 0 is normal: x intersect y
   1870 // polarity = 2: x intersect ~y == set-minus
   1871 // polarity = 1: ~x intersect y
   1872 // polarity = 3: ~x intersect ~y
   1873 
   1874 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
   1875    if (isFrozen() || isBogus()) {
   1876        return;
   1877    }
   1878    if (!ensureBufferCapacity(len + otherLen)) {
   1879        return;
   1880    }
   1881 
   1882    int32_t i = 0, j = 0, k = 0;
   1883    UChar32 a = list[i++];
   1884    UChar32 b = other[j++];
   1885    // change from xor is that we have to check overlapping pairs
   1886    // polarity bit 1 means a is second, bit 2 means b is.
   1887    for (;;) {
   1888        switch (polarity) {
   1889          case 0: // both first; drop the smaller
   1890            if (a < b) { // drop a
   1891                a = list[i++];
   1892                polarity ^= 1;
   1893            } else if (b < a) { // drop b
   1894                b = other[j++];
   1895                polarity ^= 2;
   1896            } else { // a == b, take one, drop other
   1897                if (a == UNICODESET_HIGH) goto loop_end;
   1898                buffer[k++] = a;
   1899                a = list[i++];
   1900                polarity ^= 1;
   1901                b = other[j++];
   1902                polarity ^= 2;
   1903            }
   1904            break;
   1905          case 3: // both second; take lower if unequal
   1906            if (a < b) { // take a
   1907                buffer[k++] = a;
   1908                a = list[i++];
   1909                polarity ^= 1;
   1910            } else if (b < a) { // take b
   1911                buffer[k++] = b;
   1912                b = other[j++];
   1913                polarity ^= 2;
   1914            } else { // a == b, take one, drop other
   1915                if (a == UNICODESET_HIGH) goto loop_end;
   1916                buffer[k++] = a;
   1917                a = list[i++];
   1918                polarity ^= 1;
   1919                b = other[j++];
   1920                polarity ^= 2;
   1921            }
   1922            break;
   1923          case 1: // a second, b first;
   1924            if (a < b) { // NO OVERLAP, drop a
   1925                a = list[i++];
   1926                polarity ^= 1;
   1927            } else if (b < a) { // OVERLAP, take b
   1928                buffer[k++] = b;
   1929                b = other[j++];
   1930                polarity ^= 2;
   1931            } else { // a == b, drop both!
   1932                if (a == UNICODESET_HIGH) goto loop_end;
   1933                a = list[i++];
   1934                polarity ^= 1;
   1935                b = other[j++];
   1936                polarity ^= 2;
   1937            }
   1938            break;
   1939          case 2: // a first, b second; if a < b, overlap
   1940            if (b < a) { // no overlap, drop b
   1941                b = other[j++];
   1942                polarity ^= 2;
   1943            } else  if (a < b) { // OVERLAP, take a
   1944                buffer[k++] = a;
   1945                a = list[i++];
   1946                polarity ^= 1;
   1947            } else { // a == b, drop both!
   1948                if (a == UNICODESET_HIGH) goto loop_end;
   1949                a = list[i++];
   1950                polarity ^= 1;
   1951                b = other[j++];
   1952                polarity ^= 2;
   1953            }
   1954            break;
   1955        }
   1956    }
   1957 loop_end:
   1958    buffer[k++] = UNICODESET_HIGH;    // terminate
   1959    len = k;
   1960    swapBuffers();
   1961    releasePattern();
   1962 }
   1963 
   1964 /**
   1965 * Append the <code>toPattern()</code> representation of a
   1966 * string to the given <code>StringBuffer</code>.
   1967 */
   1968 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
   1969    UChar32 cp;
   1970    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
   1971        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
   1972    }
   1973 }
   1974 
   1975 /**
   1976 * Append the <code>toPattern()</code> representation of a
   1977 * character to the given <code>StringBuffer</code>.
   1978 */
   1979 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
   1980    if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
   1981        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
   1982        // unprintable
   1983        ICU_Utility::escape(buf, c);
   1984        return;
   1985    }
   1986    // Okay to let ':' pass through
   1987    switch (c) {
   1988    case u'[':
   1989    case u']':
   1990    case u'-':
   1991    case u'^':
   1992    case u'&':
   1993    case u'\\':
   1994    case u'{':
   1995    case u'}':
   1996    case u':':
   1997    case SymbolTable::SYMBOL_REF:
   1998        buf.append(u'\\');
   1999        break;
   2000    default:
   2001        // Escape whitespace
   2002        if (PatternProps::isWhiteSpace(c)) {
   2003            buf.append(u'\\');
   2004        }
   2005        break;
   2006    }
   2007    buf.append(c);
   2008 }
   2009 
   2010 void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
   2011                              UBool escapeUnprintable) {
   2012    _appendToPat(result, start, escapeUnprintable);
   2013    if (start != end) {
   2014        if ((start+1) != end ||
   2015                // Avoid writing what looks like a lead+trail surrogate pair.
   2016                start == 0xdbff) {
   2017            result.append(u'-');
   2018        }
   2019        _appendToPat(result, end, escapeUnprintable);
   2020    }
   2021 }
   2022 
   2023 /**
   2024 * Append a string representation of this set to result.  This will be
   2025 * a cleaned version of the string passed to applyPattern(), if there
   2026 * is one.  Otherwise it will be generated.
   2027 */
   2028 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
   2029                                      UBool escapeUnprintable) const
   2030 {
   2031    if (pat != nullptr) {
   2032        int32_t i;
   2033        int32_t backslashCount = 0;
   2034        for (i=0; i<patLen; ) {
   2035            UChar32 c;
   2036            U16_NEXT(pat, i, patLen, c);
   2037            if (escapeUnprintable ?
   2038                    ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
   2039                // If the unprintable character is preceded by an odd
   2040                // number of backslashes, then it has been escaped.
   2041                // Before unescaping it, we delete the final
   2042                // backslash.
   2043                if ((backslashCount % 2) == 1) {
   2044                    result.truncate(result.length() - 1);
   2045                }
   2046                ICU_Utility::escape(result, c);
   2047                backslashCount = 0;
   2048            } else {
   2049                result.append(c);
   2050                if (c == u'\\') {
   2051                    ++backslashCount;
   2052                } else {
   2053                    backslashCount = 0;
   2054                }
   2055            }
   2056        }
   2057        return result;
   2058    }
   2059 
   2060    return _generatePattern(result, escapeUnprintable);
   2061 }
   2062 
   2063 /**
   2064 * Returns a string representation of this set.  If the result of
   2065 * calling this function is passed to a UnicodeSet constructor, it
   2066 * will produce another set that is equal to this one.
   2067 */
   2068 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
   2069                                     UBool escapeUnprintable) const
   2070 {
   2071    result.truncate(0);
   2072    return _toPattern(result, escapeUnprintable);
   2073 }
   2074 
   2075 /**
   2076 * Generate and append a string representation of this set to result.
   2077 * This does not use this.pat, the cleaned up copy of the string
   2078 * passed to applyPattern().
   2079 */
   2080 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
   2081                                            UBool escapeUnprintable) const
   2082 {
   2083    result.append(u'[');
   2084 
   2085    int32_t i = 0;
   2086    int32_t limit = len & ~1;  // = 2 * getRangeCount()
   2087 
   2088    // If the set contains at least 2 intervals and includes both
   2089    // MIN_VALUE and MAX_VALUE, then the inverse representation will
   2090    // be more economical.
   2091    //     if (getRangeCount() >= 2 &&
   2092    //             getRangeStart(0) == MIN_VALUE &&
   2093    //             getRangeEnd(last) == MAX_VALUE)
   2094    // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
   2095    // If limit == len then len is even and the last range ends with MAX_VALUE.
   2096    //
   2097    // *But* do not write the inverse (complement) if there are strings.
   2098    // Since ICU 70, the '^' performs a code point complement which removes all strings.
   2099    if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
   2100        // Emit the inverse
   2101        result.append(u'^');
   2102        // Offsetting the inversion list index by one lets us
   2103        // iterate over the ranges of the set complement.
   2104        i = 1;
   2105        --limit;
   2106    }
   2107 
   2108    // Emit the ranges as pairs.
   2109    while (i < limit) {
   2110        UChar32 start = list[i];  // getRangeStart()
   2111        UChar32 end = list[i + 1] - 1;  // getRangeEnd() = range limit minus one
   2112        if (!(0xd800 <= end && end <= 0xdbff)) {
   2113            _appendToPat(result, start, end, escapeUnprintable);
   2114            i += 2;
   2115        } else {
   2116            // The range ends with a lead surrogate.
   2117            // Avoid writing what looks like a lead+trail surrogate pair.
   2118            // 1. Postpone ranges that start with a lead surrogate code point.
   2119            int32_t firstLead = i;
   2120            while ((i += 2) < limit && list[i] <= 0xdbff) {}
   2121            int32_t firstAfterLead = i;
   2122            // 2. Write following ranges that start with a trail surrogate code point.
   2123            while (i < limit && (start = list[i]) <= 0xdfff) {
   2124                _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
   2125                i += 2;
   2126            }
   2127            // 3. Now write the postponed ranges.
   2128            for (int j = firstLead; j < firstAfterLead; j += 2) {
   2129                _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
   2130            }
   2131        }
   2132    }
   2133 
   2134    if (strings_ != nullptr) {
   2135        for (int32_t i = 0; i<strings_->size(); ++i) {
   2136            result.append(u'{');
   2137            _appendToPat(result,
   2138                         *static_cast<const UnicodeString*>(strings_->elementAt(i)),
   2139                         escapeUnprintable);
   2140            result.append(u'}');
   2141        }
   2142    }
   2143    return result.append(u']');
   2144 }
   2145 
   2146 /**
   2147 * Release existing cached pattern
   2148 */
   2149 void UnicodeSet::releasePattern() {
   2150    if (pat) {
   2151        uprv_free(pat);
   2152        pat = nullptr;
   2153        patLen = 0;
   2154    }
   2155 }
   2156 
   2157 /**
   2158 * Set the new pattern to cache.
   2159 */
   2160 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
   2161    releasePattern();
   2162    pat = static_cast<char16_t*>(uprv_malloc((newPatLen + 1) * sizeof(char16_t)));
   2163    if (pat) {
   2164        patLen = newPatLen;
   2165        u_memcpy(pat, newPat, patLen);
   2166        pat[patLen] = 0;
   2167    }
   2168    // else we don't care if malloc failed. This was just a nice cache.
   2169    // We can regenerate an equivalent pattern later when requested.
   2170 }
   2171 
   2172 UnicodeSet *UnicodeSet::freeze() {
   2173    if(!isFrozen() && !isBogus()) {
   2174        compact();
   2175 
   2176        // Optimize contains() and span() and similar functions.
   2177        if (hasStrings()) {
   2178            stringSpan = new UnicodeSetStringSpan(*this, *strings_, UnicodeSetStringSpan::ALL);
   2179            if (stringSpan == nullptr) {
   2180                setToBogus();
   2181                return this;
   2182            } else if (!stringSpan->needsStringSpanUTF16()) {
   2183                // All strings are irrelevant for span() etc. because
   2184                // all of each string's code points are contained in this set.
   2185                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
   2186                // many relevant strings as UTF-16.
   2187                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
   2188                delete stringSpan;
   2189                stringSpan = nullptr;
   2190            }
   2191        }
   2192        if (stringSpan == nullptr) {
   2193            // No span-relevant strings: Optimize for code point spans.
   2194            bmpSet=new BMPSet(list, len);
   2195            if (bmpSet == nullptr) { // Check for memory allocation error.
   2196                setToBogus();
   2197            }
   2198        }
   2199    }
   2200    return this;
   2201 }
   2202 
   2203 int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
   2204    if(length>0 && bmpSet!=nullptr) {
   2205        return static_cast<int32_t>(bmpSet->span(s, s + length, spanCondition) - s);
   2206    }
   2207    if(length<0) {
   2208        length=u_strlen(s);
   2209    }
   2210    if(length==0) {
   2211        return 0;
   2212    }
   2213    if(stringSpan!=nullptr) {
   2214        return stringSpan->span(s, length, spanCondition);
   2215    } else if(hasStrings()) {
   2216        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2217                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
   2218                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
   2219        UnicodeSetStringSpan strSpan(*this, *strings_, which);
   2220        if(strSpan.needsStringSpanUTF16()) {
   2221            return strSpan.span(s, length, spanCondition);
   2222        }
   2223    }
   2224 
   2225    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2226        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2227    }
   2228 
   2229    UChar32 c;
   2230    int32_t start=0, prev=0;
   2231    do {
   2232        U16_NEXT(s, start, length, c);
   2233        if(spanCondition!=contains(c)) {
   2234            break;
   2235        }
   2236    } while((prev=start)<length);
   2237    return prev;
   2238 }
   2239 
   2240 int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
   2241    if(length>0 && bmpSet!=nullptr) {
   2242        return static_cast<int32_t>(bmpSet->spanBack(s, s + length, spanCondition) - s);
   2243    }
   2244    if(length<0) {
   2245        length=u_strlen(s);
   2246    }
   2247    if(length==0) {
   2248        return 0;
   2249    }
   2250    if(stringSpan!=nullptr) {
   2251        return stringSpan->spanBack(s, length, spanCondition);
   2252    } else if(hasStrings()) {
   2253        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2254                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
   2255                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
   2256        UnicodeSetStringSpan strSpan(*this, *strings_, which);
   2257        if(strSpan.needsStringSpanUTF16()) {
   2258            return strSpan.spanBack(s, length, spanCondition);
   2259        }
   2260    }
   2261 
   2262    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2263        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2264    }
   2265 
   2266    UChar32 c;
   2267    int32_t prev=length;
   2268    do {
   2269        U16_PREV(s, 0, length, c);
   2270        if(spanCondition!=contains(c)) {
   2271            break;
   2272        }
   2273    } while((prev=length)>0);
   2274    return prev;
   2275 }
   2276 
   2277 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
   2278    if(length>0 && bmpSet!=nullptr) {
   2279        const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s);
   2280        return static_cast<int32_t>(bmpSet->spanUTF8(s0, length, spanCondition) - s0);
   2281    }
   2282    if(length<0) {
   2283        length = static_cast<int32_t>(uprv_strlen(s));
   2284    }
   2285    if(length==0) {
   2286        return 0;
   2287    }
   2288    if(stringSpan!=nullptr) {
   2289        return stringSpan->spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
   2290    } else if(hasStrings()) {
   2291        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2292                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
   2293                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
   2294        UnicodeSetStringSpan strSpan(*this, *strings_, which);
   2295        if(strSpan.needsStringSpanUTF8()) {
   2296            return strSpan.spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
   2297        }
   2298    }
   2299 
   2300    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2301        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2302    }
   2303 
   2304    UChar32 c;
   2305    int32_t start=0, prev=0;
   2306    do {
   2307        U8_NEXT_OR_FFFD(s, start, length, c);
   2308        if(spanCondition!=contains(c)) {
   2309            break;
   2310        }
   2311    } while((prev=start)<length);
   2312    return prev;
   2313 }
   2314 
   2315 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
   2316    if(length>0 && bmpSet!=nullptr) {
   2317        const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s);
   2318        return bmpSet->spanBackUTF8(s0, length, spanCondition);
   2319    }
   2320    if(length<0) {
   2321        length = static_cast<int32_t>(uprv_strlen(s));
   2322    }
   2323    if(length==0) {
   2324        return 0;
   2325    }
   2326    if(stringSpan!=nullptr) {
   2327        return stringSpan->spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
   2328    } else if(hasStrings()) {
   2329        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2330                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
   2331                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
   2332        UnicodeSetStringSpan strSpan(*this, *strings_, which);
   2333        if(strSpan.needsStringSpanUTF8()) {
   2334            return strSpan.spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
   2335        }
   2336    }
   2337 
   2338    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2339        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2340    }
   2341 
   2342    UChar32 c;
   2343    int32_t prev=length;
   2344    do {
   2345        U8_PREV_OR_FFFD(s, 0, length, c);
   2346        if(spanCondition!=contains(c)) {
   2347            break;
   2348        }
   2349    } while((prev=length)>0);
   2350    return prev;
   2351 }
   2352 
   2353 U_NAMESPACE_END
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE