tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

uniset_closure.cpp (12080B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  uniset_closure.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2011may30
     16 *   created by: Markus W. Scherer
     17 *
     18 *   UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp
     19 *   to simplify dependencies.
     20 *   In particular, this depends on the BreakIterator, but the BreakIterator
     21 *   code also builds UnicodeSets from patterns and needs uniset_props.
     22 */
     23 
     24 #include "unicode/brkiter.h"
     25 #include "unicode/locid.h"
     26 #include "unicode/parsepos.h"
     27 #include "unicode/uniset.h"
     28 #include "unicode/utf16.h"
     29 #include "cmemory.h"
     30 #include "ruleiter.h"
     31 #include "ucase.h"
     32 #include "uprops.h"
     33 #include "util.h"
     34 #include "uvector.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 // TODO memory debugging provided inside uniset.cpp
     39 // could be made available here but probably obsolete with use of modern
     40 // memory leak checker tools
     41 #define _dbgct(me)
     42 
     43 //----------------------------------------------------------------
     44 // Constructors &c
     45 //----------------------------------------------------------------
     46 
     47 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
     48                       uint32_t options,
     49                       const SymbolTable* symbols,
     50                       UErrorCode& status) {
     51    applyPattern(pattern, options, symbols, status);
     52    _dbgct(this);
     53 }
     54 
     55 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
     56                       uint32_t options,
     57                       const SymbolTable* symbols,
     58                       UErrorCode& status) {
     59    applyPattern(pattern, pos, options, symbols, status);
     60    _dbgct(this);
     61 }
     62 
     63 //----------------------------------------------------------------
     64 // Public API
     65 //----------------------------------------------------------------
     66 
     67 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
     68                                     uint32_t options,
     69                                     const SymbolTable* symbols,
     70                                     UErrorCode& status) {
     71    ParsePosition pos(0);
     72    applyPattern(pattern, pos, options, symbols, status);
     73    if (U_FAILURE(status)) return *this;
     74 
     75    int32_t i = pos.getIndex();
     76 
     77    if (options & USET_IGNORE_SPACE) {
     78        // Skip over trailing whitespace
     79        ICU_Utility::skipWhitespace(pattern, i, true);
     80    }
     81 
     82    if (i != pattern.length()) {
     83        status = U_ILLEGAL_ARGUMENT_ERROR;
     84    }
     85    return *this;
     86 }
     87 
     88 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
     89                              ParsePosition& pos,
     90                              uint32_t options,
     91                              const SymbolTable* symbols,
     92                              UErrorCode& status) {
     93    if (U_FAILURE(status)) {
     94        return *this;
     95    }
     96    if (isFrozen()) {
     97        status = U_NO_WRITE_PERMISSION;
     98        return *this;
     99    }
    100    // Need to build the pattern in a temporary string because
    101    // _applyPattern calls add() etc., which set pat to empty.
    102    UnicodeString rebuiltPat;
    103    RuleCharacterIterator chars(pattern, symbols, pos);
    104    applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status);
    105    if (U_FAILURE(status)) return *this;
    106    if (chars.inVariable()) {
    107        // syntaxError(chars, "Extra chars in variable value");
    108        status = U_MALFORMED_SET;
    109        return *this;
    110    }
    111    setPattern(rebuiltPat);
    112    return *this;
    113 }
    114 
    115 // USetAdder implementation
    116 // Does not use uset.h to reduce code dependencies
    117 static void U_CALLCONV
    118 _set_add(USet *set, UChar32 c) {
    119    reinterpret_cast<UnicodeSet*>(set)->add(c);
    120 }
    121 
    122 static void U_CALLCONV
    123 _set_addRange(USet *set, UChar32 start, UChar32 end) {
    124    reinterpret_cast<UnicodeSet*>(set)->add(start, end);
    125 }
    126 
    127 static void U_CALLCONV
    128 _set_addString(USet *set, const char16_t *str, int32_t length) {
    129    reinterpret_cast<UnicodeSet*>(set)->add(UnicodeString(static_cast<UBool>(length < 0), str, length));
    130 }
    131 
    132 //----------------------------------------------------------------
    133 // Case folding API
    134 //----------------------------------------------------------------
    135 
    136 // add the result of a full case mapping to the set
    137 // use str as a temporary string to avoid constructing one
    138 static inline void
    139 addCaseMapping(UnicodeSet &set, int32_t result, const char16_t *full, UnicodeString &str) {
    140    if(result >= 0) {
    141        if(result > UCASE_MAX_STRING_LENGTH) {
    142            // add a single-code point case mapping
    143            set.add(result);
    144        } else {
    145            // add a string case mapping from full with length result
    146            str.setTo(static_cast<UBool>(false), full, result);
    147            set.add(str);
    148        }
    149    }
    150    // result < 0: the code point mapped to itself, no need to add it
    151    // see ucase.h
    152 }
    153 
    154 namespace {
    155 
    156 /** For case closure on a large set, look only at code points with relevant properties. */
    157 const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) {
    158    // The subset must have been constructed with all code points,
    159    // so that the retainAll() intersection effectively copies all single code points from src.
    160    U_ASSERT(subset.contains(0, 0x10ffff));
    161    if (src.size() < 30) {
    162        return src;
    163    }
    164    // Return the intersection of the src code points with Case_Sensitive ones.
    165    UErrorCode errorCode = U_ZERO_ERROR;
    166    const UnicodeSet *sensitive =
    167        CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode);
    168    if (U_FAILURE(errorCode)) {
    169        return src;
    170    }
    171    // Start by copying the "smaller" set.
    172    // (We "copy" by intersecting all Unicode *code points* with the first set,
    173    // which omits any strings.)
    174    if (src.getRangeCount() > sensitive->getRangeCount()) {
    175        subset.retainAll(*sensitive);
    176        subset.retainAll(src);
    177    } else {
    178        subset.retainAll(src);
    179        subset.retainAll(*sensitive);
    180    }
    181    return subset;
    182 }
    183 
    184 // Per-character scf = Simple_Case_Folding of a string.
    185 // (Normally when we case-fold a string we use full case foldings.)
    186 bool scfString(const UnicodeString &s, UnicodeString &scf) {
    187    // Iterate over the raw buffer for best performance.
    188    const char16_t *p = s.getBuffer();
    189    int32_t length = s.length();
    190    // Loop while not needing modification.
    191    for (int32_t i = 0; i < length;) {
    192        UChar32 c;
    193        U16_NEXT(p, i, length, c);  // post-increments i
    194        UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
    195        if (scfChar != c) {
    196            // Copy the characters before c.
    197            scf.setTo(p, i - U16_LENGTH(c));
    198            // Loop over the rest of the string and keep case-folding.
    199            for (;;) {
    200                scf.append(scfChar);
    201                if (i == length) {
    202                    return true;
    203                }
    204                U16_NEXT(p, i, length, c);  // post-increments i
    205                scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
    206            }
    207        }
    208    }
    209    return false;
    210 }
    211 
    212 }  // namespace
    213 
    214 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
    215    if (isFrozen() || isBogus()) {
    216        return *this;
    217    }
    218    switch (attribute & USET_CASE_MASK) {
    219    case 0:
    220        break;
    221    case USET_CASE_INSENSITIVE:
    222        closeOverCaseInsensitive(/* simple= */ false);
    223        break;
    224    case USET_ADD_CASE_MAPPINGS:
    225        closeOverAddCaseMappings();
    226        break;
    227    case USET_SIMPLE_CASE_INSENSITIVE:
    228        closeOverCaseInsensitive(/* simple= */ true);
    229        break;
    230    default:
    231        // bad option (unreachable)
    232        break;
    233    }
    234    return *this;
    235 }
    236 
    237 void UnicodeSet::closeOverCaseInsensitive(bool simple) {
    238    // Start with input set to guarantee inclusion.
    239    UnicodeSet foldSet(*this);
    240    // Full case mappings closure:
    241    // Remove strings because the strings will actually be reduced (folded);
    242    // therefore, start with no strings and add only those needed.
    243    // Do this before processing code points, because they may add strings.
    244    if (!simple && foldSet.hasStrings()) {
    245        foldSet.strings_->removeAllElements();
    246    }
    247 
    248    USetAdder sa = {
    249        foldSet.toUSet(),
    250        _set_add,
    251        _set_addRange,
    252        _set_addString,
    253        nullptr, // don't need remove()
    254        nullptr // don't need removeRange()
    255    };
    256 
    257    UnicodeSet subset(0, 0x10ffff);
    258    const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
    259 
    260    // Iterate over the ranges of single code points. Nested loop for each code point.
    261    int32_t n = codePoints.getRangeCount();
    262 
    263    for (int32_t i=0; i<n; ++i) {
    264        UChar32 start = codePoints.getRangeStart(i);
    265        UChar32 end   = codePoints.getRangeEnd(i);
    266 
    267        if (simple) {
    268            for (UChar32 cp=start; cp<=end; ++cp) {
    269                ucase_addSimpleCaseClosure(cp, &sa);
    270            }
    271        } else {
    272            for (UChar32 cp=start; cp<=end; ++cp) {
    273                ucase_addCaseClosure(cp, &sa);
    274            }
    275        }
    276    }
    277    if (hasStrings()) {
    278        UnicodeString str;
    279        for (int32_t j=0; j<strings_->size(); ++j) {
    280            const UnicodeString* pStr = static_cast<const UnicodeString*>(strings_->elementAt(j));
    281            if (simple) {
    282                if (scfString(*pStr, str)) {
    283                    foldSet.remove(*pStr).add(str);
    284                }
    285            } else {
    286                str = *pStr;
    287                str.foldCase();
    288                if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
    289                    foldSet.add(str); // does not map to code points: add the folded string itself
    290                }
    291            }
    292        }
    293    }
    294    *this = foldSet;
    295 }
    296 
    297 void UnicodeSet::closeOverAddCaseMappings() {
    298    // Start with input set to guarantee inclusion.
    299    UnicodeSet foldSet(*this);
    300 
    301    UnicodeSet subset(0, 0x10ffff);
    302    const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
    303 
    304    // Iterate over the ranges of single code points. Nested loop for each code point.
    305    int32_t n = codePoints.getRangeCount();
    306    UChar32 result;
    307    const char16_t *full;
    308    UnicodeString str;
    309 
    310    for (int32_t i=0; i<n; ++i) {
    311        UChar32 start = codePoints.getRangeStart(i);
    312        UChar32 end   = codePoints.getRangeEnd(i);
    313 
    314        // add case mappings
    315        // (does not add long s for regular s, or Kelvin for k, for example)
    316        for (UChar32 cp=start; cp<=end; ++cp) {
    317            result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
    318            addCaseMapping(foldSet, result, full, str);
    319 
    320            result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
    321            addCaseMapping(foldSet, result, full, str);
    322 
    323            result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
    324            addCaseMapping(foldSet, result, full, str);
    325 
    326            result = ucase_toFullFolding(cp, &full, 0);
    327            addCaseMapping(foldSet, result, full, str);
    328        }
    329    }
    330    if (hasStrings()) {
    331        Locale root("");
    332 #if !UCONFIG_NO_BREAK_ITERATION
    333        UErrorCode status = U_ZERO_ERROR;
    334        BreakIterator *bi = BreakIterator::createWordInstance(root, status);
    335        if (U_SUCCESS(status)) {
    336 #endif
    337            for (int32_t j=0; j<strings_->size(); ++j) {
    338                const UnicodeString* pStr = static_cast<const UnicodeString*>(strings_->elementAt(j));
    339                (str = *pStr).toLower(root);
    340                foldSet.add(str);
    341 #if !UCONFIG_NO_BREAK_ITERATION
    342                (str = *pStr).toTitle(bi, root);
    343                foldSet.add(str);
    344 #endif
    345                (str = *pStr).toUpper(root);
    346                foldSet.add(str);
    347                (str = *pStr).foldCase();
    348                foldSet.add(str);
    349            }
    350 #if !UCONFIG_NO_BREAK_ITERATION
    351        }
    352        delete bi;
    353 #endif
    354    }
    355    *this = foldSet;
    356 }
    357 
    358 U_NAMESPACE_END