[ tor-browser ].git.dasho

collationruleparser.cpp (31681B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationruleparser.cpp
      9 *
     10 * (replaced the former ucol_tok.cpp)
     11 *
     12 * created on: 2013apr10
     13 * created by: Markus W. Scherer
     14 */
     15 
     16 #include "unicode/utypes.h"
     17 
     18 #if !UCONFIG_NO_COLLATION
     19 
     20 #include "unicode/normalizer2.h"
     21 #include "unicode/parseerr.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/ucol.h"
     24 #include "unicode/uloc.h"
     25 #include "unicode/unistr.h"
     26 #include "unicode/utf16.h"
     27 #include "charstr.h"
     28 #include "cmemory.h"
     29 #include "collation.h"
     30 #include "collationdata.h"
     31 #include "collationruleparser.h"
     32 #include "collationsettings.h"
     33 #include "collationtailoring.h"
     34 #include "cstring.h"
     35 #include "patternprops.h"
     36 #include "uassert.h"
     37 #include "ulocimp.h"
     38 #include "uvectr32.h"
     39 
     40 U_NAMESPACE_BEGIN
     41 
     42 namespace {
     43 
     44 const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
     45 const int32_t BEFORE_LENGTH = 7;
     46 
     47 }  // namespace
     48 
     49 CollationRuleParser::Sink::~Sink() {}
     50 
     51 void
     52 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
     53 
     54 void
     55 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
     56 
     57 CollationRuleParser::Importer::~Importer() {}
     58 
     59 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
     60        : nfd(*Normalizer2::getNFDInstance(errorCode)),
     61          nfc(*Normalizer2::getNFCInstance(errorCode)),
     62          rules(nullptr), baseData(base), settings(nullptr),
     63          parseError(nullptr), errorReason(nullptr),
     64          sink(nullptr), importer(nullptr),
     65          ruleIndex(0) {
     66 }
     67 
     68 CollationRuleParser::~CollationRuleParser() {
     69 }
     70 
     71 void
     72 CollationRuleParser::parse(const UnicodeString &ruleString,
     73                           CollationSettings &outSettings,
     74                           UParseError *outParseError,
     75                           UErrorCode &errorCode) {
     76    if(U_FAILURE(errorCode)) { return; }
     77    settings = &outSettings;
     78    parseError = outParseError;
     79    if(parseError != nullptr) {
     80        parseError->line = 0;
     81        parseError->offset = -1;
     82        parseError->preContext[0] = 0;
     83        parseError->postContext[0] = 0;
     84    }
     85    errorReason = nullptr;
     86    parse(ruleString, errorCode);
     87 }
     88 
     89 void
     90 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
     91    if(U_FAILURE(errorCode)) { return; }
     92    rules = &ruleString;
     93    ruleIndex = 0;
     94 
     95    while(ruleIndex < rules->length()) {
     96        char16_t c = rules->charAt(ruleIndex);
     97        if(PatternProps::isWhiteSpace(c)) {
     98            ++ruleIndex;
     99            continue;
    100        }
    101        switch(c) {
    102        case 0x26:  // '&'
    103            parseRuleChain(errorCode);
    104            break;
    105        case 0x5b:  // '['
    106            parseSetting(errorCode);
    107            break;
    108        case 0x23:  // '#' starts a comment, until the end of the line
    109            ruleIndex = skipComment(ruleIndex + 1);
    110            break;
    111        case 0x40:  // '@' is equivalent to [backwards 2]
    112            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
    113                              UCOL_ON, 0, errorCode);
    114            ++ruleIndex;
    115            break;
    116        case 0x21:  // '!' used to turn on Thai/Lao character reversal
    117            // Accept but ignore. The root collator has contractions
    118            // that are equivalent to the character reversal, where appropriate.
    119            ++ruleIndex;
    120            break;
    121        default:
    122            setParseError("expected a reset or setting or comment", errorCode);
    123            break;
    124        }
    125        if(U_FAILURE(errorCode)) { return; }
    126    }
    127 }
    128 
    129 void
    130 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
    131    int32_t resetStrength = parseResetAndPosition(errorCode);
    132    UBool isFirstRelation = true;
    133    for(;;) {
    134        int32_t result = parseRelationOperator(errorCode);
    135        if(U_FAILURE(errorCode)) { return; }
    136        if(result < 0) {
    137            if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
    138                // '#' starts a comment, until the end of the line
    139                ruleIndex = skipComment(ruleIndex + 1);
    140                continue;
    141            }
    142            if(isFirstRelation) {
    143                setParseError("reset not followed by a relation", errorCode);
    144            }
    145            return;
    146        }
    147        int32_t strength = result & STRENGTH_MASK;
    148        if(resetStrength < UCOL_IDENTICAL) {
    149            // reset-before rule chain
    150            if(isFirstRelation) {
    151                if(strength != resetStrength) {
    152                    setParseError("reset-before strength differs from its first relation", errorCode);
    153                    return;
    154                }
    155            } else {
    156                if(strength < resetStrength) {
    157                    setParseError("reset-before strength followed by a stronger relation", errorCode);
    158                    return;
    159                }
    160            }
    161        }
    162        int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
    163        if((result & STARRED_FLAG) == 0) {
    164            parseRelationStrings(strength, i, errorCode);
    165        } else {
    166            parseStarredCharacters(strength, i, errorCode);
    167        }
    168        if(U_FAILURE(errorCode)) { return; }
    169        isFirstRelation = false;
    170    }
    171 }
    172 
    173 int32_t
    174 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
    175    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
    176    int32_t i = skipWhiteSpace(ruleIndex + 1);
    177    int32_t j;
    178    char16_t c;
    179    int32_t resetStrength;
    180    if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
    181            (j = i + BEFORE_LENGTH) < rules->length() &&
    182            PatternProps::isWhiteSpace(rules->charAt(j)) &&
    183            ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
    184            0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
    185            rules->charAt(j + 1) == 0x5d) {
    186        // &[before n] with n=1 or 2 or 3
    187        resetStrength = UCOL_PRIMARY + (c - 0x31);
    188        i = skipWhiteSpace(j + 2);
    189    } else {
    190        resetStrength = UCOL_IDENTICAL;
    191    }
    192    if(i >= rules->length()) {
    193        setParseError("reset without position", errorCode);
    194        return UCOL_DEFAULT;
    195    }
    196    UnicodeString str;
    197    if(rules->charAt(i) == 0x5b) {  // '['
    198        i = parseSpecialPosition(i, str, errorCode);
    199    } else {
    200        i = parseTailoringString(i, str, errorCode);
    201    }
    202    sink->addReset(resetStrength, str, errorReason, errorCode);
    203    if(U_FAILURE(errorCode)) { setErrorContext(); }
    204    ruleIndex = i;
    205    return resetStrength;
    206 }
    207 
    208 int32_t
    209 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
    210    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
    211    ruleIndex = skipWhiteSpace(ruleIndex);
    212    if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
    213    int32_t strength;
    214    int32_t i = ruleIndex;
    215    char16_t c = rules->charAt(i++);
    216    switch(c) {
    217    case 0x3c:  // '<'
    218        if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
    219            ++i;
    220            if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
    221                ++i;
    222                if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
    223                    ++i;
    224                    strength = UCOL_QUATERNARY;
    225                } else {
    226                    strength = UCOL_TERTIARY;
    227                }
    228            } else {
    229                strength = UCOL_SECONDARY;
    230            }
    231        } else {
    232            strength = UCOL_PRIMARY;
    233        }
    234        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
    235            ++i;
    236            strength |= STARRED_FLAG;
    237        }
    238        break;
    239    case 0x3b:  // ';' same as <<
    240        strength = UCOL_SECONDARY;
    241        break;
    242    case 0x2c:  // ',' same as <<<
    243        strength = UCOL_TERTIARY;
    244        break;
    245    case 0x3d:  // '='
    246        strength = UCOL_IDENTICAL;
    247        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
    248            ++i;
    249            strength |= STARRED_FLAG;
    250        }
    251        break;
    252    default:
    253        return UCOL_DEFAULT;
    254    }
    255    return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
    256 }
    257 
    258 void
    259 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
    260    // Parse
    261    //     prefix | str / extension
    262    // where prefix and extension are optional.
    263    UnicodeString prefix, str, extension;
    264    i = parseTailoringString(i, str, errorCode);
    265    if(U_FAILURE(errorCode)) { return; }
    266    char16_t next = (i < rules->length()) ? rules->charAt(i) : 0;
    267    if(next == 0x7c) {  // '|' separates the context prefix from the string.
    268        prefix = str;
    269        i = parseTailoringString(i + 1, str, errorCode);
    270        if(U_FAILURE(errorCode)) { return; }
    271        next = (i < rules->length()) ? rules->charAt(i) : 0;
    272    }
    273    if(next == 0x2f) {  // '/' separates the string from the extension.
    274        i = parseTailoringString(i + 1, extension, errorCode);
    275    }
    276    if(!prefix.isEmpty()) {
    277        UChar32 prefix0 = prefix.char32At(0);
    278        UChar32 c = str.char32At(0);
    279        if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
    280            setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
    281                          errorCode);
    282            return;
    283        }
    284    }
    285    sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
    286    if(U_FAILURE(errorCode)) { setErrorContext(); }
    287    ruleIndex = i;
    288 }
    289 
    290 void
    291 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
    292    UnicodeString empty, raw;
    293    i = parseString(skipWhiteSpace(i), raw, errorCode);
    294    if(U_FAILURE(errorCode)) { return; }
    295    if(raw.isEmpty()) {
    296        setParseError("missing starred-relation string", errorCode);
    297        return;
    298    }
    299    UChar32 prev = -1;
    300    int32_t j = 0;
    301    for(;;) {
    302        while(j < raw.length()) {
    303            UChar32 c = raw.char32At(j);
    304            if(!nfd.isInert(c)) {
    305                setParseError("starred-relation string is not all NFD-inert", errorCode);
    306                return;
    307            }
    308            sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
    309            if(U_FAILURE(errorCode)) {
    310                setErrorContext();
    311                return;
    312            }
    313            j += U16_LENGTH(c);
    314            prev = c;
    315        }
    316        if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
    317            break;
    318        }
    319        if(prev < 0) {
    320            setParseError("range without start in starred-relation string", errorCode);
    321            return;
    322        }
    323        i = parseString(i + 1, raw, errorCode);
    324        if(U_FAILURE(errorCode)) { return; }
    325        if(raw.isEmpty()) {
    326            setParseError("range without end in starred-relation string", errorCode);
    327            return;
    328        }
    329        UChar32 c = raw.char32At(0);
    330        if(c < prev) {
    331            setParseError("range start greater than end in starred-relation string", errorCode);
    332            return;
    333        }
    334        // range prev-c
    335        UnicodeString s;
    336        while(++prev <= c) {
    337            if(!nfd.isInert(prev)) {
    338                setParseError("starred-relation string range is not all NFD-inert", errorCode);
    339                return;
    340            }
    341            if(U_IS_SURROGATE(prev)) {
    342                setParseError("starred-relation string range contains a surrogate", errorCode);
    343                return;
    344            }
    345            if(0xfffd <= prev && prev <= 0xffff) {
    346                setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
    347                return;
    348            }
    349            s.setTo(prev);
    350            sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
    351            if(U_FAILURE(errorCode)) {
    352                setErrorContext();
    353                return;
    354            }
    355        }
    356        prev = -1;
    357        j = U16_LENGTH(c);
    358    }
    359    ruleIndex = skipWhiteSpace(i);
    360 }
    361 
    362 int32_t
    363 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
    364    i = parseString(skipWhiteSpace(i), raw, errorCode);
    365    if(U_SUCCESS(errorCode) && raw.isEmpty()) {
    366        setParseError("missing relation string", errorCode);
    367    }
    368    return skipWhiteSpace(i);
    369 }
    370 
    371 int32_t
    372 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
    373    if(U_FAILURE(errorCode)) { return i; }
    374    raw.remove();
    375    while(i < rules->length()) {
    376        UChar32 c = rules->charAt(i++);
    377        if(isSyntaxChar(c)) {
    378            if(c == 0x27) {  // apostrophe
    379                if(i < rules->length() && rules->charAt(i) == 0x27) {
    380                    // Double apostrophe, encodes a single one.
    381                    raw.append(static_cast<char16_t>(0x27));
    382                    ++i;
    383                    continue;
    384                }
    385                // Quote literal text until the next single apostrophe.
    386                for(;;) {
    387                    if(i == rules->length()) {
    388                        setParseError("quoted literal text missing terminating apostrophe", errorCode);
    389                        return i;
    390                    }
    391                    c = rules->charAt(i++);
    392                    if(c == 0x27) {
    393                        if(i < rules->length() && rules->charAt(i) == 0x27) {
    394                            // Double apostrophe inside quoted literal text,
    395                            // still encodes a single apostrophe.
    396                            ++i;
    397                        } else {
    398                            break;
    399                        }
    400                    }
    401                    raw.append(static_cast<char16_t>(c));
    402                }
    403            } else if(c == 0x5c) {  // backslash
    404                if(i == rules->length()) {
    405                    setParseError("backslash escape at the end of the rule string", errorCode);
    406                    return i;
    407                }
    408                c = rules->char32At(i);
    409                raw.append(c);
    410                i += U16_LENGTH(c);
    411            } else {
    412                // Any other syntax character terminates a string.
    413                --i;
    414                break;
    415            }
    416        } else if(PatternProps::isWhiteSpace(c)) {
    417            // Unquoted white space terminates a string.
    418            --i;
    419            break;
    420        } else {
    421            raw.append(static_cast<char16_t>(c));
    422        }
    423    }
    424    for(int32_t j = 0; j < raw.length();) {
    425        UChar32 c = raw.char32At(j);
    426        if(U_IS_SURROGATE(c)) {
    427            setParseError("string contains an unpaired surrogate", errorCode);
    428            return i;
    429        }
    430        if(0xfffd <= c && c <= 0xffff) {
    431            setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
    432            return i;
    433        }
    434        j += U16_LENGTH(c);
    435    }
    436    return i;
    437 }
    438 
    439 namespace {
    440 
    441 const char* const positions[] = {
    442    "first tertiary ignorable",
    443    "last tertiary ignorable",
    444    "first secondary ignorable",
    445    "last secondary ignorable",
    446    "first primary ignorable",
    447    "last primary ignorable",
    448    "first variable",
    449    "last variable",
    450    "first regular",
    451    "last regular",
    452    "first implicit",
    453    "last implicit",
    454    "first trailing",
    455    "last trailing"
    456 };
    457 
    458 }  // namespace
    459 
    460 int32_t
    461 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
    462    if(U_FAILURE(errorCode)) { return 0; }
    463    UnicodeString raw;
    464    int32_t j = readWords(i + 1, raw);
    465    if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
    466        ++j;
    467        for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
    468            if(raw == UnicodeString(positions[pos], -1, US_INV)) {
    469                str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + pos));
    470                return j;
    471            }
    472        }
    473        if(raw == UNICODE_STRING_SIMPLE("top")) {
    474            str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_REGULAR));
    475            return j;
    476        }
    477        if(raw == UNICODE_STRING_SIMPLE("variable top")) {
    478            str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_VARIABLE));
    479            return j;
    480        }
    481    }
    482    setParseError("not a valid special reset position", errorCode);
    483    return i;
    484 }
    485 
    486 void
    487 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
    488    if(U_FAILURE(errorCode)) { return; }
    489    UnicodeString raw;
    490    int32_t i = ruleIndex + 1;
    491    int32_t j = readWords(i, raw);
    492    if(j <= i || raw.isEmpty()) {
    493        setParseError("expected a setting/option at '['", errorCode);
    494    }
    495    if(rules->charAt(j) == 0x5d) {  // words end with ]
    496        ++j;
    497        if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
    498                (raw.length() == 7 || raw.charAt(7) == 0x20)) {
    499            parseReordering(raw, errorCode);
    500            ruleIndex = j;
    501            return;
    502        }
    503        if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
    504            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
    505                              UCOL_ON, 0, errorCode);
    506            ruleIndex = j;
    507            return;
    508        }
    509        UnicodeString v;
    510        int32_t valueIndex = raw.lastIndexOf(static_cast<char16_t>(0x20));
    511        if(valueIndex >= 0) {
    512            v.setTo(raw, valueIndex + 1);
    513            raw.truncate(valueIndex);
    514        }
    515        if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
    516            int32_t value = UCOL_DEFAULT;
    517            char16_t c = v.charAt(0);
    518            if(0x31 <= c && c <= 0x34) {  // 1..4
    519                value = UCOL_PRIMARY + (c - 0x31);
    520            } else if(c == 0x49) {  // 'I'
    521                value = UCOL_IDENTICAL;
    522            }
    523            if(value != UCOL_DEFAULT) {
    524                settings->setStrength(value, 0, errorCode);
    525                ruleIndex = j;
    526                return;
    527            }
    528        } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
    529            UColAttributeValue value = UCOL_DEFAULT;
    530            if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
    531                value = UCOL_NON_IGNORABLE;
    532            } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
    533                value = UCOL_SHIFTED;
    534            }
    535            if(value != UCOL_DEFAULT) {
    536                settings->setAlternateHandling(value, 0, errorCode);
    537                ruleIndex = j;
    538                return;
    539            }
    540        } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
    541            int32_t value = UCOL_DEFAULT;
    542            if(v == UNICODE_STRING_SIMPLE("space")) {
    543                value = CollationSettings::MAX_VAR_SPACE;
    544            } else if(v == UNICODE_STRING_SIMPLE("punct")) {
    545                value = CollationSettings::MAX_VAR_PUNCT;
    546            } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
    547                value = CollationSettings::MAX_VAR_SYMBOL;
    548            } else if(v == UNICODE_STRING_SIMPLE("currency")) {
    549                value = CollationSettings::MAX_VAR_CURRENCY;
    550            }
    551            if(value != UCOL_DEFAULT) {
    552                settings->setMaxVariable(value, 0, errorCode);
    553                settings->variableTop = baseData->getLastPrimaryForGroup(
    554                    UCOL_REORDER_CODE_FIRST + value);
    555                U_ASSERT(settings->variableTop != 0);
    556                ruleIndex = j;
    557                return;
    558            }
    559        } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
    560            UColAttributeValue value = UCOL_DEFAULT;
    561            if(v == UNICODE_STRING_SIMPLE("off")) {
    562                value = UCOL_OFF;
    563            } else if(v == UNICODE_STRING_SIMPLE("lower")) {
    564                value = UCOL_LOWER_FIRST;
    565            } else if(v == UNICODE_STRING_SIMPLE("upper")) {
    566                value = UCOL_UPPER_FIRST;
    567            }
    568            if(value != UCOL_DEFAULT) {
    569                settings->setCaseFirst(value, 0, errorCode);
    570                ruleIndex = j;
    571                return;
    572            }
    573        } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
    574            UColAttributeValue value = getOnOffValue(v);
    575            if(value != UCOL_DEFAULT) {
    576                settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
    577                ruleIndex = j;
    578                return;
    579            }
    580        } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
    581            UColAttributeValue value = getOnOffValue(v);
    582            if(value != UCOL_DEFAULT) {
    583                settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
    584                ruleIndex = j;
    585                return;
    586            }
    587        } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
    588            UColAttributeValue value = getOnOffValue(v);
    589            if(value != UCOL_DEFAULT) {
    590                settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
    591                ruleIndex = j;
    592                return;
    593            }
    594        } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
    595            UColAttributeValue value = getOnOffValue(v);
    596            if(value != UCOL_DEFAULT) {
    597                if(value == UCOL_ON) {
    598                    setParseError("[hiraganaQ on] is not supported", errorCode);
    599                }
    600                ruleIndex = j;
    601                return;
    602            }
    603        } else if(raw == UNICODE_STRING_SIMPLE("import")) {
    604            CharString lang;
    605            lang.appendInvariantChars(v, errorCode);
    606            if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
    607            // BCP 47 language tag -> ICU locale ID
    608            int32_t parsedLength;
    609            CharString localeID = ulocimp_forLanguageTag(lang.data(), -1, &parsedLength, errorCode);
    610            if(U_FAILURE(errorCode) || parsedLength != lang.length()) {
    611                errorCode = U_ZERO_ERROR;
    612                setParseError("expected language tag in [import langTag]", errorCode);
    613                return;
    614            }
    615            // localeID minus all keywords
    616            CharString baseID = ulocimp_getBaseName(localeID.toStringPiece(), errorCode);
    617            if (U_FAILURE(errorCode)) {
    618                errorCode = U_ZERO_ERROR;
    619                setParseError("expected language tag in [import langTag]", errorCode);
    620                return;
    621            }
    622            if (baseID.isEmpty()) {
    623                baseID.copyFrom("root", errorCode);
    624            } else if (baseID[0] == '_') {
    625                // CharString doesn't have any insert() method, only append().
    626                constexpr char und[] = "und";
    627                constexpr int32_t length = sizeof und - 1;
    628                int32_t dummy;
    629                char* tail = baseID.getAppendBuffer(length, length, dummy, errorCode);
    630                char* head = baseID.data();
    631                uprv_memmove(head + length, head, baseID.length());
    632                uprv_memcpy(head, und, length);
    633                baseID.append(tail, length, errorCode);
    634            }
    635            // @collation=type, or length=0 if not specified
    636            CharString collationType = ulocimp_getKeywordValue(localeID.data(), "collation", errorCode);
    637            if(U_FAILURE(errorCode)) {
    638                errorCode = U_ZERO_ERROR;
    639                setParseError("expected language tag in [import langTag]", errorCode);
    640                return;
    641            }
    642            if(importer == nullptr) {
    643                setParseError("[import langTag] is not supported", errorCode);
    644            } else {
    645                UnicodeString importedRules;
    646                importer->getRules(baseID.data(),
    647                                   !collationType.isEmpty() ? collationType.data() : "standard",
    648                                   importedRules, errorReason, errorCode);
    649                if(U_FAILURE(errorCode)) {
    650                    if(errorReason == nullptr) {
    651                        errorReason = "[import langTag] failed";
    652                    }
    653                    setErrorContext();
    654                    return;
    655                }
    656                const UnicodeString *outerRules = rules;
    657                int32_t outerRuleIndex = ruleIndex;
    658                parse(importedRules, errorCode);
    659                if(U_FAILURE(errorCode)) {
    660                    if(parseError != nullptr) {
    661                        parseError->offset = outerRuleIndex;
    662                    }
    663                }
    664                rules = outerRules;
    665                ruleIndex = j;
    666            }
    667            return;
    668        }
    669    } else if(rules->charAt(j) == 0x5b) {  // words end with [
    670        UnicodeSet set;
    671        j = parseUnicodeSet(j, set, errorCode);
    672        if(U_FAILURE(errorCode)) { return; }
    673        if(raw == UNICODE_STRING_SIMPLE("optimize")) {
    674            sink->optimize(set, errorReason, errorCode);
    675            if(U_FAILURE(errorCode)) { setErrorContext(); }
    676            ruleIndex = j;
    677            return;
    678        } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
    679            sink->suppressContractions(set, errorReason, errorCode);
    680            if(U_FAILURE(errorCode)) { setErrorContext(); }
    681            ruleIndex = j;
    682            return;
    683        }
    684    }
    685    setParseError("not a valid setting/option", errorCode);
    686 }
    687 
    688 void
    689 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
    690    if(U_FAILURE(errorCode)) { return; }
    691    int32_t i = 7;  // after "reorder"
    692    if(i == raw.length()) {
    693        // empty [reorder] with no codes
    694        settings->resetReordering();
    695        return;
    696    }
    697    // Parse the codes in [reorder aa bb cc].
    698    UVector32 reorderCodes(errorCode);
    699    if(U_FAILURE(errorCode)) { return; }
    700    CharString word;
    701    while(i < raw.length()) {
    702        ++i;  // skip the word-separating space
    703        int32_t limit = raw.indexOf(static_cast<char16_t>(0x20), i);
    704        if(limit < 0) { limit = raw.length(); }
    705        word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
    706        if(U_FAILURE(errorCode)) { return; }
    707        int32_t code = getReorderCode(word.data());
    708        if(code < 0) {
    709            setParseError("unknown script or reorder code", errorCode);
    710            return;
    711        }
    712        reorderCodes.addElement(code, errorCode);
    713        if(U_FAILURE(errorCode)) { return; }
    714        i = limit;
    715    }
    716    settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
    717 }
    718 
    719 static const char *const gSpecialReorderCodes[] = {
    720    "space", "punct", "symbol", "currency", "digit"
    721 };
    722 
    723 int32_t
    724 CollationRuleParser::getReorderCode(const char *word) {
    725    for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
    726        if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
    727            return UCOL_REORDER_CODE_FIRST + i;
    728        }
    729    }
    730    int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
    731    if(script >= 0) {
    732        return script;
    733    }
    734    if(uprv_stricmp(word, "others") == 0) {
    735        return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
    736    }
    737    return -1;
    738 }
    739 
    740 UColAttributeValue
    741 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
    742    if(s == UNICODE_STRING_SIMPLE("on")) {
    743        return UCOL_ON;
    744    } else if(s == UNICODE_STRING_SIMPLE("off")) {
    745        return UCOL_OFF;
    746    } else {
    747        return UCOL_DEFAULT;
    748    }
    749 }
    750 
    751 int32_t
    752 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
    753    // Collect a UnicodeSet pattern between a balanced pair of [brackets].
    754    int32_t level = 0;
    755    int32_t j = i;
    756    for(;;) {
    757        if(j == rules->length()) {
    758            setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
    759            return j;
    760        }
    761        char16_t c = rules->charAt(j++);
    762        if(c == 0x5b) {  // '['
    763            ++level;
    764        } else if(c == 0x5d) {  // ']'
    765            if(--level == 0) { break; }
    766        }
    767    }
    768    set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
    769    if(U_FAILURE(errorCode)) {
    770        errorCode = U_ZERO_ERROR;
    771        setParseError("not a valid UnicodeSet pattern", errorCode);
    772        return j;
    773    }
    774    j = skipWhiteSpace(j);
    775    if(j == rules->length() || rules->charAt(j) != 0x5d) {
    776        setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
    777        return j;
    778    }
    779    return ++j;
    780 }
    781 
    782 int32_t
    783 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
    784    static const char16_t sp = 0x20;
    785    raw.remove();
    786    i = skipWhiteSpace(i);
    787    for(;;) {
    788        if(i >= rules->length()) { return 0; }
    789        char16_t c = rules->charAt(i);
    790        if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
    791            if(raw.isEmpty()) { return i; }
    792            if(raw.endsWith(&sp, 1)) {  // remove trailing space
    793                raw.truncate(raw.length() - 1);
    794            }
    795            return i;
    796        }
    797        if(PatternProps::isWhiteSpace(c)) {
    798            raw.append(sp);
    799            i = skipWhiteSpace(i + 1);
    800        } else {
    801            raw.append(c);
    802            ++i;
    803        }
    804    }
    805 }
    806 
    807 int32_t
    808 CollationRuleParser::skipComment(int32_t i) const {
    809    // skip to past the newline
    810    while(i < rules->length()) {
    811        char16_t c = rules->charAt(i++);
    812        // LF or FF or CR or NEL or LS or PS
    813        if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
    814            // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
    815            // NLF (new line function) = CR or LF or CR+LF or NEL.
    816            // No need to collect all of CR+LF because a following LF will be ignored anyway.
    817            break;
    818        }
    819    }
    820    return i;
    821 }
    822 
    823 void
    824 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
    825    if(U_FAILURE(errorCode)) { return; }
    826    // Error code consistent with the old parser (from ca. 2001),
    827    // rather than U_PARSE_ERROR;
    828    errorCode = U_INVALID_FORMAT_ERROR;
    829    errorReason = reason;
    830    if(parseError != nullptr) { setErrorContext(); }
    831 }
    832 
    833 void
    834 CollationRuleParser::setErrorContext() {
    835    if(parseError == nullptr) { return; }
    836 
    837    // Note: This relies on the calling code maintaining the ruleIndex
    838    // at a position that is useful for debugging.
    839    // For example, at the beginning of a reset or relation etc.
    840    parseError->offset = ruleIndex;
    841    parseError->line = 0;  // We are not counting line numbers.
    842 
    843    // before ruleIndex
    844    int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
    845    if(start < 0) {
    846        start = 0;
    847    } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
    848        ++start;
    849    }
    850    int32_t length = ruleIndex - start;
    851    rules->extract(start, length, parseError->preContext);
    852    parseError->preContext[length] = 0;
    853 
    854    // starting from ruleIndex
    855    length = rules->length() - ruleIndex;
    856    if(length >= U_PARSE_CONTEXT_LEN) {
    857        length = U_PARSE_CONTEXT_LEN - 1;
    858        if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
    859            --length;
    860        }
    861    }
    862    rules->extract(ruleIndex, length, parseError->postContext);
    863    parseError->postContext[length] = 0;
    864 }
    865 
    866 UBool
    867 CollationRuleParser::isSyntaxChar(UChar32 c) {
    868    return 0x21 <= c && c <= 0x7e &&
    869            (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
    870            (0x5b <= c && c <= 0x60) || (0x7b <= c));
    871 }
    872 
    873 int32_t
    874 CollationRuleParser::skipWhiteSpace(int32_t i) const {
    875    while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
    876        ++i;
    877    }
    878    return i;
    879 }
    880 
    881 U_NAMESPACE_END
    882 
    883 #endif  // !UCONFIG_NO_COLLATION
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE