[ tor-browser ].git.dasho

tridpars.cpp (30445B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2002-2014, International Business Machines Corporation
      6 *   and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   01/14/2002  aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "tridpars.h"
     18 #include "hash.h"
     19 #include "mutex.h"
     20 #include "transreg.h"
     21 #include "uassert.h"
     22 #include "ucln_in.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/translit.h"
     25 #include "unicode/uchar.h"
     26 #include "unicode/uniset.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/utrans.h"
     29 #include "util.h"
     30 #include "uvector.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 static const char16_t ID_DELIM    = 0x003B; // ;
     35 static const char16_t TARGET_SEP  = 0x002D; // -
     36 static const char16_t VARIANT_SEP = 0x002F; // /
     37 static const char16_t OPEN_REV    = 0x0028; // (
     38 static const char16_t CLOSE_REV   = 0x0029; // )
     39 
     40 //static const char16_t EMPTY[]     = {0}; // ""
     41 static const char16_t ANY[]       = {65,110,121,0}; // "Any"
     42 static const char16_t ANY_NULL[]  = {65,110,121,45,78,117,108,108,0}; // "Any-Null"
     43 
     44 static const int32_t FORWARD = UTRANS_FORWARD;
     45 static const int32_t REVERSE = UTRANS_REVERSE;
     46 
     47 static Hashtable* SPECIAL_INVERSES = nullptr;
     48 static UInitOnce gSpecialInversesInitOnce {};
     49 
     50 /**
     51 * The mutex controlling access to SPECIAL_INVERSES
     52 */
     53 static UMutex LOCK;
     54 
     55 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t,
     56                                     const UnicodeString& v, UBool sawS,
     57                                     const UnicodeString& f) {
     58    source = s;
     59    target = t;
     60    variant = v;
     61    sawSource = sawS;
     62    filter = f;
     63 }
     64 
     65 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b,
     66                                           const UnicodeString& f) {
     67    canonID = c;
     68    basicID = b;
     69    filter = f;
     70 }
     71 
     72 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) {
     73    canonID = c;
     74    basicID = b;
     75 }
     76 
     77 Transliterator* TransliteratorIDParser::SingleID::createInstance() {
     78    Transliterator* t;
     79    if (basicID.length() == 0) {
     80        t = createBasicInstance(UnicodeString(true, ANY_NULL, 8), &canonID);
     81    } else {
     82        t = createBasicInstance(basicID, &canonID);
     83    }
     84    if (t != nullptr) {
     85        if (filter.length() != 0) {
     86            UErrorCode ec = U_ZERO_ERROR;
     87            UnicodeSet *set = new UnicodeSet(filter, ec);
     88            if (U_FAILURE(ec)) {
     89                delete set;
     90            } else {
     91                t->adoptFilter(set);
     92            }
     93        }
     94    }
     95    return t;
     96 }
     97 
     98 
     99 /**
    100 * Parse a single ID, that is, an ID of the general form
    101 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
    102 * optional, the filters optional, and the variants optional.
    103 * @param id the id to be parsed
    104 * @param pos INPUT-OUTPUT parameter.  On input, the position of
    105 * the first character to parse.  On output, the position after
    106 * the last character parsed.
    107 * @param dir the direction.  If the direction is REVERSE then the
    108 * SingleID is constructed for the reverse direction.
    109 * @return a SingleID object or nullptr
    110 */
    111 TransliteratorIDParser::SingleID*
    112 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos,
    113                                      int32_t dir, UErrorCode& status) {
    114 
    115    int32_t start = pos;
    116 
    117    // The ID will be of the form A, A(), A(B), or (B), where
    118    // A and B are filter IDs.
    119    Specs* specsA = nullptr;
    120    Specs* specsB = nullptr;
    121    UBool sawParen = false;
    122 
    123    // On the first pass, look for (B) or ().  If this fails, then
    124    // on the second pass, look for A, A(B), or A().
    125    for (int32_t pass=1; pass<=2; ++pass) {
    126        if (pass == 2) {
    127            specsA = parseFilterID(id, pos, true);
    128            if (specsA == nullptr) {
    129                pos = start;
    130                return nullptr;
    131            }
    132        }
    133        if (ICU_Utility::parseChar(id, pos, OPEN_REV)) {
    134            sawParen = true;
    135            if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
    136                specsB = parseFilterID(id, pos, true);
    137                // Must close with a ')'
    138                if (specsB == nullptr || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
    139                    delete specsA;
    140                    pos = start;
    141                    return nullptr;
    142                }
    143            }
    144            break;
    145        }
    146    }
    147 
    148    // Assemble return results
    149    SingleID* single;
    150    if (sawParen) {
    151        if (dir == FORWARD) {
    152            SingleID* b = specsToID(specsB, FORWARD);
    153            single = specsToID(specsA, FORWARD);
    154            // Null pointers check
    155            if (b == nullptr || single == nullptr) {
    156            	delete b;
    157            	delete single;
    158            	status = U_MEMORY_ALLOCATION_ERROR;
    159            	return nullptr;
    160            }
    161            single->canonID.append(OPEN_REV)
    162                .append(b->canonID).append(CLOSE_REV);
    163            if (specsA != nullptr) {
    164                single->filter = specsA->filter;
    165            }
    166            delete b;
    167        } else {
    168            SingleID* a = specsToID(specsA, FORWARD);
    169            single = specsToID(specsB, FORWARD);
    170            // Check for null pointer.
    171            if (a == nullptr || single == nullptr) {
    172            	delete a;
    173            	delete single;
    174            	status = U_MEMORY_ALLOCATION_ERROR;
    175            	return nullptr;
    176            }
    177            single->canonID.append(OPEN_REV)
    178                .append(a->canonID).append(CLOSE_REV);
    179            if (specsB != nullptr) {
    180                single->filter = specsB->filter;
    181            }
    182            delete a;
    183        }
    184    } else {
    185        // assert(specsA != nullptr);
    186        if (dir == FORWARD) {
    187            single = specsToID(specsA, FORWARD);
    188        } else {
    189            single = specsToSpecialInverse(*specsA, status);
    190            if (single == nullptr) {
    191                single = specsToID(specsA, REVERSE);
    192            }
    193        }
    194        // Check for nullptr pointer
    195        if (single == nullptr) {
    196        	status = U_MEMORY_ALLOCATION_ERROR;
    197        	return nullptr;
    198        }
    199        single->filter = specsA->filter;
    200    }
    201 
    202    delete specsA;
    203    delete specsB;
    204 
    205    return single;
    206 }
    207 
    208 /**
    209 * Parse a filter ID, that is, an ID of the general form
    210 * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
    211 * @param id the id to be parsed
    212 * @param pos INPUT-OUTPUT parameter.  On input, the position of
    213 * the first character to parse.  On output, the position after
    214 * the last character parsed.
    215 * @return a SingleID object or null if the parse fails
    216 */
    217 TransliteratorIDParser::SingleID*
    218 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) {
    219 
    220    int32_t start = pos;
    221 
    222    Specs* specs = parseFilterID(id, pos, true);
    223    if (specs == nullptr) {
    224        pos = start;
    225        return nullptr;
    226    }
    227 
    228    // Assemble return results
    229    SingleID* single = specsToID(specs, FORWARD);
    230    if (single != nullptr) {
    231        single->filter = specs->filter;
    232    }
    233    delete specs;
    234    return single;
    235 }
    236 
    237 /**
    238 * Parse a global filter of the form "[f]" or "([f])", depending
    239 * on 'withParens'.
    240 * @param id the pattern the parse
    241 * @param pos INPUT-OUTPUT parameter.  On input, the position of
    242 * the first character to parse.  On output, the position after
    243 * the last character parsed.
    244 * @param dir the direction.
    245 * @param withParens INPUT-OUTPUT parameter.  On entry, if
    246 * withParens is 0, then parens are disallowed.  If it is 1,
    247 * then parens are requires.  If it is -1, then parens are
    248 * optional, and the return result will be set to 0 or 1.
    249 * @param canonID OUTPUT parameter.  The pattern for the filter
    250 * added to the canonID, either at the end, if dir is FORWARD, or
    251 * at the start, if dir is REVERSE.  The pattern will be enclosed
    252 * in parentheses if appropriate, and will be suffixed with an
    253 * ID_DELIM character.  May be nullptr.
    254 * @return a UnicodeSet object or nullptr.  A non-nullptr results
    255 * indicates a successful parse, regardless of whether the filter
    256 * applies to the given direction.  The caller should discard it
    257 * if withParens != (dir == REVERSE).
    258 */
    259 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos,
    260                                                      int32_t dir,
    261                                                      int32_t& withParens,
    262                                                      UnicodeString* canonID) {
    263    UnicodeSet* filter = nullptr;
    264    int32_t start = pos;
    265 
    266    if (withParens == -1) {
    267        withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0;
    268    } else if (withParens == 1) {
    269        if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) {
    270            pos = start;
    271            return nullptr;
    272        }
    273    }
    274 
    275    ICU_Utility::skipWhitespace(id, pos, true);
    276 
    277    if (UnicodeSet::resemblesPattern(id, pos)) {
    278        ParsePosition ppos(pos);
    279        UErrorCode ec = U_ZERO_ERROR;
    280        filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, nullptr, ec);
    281        /* test for nullptr */
    282        if (filter == nullptr) {
    283            pos = start;
    284            return nullptr;
    285        }
    286        if (U_FAILURE(ec)) {
    287            delete filter;
    288            pos = start;
    289            return nullptr;
    290        }
    291 
    292        UnicodeString pattern;
    293        id.extractBetween(pos, ppos.getIndex(), pattern);
    294        pos = ppos.getIndex();
    295 
    296        if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
    297            delete filter;
    298            pos = start;
    299            return nullptr;
    300        }
    301 
    302        // In the forward direction, append the pattern to the
    303        // canonID.  In the reverse, insert it at zero, and invert
    304        // the presence of parens ("A" <-> "(A)").
    305        if (canonID != nullptr) {
    306            if (dir == FORWARD) {
    307                if (withParens == 1) {
    308                    pattern.insert(0, OPEN_REV);
    309                    pattern.append(CLOSE_REV);
    310                }
    311                canonID->append(pattern).append(ID_DELIM);
    312            } else {
    313                if (withParens == 0) {
    314                    pattern.insert(0, OPEN_REV);
    315                    pattern.append(CLOSE_REV);
    316                }
    317                canonID->insert(0, pattern);
    318                canonID->insert(pattern.length(), ID_DELIM);
    319            }
    320        }
    321    }
    322 
    323    return filter;
    324 }
    325 
    326 U_CDECL_BEGIN
    327 static void U_CALLCONV _deleteSingleID(void* obj) {
    328    delete (TransliteratorIDParser::SingleID*) obj;
    329 }
    330 
    331 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) {
    332    delete (Transliterator*) obj;
    333 }
    334 U_CDECL_END
    335 
    336 /**
    337 * Parse a compound ID, consisting of an optional forward global
    338 * filter, a separator, one or more single IDs delimited by
    339 * separators, an an optional reverse global filter.  The
    340 * separator is a semicolon.  The global filters are UnicodeSet
    341 * patterns.  The reverse global filter must be enclosed in
    342 * parentheses.
    343 * @param id the pattern the parse
    344 * @param dir the direction.
    345 * @param canonID OUTPUT parameter that receives the canonical ID,
    346 * consisting of canonical IDs for all elements, as returned by
    347 * parseSingleID(), separated by semicolons.  Previous contents
    348 * are discarded.
    349 * @param list OUTPUT parameter that receives a list of SingleID
    350 * objects representing the parsed IDs.  Previous contents are
    351 * discarded.
    352 * @param globalFilter OUTPUT parameter that receives a pointer to
    353 * a newly created global filter for this ID in this direction, or
    354 * nullptr if there is none.
    355 * @return true if the parse succeeds, that is, if the entire
    356 * id is consumed without syntax error.
    357 */
    358 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir,
    359                                              UnicodeString& canonID,
    360                                              UVector& list,
    361                                              UnicodeSet*& globalFilter) {
    362    UErrorCode ec = U_ZERO_ERROR;
    363    int32_t i;
    364    int32_t pos = 0;
    365    int32_t withParens = 1;
    366    list.removeAllElements();
    367    UObjectDeleter *save = list.setDeleter(_deleteSingleID);
    368 
    369    UnicodeSet* filter;
    370    globalFilter = nullptr;
    371    canonID.truncate(0);
    372 
    373    // Parse leading global filter, if any
    374    withParens = 0; // parens disallowed
    375    filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
    376    if (filter != nullptr) {
    377        if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
    378            // Not a global filter; backup and resume
    379            canonID.truncate(0);
    380            pos = 0;
    381        }
    382        if (dir == FORWARD) {
    383            globalFilter = filter;
    384        } else {
    385            delete filter;
    386        }
    387        filter = nullptr;
    388    }
    389 
    390    UBool sawDelimiter = true;
    391    for (;;) {
    392        SingleID* single = parseSingleID(id, pos, dir, ec);
    393        if (single == nullptr) {
    394            break;
    395        }
    396        if (dir == FORWARD) {
    397            list.adoptElement(single, ec);
    398        } else {
    399            list.insertElementAt(single, 0, ec);
    400        }
    401        if (U_FAILURE(ec)) {
    402            goto FAIL;
    403        }
    404        if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
    405            sawDelimiter = false;
    406            break;
    407        }
    408    }
    409 
    410    if (list.size() == 0) {
    411        goto FAIL;
    412    }
    413 
    414    // Construct canonical ID
    415    for (i=0; i<list.size(); ++i) {
    416        SingleID* single = static_cast<SingleID*>(list.elementAt(i));
    417        canonID.append(single->canonID);
    418        if (i != (list.size()-1)) {
    419            canonID.append(ID_DELIM);
    420        }
    421    }
    422 
    423    // Parse trailing global filter, if any, and only if we saw
    424    // a trailing delimiter after the IDs.
    425    if (sawDelimiter) {
    426        withParens = 1; // parens required
    427        filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
    428        if (filter != nullptr) {
    429            // Don't require trailing ';', but parse it if present
    430            ICU_Utility::parseChar(id, pos, ID_DELIM);
    431 
    432            if (dir == REVERSE) {
    433                globalFilter = filter;
    434            } else {
    435                delete filter;
    436            }
    437            filter = nullptr;
    438        }
    439    }
    440 
    441    // Trailing unparsed text is a syntax error
    442    ICU_Utility::skipWhitespace(id, pos, true);
    443    if (pos != id.length()) {
    444        goto FAIL;
    445    }
    446 
    447    list.setDeleter(save);
    448    return true;
    449 
    450 FAIL:
    451    list.removeAllElements();
    452    list.setDeleter(save);
    453    delete globalFilter;
    454    globalFilter = nullptr;
    455    return false;
    456 }
    457 
    458 /**
    459 * Convert the elements of the 'list' vector, which are SingleID
    460 * objects, into actual Transliterator objects.  In the course of
    461 * this, some (or all) entries may be removed.  If all entries
    462 * are removed, the nullptr transliterator will be added.
    463 *
    464 * Delete entries with empty basicIDs; these are generated by
    465 * elements like "(A)" in the forward direction, or "A()" in
    466 * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
    467 * SingleID entries to actual transliterators.
    468 *
    469 * @param list vector of SingleID objects.  On exit, vector
    470 * of one or more Transliterators.
    471 * @return new value of insertIndex.  The index will shift if
    472 * there are empty items, like "(Lower)", with indices less than
    473 * insertIndex.
    474 */
    475 void TransliteratorIDParser::instantiateList(UVector& list,
    476                                                UErrorCode& ec) {
    477    UVector tlist(ec);
    478    if (U_FAILURE(ec)) {
    479        goto RETURN;
    480    }
    481    tlist.setDeleter(_deleteTransliteratorTrIDPars);
    482 
    483    Transliterator* t;
    484    int32_t i;
    485    for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
    486        // We run the loop too long by one, so we can
    487        // do an insert after the last element
    488        if (i==list.size()) {
    489            break;
    490        }
    491 
    492        SingleID* single = static_cast<SingleID*>(list.elementAt(i));
    493        if (single->basicID.length() != 0) {
    494            t = single->createInstance();
    495            if (t == nullptr) {
    496                ec = U_INVALID_ID;
    497                goto RETURN;
    498            }
    499            tlist.adoptElement(t, ec);
    500            if (U_FAILURE(ec)) {
    501                goto RETURN;
    502            }
    503        }
    504    }
    505 
    506    // An empty list is equivalent to a nullptr transliterator.
    507    if (tlist.size() == 0) {
    508        t = createBasicInstance(UnicodeString(true, ANY_NULL, 8), nullptr);
    509        if (t == nullptr) {
    510            // Should never happen
    511            ec = U_INTERNAL_TRANSLITERATOR_ERROR;
    512        }
    513        tlist.adoptElement(t, ec);
    514    }
    515 
    516 RETURN:
    517 
    518    UObjectDeleter *save = list.setDeleter(_deleteSingleID);
    519    list.removeAllElements();
    520 
    521    if (U_SUCCESS(ec)) {
    522        list.setDeleter(_deleteTransliteratorTrIDPars);
    523 
    524        while (tlist.size() > 0) {
    525            t = static_cast<Transliterator*>(tlist.orphanElementAt(0));
    526            list.adoptElement(t, ec);
    527            if (U_FAILURE(ec)) {
    528                list.removeAllElements();
    529                break;
    530            }
    531        }
    532    }
    533 
    534    list.setDeleter(save);
    535 }
    536 
    537 /**
    538 * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
    539 * S-T/V, or S/V-T.  If the source is missing, return a source of
    540 * ANY.
    541 * @param id the id string, in any of several forms
    542 * @return an array of 4 strings: source, target, variant, and
    543 * isSourcePresent.  If the source is not present, ANY will be
    544 * given as the source, and isSourcePresent will be nullptr.  Otherwise
    545 * isSourcePresent will be non-nullptr.  The target may be empty if the
    546 * id is not well-formed.  The variant may be empty.
    547 */
    548 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
    549                                     UnicodeString& source,
    550                                     UnicodeString& target,
    551                                     UnicodeString& variant,
    552                                     UBool& isSourcePresent) {
    553    source.setTo(ANY, 3);
    554    target.truncate(0);
    555    variant.truncate(0);
    556 
    557    int32_t sep = id.indexOf(TARGET_SEP);
    558    int32_t var = id.indexOf(VARIANT_SEP);
    559    if (var < 0) {
    560        var = id.length();
    561    }
    562    isSourcePresent = false;
    563 
    564    if (sep < 0) {
    565        // Form: T/V or T (or /V)
    566        id.extractBetween(0, var, target);
    567        id.extractBetween(var, id.length(), variant);
    568    } else if (sep < var) {
    569        // Form: S-T/V or S-T (or -T/V or -T)
    570        if (sep > 0) {
    571            id.extractBetween(0, sep, source);
    572            isSourcePresent = true;
    573        }
    574        id.extractBetween(++sep, var, target);
    575        id.extractBetween(var, id.length(), variant);
    576    } else {
    577        // Form: (S/V-T or /V-T)
    578        if (var > 0) {
    579            id.extractBetween(0, var, source);
    580            isSourcePresent = true;
    581        }
    582        id.extractBetween(var, sep++, variant);
    583        id.extractBetween(sep, id.length(), target);
    584    }
    585 
    586    if (variant.length() > 0) {
    587        variant.remove(0, 1);
    588    }
    589 }
    590 
    591 /**
    592 * Given source, target, and variant strings, concatenate them into a
    593 * full ID.  If the source is empty, then "Any" will be used for the
    594 * source, so the ID will always be of the form s-t/v or s-t.
    595 */
    596 void TransliteratorIDParser::STVtoID(const UnicodeString& source,
    597                                     const UnicodeString& target,
    598                                     const UnicodeString& variant,
    599                                     UnicodeString& id) {
    600    id = source;
    601    if (id.length() == 0) {
    602        id.setTo(ANY, 3);
    603    }
    604    id.append(TARGET_SEP).append(target);
    605    if (variant.length() != 0) {
    606        id.append(VARIANT_SEP).append(variant);
    607    }
    608    // NUL-terminate the ID string for getTerminatedBuffer.
    609    // This prevents valgrind and Purify warnings.
    610    id.append(static_cast<char16_t>(0));
    611    id.truncate(id.length()-1);
    612 }
    613 
    614 /**
    615 * Register two targets as being inverses of one another.  For
    616 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
    617 * Transliterator to form the following inverse relationships:
    618 *
    619 * <pre>NFC => NFD
    620 * Any-NFC => Any-NFD
    621 * NFD => NFC
    622 * Any-NFD => Any-NFC</pre>
    623 *
    624 * (Without the special inverse registration, the inverse of NFC
    625 * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
    626 * that the presence or absence of "Any-" is preserved.
    627 *
    628 * <p>The relationship is symmetrical; registering (a, b) is
    629 * equivalent to registering (b, a).
    630 *
    631 * <p>The relevant IDs must still be registered separately as
    632 * factories or classes.
    633 *
    634 * <p>Only the targets are specified.  Special inverses always
    635 * have the form Any-Target1 <=> Any-Target2.  The target should
    636 * have canonical casing (the casing desired to be produced when
    637 * an inverse is formed) and should contain no whitespace or other
    638 * extraneous characters.
    639 *
    640 * @param target the target against which to register the inverse
    641 * @param inverseTarget the inverse of target, that is
    642 * Any-target.getInverse() => Any-inverseTarget
    643 * @param bidirectional if true, register the reverse relation
    644 * as well, that is, Any-inverseTarget.getInverse() => Any-target
    645 */
    646 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target,
    647                                                    const UnicodeString& inverseTarget,
    648                                                    UBool bidirectional,
    649                                                    UErrorCode &status) {
    650    umtx_initOnce(gSpecialInversesInitOnce, init, status);
    651    if (U_FAILURE(status)) {
    652        return;
    653    }
    654 
    655    // If target == inverseTarget then force bidirectional => false
    656    if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) {
    657        bidirectional = false;
    658    }
    659 
    660    Mutex lock(&LOCK);
    661 
    662    UnicodeString *tempus = new UnicodeString(inverseTarget);  // Used for null pointer check before usage.
    663    if (tempus == nullptr) {
    664    	status = U_MEMORY_ALLOCATION_ERROR;
    665    	return;
    666    }
    667    SPECIAL_INVERSES->put(target, tempus, status);
    668    if (bidirectional) {
    669    	tempus = new UnicodeString(target);
    670    	if (tempus == nullptr) {
    671    		status = U_MEMORY_ALLOCATION_ERROR;
    672    		return;
    673    	}
    674        SPECIAL_INVERSES->put(inverseTarget, tempus, status);
    675    }
    676 }
    677 
    678 //----------------------------------------------------------------
    679 // Private implementation
    680 //----------------------------------------------------------------
    681 
    682 /**
    683 * Parse an ID into component pieces.  Take IDs of the form T,
    684 * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
    685 * source of ANY.
    686 * @param id the id string, in any of several forms
    687 * @param pos INPUT-OUTPUT parameter.  On input, pos is the
    688 * offset of the first character to parse in id.  On output,
    689 * pos is the offset after the last parsed character.  If the
    690 * parse failed, pos will be unchanged.
    691 * @param allowFilter2 if true, a UnicodeSet pattern is allowed
    692 * at any location between specs or delimiters, and is returned
    693 * as the fifth string in the array.
    694 * @return a Specs object, or nullptr if the parse failed.  If
    695 * neither source nor target was seen in the parsed id, then the
    696 * parse fails.  If allowFilter is true, then the parsed filter
    697 * pattern is returned in the Specs object, otherwise the returned
    698 * filter reference is nullptr.  If the parse fails for any reason
    699 * nullptr is returned.
    700 */
    701 TransliteratorIDParser::Specs*
    702 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos,
    703                                      UBool allowFilter) {
    704    UnicodeString first;
    705    UnicodeString source;
    706    UnicodeString target;
    707    UnicodeString variant;
    708    UnicodeString filter;
    709    char16_t delimiter = 0;
    710    int32_t specCount = 0;
    711    int32_t start = pos;
    712 
    713    // This loop parses one of the following things with each
    714    // pass: a filter, a delimiter character (either '-' or '/'),
    715    // or a spec (source, target, or variant).
    716    for (;;) {
    717        ICU_Utility::skipWhitespace(id, pos, true);
    718        if (pos == id.length()) {
    719            break;
    720        }
    721 
    722        // Parse filters
    723        if (allowFilter && filter.length() == 0 &&
    724            UnicodeSet::resemblesPattern(id, pos)) {
    725 
    726            ParsePosition ppos(pos);
    727            UErrorCode ec = U_ZERO_ERROR;
    728            UnicodeSet set(id, ppos, USET_IGNORE_SPACE, nullptr, ec);
    729            if (U_FAILURE(ec)) {
    730                pos = start;
    731                return nullptr;
    732            }
    733            id.extractBetween(pos, ppos.getIndex(), filter);
    734            pos = ppos.getIndex();
    735            continue;
    736        }
    737 
    738        if (delimiter == 0) {
    739            char16_t c = id.charAt(pos);
    740            if ((c == TARGET_SEP && target.length() == 0) ||
    741                (c == VARIANT_SEP && variant.length() == 0)) {
    742                delimiter = c;
    743                ++pos;
    744                continue;
    745            }
    746        }
    747 
    748        // We are about to try to parse a spec with no delimiter
    749        // when we can no longer do so (we can only do so at the
    750        // start); break.
    751        if (delimiter == 0 && specCount > 0) {
    752            break;
    753        }
    754 
    755        UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos);
    756        if (spec.length() == 0) {
    757            // Note that if there was a trailing delimiter, we
    758            // consume it.  So Foo-, Foo/, Foo-Bar/, and Foo/Bar-
    759            // are legal.
    760            break;
    761        }
    762 
    763        switch (delimiter) {
    764        case 0:
    765            first = spec;
    766            break;
    767        case TARGET_SEP:
    768            target = spec;
    769            break;
    770        case VARIANT_SEP:
    771            variant = spec;
    772            break;
    773        }
    774        ++specCount;
    775        delimiter = 0;
    776    }
    777 
    778    // A spec with no prior character is either source or target,
    779    // depending on whether an explicit "-target" was seen.
    780    if (first.length() != 0) {
    781        if (target.length() == 0) {
    782            target = first;
    783        } else {
    784            source = first;
    785        }
    786    }
    787 
    788    // Must have either source or target
    789    if (source.length() == 0 && target.length() == 0) {
    790        pos = start;
    791        return nullptr;
    792    }
    793 
    794    // Empty source or target defaults to ANY
    795    UBool sawSource = true;
    796    if (source.length() == 0) {
    797        source.setTo(ANY, 3);
    798        sawSource = false;
    799    }
    800    if (target.length() == 0) {
    801        target.setTo(ANY, 3);
    802    }
    803 
    804    return new Specs(source, target, variant, sawSource, filter);
    805 }
    806 
    807 /**
    808 * Givens a Spec object, convert it to a SingleID object.  The
    809 * Spec object is a more unprocessed parse result.  The SingleID
    810 * object contains information about canonical and basic IDs.
    811 * @return a SingleID; never returns nullptr.  Returned object always
    812 * has 'filter' field of nullptr.
    813 */
    814 TransliteratorIDParser::SingleID*
    815 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) {
    816    UnicodeString canonID;
    817    UnicodeString basicID;
    818    UnicodeString basicPrefix;
    819    if (specs != nullptr) {
    820        UnicodeString buf;
    821        if (dir == FORWARD) {
    822            if (specs->sawSource) {
    823                buf.append(specs->source).append(TARGET_SEP);
    824            } else {
    825                basicPrefix = specs->source;
    826                basicPrefix.append(TARGET_SEP);
    827            }
    828            buf.append(specs->target);
    829        } else {
    830            buf.append(specs->target).append(TARGET_SEP).append(specs->source);
    831        }
    832        if (specs->variant.length() != 0) {
    833            buf.append(VARIANT_SEP).append(specs->variant);
    834        }
    835        basicID = basicPrefix;
    836        basicID.append(buf);
    837        if (specs->filter.length() != 0) {
    838            buf.insert(0, specs->filter);
    839        }
    840        canonID = buf;
    841    }
    842    return new SingleID(canonID, basicID);
    843 }
    844 
    845 /**
    846 * Given a Specs object, return a SingleID representing the
    847 * special inverse of that ID.  If there is no special inverse
    848 * then return nullptr.
    849 * @return a SingleID or nullptr.  Returned object always has
    850 * 'filter' field of nullptr.
    851 */
    852 TransliteratorIDParser::SingleID*
    853 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) {
    854    if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) {
    855        return nullptr;
    856    }
    857    umtx_initOnce(gSpecialInversesInitOnce, init, status);
    858    if (U_FAILURE(status)) {
    859        return nullptr;
    860    }
    861 
    862    UnicodeString* inverseTarget;
    863 
    864    umtx_lock(&LOCK);
    865    inverseTarget = static_cast<UnicodeString*>(SPECIAL_INVERSES->get(specs.target));
    866    umtx_unlock(&LOCK);
    867 
    868    if (inverseTarget != nullptr) {
    869        // If the original ID contained "Any-" then make the
    870        // special inverse "Any-Foo"; otherwise make it "Foo".
    871        // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
    872        UnicodeString buf;
    873        if (specs.filter.length() != 0) {
    874            buf.append(specs.filter);
    875        }
    876        if (specs.sawSource) {
    877            buf.append(ANY, 3).append(TARGET_SEP);
    878        }
    879        buf.append(*inverseTarget);
    880 
    881        UnicodeString basicID(true, ANY, 3);
    882        basicID.append(TARGET_SEP).append(*inverseTarget);
    883 
    884        if (specs.variant.length() != 0) {
    885            buf.append(VARIANT_SEP).append(specs.variant);
    886            basicID.append(VARIANT_SEP).append(specs.variant);
    887        }
    888        return new SingleID(buf, basicID);
    889    }
    890    return nullptr;
    891 }
    892 
    893 /**
    894 * Glue method to get around access problems in C++.  This would
    895 * ideally be inline but we want to avoid a circular header
    896 * dependency.
    897 */
    898 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
    899    return Transliterator::createBasicInstance(id, canonID);
    900 }
    901 
    902 /**
    903 * Initialize static memory. Called through umtx_initOnce only.
    904 */
    905 void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) {
    906    U_ASSERT(SPECIAL_INVERSES == nullptr);
    907    ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
    908 
    909    SPECIAL_INVERSES = new Hashtable(true, status);
    910    if (SPECIAL_INVERSES == nullptr) {
    911    	status = U_MEMORY_ALLOCATION_ERROR;
    912    	return;
    913    }
    914    SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject);
    915 }
    916 
    917 /**
    918 * Free static memory.
    919 */
    920 void TransliteratorIDParser::cleanup() {
    921    if (SPECIAL_INVERSES) {
    922        delete SPECIAL_INVERSES;
    923        SPECIAL_INVERSES = nullptr;
    924    }
    925    gSpecialInversesInitOnce.reset();
    926 }
    927 
    928 U_NAMESPACE_END
    929 
    930 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    931 
    932 //eof
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE