[ tor-browser ].git.dasho

translit.cpp (63071B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "utypeinfo.h"  // for 'typeid' to work
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_TRANSLITERATION
     18 
     19 #include "unicode/putil.h"
     20 #include "unicode/translit.h"
     21 #include "unicode/locid.h"
     22 #include "unicode/msgfmt.h"
     23 #include "unicode/rep.h"
     24 #include "unicode/resbund.h"
     25 #include "unicode/unifilt.h"
     26 #include "unicode/uniset.h"
     27 #include "unicode/uscript.h"
     28 #include "unicode/strenum.h"
     29 #include "unicode/utf16.h"
     30 #include "cpdtrans.h"
     31 #include "nultrans.h"
     32 #include "rbt_data.h"
     33 #include "rbt_pars.h"
     34 #include "rbt.h"
     35 #include "transreg.h"
     36 #include "name2uni.h"
     37 #include "nortrans.h"
     38 #include "remtrans.h"
     39 #include "titletrn.h"
     40 #include "tolowtrn.h"
     41 #include "toupptrn.h"
     42 #include "uni2name.h"
     43 #include "brktrans.h"
     44 #include "esctrn.h"
     45 #include "unesctrn.h"
     46 #include "tridpars.h"
     47 #include "anytrans.h"
     48 #include "util.h"
     49 #include "hash.h"
     50 #include "mutex.h"
     51 #include "ucln_in.h"
     52 #include "uassert.h"
     53 #include "cmemory.h"
     54 #include "cstring.h"
     55 #include "uinvchar.h"
     56 
     57 static const char16_t TARGET_SEP  = 0x002D; /*-*/
     58 static const char16_t ID_DELIM    = 0x003B; /*;*/
     59 static const char16_t VARIANT_SEP = 0x002F; // '/'
     60 
     61 /**
     62 * Prefix for resource bundle key for the display name for a
     63 * transliterator.  The ID is appended to this to form the key.
     64 * The resource bundle value should be a String.
     65 */
     66 static const char RB_DISPLAY_NAME_PREFIX[] = "%Translit%%";
     67 
     68 /**
     69 * Prefix for resource bundle key for the display name for a
     70 * transliterator SCRIPT.  The ID is appended to this to form the key.
     71 * The resource bundle value should be a String.
     72 */
     73 static const char RB_SCRIPT_DISPLAY_NAME_PREFIX[] = "%Translit%";
     74 
     75 /**
     76 * Resource bundle key for display name pattern.
     77 * The resource bundle value should be a String forming a
     78 * MessageFormat pattern, e.g.:
     79 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
     80 */
     81 static const char RB_DISPLAY_NAME_PATTERN[] = "TransliteratorNamePattern";
     82 
     83 /**
     84 * Resource bundle key for the list of RuleBasedTransliterator IDs.
     85 * The resource bundle value should be a String[] with each element
     86 * being a valid ID.  The ID will be appended to RB_RULE_BASED_PREFIX
     87 * to obtain the class name in which the RB_RULE key will be sought.
     88 */
     89 static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs";
     90 
     91 /**
     92 * The mutex controlling access to registry object.
     93 */
     94 static icu::UMutex registryMutex;
     95 
     96 /**
     97 * System transliterator registry; non-null when initialized.
     98 */
     99 static icu::TransliteratorRegistry* registry = nullptr;
    100 
    101 // Macro to check/initialize the registry. ONLY USE WITHIN
    102 // MUTEX. Avoids function call when registry is initialized.
    103 #define HAVE_REGISTRY(status) (registry!=0 || initializeRegistry(status))
    104 
    105 U_NAMESPACE_BEGIN
    106 
    107 UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator)
    108 
    109 /**
    110 * Return true if the given UTransPosition is valid for text of
    111 * the given length.
    112 */
    113 static inline UBool positionIsValid(UTransPosition& index, int32_t len) {
    114    return !(index.contextStart < 0 ||
    115             index.start < index.contextStart ||
    116             index.limit < index.start ||
    117             index.contextLimit < index.limit ||
    118             len < index.contextLimit);
    119 }
    120 
    121 /**
    122 * Default constructor.
    123 * @param theID the string identifier for this transliterator
    124 * @param theFilter the filter.  Any character for which
    125 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
    126 * altered by this transliterator.  If <tt>filter</tt> is
    127 * <tt>null</tt> then no filtering is applied.
    128 */
    129 Transliterator::Transliterator(const UnicodeString& theID,
    130                               UnicodeFilter* adoptedFilter) :
    131    UObject(), ID(theID), filter(adoptedFilter),
    132    maximumContextLength(0)
    133 {
    134    // NUL-terminate the ID string, which is a non-aliased copy.
    135    ID.append(static_cast<char16_t>(0));
    136    ID.truncate(ID.length()-1);
    137 }
    138 
    139 /**
    140 * Destructor.
    141 */
    142 Transliterator::~Transliterator() {
    143    delete filter;
    144 }
    145 
    146 /**
    147 * Copy constructor.
    148 */
    149 Transliterator::Transliterator(const Transliterator& other) :
    150    UObject(other), ID(other.ID), filter(nullptr),
    151    maximumContextLength(other.maximumContextLength)
    152 {
    153    // NUL-terminate the ID string, which is a non-aliased copy.
    154    ID.append(static_cast<char16_t>(0));
    155    ID.truncate(ID.length()-1);
    156 
    157    if (other.filter != nullptr) {
    158        // We own the filter, so we must have our own copy
    159        filter = other.filter->clone();
    160    }
    161 }
    162 
    163 Transliterator* Transliterator::clone() const {
    164    return nullptr;
    165 }
    166 
    167 /**
    168 * Assignment operator.
    169 */
    170 Transliterator& Transliterator::operator=(const Transliterator& other) {
    171    if (this == &other) { return *this; }  // self-assignment: no-op
    172    ID = other.ID;
    173    // NUL-terminate the ID string
    174    ID.getTerminatedBuffer();
    175 
    176    maximumContextLength = other.maximumContextLength;
    177    adoptFilter(other.filter == nullptr ? nullptr : other.filter->clone());
    178    return *this;
    179 }
    180 
    181 /**
    182 * Transliterates a segment of a string.  <code>Transliterator</code> API.
    183 * @param text the string to be transliterated
    184 * @param start the beginning index, inclusive; <code>0 <= start
    185 * <= limit</code>.
    186 * @param limit the ending index, exclusive; <code>start <= limit
    187 * <= text.length()</code>.
    188 * @return the new limit index, or -1
    189 */
    190 int32_t Transliterator::transliterate(Replaceable& text,
    191                                      int32_t start, int32_t limit) const {
    192    if (start < 0 ||
    193        limit < start ||
    194        text.length() < limit) {
    195        return -1;
    196    }
    197 
    198    UTransPosition offsets;
    199    offsets.contextStart= start;
    200    offsets.contextLimit = limit;
    201    offsets.start = start;
    202    offsets.limit = limit;
    203    filteredTransliterate(text, offsets, false, true);
    204    return offsets.limit;
    205 }
    206 
    207 /**
    208 * Transliterates an entire string in place. Convenience method.
    209 * @param text the string to be transliterated
    210 */
    211 void Transliterator::transliterate(Replaceable& text) const {
    212    transliterate(text, 0, text.length());
    213 }
    214 
    215 /**
    216 * Transliterates the portion of the text buffer that can be
    217 * transliterated unambiguosly after new text has been inserted,
    218 * typically as a result of a keyboard event.  The new text in
    219 * <code>insertion</code> will be inserted into <code>text</code>
    220 * at <code>index.contextLimit</code>, advancing
    221 * <code>index.contextLimit</code> by <code>insertion.length()</code>.
    222 * Then the transliterator will try to transliterate characters of
    223 * <code>text</code> between <code>index.start</code> and
    224 * <code>index.contextLimit</code>.  Characters before
    225 * <code>index.start</code> will not be changed.
    226 *
    227 * <p>Upon return, values in <code>index</code> will be updated.
    228 * <code>index.contextStart</code> will be advanced to the first
    229 * character that future calls to this method will read.
    230 * <code>index.start</code> and <code>index.contextLimit</code> will
    231 * be adjusted to delimit the range of text that future calls to
    232 * this method may change.
    233 *
    234 * <p>Typical usage of this method begins with an initial call
    235 * with <code>index.contextStart</code> and <code>index.contextLimit</code>
    236 * set to indicate the portion of <code>text</code> to be
    237 * transliterated, and <code>index.start == index.contextStart</code>.
    238 * Thereafter, <code>index</code> can be used without
    239 * modification in future calls, provided that all changes to
    240 * <code>text</code> are made via this method.
    241 *
    242 * <p>This method assumes that future calls may be made that will
    243 * insert new text into the buffer.  As a result, it only performs
    244 * unambiguous transliterations.  After the last call to this
    245 * method, there may be untransliterated text that is waiting for
    246 * more input to resolve an ambiguity.  In order to perform these
    247 * pending transliterations, clients should call {@link
    248 * #finishKeyboardTransliteration} after the last call to this
    249 * method has been made.
    250 * 
    251 * @param text the buffer holding transliterated and untransliterated text
    252 * @param index an array of three integers.
    253 *
    254 * <ul><li><code>index.contextStart</code>: the beginning index,
    255 * inclusive; <code>0 <= index.contextStart <= index.contextLimit</code>.
    256 *
    257 * <li><code>index.contextLimit</code>: the ending index, exclusive;
    258 * <code>index.contextStart <= index.contextLimit <= text.length()</code>.
    259 * <code>insertion</code> is inserted at
    260 * <code>index.contextLimit</code>.
    261 *
    262 * <li><code>index.start</code>: the next character to be
    263 * considered for transliteration; <code>index.contextStart <=
    264 * index.start <= index.contextLimit</code>.  Characters before
    265 * <code>index.start</code> will not be changed by future calls
    266 * to this method.</ul>
    267 *
    268 * @param insertion text to be inserted and possibly
    269 * transliterated into the translation buffer at
    270 * <code>index.contextLimit</code>.  If <code>null</code> then no text
    271 * is inserted.
    272 * @see #START
    273 * @see #LIMIT
    274 * @see #CURSOR
    275 * @see #handleTransliterate
    276 * @exception IllegalArgumentException if <code>index</code>
    277 * is invalid
    278 */
    279 void Transliterator::transliterate(Replaceable& text,
    280                                   UTransPosition& index,
    281                                   const UnicodeString& insertion,
    282                                   UErrorCode &status) const {
    283    _transliterate(text, index, &insertion, status);
    284 }
    285 
    286 /**
    287 * Transliterates the portion of the text buffer that can be
    288 * transliterated unambiguosly after a new character has been
    289 * inserted, typically as a result of a keyboard event.  This is a
    290 * convenience method; see {@link
    291 * #transliterate(Replaceable, int[], String)} for details.
    292 * @param text the buffer holding transliterated and
    293 * untransliterated text
    294 * @param index an array of three integers.  See {@link
    295 * #transliterate(Replaceable, int[], String)}.
    296 * @param insertion text to be inserted and possibly
    297 * transliterated into the translation buffer at
    298 * <code>index.contextLimit</code>.
    299 * @see #transliterate(Replaceable, int[], String)
    300 */
    301 void Transliterator::transliterate(Replaceable& text,
    302                                   UTransPosition& index,
    303                                   UChar32 insertion,
    304                                   UErrorCode& status) const {
    305    UnicodeString str(insertion);
    306    _transliterate(text, index, &str, status);
    307 }
    308 
    309 /**
    310 * Transliterates the portion of the text buffer that can be
    311 * transliterated unambiguosly.  This is a convenience method; see
    312 * {@link #transliterate(Replaceable, int[], String)} for
    313 * details.
    314 * @param text the buffer holding transliterated and
    315 * untransliterated text
    316 * @param index an array of three integers.  See {@link
    317 * #transliterate(Replaceable, int[], String)}.
    318 * @see #transliterate(Replaceable, int[], String)
    319 */
    320 void Transliterator::transliterate(Replaceable& text,
    321                                   UTransPosition& index,
    322                                   UErrorCode& status) const {
    323    _transliterate(text, index, nullptr, status);
    324 }
    325 
    326 /**
    327 * Finishes any pending transliterations that were waiting for
    328 * more characters.  Clients should call this method as the last
    329 * call after a sequence of one or more calls to
    330 * <code>transliterate()</code>.
    331 * @param text the buffer holding transliterated and
    332 * untransliterated text.
    333 * @param index the array of indices previously passed to {@link
    334 * #transliterate}
    335 */
    336 void Transliterator::finishTransliteration(Replaceable& text,
    337                                           UTransPosition& index) const {
    338    if (!positionIsValid(index, text.length())) {
    339        return;
    340    }
    341 
    342    filteredTransliterate(text, index, false, true);
    343 }
    344 
    345 /**
    346 * This internal method does keyboard transliteration.  If the
    347 * 'insertion' is non-null then we append it to 'text' before
    348 * proceeding.  This method calls through to the pure virtual
    349 * framework method handleTransliterate() to do the actual
    350 * work.
    351 */
    352 void Transliterator::_transliterate(Replaceable& text,
    353                                    UTransPosition& index,
    354                                    const UnicodeString* insertion,
    355                                    UErrorCode &status) const {
    356    if (U_FAILURE(status)) {
    357        return;
    358    }
    359 
    360    if (!positionIsValid(index, text.length())) {
    361        status = U_ILLEGAL_ARGUMENT_ERROR;
    362        return;
    363    }
    364 
    365 //    int32_t originalStart = index.contextStart;
    366    if (insertion != nullptr) {
    367        text.handleReplaceBetween(index.limit, index.limit, *insertion);
    368        index.limit += insertion->length();
    369        index.contextLimit += insertion->length();
    370    }
    371 
    372    if (index.limit > 0 &&
    373        U16_IS_LEAD(text.charAt(index.limit - 1))) {
    374        // Oops, there is a dangling lead surrogate in the buffer.
    375        // This will break most transliterators, since they will
    376        // assume it is part of a pair.  Don't transliterate until
    377        // more text comes in.
    378        return;
    379    }
    380 
    381    filteredTransliterate(text, index, true, true);
    382 
    383 #if 0
    384    // TODO
    385    // I CAN'T DO what I'm attempting below now that the Kleene star
    386    // operator is supported.  For example, in the rule
    387 
    388    //   ([:Lu:]+) { x } > $1;
    389 
    390    // what is the maximum context length?  getMaximumContextLength()
    391    // will return 1, but this is just the length of the ante context
    392    // part of the pattern string -- 1 character, which is a standin
    393    // for a Quantifier, which contains a StringMatcher, which
    394    // contains a UnicodeSet.
    395 
    396    // There is a complicated way to make this work again, and that's
    397    // to add a "maximum left context" protocol into the
    398    // UnicodeMatcher hierarchy.  At present I'm not convinced this is
    399    // worth it.
    400 
    401    // ---
    402 
    403    // The purpose of the code below is to keep the context small
    404    // while doing incremental transliteration.  When part of the left
    405    // context (between contextStart and start) is no longer needed,
    406    // we try to advance contextStart past that portion.  We use the
    407    // maximum context length to do so.
    408    int32_t newCS = index.start;
    409    int32_t n = getMaximumContextLength();
    410    while (newCS > originalStart && n-- > 0) {
    411        --newCS;
    412        newCS -= U16_LENGTH(text.char32At(newCS)) - 1;
    413    }
    414    index.contextStart = uprv_max(newCS, originalStart);
    415 #endif
    416 }
    417 
    418 /**
    419 * This method breaks up the input text into runs of unfiltered
    420 * characters.  It passes each such run to
    421 * <subclass>.handleTransliterate().  Subclasses that can handle the
    422 * filter logic more efficiently themselves may override this method.
    423 *
    424 * All transliteration calls in this class go through this method.
    425 */
    426 void Transliterator::filteredTransliterate(Replaceable& text,
    427                                           UTransPosition& index,
    428                                           UBool incremental,
    429                                           UBool rollback) const {
    430    // Short circuit path for transliterators with no filter in
    431    // non-incremental mode.
    432    if (filter == nullptr && !rollback) {
    433        handleTransliterate(text, index, incremental);
    434        return;
    435    }
    436 
    437    //----------------------------------------------------------------------
    438    // This method processes text in two groupings:
    439    //
    440    // RUNS -- A run is a contiguous group of characters which are contained
    441    // in the filter for this transliterator (filter.contains(ch) == true).
    442    // Text outside of runs may appear as context but it is not modified.
    443    // The start and limit Position values are narrowed to each run.
    444    //
    445    // PASSES (incremental only) -- To make incremental mode work correctly,
    446    // each run is broken up into n passes, where n is the length (in code
    447    // points) of the run.  Each pass contains the first n characters.  If a
    448    // pass is completely transliterated, it is committed, and further passes
    449    // include characters after the committed text.  If a pass is blocked,
    450    // and does not transliterate completely, then this method rolls back
    451    // the changes made during the pass, extends the pass by one code point,
    452    // and tries again.
    453    //----------------------------------------------------------------------
    454    
    455    // globalLimit is the limit value for the entire operation.  We
    456    // set index.limit to the end of each unfiltered run before
    457    // calling handleTransliterate(), so we need to maintain the real
    458    // value of index.limit here.  After each transliteration, we
    459    // update globalLimit for insertions or deletions that have
    460    // happened.
    461    int32_t globalLimit = index.limit;
    462    
    463    // If there is a non-null filter, then break the input text up.  Say the
    464    // input text has the form:
    465    //   xxxabcxxdefxx
    466    // where 'x' represents a filtered character (filter.contains('x') ==
    467    // false).  Then we break this up into:
    468    //   xxxabc xxdef xx
    469    // Each pass through the loop consumes a run of filtered
    470    // characters (which are ignored) and a subsequent run of
    471    // unfiltered characters (which are transliterated).
    472    
    473    for (;;) {
    474 
    475        if (filter != nullptr) {
    476            // Narrow the range to be transliterated to the first segment
    477            // of unfiltered characters at or after index.start.
    478 
    479            // Advance past filtered chars
    480            UChar32 c;
    481            while (index.start < globalLimit &&
    482                   !filter->contains(c=text.char32At(index.start))) {
    483                index.start += U16_LENGTH(c);
    484            }
    485 
    486            // Find the end of this run of unfiltered chars
    487            index.limit = index.start;
    488            while (index.limit < globalLimit &&
    489                   filter->contains(c=text.char32At(index.limit))) {
    490                index.limit += U16_LENGTH(c);
    491            }
    492        }
    493 
    494        // Check to see if the unfiltered run is empty.  This only
    495        // happens at the end of the string when all the remaining
    496        // characters are filtered.
    497        if (index.limit == index.start) {
    498            // assert(index.start == globalLimit);
    499            break;
    500        }
    501 
    502        // Is this run incremental?  If there is additional
    503        // filtered text (if limit < globalLimit) then we pass in
    504        // an incremental value of false to force the subclass to
    505        // complete the transliteration for this run.
    506        UBool isIncrementalRun =
    507            (index.limit < globalLimit ? false : incremental);
    508        
    509        int32_t delta;
    510 
    511        // Implement rollback.  To understand the need for rollback,
    512        // consider the following transliterator:
    513        //
    514        //  "t" is "a > A;"
    515        //  "u" is "A > b;"
    516        //  "v" is a compound of "t; NFD; u" with a filter [:Ll:]
    517        //
    518        // Now apply "c" to the input text "a".  The result is "b".  But if
    519        // the transliteration is done incrementally, then the NFD holds
    520        // things up after "t" has already transformed "a" to "A".  When
    521        // finishTransliterate() is called, "A" is _not_ processed because
    522        // it gets excluded by the [:Ll:] filter, and the end result is "A"
    523        // -- incorrect.  The problem is that the filter is applied to a
    524        // partially-transliterated result, when we only want it to apply to
    525        // input text.  Although this example hinges on a compound
    526        // transliterator containing NFD and a specific filter, it can
    527        // actually happen with any transliterator which may do a partial
    528        // transformation in incremental mode into characters outside its
    529        // filter.
    530        //
    531        // To handle this, when in incremental mode we supply characters to
    532        // handleTransliterate() in several passes.  Each pass adds one more
    533        // input character to the input text.  That is, for input "ABCD", we
    534        // first try "A", then "AB", then "ABC", and finally "ABCD".  If at
    535        // any point we block (upon return, start < limit) then we roll
    536        // back.  If at any point we complete the run (upon return start ==
    537        // limit) then we commit that run.
    538 
    539        if (rollback && isIncrementalRun) {
    540 
    541            int32_t runStart = index.start;
    542            int32_t runLimit = index.limit;
    543            int32_t runLength =  runLimit - runStart;
    544 
    545            // Make a rollback copy at the end of the string
    546            int32_t rollbackOrigin = text.length();
    547            text.copy(runStart, runLimit, rollbackOrigin);
    548 
    549            // Variables reflecting the commitment of completely
    550            // transliterated text.  passStart is the runStart, advanced
    551            // past committed text.  rollbackStart is the rollbackOrigin,
    552            // advanced past rollback text that corresponds to committed
    553            // text.
    554            int32_t passStart = runStart;
    555            int32_t rollbackStart = rollbackOrigin;
    556 
    557            // The limit for each pass; we advance by one code point with
    558            // each iteration.
    559            int32_t passLimit = index.start;
    560 
    561            // Total length, in 16-bit code units, of uncommitted text.
    562            // This is the length to be rolled back.
    563            int32_t uncommittedLength = 0;
    564 
    565            // Total delta (change in length) for all passes
    566            int32_t totalDelta = 0;
    567 
    568            // PASS MAIN LOOP -- Start with a single character, and extend
    569            // the text by one character at a time.  Roll back partial
    570            // transliterations and commit complete transliterations.
    571            for (;;) {
    572                // Length of additional code point, either one or two
    573                int32_t charLength = U16_LENGTH(text.char32At(passLimit));
    574                passLimit += charLength;
    575                if (passLimit > runLimit) {
    576                    break;
    577                }
    578                uncommittedLength += charLength;
    579 
    580                index.limit = passLimit;
    581 
    582                // Delegate to subclass for actual transliteration.  Upon
    583                // return, start will be updated to point after the
    584                // transliterated text, and limit and contextLimit will be
    585                // adjusted for length changes.
    586                handleTransliterate(text, index, true);
    587 
    588                delta = index.limit - passLimit; // change in length
    589 
    590                // We failed to completely transliterate this pass.
    591                // Roll back the text.  Indices remain unchanged; reset
    592                // them where necessary.
    593                if (index.start != index.limit) {
    594                    // Find the rollbackStart, adjusted for length changes
    595                    // and the deletion of partially transliterated text.
    596                    int32_t rs = rollbackStart + delta - (index.limit - passStart);
    597 
    598                    // Delete the partially transliterated text
    599                    text.handleReplaceBetween(passStart, index.limit, UnicodeString());
    600 
    601                    // Copy the rollback text back
    602                    text.copy(rs, rs + uncommittedLength, passStart);
    603 
    604                    // Restore indices to their original values
    605                    index.start = passStart;
    606                    index.limit = passLimit;
    607                    index.contextLimit -= delta;
    608                }
    609 
    610                // We did completely transliterate this pass.  Update the
    611                // commit indices to record how far we got.  Adjust indices
    612                // for length change.
    613                else {
    614                    // Move the pass indices past the committed text.
    615                    passStart = passLimit = index.start;
    616 
    617                    // Adjust the rollbackStart for length changes and move
    618                    // it past the committed text.  All characters we've
    619                    // processed to this point are committed now, so zero
    620                    // out the uncommittedLength.
    621                    rollbackStart += delta + uncommittedLength;
    622                    uncommittedLength = 0;
    623 
    624                    // Adjust indices for length changes.
    625                    runLimit += delta;
    626                    totalDelta += delta;
    627                }
    628            }
    629 
    630            // Adjust overall limit and rollbackOrigin for insertions and
    631            // deletions.  Don't need to worry about contextLimit because
    632            // handleTransliterate() maintains that.
    633            rollbackOrigin += totalDelta;
    634            globalLimit += totalDelta;
    635 
    636            // Delete the rollback copy
    637            text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, UnicodeString());
    638 
    639            // Move start past committed text
    640            index.start = passStart;
    641        }
    642 
    643        else {
    644            // Delegate to subclass for actual transliteration.
    645            int32_t limit = index.limit;
    646            handleTransliterate(text, index, isIncrementalRun);
    647            delta = index.limit - limit; // change in length
    648 
    649            // In a properly written transliterator, start == limit after
    650            // handleTransliterate() returns when incremental is false.
    651            // Catch cases where the subclass doesn't do this, and throw
    652            // an exception.  (Just pinning start to limit is a bad idea,
    653            // because what's probably happening is that the subclass
    654            // isn't transliterating all the way to the end, and it should
    655            // in non-incremental mode.)
    656            if (!incremental && index.start != index.limit) {
    657                // We can't throw an exception, so just fudge things
    658                index.start = index.limit;
    659            }
    660 
    661            // Adjust overall limit for insertions/deletions.  Don't need
    662            // to worry about contextLimit because handleTransliterate()
    663            // maintains that.
    664            globalLimit += delta;
    665        }
    666 
    667        if (filter == nullptr || isIncrementalRun) {
    668            break;
    669        }
    670 
    671        // If we did completely transliterate this
    672        // run, then repeat with the next unfiltered run.
    673    }
    674 
    675    // Start is valid where it is.  Limit needs to be put back where
    676    // it was, modulo adjustments for deletions/insertions.
    677    index.limit = globalLimit;
    678 }
    679 
    680 void Transliterator::filteredTransliterate(Replaceable& text,
    681                                           UTransPosition& index,
    682                                           UBool incremental) const {
    683    filteredTransliterate(text, index, incremental, false);
    684 }
    685 
    686 /**
    687 * Method for subclasses to use to set the maximum context length.
    688 * @see #getMaximumContextLength
    689 */
    690 void Transliterator::setMaximumContextLength(int32_t maxContextLength) {
    691    maximumContextLength = maxContextLength;
    692 }
    693 
    694 /**
    695 * Returns a programmatic identifier for this transliterator.
    696 * If this identifier is passed to <code>getInstance()</code>, it
    697 * will return this object, if it has been registered.
    698 * @see #registerInstance
    699 * @see #getAvailableIDs
    700 */
    701 const UnicodeString& Transliterator::getID() const {
    702    return ID;
    703 }
    704 
    705 /**
    706 * Returns a name for this transliterator that is appropriate for
    707 * display to the user in the default locale.  See {@link
    708 * #getDisplayName(Locale)} for details.
    709 */
    710 UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& ID,
    711                                              UnicodeString& result) {
    712    return getDisplayName(ID, Locale::getDefault(), result);
    713 }
    714 
    715 /**
    716 * Returns a name for this transliterator that is appropriate for
    717 * display to the user in the given locale.  This name is taken
    718 * from the locale resource data in the standard manner of the
    719 * <code>java.text</code> package.
    720 *
    721 * <p>If no localized names exist in the system resource bundles,
    722 * a name is synthesized using a localized
    723 * <code>MessageFormat</code> pattern from the resource data.  The
    724 * arguments to this pattern are an integer followed by one or two
    725 * strings.  The integer is the number of strings, either 1 or 2.
    726 * The strings are formed by splitting the ID for this
    727 * transliterator at the first TARGET_SEP.  If there is no TARGET_SEP, then the
    728 * entire ID forms the only string.
    729 * @param inLocale the Locale in which the display name should be
    730 * localized.
    731 * @see java.text.MessageFormat
    732 */
    733 UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& id,
    734                                              const Locale& inLocale,
    735                                              UnicodeString& result) {
    736    UErrorCode status = U_ZERO_ERROR;
    737 
    738    ResourceBundle bundle(U_ICUDATA_TRANSLIT, inLocale, status);
    739 
    740    // Suspend checking status until later...
    741 
    742    result.truncate(0);
    743 
    744    // Normalize the ID
    745    UnicodeString source, target, variant;
    746    UBool sawSource;
    747    TransliteratorIDParser::IDtoSTV(id, source, target, variant, sawSource);
    748    if (target.length() < 1) {
    749        // No target; malformed id
    750        return result;
    751    }
    752    if (variant.length() > 0) { // Change "Foo" to "/Foo"
    753        variant.insert(0, VARIANT_SEP);
    754    }
    755    UnicodeString ID(source);
    756    ID.append(TARGET_SEP).append(target).append(variant);
    757 
    758    // build the char* key
    759    if (uprv_isInvariantUString(ID.getBuffer(), ID.length())) {
    760        char key[200];
    761        uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX);
    762        int32_t length = static_cast<int32_t>(uprv_strlen(RB_DISPLAY_NAME_PREFIX));
    763        ID.extract(0, static_cast<int32_t>(sizeof(key) - length), key + length, static_cast<int32_t>(sizeof(key) - length), US_INV);
    764 
    765        // Try to retrieve a UnicodeString from the bundle.
    766        UnicodeString resString = bundle.getStringEx(key, status);
    767 
    768        if (U_SUCCESS(status) && resString.length() != 0) {
    769            return result = resString; // [sic] assign & return
    770        }
    771 
    772 #if !UCONFIG_NO_FORMATTING
    773        // We have failed to get a name from the locale data.  This is
    774        // typical, since most transliterators will not have localized
    775        // name data.  The next step is to retrieve the MessageFormat
    776        // pattern from the locale data and to use it to synthesize the
    777        // name from the ID.
    778 
    779        status = U_ZERO_ERROR;
    780        resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status);
    781 
    782        if (U_SUCCESS(status) && resString.length() != 0) {
    783            MessageFormat msg(resString, inLocale, status);
    784            // Suspend checking status until later...
    785 
    786            // We pass either 2 or 3 Formattable objects to msg.
    787            Formattable args[3];
    788            int32_t nargs;
    789            args[0].setLong(2); // # of args to follow
    790            args[1].setString(source);
    791            args[2].setString(target);
    792            nargs = 3;
    793 
    794            // Use display names for the scripts, if they exist
    795            UnicodeString s;
    796            length = static_cast<int32_t>(uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX));
    797            for (int j=1; j<=2; ++j) {
    798                status = U_ZERO_ERROR;
    799                uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX);
    800                args[j].getString(s);
    801                if (uprv_isInvariantUString(s.getBuffer(), s.length())) {
    802                    s.extract(0, sizeof(key) - length - 1, key + length, static_cast<int32_t>(sizeof(key)) - length - 1, US_INV);
    803 
    804                    resString = bundle.getStringEx(key, status);
    805 
    806                    if (U_SUCCESS(status)) {
    807                        args[j] = resString;
    808                    }
    809                }
    810            }
    811 
    812            status = U_ZERO_ERROR;
    813            FieldPosition pos; // ignored by msg
    814            msg.format(args, nargs, result, pos, status);
    815            if (U_SUCCESS(status)) {
    816                result.append(variant);
    817                return result;
    818            }
    819        }
    820 #endif
    821    }
    822 
    823    // We should not reach this point unless there is something
    824    // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
    825    // been deleted from the root RB_LOCALE_ELEMENTS resource.
    826    result = ID;
    827    return result;
    828 }
    829 
    830 /**
    831 * Returns the filter used by this transliterator, or <tt>null</tt>
    832 * if this transliterator uses no filter.  Caller musn't delete
    833 * the result!
    834 */
    835 const UnicodeFilter* Transliterator::getFilter() const {
    836    return filter;
    837 }
    838 
    839 /**
    840 * Returns the filter used by this transliterator, or
    841 * <tt>nullptr</tt> if this transliterator uses no filter.  The
    842 * caller must eventually delete the result.  After this call,
    843 * this transliterator's filter is set to <tt>nullptr</tt>.
    844 */
    845 UnicodeFilter* Transliterator::orphanFilter() {
    846    UnicodeFilter *result = filter;
    847    filter = nullptr;
    848    return result;
    849 }
    850 
    851 /**
    852 * Changes the filter used by this transliterator.  If the filter
    853 * is set to <tt>null</tt> then no filtering will occur.
    854 *
    855 * <p>Callers must take care if a transliterator is in use by
    856 * multiple threads.  The filter should not be changed by one
    857 * thread while another thread may be transliterating.
    858 */
    859 void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) {
    860    delete filter;
    861    filter = filterToAdopt;
    862 }
    863 
    864 /**
    865 * Returns this transliterator's inverse.  See the class
    866 * documentation for details.  This implementation simply inverts
    867 * the two entities in the ID and attempts to retrieve the
    868 * resulting transliterator.  That is, if <code>getID()</code>
    869 * returns "A-B", then this method will return the result of
    870 * <code>getInstance("B-A")</code>, or <code>null</code> if that
    871 * call fails.
    872 *
    873 * <p>This method does not take filtering into account.  The
    874 * returned transliterator will have no filter.
    875 *
    876 * <p>Subclasses with knowledge of their inverse may wish to
    877 * override this method.
    878 *
    879 * @return a transliterator that is an inverse, not necessarily
    880 * exact, of this transliterator, or <code>null</code> if no such
    881 * transliterator is registered.
    882 * @see #registerInstance
    883 */
    884 Transliterator* Transliterator::createInverse(UErrorCode& status) const {
    885    UParseError parseError;
    886    return Transliterator::createInstance(ID, UTRANS_REVERSE,parseError,status);
    887 }
    888 
    889 Transliterator* U_EXPORT2
    890 Transliterator::createInstance(const UnicodeString& ID,
    891                                UTransDirection dir,
    892                                UErrorCode& status)
    893 {
    894    UParseError parseError;
    895    return createInstance(ID, dir, parseError, status);
    896 }
    897 
    898 /**
    899 * Returns a <code>Transliterator</code> object given its ID.
    900 * The ID must be either a system transliterator ID or a ID registered
    901 * using <code>registerInstance()</code>.
    902 *
    903 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
    904 * @return A <code>Transliterator</code> object with the given ID
    905 * @see #registerInstance
    906 * @see #getAvailableIDs
    907 * @see #getID
    908 */
    909 Transliterator* U_EXPORT2
    910 Transliterator::createInstance(const UnicodeString& ID,
    911                                UTransDirection dir,
    912                                UParseError& parseError,
    913                                UErrorCode& status)
    914 {
    915    if (U_FAILURE(status)) {
    916        return nullptr;
    917    }
    918 
    919    UnicodeString canonID;
    920    UVector list(status);
    921    if (U_FAILURE(status)) {
    922        return nullptr;
    923    }
    924 
    925    UnicodeSet* globalFilter = nullptr;
    926    // TODO add code for parseError...currently unused, but
    927    // later may be used by parsing code...
    928    if (!TransliteratorIDParser::parseCompoundID(ID, dir, canonID, list, globalFilter)) {
    929        status = U_INVALID_ID;
    930        delete globalFilter;
    931        return nullptr;
    932    }
    933    LocalPointer<UnicodeSet> lpGlobalFilter(globalFilter);
    934    
    935    TransliteratorIDParser::instantiateList(list, status);
    936    if (U_FAILURE(status)) {
    937        return nullptr;
    938    }
    939    
    940    U_ASSERT(list.size() > 0);
    941    Transliterator* t = nullptr;
    942    
    943    if (list.size() > 1 || canonID.indexOf(ID_DELIM) >= 0) {
    944        // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
    945        // has one child transliterator.  This is so that toRules() will return the right thing
    946        // (without any inactive ID), but our main ID still comes out correct.  That is, if we
    947        // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
    948        // even though the ID is "(Lower);Latin-Greek;".
    949        t = new CompoundTransliterator(list, parseError, status);
    950    }
    951    else {
    952        t = static_cast<Transliterator*>(list.elementAt(0));
    953    }
    954    // Check null pointer
    955    if (t != nullptr) {
    956        t->setID(canonID);
    957        if (lpGlobalFilter.isValid()) {
    958            t->adoptFilter(lpGlobalFilter.orphan());
    959        }
    960    }
    961    else if (U_SUCCESS(status)) {
    962        status = U_MEMORY_ALLOCATION_ERROR;
    963    }
    964    return t;
    965 }
    966 
    967 /**
    968 * Create a transliterator from a basic ID.  This is an ID
    969 * containing only the forward direction source, target, and
    970 * variant.
    971 * @param id a basic ID of the form S-T or S-T/V.
    972 * @return a newly created Transliterator or null if the ID is
    973 * invalid.
    974 */
    975 Transliterator* Transliterator::createBasicInstance(const UnicodeString& id,
    976                                                    const UnicodeString* canon) {
    977    UParseError pe;
    978    UErrorCode ec = U_ZERO_ERROR;
    979    TransliteratorAlias* alias = nullptr;
    980    Transliterator* t = nullptr;
    981 
    982    umtx_lock(&registryMutex);
    983    if (HAVE_REGISTRY(ec)) {
    984        t = registry->get(id, alias, ec);
    985    }
    986    umtx_unlock(&registryMutex);
    987 
    988    if (U_FAILURE(ec)) {
    989        delete t;
    990        delete alias;
    991        return nullptr;
    992    }
    993 
    994    // We may have not gotten a transliterator:  Because we can't
    995    // instantiate a transliterator from inside TransliteratorRegistry::
    996    // get() (that would deadlock), we sometimes pass back an alias.  This
    997    // contains the data we need to finish the instantiation outside the
    998    // registry mutex.  The alias may, in turn, generate another alias, so
    999    // we handle aliases in a loop.  The max times through the loop is two.
   1000    // [alan]
   1001    while (alias != nullptr) {
   1002        U_ASSERT(t==0);
   1003        // Rule-based aliases are handled with TransliteratorAlias::
   1004        // parse(), followed by TransliteratorRegistry::reget().
   1005        // Other aliases are handled with TransliteratorAlias::create().
   1006        if (alias->isRuleBased()) {
   1007            // Step 1. parse
   1008            TransliteratorParser parser(ec);
   1009            alias->parse(parser, pe, ec);
   1010            delete alias;
   1011            alias = nullptr;
   1012 
   1013            // Step 2. reget
   1014            umtx_lock(&registryMutex);
   1015            if (HAVE_REGISTRY(ec)) {
   1016                t = registry->reget(id, parser, alias, ec);
   1017            }
   1018            umtx_unlock(&registryMutex);
   1019 
   1020            // Step 3. Loop back around!
   1021        } else {
   1022            t = alias->create(pe, ec);
   1023            delete alias;
   1024            alias = nullptr;
   1025            break;
   1026        }
   1027        if (U_FAILURE(ec)) {
   1028            delete t;
   1029            delete alias;
   1030            t = nullptr;
   1031            break;
   1032        }
   1033    }
   1034 
   1035    if (t != nullptr && canon != nullptr) {
   1036        t->setID(*canon);
   1037    }
   1038 
   1039    return t;
   1040 }
   1041 
   1042 /**
   1043 * Returns a <code>Transliterator</code> object constructed from
   1044 * the given rule string.  This will be a RuleBasedTransliterator,
   1045 * if the rule string contains only rules, or a
   1046 * CompoundTransliterator, if it contains ID blocks, or a
   1047 * NullTransliterator, if it contains ID blocks which parse as
   1048 * empty for the given direction.
   1049 */
   1050 Transliterator* U_EXPORT2
   1051 Transliterator::createFromRules(const UnicodeString& ID,
   1052                                const UnicodeString& rules,
   1053                                UTransDirection dir,
   1054                                UParseError& parseError,
   1055                                UErrorCode& status)
   1056 {
   1057    Transliterator* t = nullptr;
   1058 
   1059    TransliteratorParser parser(status);
   1060    parser.parse(rules, dir, parseError, status);
   1061 
   1062    if (U_FAILURE(status)) {
   1063        return nullptr;
   1064    }
   1065 
   1066    // NOTE: The logic here matches that in TransliteratorRegistry.
   1067    if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
   1068        t = new NullTransliterator();
   1069    }
   1070    else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
   1071        t = new RuleBasedTransliterator(ID, static_cast<TransliterationRuleData*>(parser.dataVector.orphanElementAt(0)), true);
   1072    }
   1073    else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
   1074        // idBlock, no data -- this is an alias.  The ID has
   1075        // been munged from reverse into forward mode, if
   1076        // necessary, so instantiate the ID in the forward
   1077        // direction.
   1078        if (parser.compoundFilter != nullptr) {
   1079            UnicodeString filterPattern;
   1080            parser.compoundFilter->toPattern(filterPattern, false);
   1081            t = createInstance(filterPattern + UnicodeString(ID_DELIM)
   1082                    + *static_cast<UnicodeString*>(parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status);
   1083        }
   1084        else
   1085            t = createInstance(*static_cast<UnicodeString*>(parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status);
   1086 
   1087 
   1088        if (t != nullptr) {
   1089            t->setID(ID);
   1090        }
   1091    }
   1092    else {
   1093        UVector transliterators(status);
   1094        // TODO ICU-21701 missing U_FAILURE check here.
   1095        //      Error and nullptr checking through this whole block looks suspect.
   1096        int32_t passNumber = 1;
   1097 
   1098        int32_t limit = parser.idBlockVector.size();
   1099        if (parser.dataVector.size() > limit)
   1100            limit = parser.dataVector.size();
   1101 
   1102        for (int32_t i = 0; i < limit; i++) {
   1103            if (i < parser.idBlockVector.size()) {
   1104                UnicodeString* idBlock = static_cast<UnicodeString*>(parser.idBlockVector.elementAt(i));
   1105                if (!idBlock->isEmpty()) {
   1106                    Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status);
   1107                    if (U_FAILURE(status)) {
   1108                        delete temp;
   1109                        return nullptr;
   1110                    }
   1111                    if (temp != nullptr && typeid(*temp) != typeid(NullTransliterator)) {
   1112                        transliterators.addElement(temp, status);
   1113                        if (U_FAILURE(status)) {
   1114                            delete temp;
   1115                            return nullptr;
   1116                        }
   1117                    } else {
   1118                        delete temp;
   1119                    }
   1120                }
   1121            }
   1122            if (!parser.dataVector.isEmpty()) {
   1123                TransliterationRuleData* data = static_cast<TransliterationRuleData*>(parser.dataVector.orphanElementAt(0));
   1124                // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")?
   1125                RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++),
   1126                        data, true);
   1127                // Check if nullptr before adding it to transliterators to avoid future usage of nullptr pointer.
   1128                if (temprbt == nullptr) {
   1129                    if (U_SUCCESS(status)) {
   1130                        status = U_MEMORY_ALLOCATION_ERROR;
   1131                    }
   1132                    return t;
   1133                }
   1134                transliterators.addElement(temprbt, status);
   1135                if (U_FAILURE(status)) {
   1136                    delete temprbt;
   1137                    return t;
   1138                }
   1139                // TODO: ICU-21701 the transliterators vector will leak its contents if anything goes wrong.
   1140                //       Under normal operation, the CompoundTransliterator constructor adopts the
   1141                //       the contents of the vector.
   1142            }
   1143        }
   1144 
   1145        t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status);
   1146        // Null pointer check
   1147        if (t != nullptr) {
   1148            t->setID(ID);
   1149            t->adoptFilter(parser.orphanCompoundFilter());
   1150        }
   1151    }
   1152    if (U_SUCCESS(status) && t == nullptr) {
   1153        status = U_MEMORY_ALLOCATION_ERROR;
   1154    }
   1155    return t;
   1156 }
   1157 
   1158 UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
   1159                                       UBool escapeUnprintable) const {
   1160    // The base class implementation of toRules munges the ID into
   1161    // the correct format.  That is: foo => ::foo
   1162    if (escapeUnprintable) {
   1163        rulesSource.truncate(0);
   1164        UnicodeString id = getID();
   1165        for (int32_t i=0; i<id.length();) {
   1166            UChar32 c = id.char32At(i);
   1167            if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
   1168                rulesSource.append(c);
   1169            }
   1170            i += U16_LENGTH(c);
   1171        }
   1172    } else {
   1173        rulesSource = getID();
   1174    }
   1175    // KEEP in sync with rbt_pars
   1176    rulesSource.insert(0, UNICODE_STRING_SIMPLE("::"));
   1177    rulesSource.append(ID_DELIM);
   1178    return rulesSource;
   1179 }
   1180 
   1181 int32_t Transliterator::countElements() const {
   1182    const CompoundTransliterator* ct = dynamic_cast<const CompoundTransliterator*>(this);
   1183    return ct != nullptr ? ct->getCount() : 0;
   1184 }
   1185 
   1186 const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const {
   1187    if (U_FAILURE(ec)) {
   1188        return *this;
   1189    }
   1190    const CompoundTransliterator* cpd = dynamic_cast<const CompoundTransliterator*>(this);
   1191    int32_t n = (cpd == nullptr) ? 1 : cpd->getCount();
   1192    if (index < 0 || index >= n) {
   1193        ec = U_INDEX_OUTOFBOUNDS_ERROR;
   1194        return *this;
   1195    } else {
   1196        return (n == 1) ? *this : cpd->getTransliterator(index);
   1197    }
   1198 }
   1199 
   1200 UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const {
   1201    handleGetSourceSet(result);
   1202    if (filter != nullptr) {
   1203        UnicodeSet* filterSet = dynamic_cast<UnicodeSet*>(filter);
   1204        UBool deleteFilterSet = false;
   1205        // Most, but not all filters will be UnicodeSets.  Optimize for
   1206        // the high-runner case.
   1207        if (filterSet == nullptr) {
   1208            filterSet = new UnicodeSet();
   1209            // Check null pointer
   1210            if (filterSet == nullptr) {
   1211                return result;
   1212            }
   1213            deleteFilterSet = true;
   1214            filter->addMatchSetTo(*filterSet);
   1215        }
   1216        result.retainAll(*filterSet);
   1217        if (deleteFilterSet) {
   1218            delete filterSet;
   1219        }
   1220    }
   1221    return result;
   1222 }
   1223 
   1224 void Transliterator::handleGetSourceSet(UnicodeSet& result) const {
   1225    result.clear();
   1226 }
   1227 
   1228 UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const {
   1229    return result.clear();
   1230 }
   1231 
   1232 // For public consumption
   1233 void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id,
   1234                                     Transliterator::Factory factory,
   1235                                     Transliterator::Token context) {
   1236    Mutex lock(&registryMutex);
   1237    UErrorCode ec = U_ZERO_ERROR;
   1238    if (HAVE_REGISTRY(ec)) {
   1239        _registerFactory(id, factory, context);
   1240    }
   1241 }
   1242 
   1243 // To be called only by Transliterator subclasses that are called
   1244 // to register themselves by initializeRegistry().
   1245 void Transliterator::_registerFactory(const UnicodeString& id,
   1246                                      Transliterator::Factory factory,
   1247                                      Transliterator::Token context) {
   1248    UErrorCode ec = U_ZERO_ERROR;
   1249    registry->put(id, factory, context, true, ec);
   1250 }
   1251 
   1252 // To be called only by Transliterator subclasses that are called
   1253 // to register themselves by initializeRegistry().
   1254 void Transliterator::_registerSpecialInverse(const UnicodeString& target,
   1255                                             const UnicodeString& inverseTarget,
   1256                                             UBool bidirectional) {
   1257    UErrorCode status = U_ZERO_ERROR;
   1258    TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional, status);
   1259 }
   1260 
   1261 /**
   1262 * Registers a instance <tt>obj</tt> of a subclass of
   1263 * <code>Transliterator</code> with the system.  This object must
   1264 * implement the <tt>clone()</tt> method.  When
   1265 * <tt>getInstance()</tt> is called with an ID string that is
   1266 * equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is
   1267 * returned.
   1268 *
   1269 * @param obj an instance of subclass of
   1270 * <code>Transliterator</code> that defines <tt>clone()</tt>
   1271 * @see #getInstance
   1272 * @see #unregister
   1273 */
   1274 void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) {
   1275    Mutex lock(&registryMutex);
   1276    UErrorCode ec = U_ZERO_ERROR;
   1277    if (HAVE_REGISTRY(ec)) {
   1278        _registerInstance(adoptedPrototype);
   1279    }
   1280 }
   1281 
   1282 void Transliterator::_registerInstance(Transliterator* adoptedPrototype) {
   1283    UErrorCode ec = U_ZERO_ERROR;
   1284    registry->put(adoptedPrototype, true, ec);
   1285 }
   1286 
   1287 void U_EXPORT2 Transliterator::registerAlias(const UnicodeString& aliasID,
   1288                                             const UnicodeString& realID) {
   1289    Mutex lock(&registryMutex);
   1290    UErrorCode ec = U_ZERO_ERROR;
   1291    if (HAVE_REGISTRY(ec)) {
   1292        _registerAlias(aliasID, realID);
   1293    }
   1294 }
   1295 
   1296 void Transliterator::_registerAlias(const UnicodeString& aliasID,
   1297                                    const UnicodeString& realID) {
   1298    UErrorCode ec = U_ZERO_ERROR;
   1299    registry->put(aliasID, realID, false, true, ec);
   1300 }
   1301 
   1302 /**
   1303 * Unregisters a transliterator or class.  This may be either
   1304 * a system transliterator or a user transliterator or class.
   1305 * 
   1306 * @param ID the ID of the transliterator or class
   1307 * @see #registerInstance
   1308 
   1309 */
   1310 void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) {
   1311    Mutex lock(&registryMutex);
   1312    UErrorCode ec = U_ZERO_ERROR;
   1313    if (HAVE_REGISTRY(ec)) {
   1314        registry->remove(ID);
   1315    }
   1316 }
   1317 
   1318 /**
   1319 * == OBSOLETE - remove in ICU 3.4 ==
   1320 * Return the number of IDs currently registered with the system.
   1321 * To retrieve the actual IDs, call getAvailableID(i) with
   1322 * i from 0 to countAvailableIDs() - 1.
   1323 */
   1324 int32_t U_EXPORT2 Transliterator::countAvailableIDs() {
   1325    int32_t retVal = 0;
   1326    Mutex lock(&registryMutex);
   1327    UErrorCode ec = U_ZERO_ERROR;
   1328    if (HAVE_REGISTRY(ec)) {
   1329        retVal = registry->countAvailableIDs();
   1330    }
   1331    return retVal;
   1332 }
   1333 
   1334 /**
   1335 * == OBSOLETE - remove in ICU 3.4 ==
   1336 * Return the index-th available ID.  index must be between 0
   1337 * and countAvailableIDs() - 1, inclusive.  If index is out of
   1338 * range, the result of getAvailableID(0) is returned.
   1339 */
   1340 const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) {
   1341    const UnicodeString* result = nullptr;
   1342    umtx_lock(&registryMutex);
   1343    UErrorCode ec = U_ZERO_ERROR;
   1344    if (HAVE_REGISTRY(ec)) {
   1345        result = &registry->getAvailableID(index);
   1346    }
   1347    umtx_unlock(&registryMutex);
   1348    U_ASSERT(result != nullptr); // fail if no registry
   1349    return *result;
   1350 }
   1351 
   1352 StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) {
   1353    if (U_FAILURE(ec)) return nullptr;
   1354    StringEnumeration* result = nullptr;
   1355    umtx_lock(&registryMutex);
   1356    if (HAVE_REGISTRY(ec)) {
   1357        result = registry->getAvailableIDs();
   1358    }
   1359    umtx_unlock(&registryMutex);
   1360    if (result == nullptr) {
   1361        ec = U_INTERNAL_TRANSLITERATOR_ERROR;
   1362    }
   1363    return result;
   1364 }
   1365 
   1366 int32_t U_EXPORT2 Transliterator::countAvailableSources() {
   1367    Mutex lock(&registryMutex);
   1368    UErrorCode ec = U_ZERO_ERROR;
   1369    return HAVE_REGISTRY(ec) ? _countAvailableSources() : 0;
   1370 }
   1371 
   1372 UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index,
   1373                                                  UnicodeString& result) {
   1374    Mutex lock(&registryMutex);
   1375    UErrorCode ec = U_ZERO_ERROR;
   1376    if (HAVE_REGISTRY(ec)) {
   1377        _getAvailableSource(index, result);
   1378    }
   1379    return result;
   1380 }
   1381 
   1382 int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) {
   1383    Mutex lock(&registryMutex);
   1384    UErrorCode ec = U_ZERO_ERROR;
   1385    return HAVE_REGISTRY(ec) ? _countAvailableTargets(source) : 0;
   1386 }
   1387 
   1388 UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index,
   1389                                                  const UnicodeString& source,
   1390                                                  UnicodeString& result) {
   1391    Mutex lock(&registryMutex);
   1392    UErrorCode ec = U_ZERO_ERROR;
   1393    if (HAVE_REGISTRY(ec)) {
   1394        _getAvailableTarget(index, source, result);
   1395    }
   1396    return result;
   1397 }
   1398 
   1399 int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source,
   1400                                               const UnicodeString& target) {
   1401    Mutex lock(&registryMutex);
   1402    UErrorCode ec = U_ZERO_ERROR;
   1403    return HAVE_REGISTRY(ec) ? _countAvailableVariants(source, target) : 0;
   1404 }
   1405 
   1406 UnicodeString& U_EXPORT2 Transliterator::getAvailableVariant(int32_t index,
   1407                                                   const UnicodeString& source,
   1408                                                   const UnicodeString& target,
   1409                                                   UnicodeString& result) {
   1410    Mutex lock(&registryMutex);
   1411    UErrorCode ec = U_ZERO_ERROR;
   1412    if (HAVE_REGISTRY(ec)) {
   1413        _getAvailableVariant(index, source, target, result);
   1414    }
   1415    return result;
   1416 }
   1417 
   1418 int32_t Transliterator::_countAvailableSources() {
   1419    return registry->countAvailableSources();
   1420 }
   1421 
   1422 UnicodeString& Transliterator::_getAvailableSource(int32_t index,
   1423                                                  UnicodeString& result) {
   1424    return registry->getAvailableSource(index, result);
   1425 }
   1426 
   1427 int32_t Transliterator::_countAvailableTargets(const UnicodeString& source) {
   1428    return registry->countAvailableTargets(source);
   1429 }
   1430 
   1431 UnicodeString& Transliterator::_getAvailableTarget(int32_t index,
   1432                                                  const UnicodeString& source,
   1433                                                  UnicodeString& result) {
   1434    return registry->getAvailableTarget(index, source, result);
   1435 }
   1436 
   1437 int32_t Transliterator::_countAvailableVariants(const UnicodeString& source,
   1438                                               const UnicodeString& target) {
   1439    return registry->countAvailableVariants(source, target);
   1440 }
   1441 
   1442 UnicodeString& Transliterator::_getAvailableVariant(int32_t index,
   1443                                                   const UnicodeString& source,
   1444                                                   const UnicodeString& target,
   1445                                                   UnicodeString& result) {
   1446    return registry->getAvailableVariant(index, source, target, result);
   1447 }
   1448 
   1449 #ifdef U_USE_DEPRECATED_TRANSLITERATOR_API
   1450 
   1451 /**
   1452 * Method for subclasses to use to obtain a character in the given
   1453 * string, with filtering.
   1454 * @deprecated the new architecture provides filtering at the top
   1455 * level.  This method will be removed Dec 31 2001.
   1456 */
   1457 char16_t Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const {
   1458    char16_t c;
   1459    const UnicodeFilter* localFilter = getFilter();
   1460    return (localFilter == 0) ? text.charAt(i) :
   1461        (localFilter->contains(c = text.charAt(i)) ? c : (char16_t)0xFFFE);
   1462 }
   1463 
   1464 #endif
   1465 
   1466 /**
   1467 * If the registry is initialized, return true.  If not, initialize it
   1468 * and return true.  If the registry cannot be initialized, return
   1469 * false (rare).
   1470 *
   1471 * IMPORTANT: Upon entry, registryMutex must be LOCKED.  The entire
   1472 * initialization is done with the lock held.  There is NO REASON to
   1473 * unlock, since no other thread that is waiting on the registryMutex
   1474 * cannot itself proceed until the registry is initialized.
   1475 */
   1476 UBool Transliterator::initializeRegistry(UErrorCode &status) {
   1477    if (registry != nullptr) {
   1478        return true;
   1479    }
   1480 
   1481    registry = new TransliteratorRegistry(status);
   1482    if (registry == nullptr || U_FAILURE(status)) {
   1483        delete registry;
   1484        registry = nullptr;
   1485        return false; // can't create registry, no recovery
   1486    }
   1487 
   1488    /* The following code parses the index table located in
   1489     * icu/data/translit/root.txt.  The index is an n x 4 table
   1490     * that follows this format:
   1491     *  <id>{
   1492     *      file{
   1493     *          resource{"<resource>"}
   1494     *          direction{"<direction>"}
   1495     *      }
   1496     *  }
   1497     *  <id>{
   1498     *      internal{
   1499     *          resource{"<resource>"}
   1500     *          direction{"<direction"}
   1501     *       }
   1502     *  }
   1503     *  <id>{
   1504     *      alias{"<getInstanceArg"}
   1505     *  }
   1506     * <id> is the ID of the system transliterator being defined.  These
   1507     * are public IDs enumerated by Transliterator.getAvailableIDs(),
   1508     * unless the second field is "internal".
   1509     * 
   1510     * <resource> is a ResourceReader resource name.  Currently these refer
   1511     * to file names under com/ibm/text/resources.  This string is passed
   1512     * directly to ResourceReader, together with <encoding>.
   1513     * 
   1514     * <direction> is either "FORWARD" or "REVERSE".
   1515     * 
   1516     * <getInstanceArg> is a string to be passed directly to
   1517     * Transliterator.getInstance().  The returned Transliterator object
   1518     * then has its ID changed to <id> and is returned.
   1519     *
   1520     * The extra blank field on "alias" lines is to make the array square.
   1521     */
   1522    //static const char translit_index[] = "translit_index";
   1523 
   1524    UErrorCode lstatus = U_ZERO_ERROR;
   1525    UResourceBundle *bundle, *transIDs, *colBund;
   1526    bundle = ures_open(U_ICUDATA_TRANSLIT, nullptr/*open default locale*/, &lstatus);
   1527    transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, nullptr, &lstatus);
   1528    const UnicodeString T_PART = UNICODE_STRING_SIMPLE("-t-");
   1529 
   1530    int32_t row, maxRows;
   1531    if (lstatus == U_MEMORY_ALLOCATION_ERROR) {
   1532        delete registry;
   1533        registry = nullptr;
   1534        status = U_MEMORY_ALLOCATION_ERROR;
   1535        return false;
   1536    }
   1537    if (U_SUCCESS(lstatus)) {
   1538        maxRows = ures_getSize(transIDs);
   1539        for (row = 0; row < maxRows; row++) {
   1540            colBund = ures_getByIndex(transIDs, row, nullptr, &lstatus);
   1541            if (U_SUCCESS(lstatus)) {
   1542                UnicodeString id(ures_getKey(colBund), -1, US_INV);
   1543                if(id.indexOf(T_PART) != -1) {
   1544                    ures_close(colBund);
   1545                    continue;
   1546                }
   1547                UResourceBundle* res = ures_getNextResource(colBund, nullptr, &lstatus);
   1548                const char* typeStr = ures_getKey(res);
   1549                char16_t type;
   1550                u_charsToUChars(typeStr, &type, 1);
   1551 
   1552                if (U_SUCCESS(lstatus)) {
   1553                    int32_t len = 0;
   1554                    const char16_t *resString;
   1555                    switch (type) {
   1556                    case 0x66: // 'f'
   1557                    case 0x69: // 'i'
   1558                        // 'file' or 'internal';
   1559                        // row[2]=resource, row[3]=direction
   1560                        {
   1561                            
   1562                            resString = ures_getStringByKey(res, "resource", &len, &lstatus);
   1563                            UBool visible = (type == 0x0066 /*f*/);
   1564                            UTransDirection dir = 
   1565                                (ures_getUnicodeStringByKey(res, "direction", &lstatus).charAt(0) ==
   1566                                 0x0046 /*F*/) ?
   1567                                UTRANS_FORWARD : UTRANS_REVERSE;
   1568                            registry->put(id, UnicodeString(true, resString, len), dir, true, visible, lstatus);
   1569                        }
   1570                        break;
   1571                    case 0x61: // 'a'
   1572                        // 'alias'; row[2]=createInstance argument
   1573                        resString = ures_getString(res, &len, &lstatus);
   1574                        registry->put(id, UnicodeString(true, resString, len), true, true, lstatus);
   1575                        break;
   1576                    }
   1577                }
   1578                ures_close(res);
   1579            }
   1580            ures_close(colBund);
   1581        }
   1582    }
   1583 
   1584    ures_close(transIDs);
   1585    ures_close(bundle);
   1586 
   1587    // Manually add prototypes that the system knows about to the
   1588    // cache.  This is how new non-rule-based transliterators are
   1589    // added to the system.
   1590    
   1591    // This is to allow for null pointer check
   1592    NullTransliterator* tempNullTranslit = new NullTransliterator();
   1593    LowercaseTransliterator* tempLowercaseTranslit = new LowercaseTransliterator();
   1594    UppercaseTransliterator* tempUppercaseTranslit = new UppercaseTransliterator();
   1595    TitlecaseTransliterator* tempTitlecaseTranslit = new TitlecaseTransliterator();
   1596    UnicodeNameTransliterator* tempUnicodeTranslit = new UnicodeNameTransliterator();
   1597    NameUnicodeTransliterator* tempNameUnicodeTranslit = new NameUnicodeTransliterator();
   1598 #if !UCONFIG_NO_BREAK_ITERATION
   1599     // TODO: could or should these transliterators be referenced polymorphically once constructed?
   1600     BreakTransliterator* tempBreakTranslit         = new BreakTransliterator();
   1601 #endif
   1602    // Check for null pointers
   1603    if (tempNullTranslit == nullptr || tempLowercaseTranslit == nullptr || tempUppercaseTranslit == nullptr ||
   1604        tempTitlecaseTranslit == nullptr || tempUnicodeTranslit == nullptr || 
   1605 #if !UCONFIG_NO_BREAK_ITERATION
   1606        tempBreakTranslit == nullptr ||
   1607 #endif
   1608        tempNameUnicodeTranslit == nullptr )
   1609    {
   1610        delete tempNullTranslit;
   1611        delete tempLowercaseTranslit;
   1612        delete tempUppercaseTranslit;
   1613        delete tempTitlecaseTranslit;
   1614        delete tempUnicodeTranslit;
   1615        delete tempNameUnicodeTranslit;
   1616 #if !UCONFIG_NO_BREAK_ITERATION
   1617        delete tempBreakTranslit;
   1618 #endif
   1619        // Since there was an error, remove registry
   1620        delete registry;
   1621        registry = nullptr;
   1622 
   1623        status = U_MEMORY_ALLOCATION_ERROR;
   1624        return 0;
   1625    }
   1626 
   1627    registry->put(tempNullTranslit, true, status);
   1628    registry->put(tempLowercaseTranslit, true, status);
   1629    registry->put(tempUppercaseTranslit, true, status);
   1630    registry->put(tempTitlecaseTranslit, true, status);
   1631    registry->put(tempUnicodeTranslit, true, status);
   1632    registry->put(tempNameUnicodeTranslit, true, status);
   1633 #if !UCONFIG_NO_BREAK_ITERATION
   1634    registry->put(tempBreakTranslit, false, status);   // false means invisible.
   1635 #endif
   1636 
   1637    RemoveTransliterator::registerIDs(); // Must be within mutex
   1638    EscapeTransliterator::registerIDs();
   1639    UnescapeTransliterator::registerIDs();
   1640    NormalizationTransliterator::registerIDs();
   1641    AnyTransliterator::registerIDs();
   1642 
   1643    _registerSpecialInverse(UNICODE_STRING_SIMPLE("Null"),
   1644                            UNICODE_STRING_SIMPLE("Null"), false);
   1645    _registerSpecialInverse(UNICODE_STRING_SIMPLE("Upper"),
   1646                            UNICODE_STRING_SIMPLE("Lower"), true);
   1647    _registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"),
   1648                            UNICODE_STRING_SIMPLE("Lower"), false);
   1649 
   1650    ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
   1651 
   1652    return true;
   1653 }
   1654 
   1655 U_NAMESPACE_END
   1656 
   1657 // Defined in transreg.h:
   1658 
   1659 /**
   1660 * Release all static memory held by transliterator.  This will
   1661 * necessarily invalidate any rule-based transliterators held by the
   1662 * user, because RBTs hold pointers to common data objects.
   1663 */
   1664 U_CFUNC UBool utrans_transliterator_cleanup() {
   1665    U_NAMESPACE_USE
   1666    TransliteratorIDParser::cleanup();
   1667    if (registry) {
   1668        delete registry;
   1669        registry = nullptr;
   1670    }
   1671    return true;
   1672 }
   1673 
   1674 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1675 
   1676 //eof
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE