tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cpdtrans.cpp (21638B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/unifilt.h"
     18 #include "unicode/uniset.h"
     19 #include "cpdtrans.h"
     20 #include "uvector.h"
     21 #include "tridpars.h"
     22 #include "cmemory.h"
     23 
     24 // keep in sync with Transliterator
     25 //static const char16_t ID_SEP   = 0x002D; /*-*/
     26 static const char16_t ID_DELIM = 0x003B; /*;*/
     27 static const char16_t NEWLINE  = 10;
     28 
     29 static const char16_t COLON_COLON[] = {0x3A, 0x3A, 0}; //"::"
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 const char16_t CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass"
     34 
     35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator)
     36 
     37 /**
     38 * Constructs a new compound transliterator given an array of
     39 * transliterators.  The array of transliterators may be of any
     40 * length, including zero or one, however, useful compound
     41 * transliterators have at least two components.
     42 * @param transliterators array of <code>Transliterator</code>
     43 * objects
     44 * @param transliteratorCount The number of
     45 * <code>Transliterator</code> objects in transliterators.
     46 * @param filter the filter.  Any character for which
     47 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
     48 * altered by this transliterator.  If <tt>filter</tt> is
     49 * <tt>null</tt> then no filtering is applied.
     50 */
     51 CompoundTransliterator::CompoundTransliterator(
     52                           Transliterator* const transliterators[],
     53                           int32_t transliteratorCount,
     54                           UnicodeFilter* adoptedFilter) :
     55    Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),
     56    trans(nullptr), count(0), numAnonymousRBTs(0) {
     57    setTransliterators(transliterators, transliteratorCount);
     58 }
     59 
     60 /**
     61 * Splits an ID of the form "ID;ID;..." into a compound using each
     62 * of the IDs. 
     63 * @param id of above form
     64 * @param forward if false, does the list in reverse order, and
     65 * takes the inverse of each ID.
     66 */
     67 CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
     68                              UTransDirection direction,
     69                              UnicodeFilter* adoptedFilter,
     70                              UParseError& /*parseError*/,
     71                              UErrorCode& status) :
     72    Transliterator(id, adoptedFilter),
     73    trans(nullptr), numAnonymousRBTs(0) {
     74    // TODO add code for parseError...currently unused, but
     75    // later may be used by parsing code...
     76    init(id, direction, true, status);
     77 }
     78 
     79 CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
     80                              UParseError& /*parseError*/,
     81                              UErrorCode& status) :
     82    Transliterator(id, nullptr), // set filter to 0 here!
     83    trans(nullptr), numAnonymousRBTs(0) {
     84    // TODO add code for parseError...currently unused, but
     85    // later may be used by parsing code...
     86    init(id, UTRANS_FORWARD, true, status);
     87 }
     88 
     89 
     90 /**
     91 * Private constructor for use of TransliteratorAlias
     92 */
     93 CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,
     94                                              UVector& list,
     95                                              UnicodeFilter* adoptedFilter,
     96                                              int32_t anonymousRBTs,
     97                                              UParseError& /*parseError*/,
     98                                              UErrorCode& status) :
     99    Transliterator(newID, adoptedFilter),
    100    trans(nullptr), numAnonymousRBTs(anonymousRBTs)
    101 {
    102    init(list, UTRANS_FORWARD, false, status);
    103 }
    104 
    105 /**
    106 * Private constructor for Transliterator from a vector of
    107 * transliterators.  The caller is responsible for fixing up the
    108 * ID.
    109 */
    110 CompoundTransliterator::CompoundTransliterator(UVector& list,
    111                                               UParseError& /*parseError*/,
    112                                               UErrorCode& status) :
    113    Transliterator(UnicodeString(), nullptr),
    114    trans(nullptr), numAnonymousRBTs(0)
    115 {
    116    // TODO add code for parseError...currently unused, but
    117    // later may be used by parsing code...
    118    init(list, UTRANS_FORWARD, false, status);
    119    // assume caller will fixup ID
    120 }
    121 
    122 CompoundTransliterator::CompoundTransliterator(UVector& list,
    123                                               int32_t anonymousRBTs,
    124                                               UParseError& /*parseError*/,
    125                                               UErrorCode& status) :
    126    Transliterator(UnicodeString(), nullptr),
    127    trans(nullptr), numAnonymousRBTs(anonymousRBTs)
    128 {
    129    init(list, UTRANS_FORWARD, false, status);
    130 }
    131 
    132 /**
    133 * Finish constructing a transliterator: only to be called by
    134 * constructors.  Before calling init(), set trans and filter to nullptr.
    135 * @param id the id containing ';'-separated entries
    136 * @param direction either FORWARD or REVERSE
    137 * @param idSplitPoint the index into id at which the
    138 * adoptedSplitTransliterator should be inserted, if there is one, or
    139 * -1 if there is none.
    140 * @param adoptedSplitTransliterator a transliterator to be inserted
    141 * before the entry at offset idSplitPoint in the id string.  May be
    142 * nullptr to insert no entry.
    143 * @param fixReverseID if true, then reconstruct the ID of reverse
    144 * entries by calling getID() of component entries.  Some constructors
    145 * do not require this because they apply a facade ID anyway.
    146 * @param status the error code indicating success or failure
    147 */
    148 void CompoundTransliterator::init(const UnicodeString& id,
    149                                  UTransDirection direction,
    150                                  UBool fixReverseID,
    151                                  UErrorCode& status) {
    152    // assert(trans == 0);
    153 
    154    if (U_FAILURE(status)) {
    155        return;
    156    }
    157 
    158    UVector list(status);
    159    UnicodeSet* compoundFilter = nullptr;
    160    UnicodeString regenID;
    161    if (!TransliteratorIDParser::parseCompoundID(id, direction,
    162                                      regenID, list, compoundFilter)) {
    163        status = U_INVALID_ID;
    164        delete compoundFilter;
    165        return;
    166    }
    167 
    168    TransliteratorIDParser::instantiateList(list, status);
    169 
    170    init(list, direction, fixReverseID, status);
    171 
    172    if (compoundFilter != nullptr) {
    173        adoptFilter(compoundFilter);
    174    }
    175 }
    176 
    177 /**
    178 * Finish constructing a transliterator: only to be called by
    179 * constructors.  Before calling init(), set trans and filter to nullptr.
    180 * @param list a vector of transliterator objects to be adopted.  It
    181 * should NOT be empty.  The list should be in declared order.  That
    182 * is, it should be in the FORWARD order; if direction is REVERSE then
    183 * the list order will be reversed.
    184 * @param direction either FORWARD or REVERSE
    185 * @param fixReverseID if true, then reconstruct the ID of reverse
    186 * entries by calling getID() of component entries.  Some constructors
    187 * do not require this because they apply a facade ID anyway.
    188 * @param status the error code indicating success or failure
    189 */
    190 void CompoundTransliterator::init(UVector& list,
    191                                  UTransDirection direction,
    192                                  UBool fixReverseID,
    193                                  UErrorCode& status) {
    194    // assert(trans == 0);
    195 
    196    // Allocate array
    197    if (U_SUCCESS(status)) {
    198        count = list.size();
    199        trans = static_cast<Transliterator**>(uprv_malloc(count * sizeof(Transliterator*)));
    200        /* test for nullptr */
    201        if (trans == nullptr) {
    202            status = U_MEMORY_ALLOCATION_ERROR;
    203            return;
    204        }
    205    }
    206 
    207    if (U_FAILURE(status) || trans == nullptr) {
    208         // assert(trans == 0);
    209        return;
    210    }
    211 
    212    // Move the transliterators from the vector into an array.
    213    // Reverse the order if necessary.
    214    int32_t i;
    215    for (i=0; i<count; ++i) {
    216        int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i;
    217        trans[i] = static_cast<Transliterator*>(list.elementAt(j));
    218    }
    219 
    220    // If the direction is UTRANS_REVERSE then we may need to fix the
    221    // ID.
    222    if (direction == UTRANS_REVERSE && fixReverseID) {
    223        UnicodeString newID;
    224        for (i=0; i<count; ++i) {
    225            if (i > 0) {
    226                newID.append(ID_DELIM);
    227            }
    228            newID.append(trans[i]->getID());
    229        }
    230        setID(newID);
    231    }
    232 
    233    computeMaximumContextLength();
    234 }
    235 
    236 /**
    237 * Return the IDs of the given list of transliterators, concatenated
    238 * with ID_DELIM delimiting them.  Equivalent to the perlish expression
    239 * join(ID_DELIM, map($_.getID(), transliterators).
    240 */
    241 UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[],
    242                                              int32_t transCount) {
    243    UnicodeString id;
    244    for (int32_t i=0; i<transCount; ++i) {
    245        if (i > 0) {
    246            id.append(ID_DELIM);
    247        }
    248        id.append(transliterators[i]->getID());
    249    }
    250    return id; // Return temporary
    251 }
    252 
    253 /**
    254 * Copy constructor.
    255 */
    256 CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
    257    Transliterator(t), trans(nullptr), count(0), numAnonymousRBTs(-1) {
    258    *this = t;
    259 }
    260 
    261 /**
    262 * Destructor
    263 */
    264 CompoundTransliterator::~CompoundTransliterator() {
    265    freeTransliterators();
    266 }
    267 
    268 void CompoundTransliterator::freeTransliterators() {
    269    if (trans != nullptr) {
    270        for (int32_t i=0; i<count; ++i) {
    271            delete trans[i];
    272        }
    273        uprv_free(trans);
    274    }
    275    trans = nullptr;
    276    count = 0;
    277 }
    278 
    279 /**
    280 * Assignment operator.
    281 */
    282 CompoundTransliterator& CompoundTransliterator::operator=(
    283                                             const CompoundTransliterator& t)
    284 {
    285    if (this == &t) { return *this; }  // self-assignment: no-op
    286    Transliterator::operator=(t);
    287    int32_t i = 0;
    288    UBool failed = false;
    289    if (trans != nullptr) {
    290        for (i=0; i<count; ++i) {
    291            delete trans[i];
    292            trans[i] = nullptr;
    293        }
    294    }
    295    if (t.count > count) {
    296        if (trans != nullptr) {
    297            uprv_free(trans);
    298        }
    299        trans = static_cast<Transliterator**>(uprv_malloc(t.count * sizeof(Transliterator*)));
    300    }
    301    count = t.count;
    302    if (trans != nullptr) {
    303        for (i=0; i<count; ++i) {
    304            trans[i] = t.trans[i]->clone();
    305            if (trans[i] == nullptr) {
    306                failed = true;
    307                break;
    308            }
    309        }
    310    }
    311 
    312    // if memory allocation failed delete backwards trans array
    313    if (failed && i > 0) {
    314        int32_t n;
    315        for (n = i-1; n >= 0; n--) {
    316            uprv_free(trans[n]);
    317            trans[n] = nullptr;
    318        }
    319    }
    320    numAnonymousRBTs = t.numAnonymousRBTs;
    321    return *this;
    322 }
    323 
    324 /**
    325 * Transliterator API.
    326 */
    327 CompoundTransliterator* CompoundTransliterator::clone() const {
    328    return new CompoundTransliterator(*this);
    329 }
    330 
    331 /**
    332 * Returns the number of transliterators in this chain.
    333 * @return number of transliterators in this chain.
    334 */
    335 int32_t CompoundTransliterator::getCount() const {
    336    return count;
    337 }
    338 
    339 /**
    340 * Returns the transliterator at the given index in this chain.
    341 * @param index index into chain, from 0 to <code>getCount() - 1</code>
    342 * @return transliterator at the given index
    343 */
    344 const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const {
    345    return *trans[index];
    346 }
    347 
    348 void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[],
    349                                                int32_t transCount) {
    350    Transliterator** a = static_cast<Transliterator**>(uprv_malloc(transCount * sizeof(Transliterator*)));
    351    if (a == nullptr) {
    352        return;
    353    }
    354    int32_t i = 0;
    355    UBool failed = false;
    356    for (i=0; i<transCount; ++i) {
    357        a[i] = transliterators[i]->clone();
    358        if (a[i] == nullptr) {
    359            failed = true;
    360            break;
    361        }
    362    }
    363    if (failed && i > 0) {
    364        int32_t n;
    365        for (n = i-1; n >= 0; n--) {
    366            uprv_free(a[n]);
    367            a[n] = nullptr;
    368        }
    369        return;
    370    }
    371    adoptTransliterators(a, transCount);
    372 }
    373 
    374 void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[],
    375                                                  int32_t transCount) {
    376    // First free trans[] and set count to zero.  Once this is done,
    377    // orphan the filter.  Set up the new trans[].
    378    freeTransliterators();
    379    trans = adoptedTransliterators;
    380    count = transCount;
    381    computeMaximumContextLength();
    382    setID(joinIDs(trans, count));
    383 }
    384 
    385 /**
    386 * Append c to buf, unless buf is empty or buf already ends in c.
    387 */
    388 static void _smartAppend(UnicodeString& buf, char16_t c) {
    389    if (buf.length() != 0 &&
    390        buf.charAt(buf.length() - 1) != c) {
    391        buf.append(c);
    392    }
    393 }
    394 
    395 UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
    396                                               UBool escapeUnprintable) const {
    397    // We do NOT call toRules() on our component transliterators, in
    398    // general.  If we have several rule-based transliterators, this
    399    // yields a concatenation of the rules -- not what we want.  We do
    400    // handle compound RBT transliterators specially -- those for which
    401    // compoundRBTIndex >= 0.  For the transliterator at compoundRBTIndex,
    402    // we do call toRules() recursively.
    403    rulesSource.truncate(0);
    404    if (numAnonymousRBTs >= 1 && getFilter() != nullptr) {
    405        // If we are a compound RBT and if we have a global
    406        // filter, then emit it at the top.
    407        UnicodeString pat;
    408        rulesSource.append(COLON_COLON, 2).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM);
    409    }
    410    for (int32_t i=0; i<count; ++i) {
    411        UnicodeString rule;
    412 
    413        // Anonymous RuleBasedTransliterators (inline rules and
    414        // ::BEGIN/::END blocks) are given IDs that begin with
    415        // "%Pass": use toRules() to write all the rules to the output
    416        // (and insert "::Null;" if we have two in a row)
    417        if (trans[i]->getID().startsWith(PASS_STRING, 5)) {
    418            trans[i]->toRules(rule, escapeUnprintable);
    419            if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING, 5))
    420                rule = UNICODE_STRING_SIMPLE("::Null;") + rule;
    421 
    422        // we also use toRules() on CompoundTransliterators (which we
    423        // check for by looking for a semicolon in the ID)-- this gets
    424        // the list of their child transliterators output in the right
    425        // format
    426        } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) {
    427            trans[i]->toRules(rule, escapeUnprintable);
    428 
    429        // for everything else, use Transliterator::toRules()
    430        } else {
    431            trans[i]->Transliterator::toRules(rule, escapeUnprintable);
    432        }
    433        _smartAppend(rulesSource, NEWLINE);
    434        rulesSource.append(rule);
    435        _smartAppend(rulesSource, ID_DELIM);
    436    }
    437    return rulesSource;
    438 }
    439 
    440 /**
    441 * Implement Transliterator framework
    442 */
    443 void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const {
    444    UnicodeSet set;
    445    result.clear();
    446    for (int32_t i=0; i<count; ++i) {
    447    result.addAll(trans[i]->getSourceSet(set));
    448    // Take the example of Hiragana-Latin.  This is really
    449    // Hiragana-Katakana; Katakana-Latin.  The source set of
    450    // these two is roughly [:Hiragana:] and [:Katakana:].
    451    // But the source set for the entire transliterator is
    452    // actually [:Hiragana:] ONLY -- that is, the first
    453    // non-empty source set.
    454 
    455    // This is a heuristic, and not 100% reliable.
    456    if (!result.isEmpty()) {
    457        break;
    458    }
    459    }
    460 }
    461 
    462 /**
    463 * Override Transliterator framework
    464 */
    465 UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const {
    466    UnicodeSet set;
    467    result.clear();
    468    for (int32_t i=0; i<count; ++i) {
    469    // This is a heuristic, and not 100% reliable.
    470    result.addAll(trans[i]->getTargetSet(set));
    471    }
    472    return result;
    473 }
    474 
    475 /**
    476 * Implements {@link Transliterator#handleTransliterate}.
    477 */
    478 void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
    479                                                 UBool incremental) const {
    480    /* Call each transliterator with the same contextStart and
    481     * start, but with the limit as modified
    482     * by preceding transliterators.  The start index must be
    483     * reset for each transliterator to give each a chance to
    484     * transliterate the text.  The initial contextStart index is known
    485     * to still point to the same place after each transliterator
    486     * is called because each transliterator will not change the
    487     * text between contextStart and the initial start index.
    488     *
    489     * IMPORTANT: After the first transliterator, each subsequent
    490     * transliterator only gets to transliterate text committed by
    491     * preceding transliterators; that is, the start (output
    492     * value) of transliterator i becomes the limit (input value)
    493     * of transliterator i+1.  Finally, the overall limit is fixed
    494     * up before we return.
    495     *
    496     * Assumptions we make here:
    497     * (1) contextStart <= start <= limit <= contextLimit <= text.length()
    498     * (2) start <= start' <= limit'  ;cursor doesn't move back
    499     * (3) start <= limit'            ;text before cursor unchanged
    500     * - start' is the value of start after calling handleKT
    501     * - limit' is the value of limit after calling handleKT
    502     */
    503    
    504    /**
    505     * Example: 3 transliterators.  This example illustrates the
    506     * mechanics we need to implement.  C, S, and L are the contextStart,
    507     * start, and limit.  gl is the globalLimit.  contextLimit is
    508     * equal to limit throughout.
    509     *
    510     * 1. h-u, changes hex to Unicode
    511     *
    512     *    4  7  a  d  0      4  7  a
    513     *    abc/u0061/u    =>  abca/u    
    514     *    C  S       L       C   S L   gl=f->a
    515     *
    516     * 2. upup, changes "x" to "XX"
    517     *
    518     *    4  7  a       4  7  a
    519     *    abca/u    =>  abcAA/u    
    520     *    C  SL         C    S   
    521     *                       L    gl=a->b
    522     * 3. u-h, changes Unicode to hex
    523     *
    524     *    4  7  a        4  7  a  d  0  3
    525     *    abcAA/u    =>  abc/u0041/u0041/u    
    526     *    C  S L         C              S
    527     *                                  L   gl=b->15
    528     * 4. return
    529     *
    530     *    4  7  a  d  0  3
    531     *    abc/u0041/u0041/u    
    532     *    C S L
    533     */
    534 
    535    if (count < 1) {
    536        index.start = index.limit;
    537        return; // Short circuit for empty compound transliterators
    538    }
    539 
    540    // compoundLimit is the limit value for the entire compound
    541    // operation.  We overwrite index.limit with the previous
    542    // index.start.  After each transliteration, we update
    543    // compoundLimit for insertions or deletions that have happened.
    544    int32_t compoundLimit = index.limit;
    545 
    546    // compoundStart is the start for the entire compound
    547    // operation.
    548    int32_t compoundStart = index.start;
    549    
    550    int32_t delta = 0; // delta in length
    551 
    552    // Give each transliterator a crack at the run of characters.
    553    // See comments at the top of the method for more detail.
    554    for (int32_t i=0; i<count; ++i) {
    555        index.start = compoundStart; // Reset start
    556        int32_t limit = index.limit;
    557        
    558        if (index.start == index.limit) {
    559            // Short circuit for empty range
    560            break;
    561        }
    562 
    563        trans[i]->filteredTransliterate(text, index, incremental);
    564        
    565        // In a properly written transliterator, start == limit after
    566        // handleTransliterate() returns when incremental is false.
    567        // Catch cases where the subclass doesn't do this, and throw
    568        // an exception.  (Just pinning start to limit is a bad idea,
    569        // because what's probably happening is that the subclass
    570        // isn't transliterating all the way to the end, and it should
    571        // in non-incremental mode.)
    572        if (!incremental && index.start != index.limit) {
    573            // We can't throw an exception, so just fudge things
    574            index.start = index.limit;
    575        }
    576 
    577        // Cumulative delta for insertions/deletions
    578        delta += index.limit - limit;
    579        
    580        if (incremental) {
    581            // In the incremental case, only allow subsequent
    582            // transliterators to modify what has already been
    583            // completely processed by prior transliterators.  In the
    584            // non-incrmental case, allow each transliterator to
    585            // process the entire text.
    586            index.limit = index.start;
    587        }
    588    }
    589 
    590    compoundLimit += delta;
    591 
    592    // Start is good where it is -- where the last transliterator left
    593    // it.  Limit needs to be put back where it was, modulo
    594    // adjustments for deletions/insertions.
    595    index.limit = compoundLimit;
    596 }
    597 
    598 /**
    599 * Sets the length of the longest context required by this transliterator.
    600 * This is <em>preceding</em> context.
    601 */
    602 void CompoundTransliterator::computeMaximumContextLength() {
    603    int32_t max = 0;
    604    for (int32_t i=0; i<count; ++i) {
    605        int32_t len = trans[i]->getMaximumContextLength();
    606        if (len > max) {
    607            max = len;
    608        }
    609    }
    610    setMaximumContextLength(max);
    611 }
    612 
    613 U_NAMESPACE_END
    614 
    615 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    616 
    617 /* eof */