tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

anytrans.cpp (13143B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *****************************************************************
      5 * Copyright (c) 2002-2014, International Business Machines Corporation
      6 * and others.  All Rights Reserved.
      7 *****************************************************************
      8 * Date        Name        Description
      9 * 06/06/2002  aliu        Creation.
     10 *****************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/uobject.h"
     18 #include "unicode/uscript.h"
     19 
     20 #include "anytrans.h"
     21 #include "hash.h"
     22 #include "mutex.h"
     23 #include "nultrans.h"
     24 #include "putilimp.h"
     25 #include "tridpars.h"
     26 #include "uinvchar.h"
     27 #include "uvector.h"
     28 
     29 //------------------------------------------------------------
     30 // Constants
     31 
     32 static const char16_t TARGET_SEP = 45; // '-'
     33 static const char16_t VARIANT_SEP = 47; // '/'
     34 static const char16_t ANY[] = {0x41,0x6E,0x79,0}; // "Any"
     35 static const char16_t NULL_ID[] = {78,117,108,108,0}; // "Null"
     36 static const char16_t LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
     37 
     38 // initial size for an Any-XXXX transform's cache of script-XXXX transforms
     39 // (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
     40 #define ANY_TRANS_CACHE_INIT_SIZE 7
     41 
     42 //------------------------------------------------------------
     43 
     44 U_CDECL_BEGIN
     45 /**
     46 * Deleter function for Transliterator*.
     47 */
     48 static void U_CALLCONV
     49 _deleteTransliterator(void *obj) {
     50    delete (icu::Transliterator*) obj;
     51 }
     52 U_CDECL_END
     53 
     54 //------------------------------------------------------------
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 //------------------------------------------------------------
     59 // ScriptRunIterator
     60 
     61 /**
     62 * Returns a series of ranges corresponding to scripts. They will be
     63 * of the form:
     64 *
     65 * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
     66 * |            |          - first run (start, limit)
     67 *          |           |  - second run (start, limit)
     68 *
     69 * That is, the runs will overlap. The reason for this is so that a
     70 * transliterator can consider common characters both before and after
     71 * the scripts.
     72 */
     73 class ScriptRunIterator : public UMemory {
     74 private:
     75    const Replaceable& text;
     76    int32_t textStart;
     77    int32_t textLimit;
     78 
     79 public:
     80    /**
     81     * The code of the current run, valid after next() returns.  May
     82     * be USCRIPT_INVALID_CODE if and only if the entire text is
     83     * COMMON/INHERITED.
     84     */
     85    UScriptCode scriptCode;
     86 
     87    /**
     88     * The start of the run, inclusive, valid after next() returns.
     89     */
     90    int32_t start;
     91 
     92    /**
     93     * The end of the run, exclusive, valid after next() returns.
     94     */
     95    int32_t limit;
     96 
     97    /**
     98     * Constructs a run iterator over the given text from start
     99     * (inclusive) to limit (exclusive).
    100     */
    101    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
    102 
    103    /**
    104     * Returns true if there are any more runs.  true is always
    105     * returned at least once.  Upon return, the caller should
    106     * examine scriptCode, start, and limit.
    107     */
    108    UBool next();
    109 
    110    /**
    111     * Adjusts internal indices for a change in the limit index of the
    112     * given delta.  A positive delta means the limit has increased.
    113     */
    114    void adjustLimit(int32_t delta);
    115 
    116 private:
    117    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
    118    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
    119 };
    120 
    121 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
    122                                     int32_t myStart, int32_t myLimit) :
    123    text(theText)
    124 {
    125    textStart = myStart;
    126    textLimit = myLimit;
    127    limit = myStart;
    128 }
    129 
    130 UBool ScriptRunIterator::next() {
    131    UChar32 ch;
    132    UScriptCode s;
    133    UErrorCode ec = U_ZERO_ERROR;
    134 
    135    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
    136    start = limit;
    137 
    138    // Are we done?
    139    if (start == textLimit) {
    140        return false;
    141    }
    142 
    143    // Move start back to include adjacent COMMON or INHERITED
    144    // characters
    145    while (start > textStart) {
    146        ch = text.char32At(start - 1); // look back
    147        s = uscript_getScript(ch, &ec);
    148        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
    149            --start;
    150        } else {
    151            break;
    152        }
    153    }
    154 
    155    // Move limit ahead to include COMMON, INHERITED, and characters
    156    // of the current script.
    157    while (limit < textLimit) {
    158        ch = text.char32At(limit); // look ahead
    159        s = uscript_getScript(ch, &ec);
    160        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
    161            if (scriptCode == USCRIPT_INVALID_CODE) {
    162                scriptCode = s;
    163            } else if (s != scriptCode) {
    164                break;
    165            }
    166        }
    167        ++limit;
    168    }
    169 
    170    // Return true even if the entire text is COMMON / INHERITED, in
    171    // which case scriptCode will be USCRIPT_INVALID_CODE.
    172    return true;
    173 }
    174 
    175 void ScriptRunIterator::adjustLimit(int32_t delta) {
    176    limit += delta;
    177    textLimit += delta;
    178 }
    179 
    180 //------------------------------------------------------------
    181 // AnyTransliterator
    182 
    183 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
    184 
    185 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
    186                                     const UnicodeString& theTarget,
    187                                     const UnicodeString& theVariant,
    188                                     UScriptCode theTargetScript,
    189                                     UErrorCode& ec) :
    190    Transliterator(id, nullptr),
    191    targetScript(theTargetScript)
    192 {
    193    cache = uhash_openSize(uhash_hashLong, uhash_compareLong, nullptr, ANY_TRANS_CACHE_INIT_SIZE, &ec);
    194    if (U_FAILURE(ec)) {
    195        return;
    196    }
    197    uhash_setValueDeleter(cache, _deleteTransliterator);
    198 
    199    target = theTarget;
    200    if (theVariant.length() > 0) {
    201        target.append(VARIANT_SEP).append(theVariant);
    202    }
    203 }
    204 
    205 AnyTransliterator::~AnyTransliterator() {
    206    uhash_close(cache);
    207 }
    208 
    209 /**
    210 * Copy constructor.
    211 */
    212 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
    213    Transliterator(o),
    214    target(o.target),
    215    targetScript(o.targetScript)
    216 {
    217    // Don't copy the cache contents
    218    UErrorCode ec = U_ZERO_ERROR;
    219    cache = uhash_openSize(uhash_hashLong, uhash_compareLong, nullptr, ANY_TRANS_CACHE_INIT_SIZE, &ec);
    220    if (U_FAILURE(ec)) {
    221        return;
    222    }
    223    uhash_setValueDeleter(cache, _deleteTransliterator);
    224 }
    225 
    226 /**
    227 * Transliterator API.
    228 */
    229 AnyTransliterator* AnyTransliterator::clone() const {
    230    return new AnyTransliterator(*this);
    231 }
    232 
    233 /**
    234 * Implements {@link Transliterator#handleTransliterate}.
    235 */
    236 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
    237                                            UBool isIncremental) const {
    238    int32_t allStart = pos.start;
    239    int32_t allLimit = pos.limit;
    240 
    241    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
    242 
    243    while (it.next()) {
    244        // Ignore runs in the ante context
    245        if (it.limit <= allStart) continue;
    246 
    247        // Try to instantiate transliterator from it.scriptCode to
    248        // our target or target/variant
    249        Transliterator* t = getTransliterator(it.scriptCode);
    250 
    251        if (t == nullptr) {
    252            // We have no transliterator.  Do nothing, but keep
    253            // pos.start up to date.
    254            pos.start = it.limit;
    255            continue;
    256        }
    257 
    258        // If the run end is before the transliteration limit, do
    259        // a non-incremental transliteration.  Otherwise do an
    260        // incremental one.
    261        UBool incremental = isIncremental && (it.limit >= allLimit);
    262 
    263        pos.start = uprv_max(allStart, it.start);
    264        pos.limit = uprv_min(allLimit, it.limit);
    265        int32_t limit = pos.limit;
    266        t->filteredTransliterate(text, pos, incremental);
    267        int32_t delta = pos.limit - limit;
    268        allLimit += delta;
    269        it.adjustLimit(delta);
    270 
    271        // We're done if we enter the post context
    272        if (it.limit >= allLimit) break;
    273    }
    274 
    275    // Restore limit.  pos.start is fine where the last transliterator
    276    // left it, or at the end of the last run.
    277    pos.limit = allLimit;
    278 }
    279 
    280 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
    281 
    282    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
    283        return nullptr;
    284    }
    285 
    286    Transliterator* t = nullptr;
    287    {
    288        Mutex m(nullptr);
    289        t = static_cast<Transliterator*>(uhash_iget(cache, static_cast<int32_t>(source)));
    290    }
    291    if (t == nullptr) {
    292        UErrorCode ec = U_ZERO_ERROR;
    293        UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
    294        UnicodeString id(sourceName);
    295        id.append(TARGET_SEP).append(target);
    296 
    297        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
    298        if (U_FAILURE(ec) || t == nullptr) {
    299            delete t;
    300 
    301            // Try to pivot around Latin, our most common script
    302            id = sourceName;
    303            id.append(LATIN_PIVOT, -1).append(target);
    304            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
    305            if (U_FAILURE(ec) || t == nullptr) {
    306                delete t;
    307                t = nullptr;
    308            }
    309        }
    310 
    311        if (t != nullptr) {
    312            Transliterator *rt = nullptr;
    313            {
    314                Mutex m(nullptr);
    315                rt = static_cast<Transliterator*>(uhash_iget(cache, static_cast<int32_t>(source)));
    316                if (rt == nullptr) {
    317                    // Common case, no race to cache this new transliterator.
    318                    uhash_iput(cache, static_cast<int32_t>(source), t, &ec);
    319                } else {
    320                    // Race case, some other thread beat us to caching this transliterator.
    321                    Transliterator *temp = rt;
    322                    rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
    323                    t  = temp; // The transliterator from the cache that we will return.
    324                }
    325            }
    326            delete rt;    // will be non-null only in case of races.
    327        }
    328    }
    329    return t;
    330 }
    331 
    332 /**
    333 * Return the script code for a given name, or -1 if not found.
    334 */
    335 static UScriptCode scriptNameToCode(const UnicodeString& name) {
    336    char buf[128];
    337    UScriptCode code;
    338    UErrorCode ec = U_ZERO_ERROR;
    339    int32_t nameLen = name.length();
    340    UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
    341 
    342    if (isInvariant) {
    343        name.extract(0, nameLen, buf, static_cast<int32_t>(sizeof(buf)), US_INV);
    344        buf[127] = 0;   // Make sure that we nullptr terminate the string.
    345    }
    346    if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
    347    {
    348        code = USCRIPT_INVALID_CODE;
    349    }
    350    return code;
    351 }
    352 
    353 /**
    354 * Registers standard transliterators with the system.  Called by
    355 * Transliterator during initialization.  Scan all current targets and
    356 * register those that are scripts T as Any-T/V.
    357 */
    358 void AnyTransliterator::registerIDs() {
    359 
    360    UErrorCode ec = U_ZERO_ERROR;
    361    Hashtable seen(true, ec);
    362 
    363    int32_t sourceCount = Transliterator::_countAvailableSources();
    364    for (int32_t s=0; s<sourceCount; ++s) {
    365        UnicodeString source;
    366        Transliterator::_getAvailableSource(s, source);
    367 
    368        // Ignore the "Any" source
    369        if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
    370 
    371        int32_t targetCount = Transliterator::_countAvailableTargets(source);
    372        for (int32_t t=0; t<targetCount; ++t) {
    373            UnicodeString target;
    374            Transliterator::_getAvailableTarget(t, source, target);
    375 
    376            // Only process each target once
    377            if (seen.geti(target) != 0) continue;
    378            ec = U_ZERO_ERROR;
    379            seen.puti(target, 1, ec);
    380 
    381            // Get the script code for the target.  If not a script, ignore.
    382            UScriptCode targetScript = scriptNameToCode(target);
    383            if (targetScript == USCRIPT_INVALID_CODE) continue;
    384 
    385            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
    386            // assert(variantCount >= 1);
    387            for (int32_t v=0; v<variantCount; ++v) {
    388                UnicodeString variant;
    389                Transliterator::_getAvailableVariant(v, source, target, variant);
    390 
    391                UnicodeString id;
    392                TransliteratorIDParser::STVtoID(UnicodeString(true, ANY, 3), target, variant, id);
    393                ec = U_ZERO_ERROR;
    394                AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
    395                                                             targetScript, ec);
    396                if (U_FAILURE(ec)) {
    397                    delete tl;
    398                } else {
    399                    Transliterator::_registerInstance(tl);
    400                    Transliterator::_registerSpecialInverse(target, UnicodeString(true, NULL_ID, 4), false);
    401                }
    402            }
    403        }
    404    }
    405 }
    406 
    407 U_NAMESPACE_END
    408 
    409 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    410 
    411 //eof