tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

brkeng.cpp (12200B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ************************************************************************************
      5 * Copyright (C) 2006-2016, International Business Machines Corporation
      6 * and others. All Rights Reserved.
      7 ************************************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/uchar.h"
     15 #include "unicode/uniset.h"
     16 #include "unicode/chariter.h"
     17 #include "unicode/ures.h"
     18 #include "unicode/udata.h"
     19 #include "unicode/putil.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/uscript.h"
     22 #include "unicode/ucharstrie.h"
     23 #include "unicode/bytestrie.h"
     24 #include "unicode/rbbi.h"
     25 
     26 #include "brkeng.h"
     27 #include "cmemory.h"
     28 #include "dictbe.h"
     29 #include "lstmbe.h"
     30 #include "charstr.h"
     31 #include "dictionarydata.h"
     32 #include "mutex.h"
     33 #include "uvector.h"
     34 #include "umutex.h"
     35 #include "uresimp.h"
     36 #include "ubrkimpl.h"
     37 
     38 U_NAMESPACE_BEGIN
     39 
     40 /*
     41 ******************************************************************
     42 */
     43 
     44 LanguageBreakEngine::LanguageBreakEngine() {
     45 }
     46 
     47 LanguageBreakEngine::~LanguageBreakEngine() {
     48 }
     49 
     50 /*
     51 ******************************************************************
     52 */
     53 
     54 LanguageBreakFactory::LanguageBreakFactory() {
     55 }
     56 
     57 LanguageBreakFactory::~LanguageBreakFactory() {
     58 }
     59 
     60 /*
     61 ******************************************************************
     62 */
     63 
     64 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
     65    (void)status;
     66 }
     67 
     68 UnhandledEngine::~UnhandledEngine() {
     69    delete fHandled;
     70    fHandled = nullptr;
     71 }
     72 
     73 UBool
     74 UnhandledEngine::handles(UChar32 c, const char* locale) const {
     75    (void)locale; // Unused
     76    return fHandled && fHandled->contains(c);
     77 }
     78 
     79 int32_t
     80 UnhandledEngine::findBreaks( UText *text,
     81                             int32_t startPos,
     82                             int32_t endPos,
     83                             UVector32 &/*foundBreaks*/,
     84                             UBool /* isPhraseBreaking */,
     85                             UErrorCode &status) const {
     86    if (U_FAILURE(status)) return 0;
     87    utext_setNativeIndex(text, startPos);
     88    UChar32 c = utext_current32(text);
     89    while (static_cast<int32_t>(utext_getNativeIndex(text)) < endPos && fHandled->contains(c)) {
     90        utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
     91        c = utext_current32(text);
     92    }
     93    return 0;
     94 }
     95 
     96 void
     97 UnhandledEngine::handleCharacter(UChar32 c) {
     98    if (fHandled == nullptr) {
     99        fHandled = new UnicodeSet();
    100        if (fHandled == nullptr) {
    101            return;
    102        }
    103    }
    104    if (!fHandled->contains(c)) {
    105        UErrorCode status = U_ZERO_ERROR;
    106        // Apply the entire script of the character.
    107        int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
    108        fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
    109    }
    110 }
    111 
    112 /*
    113 ******************************************************************
    114 */
    115 
    116 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
    117    fEngines = nullptr;
    118 }
    119 
    120 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
    121    delete fEngines;
    122 }
    123 
    124 void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
    125    static UMutex gBreakEngineMutex;
    126    Mutex m(&gBreakEngineMutex);
    127    if (fEngines == nullptr) {
    128        LocalPointer<UStack>  engines(new UStack(uprv_deleteUObject, nullptr, status), status);
    129        if (U_SUCCESS(status)) {
    130            fEngines = engines.orphan();
    131        }
    132    }
    133 }
    134 
    135 const LanguageBreakEngine *
    136 ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
    137    const LanguageBreakEngine *lbe = nullptr;
    138    UErrorCode  status = U_ZERO_ERROR;
    139    ensureEngines(status);
    140    if (U_FAILURE(status) ) {
    141        // Note: no way to return error code to caller.
    142        return nullptr;
    143    }
    144 
    145    static UMutex gBreakEngineMutex;
    146    Mutex m(&gBreakEngineMutex);
    147    int32_t i = fEngines->size();
    148    while (--i >= 0) {
    149        lbe = static_cast<const LanguageBreakEngine*>(fEngines->elementAt(i));
    150        if (lbe != nullptr && lbe->handles(c, locale)) {
    151            return lbe;
    152        }
    153    }
    154 
    155    // We didn't find an engine. Create one.
    156    lbe = loadEngineFor(c, locale);
    157    if (lbe != nullptr) {
    158        fEngines->push((void *)lbe, status);
    159    }
    160    return U_SUCCESS(status) ? lbe : nullptr;
    161 }
    162 
    163 const LanguageBreakEngine *
    164 ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
    165    UErrorCode status = U_ZERO_ERROR;
    166    UScriptCode code = uscript_getScript(c, &status);
    167    if (U_SUCCESS(status)) {
    168        const LanguageBreakEngine *engine = nullptr;
    169        // Try to use LSTM first
    170        const LSTMData *data = CreateLSTMDataForScript(code, status);
    171        if (U_SUCCESS(status)) {
    172            if (data != nullptr) {
    173                engine = CreateLSTMBreakEngine(code, data, status);
    174                if (U_SUCCESS(status) && engine != nullptr) {
    175                    return engine;
    176                }
    177                if (engine != nullptr) {
    178                    delete engine;
    179                    engine = nullptr;
    180                } else {
    181                    DeleteLSTMData(data);
    182                }
    183            }
    184        }
    185        status = U_ZERO_ERROR;  // fallback to dictionary based
    186        DictionaryMatcher *m = loadDictionaryMatcherFor(code);
    187        if (m != nullptr) {
    188            switch(code) {
    189            case USCRIPT_THAI:
    190                engine = new ThaiBreakEngine(m, status);
    191                break;
    192            case USCRIPT_LAO:
    193                engine = new LaoBreakEngine(m, status);
    194                break;
    195            case USCRIPT_MYANMAR:
    196                engine = new BurmeseBreakEngine(m, status);
    197                break;
    198            case USCRIPT_KHMER:
    199                engine = new KhmerBreakEngine(m, status);
    200                break;
    201 
    202 #if !UCONFIG_NO_NORMALIZATION
    203                // CJK not available w/o normalization
    204            case USCRIPT_HANGUL:
    205                engine = new CjkBreakEngine(m, kKorean, status);
    206                break;
    207 
    208            // use same BreakEngine and dictionary for both Chinese and Japanese
    209            case USCRIPT_HIRAGANA:
    210            case USCRIPT_KATAKANA:
    211            case USCRIPT_HAN:
    212                engine = new CjkBreakEngine(m, kChineseJapanese, status);
    213                break;
    214 #if 0
    215            // TODO: Have to get some characters with script=common handled
    216            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
    217            // them to CjkBreakEngine does not work. The engine has to
    218            // special-case them.
    219            case USCRIPT_COMMON:
    220            {
    221                UBlockCode block = ublock_getCode(code);
    222                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
    223                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
    224                break;
    225            }
    226 #endif
    227 #endif
    228 
    229            default:
    230                break;
    231            }
    232            if (engine == nullptr) {
    233                delete m;
    234            }
    235            else if (U_FAILURE(status)) {
    236                delete engine;
    237                engine = nullptr;
    238            }
    239            return engine;
    240        }
    241    }
    242    return nullptr;
    243 }
    244 
    245 DictionaryMatcher *
    246 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { 
    247    UErrorCode status = U_ZERO_ERROR;
    248    // open root from brkitr tree.
    249    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
    250    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
    251    int32_t dictnlength = 0;
    252    const char16_t *dictfname =
    253        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
    254    if (U_FAILURE(status)) {
    255        ures_close(b);
    256        return nullptr;
    257    }
    258    CharString dictnbuf;
    259    CharString ext;
    260    const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
    261    if (extStart != nullptr) {
    262        int32_t len = static_cast<int32_t>(extStart - dictfname);
    263        ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
    264        dictnlength = len;
    265    }
    266    dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
    267    ures_close(b);
    268 
    269    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
    270    if (U_SUCCESS(status)) {
    271        // build trie
    272        const uint8_t* data = static_cast<const uint8_t*>(udata_getMemory(file));
    273        const int32_t* indexes = reinterpret_cast<const int32_t*>(data);
    274        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
    275        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    276        DictionaryMatcher *m = nullptr;
    277        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    278            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
    279            const char* characters = reinterpret_cast<const char*>(data + offset);
    280            m = new BytesDictionaryMatcher(characters, transform, file);
    281        }
    282        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    283            const char16_t* characters = reinterpret_cast<const char16_t*>(data + offset);
    284            m = new UCharsDictionaryMatcher(characters, file);
    285        }
    286        if (m == nullptr) {
    287            // no matcher exists to take ownership - either we are an invalid 
    288            // type or memory allocation failed
    289            udata_close(file);
    290        }
    291        return m;
    292    } else if (dictfname != nullptr) {
    293        // we don't have a dictionary matcher.
    294        // returning nullptr here will cause us to fail to find a dictionary break engine, as expected
    295        status = U_ZERO_ERROR;
    296        return nullptr;
    297    }
    298    return nullptr;
    299 }
    300 
    301 
    302 void ICULanguageBreakFactory::addExternalEngine(
    303        ExternalBreakEngine* external, UErrorCode& status) {
    304    LocalPointer<ExternalBreakEngine> engine(external, status);
    305    ensureEngines(status);
    306    LocalPointer<BreakEngineWrapper> wrapper(
    307        new BreakEngineWrapper(engine.orphan(), status), status);
    308    static UMutex gBreakEngineMutex;
    309    Mutex m(&gBreakEngineMutex);
    310    fEngines->push(wrapper.getAlias(), status);
    311    wrapper.orphan();
    312 }
    313 
    314 BreakEngineWrapper::BreakEngineWrapper(
    315    ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
    316 }
    317 
    318 BreakEngineWrapper::~BreakEngineWrapper() {
    319 }
    320 
    321 UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
    322    return delegate->isFor(c, locale);
    323 }
    324 
    325 int32_t BreakEngineWrapper::findBreaks(
    326    UText *text,
    327    int32_t startPos,
    328    int32_t endPos,
    329    UVector32 &foundBreaks,
    330    UBool /* isPhraseBreaking */,
    331    UErrorCode &status) const {
    332    if (U_FAILURE(status)) return 0;
    333    int32_t result = 0;
    334 
    335    // Find the span of characters included in the set.
    336    //   The span to break begins at the current position in the text, and
    337    //   extends towards the start or end of the text, depending on 'reverse'.
    338 
    339    utext_setNativeIndex(text, startPos);
    340    int32_t start = static_cast<int32_t>(utext_getNativeIndex(text));
    341    int32_t current;
    342    int32_t rangeStart;
    343    int32_t rangeEnd;
    344    UChar32 c = utext_current32(text);
    345    while ((current = static_cast<int32_t>(utext_getNativeIndex(text))) < endPos && delegate->handles(c)) {
    346        utext_next32(text);         // TODO:  recast loop for postincrement
    347        c = utext_current32(text);
    348    }
    349    rangeStart = start;
    350    rangeEnd = current;
    351    int32_t beforeSize = foundBreaks.size();
    352    int32_t additionalCapacity = rangeEnd - rangeStart + 1;
    353    // enlarge to contains (rangeEnd-rangeStart+1) more items
    354    foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
    355    if (U_FAILURE(status)) return 0;
    356    foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
    357    result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
    358                                  additionalCapacity, status);
    359    if (U_FAILURE(status)) return 0;
    360    foundBreaks.setSize(beforeSize + result);
    361    utext_setNativeIndex(text, current);
    362    return result;
    363 }
    364 
    365 U_NAMESPACE_END
    366 
    367 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */