[ tor-browser ].git.dasho

dictbe.cpp (63993B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2006-2016, International Business Machines Corporation
      6 * and others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 #include <utility>
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "brkeng.h"
     17 #include "dictbe.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/chariter.h"
     20 #include "unicode/resbund.h"
     21 #include "unicode/ubrk.h"
     22 #include "unicode/usetiter.h"
     23 #include "ubrkimpl.h"
     24 #include "utracimp.h"
     25 #include "uvectr32.h"
     26 #include "uvector.h"
     27 #include "uassert.h"
     28 #include "unicode/normlzr.h"
     29 #include "cmemory.h"
     30 #include "dictionarydata.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 /*
     35 ******************************************************************
     36 */
     37 
     38 DictionaryBreakEngine::DictionaryBreakEngine() {
     39 }
     40 
     41 DictionaryBreakEngine::~DictionaryBreakEngine() {
     42 }
     43 
     44 UBool
     45 DictionaryBreakEngine::handles(UChar32 c, const char*) const {
     46    return fSet.contains(c);
     47 }
     48 
     49 int32_t
     50 DictionaryBreakEngine::findBreaks( UText *text,
     51                                 int32_t startPos,
     52                                 int32_t endPos,
     53                                 UVector32 &foundBreaks,
     54                                 UBool isPhraseBreaking,
     55                                 UErrorCode& status) const {
     56    if (U_FAILURE(status)) return 0;
     57    int32_t result = 0;
     58 
     59    // Find the span of characters included in the set.
     60    //   The span to break begins at the current position in the text, and
     61    //   extends towards the start or end of the text, depending on 'reverse'.
     62 
     63    utext_setNativeIndex(text, startPos);
     64    int32_t start = static_cast<int32_t>(utext_getNativeIndex(text));
     65    int32_t current;
     66    int32_t rangeStart;
     67    int32_t rangeEnd;
     68    UChar32 c = utext_current32(text);
     69    while ((current = static_cast<int32_t>(utext_getNativeIndex(text))) < endPos && fSet.contains(c)) {
     70        utext_next32(text);         // TODO:  recast loop for postincrement
     71        c = utext_current32(text);
     72    }
     73    rangeStart = start;
     74    rangeEnd = current;
     75    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
     76    utext_setNativeIndex(text, current);
     77    
     78    return result;
     79 }
     80 
     81 void
     82 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     83    fSet = set;
     84    // Compact for caching
     85    fSet.compact();
     86 }
     87 
     88 /*
     89 ******************************************************************
     90 * PossibleWord
     91 */
     92 
     93 // Helper class for improving readability of the Thai/Lao/Khmer word break
     94 // algorithm. The implementation is completely inline.
     95 
     96 // List size, limited by the maximum number of words in the dictionary
     97 // that form a nested sequence.
     98 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
     99 
    100 class PossibleWord {
    101 private:
    102    // list of word candidate lengths, in increasing length order
    103    // TODO: bytes would be sufficient for word lengths.
    104    int32_t   count;      // Count of candidates
    105    int32_t   prefix;     // The longest match with a dictionary word
    106    int32_t   offset;     // Offset in the text of these candidates
    107    int32_t   mark;       // The preferred candidate's offset
    108    int32_t   current;    // The candidate we're currently looking at
    109    int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
    110    int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
    111 
    112 public:
    113    PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
    114    ~PossibleWord() {}
    115  
    116    // Fill the list of candidates if needed, select the longest, and return the number found
    117    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
    118  
    119    // Select the currently marked candidate, point after it in the text, and invalidate self
    120    int32_t   acceptMarked( UText *text );
    121  
    122    // Back up from the current candidate to the next shorter one; return true if that exists
    123    // and point the text after it
    124    UBool     backUp( UText *text );
    125  
    126    // Return the longest prefix this candidate location shares with a dictionary word
    127    // Return value is in code points.
    128    int32_t   longestPrefix() { return prefix; }
    129  
    130    // Mark the current candidate as the one we like
    131    void      markCurrent() { mark = current; }
    132    
    133    // Get length in code points of the marked word.
    134    int32_t   markedCPLength() { return cpLengths[mark]; }
    135 };
    136 
    137 
    138 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
    139    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    140    int32_t start = static_cast<int32_t>(utext_getNativeIndex(text));
    141    if (start != offset) {
    142        offset = start;
    143        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
    144        // Dictionary leaves text after longest prefix, not longest word. Back up.
    145        if (count <= 0) {
    146            utext_setNativeIndex(text, start);
    147        }
    148    }
    149    if (count > 0) {
    150        utext_setNativeIndex(text, start+cuLengths[count-1]);
    151    }
    152    current = count-1;
    153    mark = current;
    154    return count;
    155 }
    156 
    157 int32_t
    158 PossibleWord::acceptMarked( UText *text ) {
    159    utext_setNativeIndex(text, offset + cuLengths[mark]);
    160    return cuLengths[mark];
    161 }
    162 
    163 
    164 UBool
    165 PossibleWord::backUp( UText *text ) {
    166    if (current > 0) {
    167        utext_setNativeIndex(text, offset + cuLengths[--current]);
    168        return true;
    169    }
    170    return false;
    171 }
    172 
    173 /*
    174 ******************************************************************
    175 * ThaiBreakEngine
    176 */
    177 
    178 // How many words in a row are "good enough"?
    179 static const int32_t THAI_LOOKAHEAD = 3;
    180 
    181 // Will not combine a non-word with a preceding dictionary word longer than this
    182 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
    183 
    184 // Will not combine a non-word that shares at least this much prefix with a
    185 // dictionary word, with a preceding word
    186 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
    187 
    188 // Elision character
    189 static const int32_t THAI_PAIYANNOI = 0x0E2F;
    190 
    191 // Repeat character
    192 static const int32_t THAI_MAIYAMOK = 0x0E46;
    193 
    194 // Minimum word size
    195 static const int32_t THAI_MIN_WORD = 2;
    196 
    197 // Minimum number of characters for two words
    198 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
    199 
    200 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    201    : DictionaryBreakEngine(),
    202      fDictionary(adoptDictionary)
    203 {
    204    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
    205    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
    206    UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
    207    if (U_SUCCESS(status)) {
    208        setCharacters(thaiWordSet);
    209    }
    210    fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
    211    fMarkSet.add(0x0020);
    212    fEndWordSet = thaiWordSet;
    213    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    214    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    215    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    216    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    217    fSuffixSet.add(THAI_PAIYANNOI);
    218    fSuffixSet.add(THAI_MAIYAMOK);
    219 
    220    // Compact for caching.
    221    fMarkSet.compact();
    222    fEndWordSet.compact();
    223    fBeginWordSet.compact();
    224    fSuffixSet.compact();
    225    UTRACE_EXIT_STATUS(status);
    226 }
    227 
    228 ThaiBreakEngine::~ThaiBreakEngine() {
    229    delete fDictionary;
    230 }
    231 
    232 int32_t
    233 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
    234                                                int32_t rangeStart,
    235                                                int32_t rangeEnd,
    236                                                UVector32 &foundBreaks,
    237                                                UBool /* isPhraseBreaking */,
    238                                                UErrorCode& status) const {
    239    if (U_FAILURE(status)) return 0;
    240    utext_setNativeIndex(text, rangeStart);
    241    utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
    242    if (utext_getNativeIndex(text) >= rangeEnd) {
    243        return 0;       // Not enough characters for two words
    244    }
    245    utext_setNativeIndex(text, rangeStart);
    246 
    247 
    248    uint32_t wordsFound = 0;
    249    int32_t cpWordLength = 0;    // Word Length in Code Points.
    250    int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
    251    int32_t current;
    252    PossibleWord words[THAI_LOOKAHEAD];
    253    
    254    utext_setNativeIndex(text, rangeStart);
    255    
    256    while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {
    257        cpWordLength = 0;
    258        cuWordLength = 0;
    259 
    260        // Look for candidate words at the current position
    261        int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    262        
    263        // If we found exactly one, use that
    264        if (candidates == 1) {
    265            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    266            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
    267            wordsFound += 1;
    268        }
    269        // If there was more than one, see which one can take us forward the most words
    270        else if (candidates > 1) {
    271            // If we're already at the end of the range, we're done
    272            if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {
    273                goto foundBest;
    274            }
    275            do {
    276                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    277                    // Followed by another dictionary word; mark first word as a good candidate
    278                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
    279                    
    280                    // If we're already at the end of the range, we're done
    281                    if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {
    282                        goto foundBest;
    283                    }
    284                    
    285                    // See if any of the possible second words is followed by a third word
    286                    do {
    287                        // If we find a third word, stop right away
    288                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    289                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
    290                            goto foundBest;
    291                        }
    292                    }
    293                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
    294                }
    295            }
    296            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
    297 foundBest:
    298            // Set UText position to after the accepted word.
    299            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
    300            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
    301            wordsFound += 1;
    302        }
    303        
    304        // We come here after having either found a word or not. We look ahead to the
    305        // next word. If it's not a dictionary word, we will combine it with the word we
    306        // just found (if there is one), but only if the preceding word does not exceed
    307        // the threshold.
    308        // The text iterator should now be positioned at the end of the word we found.
    309        
    310        UChar32 uc = 0;
    311        if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
    312            // if it is a dictionary word, do nothing. If it isn't, then if there is
    313            // no preceding word, or the non-word shares less than the minimum threshold
    314            // of characters with a dictionary word, then scan to resynchronize
    315            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    316                  && (cuWordLength == 0
    317                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
    318                // Look for a plausible word boundary
    319                int32_t remaining = rangeEnd - (current+cuWordLength);
    320                UChar32 pc;
    321                int32_t chars = 0;
    322                for (;;) {
    323                    int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    324                    pc = utext_next32(text);
    325                    int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;
    326                    chars += pcSize;
    327                    remaining -= pcSize;
    328                    if (remaining <= 0) {
    329                        break;
    330                    }
    331                    uc = utext_current32(text);
    332                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    333                        // Maybe. See if it's in the dictionary.
    334                        // NOTE: In the original Apple code, checked that the next
    335                        // two characters after uc were not 0x0E4C THANTHAKHAT before
    336                        // checking the dictionary. That is just a performance filter,
    337                        // but it's not clear it's faster than checking the trie.
    338                        int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    339                        utext_setNativeIndex(text, current + cuWordLength + chars);
    340                        if (num_candidates > 0) {
    341                            break;
    342                        }
    343                    }
    344                }
    345                
    346                // Bump the word count if there wasn't already one
    347                if (cuWordLength <= 0) {
    348                    wordsFound += 1;
    349                }
    350                
    351                // Update the length with the passed-over characters
    352                cuWordLength += chars;
    353            }
    354            else {
    355                // Back up to where we were for next iteration
    356                utext_setNativeIndex(text, current+cuWordLength);
    357            }
    358        }
    359 
    360        // Never stop before a combining mark.
    361        int32_t currPos;
    362        while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    363            utext_next32(text);
    364            cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;
    365        }
    366 
    367        // Look ahead for possible suffixes if a dictionary word does not follow.
    368        // We do this in code rather than using a rule so that the heuristic
    369        // resynch continues to function. For example, one of the suffix characters
    370        // could be a typo in the middle of a word.
    371        if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cuWordLength > 0) {
    372            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    373                && fSuffixSet.contains(uc = utext_current32(text))) {
    374                if (uc == THAI_PAIYANNOI) {
    375                    if (!fSuffixSet.contains(utext_previous32(text))) {
    376                        // Skip over previous end and PAIYANNOI
    377                        utext_next32(text);
    378                        int32_t paiyannoiIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    379                        utext_next32(text);
    380                        cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - paiyannoiIndex; // Add PAIYANNOI to word
    381                        uc = utext_current32(text);     // Fetch next character
    382                    }
    383                    else {
    384                        // Restore prior position
    385                        utext_next32(text);
    386                    }
    387                }
    388                if (uc == THAI_MAIYAMOK) {
    389                    if (utext_previous32(text) != THAI_MAIYAMOK) {
    390                        // Skip over previous end and MAIYAMOK
    391                        utext_next32(text);
    392                        int32_t maiyamokIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    393                        utext_next32(text);
    394                        cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - maiyamokIndex; // Add MAIYAMOK to word
    395                    }
    396                    else {
    397                        // Restore prior position
    398                        utext_next32(text);
    399                    }
    400                }
    401            }
    402            else {
    403                utext_setNativeIndex(text, current+cuWordLength);
    404            }
    405        }
    406 
    407        // Did we find a word on this iteration? If so, push it on the break stack
    408        if (cuWordLength > 0) {
    409            foundBreaks.push((current+cuWordLength), status);
    410        }
    411    }
    412 
    413    // Don't return a break for the end of the dictionary range if there is one there.
    414    if (foundBreaks.peeki() >= rangeEnd) {
    415        (void) foundBreaks.popi();
    416        wordsFound -= 1;
    417    }
    418 
    419    return wordsFound;
    420 }
    421 
    422 /*
    423 ******************************************************************
    424 * LaoBreakEngine
    425 */
    426 
    427 // How many words in a row are "good enough"?
    428 static const int32_t LAO_LOOKAHEAD = 3;
    429 
    430 // Will not combine a non-word with a preceding dictionary word longer than this
    431 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
    432 
    433 // Will not combine a non-word that shares at least this much prefix with a
    434 // dictionary word, with a preceding word
    435 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
    436 
    437 // Minimum word size
    438 static const int32_t LAO_MIN_WORD = 2;
    439 
    440 // Minimum number of characters for two words
    441 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
    442 
    443 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    444    : DictionaryBreakEngine(),
    445      fDictionary(adoptDictionary)
    446 {
    447    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
    448    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
    449    UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
    450    if (U_SUCCESS(status)) {
    451        setCharacters(laoWordSet);
    452    }
    453    fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
    454    fMarkSet.add(0x0020);
    455    fEndWordSet = laoWordSet;
    456    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
    457    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
    458    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
    459    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
    460 
    461    // Compact for caching.
    462    fMarkSet.compact();
    463    fEndWordSet.compact();
    464    fBeginWordSet.compact();
    465    UTRACE_EXIT_STATUS(status);
    466 }
    467 
    468 LaoBreakEngine::~LaoBreakEngine() {
    469    delete fDictionary;
    470 }
    471 
    472 int32_t
    473 LaoBreakEngine::divideUpDictionaryRange( UText *text,
    474                                                int32_t rangeStart,
    475                                                int32_t rangeEnd,
    476                                                UVector32 &foundBreaks,
    477                                                UBool /* isPhraseBreaking */,
    478                                                UErrorCode& status) const {
    479    if (U_FAILURE(status)) return 0;
    480    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
    481        return 0;       // Not enough characters for two words
    482    }
    483 
    484    uint32_t wordsFound = 0;
    485    int32_t cpWordLength = 0;
    486    int32_t cuWordLength = 0;
    487    int32_t current;
    488    PossibleWord words[LAO_LOOKAHEAD];
    489 
    490    utext_setNativeIndex(text, rangeStart);
    491 
    492    while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {
    493        cuWordLength = 0;
    494        cpWordLength = 0;
    495 
    496        // Look for candidate words at the current position
    497        int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    498        
    499        // If we found exactly one, use that
    500        if (candidates == 1) {
    501            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    502            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
    503            wordsFound += 1;
    504        }
    505        // If there was more than one, see which one can take us forward the most words
    506        else if (candidates > 1) {
    507            // If we're already at the end of the range, we're done
    508            if (utext_getNativeIndex(text) >= rangeEnd) {
    509                goto foundBest;
    510            }
    511            do {
    512                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    513                    // Followed by another dictionary word; mark first word as a good candidate
    514                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
    515                    
    516                    // If we're already at the end of the range, we're done
    517                    if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {
    518                        goto foundBest;
    519                    }
    520                    
    521                    // See if any of the possible second words is followed by a third word
    522                    do {
    523                        // If we find a third word, stop right away
    524                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    525                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
    526                            goto foundBest;
    527                        }
    528                    }
    529                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
    530                }
    531            }
    532            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
    533 foundBest:
    534            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
    535            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
    536            wordsFound += 1;
    537        }
    538        
    539        // We come here after having either found a word or not. We look ahead to the
    540        // next word. If it's not a dictionary word, we will combine it with the word we
    541        // just found (if there is one), but only if the preceding word does not exceed
    542        // the threshold.
    543        // The text iterator should now be positioned at the end of the word we found.
    544        if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
    545            // if it is a dictionary word, do nothing. If it isn't, then if there is
    546            // no preceding word, or the non-word shares less than the minimum threshold
    547            // of characters with a dictionary word, then scan to resynchronize
    548            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    549                  && (cuWordLength == 0
    550                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
    551                // Look for a plausible word boundary
    552                int32_t remaining = rangeEnd - (current + cuWordLength);
    553                UChar32 pc;
    554                UChar32 uc;
    555                int32_t chars = 0;
    556                for (;;) {
    557                    int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    558                    pc = utext_next32(text);
    559                    int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;
    560                    chars += pcSize;
    561                    remaining -= pcSize;
    562                    if (remaining <= 0) {
    563                        break;
    564                    }
    565                    uc = utext_current32(text);
    566                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    567                        // Maybe. See if it's in the dictionary.
    568                        // TODO: this looks iffy; compare with old code.
    569                        int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    570                        utext_setNativeIndex(text, current + cuWordLength + chars);
    571                        if (num_candidates > 0) {
    572                            break;
    573                        }
    574                    }
    575                }
    576                
    577                // Bump the word count if there wasn't already one
    578                if (cuWordLength <= 0) {
    579                    wordsFound += 1;
    580                }
    581                
    582                // Update the length with the passed-over characters
    583                cuWordLength += chars;
    584            }
    585            else {
    586                // Back up to where we were for next iteration
    587                utext_setNativeIndex(text, current + cuWordLength);
    588            }
    589        }
    590        
    591        // Never stop before a combining mark.
    592        int32_t currPos;
    593        while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    594            utext_next32(text);
    595            cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;
    596        }
    597        
    598        // Look ahead for possible suffixes if a dictionary word does not follow.
    599        // We do this in code rather than using a rule so that the heuristic
    600        // resynch continues to function. For example, one of the suffix characters
    601        // could be a typo in the middle of a word.
    602        // NOT CURRENTLY APPLICABLE TO LAO
    603 
    604        // Did we find a word on this iteration? If so, push it on the break stack
    605        if (cuWordLength > 0) {
    606            foundBreaks.push((current+cuWordLength), status);
    607        }
    608    }
    609 
    610    // Don't return a break for the end of the dictionary range if there is one there.
    611    if (foundBreaks.peeki() >= rangeEnd) {
    612        (void) foundBreaks.popi();
    613        wordsFound -= 1;
    614    }
    615 
    616    return wordsFound;
    617 }
    618 
    619 /*
    620 ******************************************************************
    621 * BurmeseBreakEngine
    622 */
    623 
    624 // How many words in a row are "good enough"?
    625 static const int32_t BURMESE_LOOKAHEAD = 3;
    626 
    627 // Will not combine a non-word with a preceding dictionary word longer than this
    628 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
    629 
    630 // Will not combine a non-word that shares at least this much prefix with a
    631 // dictionary word, with a preceding word
    632 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
    633 
    634 // Minimum word size
    635 static const int32_t BURMESE_MIN_WORD = 2;
    636 
    637 // Minimum number of characters for two words
    638 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
    639 
    640 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    641    : DictionaryBreakEngine(),
    642      fDictionary(adoptDictionary)
    643 {
    644    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
    645    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
    646    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
    647    fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
    648    fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
    649    fMarkSet.add(0x0020);
    650    if (U_SUCCESS(status)) {
    651        setCharacters(fEndWordSet);
    652    }
    653 
    654    // Compact for caching.
    655    fMarkSet.compact();
    656    fEndWordSet.compact();
    657    fBeginWordSet.compact();
    658    UTRACE_EXIT_STATUS(status);
    659 }
    660 
    661 BurmeseBreakEngine::~BurmeseBreakEngine() {
    662    delete fDictionary;
    663 }
    664 
    665 int32_t
    666 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
    667                                                int32_t rangeStart,
    668                                                int32_t rangeEnd,
    669                                                UVector32 &foundBreaks,
    670                                                UBool /* isPhraseBreaking */,
    671                                                UErrorCode& status ) const {
    672    if (U_FAILURE(status)) return 0;
    673    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
    674        return 0;       // Not enough characters for two words
    675    }
    676 
    677    uint32_t wordsFound = 0;
    678    int32_t cpWordLength = 0;
    679    int32_t cuWordLength = 0;
    680    int32_t current;
    681    PossibleWord words[BURMESE_LOOKAHEAD];
    682 
    683    utext_setNativeIndex(text, rangeStart);
    684 
    685    while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {
    686        cuWordLength = 0;
    687        cpWordLength = 0;
    688 
    689        // Look for candidate words at the current position
    690        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    691        
    692        // If we found exactly one, use that
    693        if (candidates == 1) {
    694            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
    695            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
    696            wordsFound += 1;
    697        }
    698        // If there was more than one, see which one can take us forward the most words
    699        else if (candidates > 1) {
    700            // If we're already at the end of the range, we're done
    701            if (utext_getNativeIndex(text) >= rangeEnd) {
    702                goto foundBest;
    703            }
    704            do {
    705                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    706                    // Followed by another dictionary word; mark first word as a good candidate
    707                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
    708                    
    709                    // If we're already at the end of the range, we're done
    710                    if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {
    711                        goto foundBest;
    712                    }
    713                    
    714                    // See if any of the possible second words is followed by a third word
    715                    do {
    716                        // If we find a third word, stop right away
    717                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    718                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
    719                            goto foundBest;
    720                        }
    721                    }
    722                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
    723                }
    724            }
    725            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
    726 foundBest:
    727            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
    728            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
    729            wordsFound += 1;
    730        }
    731        
    732        // We come here after having either found a word or not. We look ahead to the
    733        // next word. If it's not a dictionary word, we will combine it with the word we
    734        // just found (if there is one), but only if the preceding word does not exceed
    735        // the threshold.
    736        // The text iterator should now be positioned at the end of the word we found.
    737        if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
    738            // if it is a dictionary word, do nothing. If it isn't, then if there is
    739            // no preceding word, or the non-word shares less than the minimum threshold
    740            // of characters with a dictionary word, then scan to resynchronize
    741            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    742                  && (cuWordLength == 0
    743                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
    744                // Look for a plausible word boundary
    745                int32_t remaining = rangeEnd - (current + cuWordLength);
    746                UChar32 pc;
    747                UChar32 uc;
    748                int32_t chars = 0;
    749                for (;;) {
    750                    int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    751                    pc = utext_next32(text);
    752                    int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;
    753                    chars += pcSize;
    754                    remaining -= pcSize;
    755                    if (remaining <= 0) {
    756                        break;
    757                    }
    758                    uc = utext_current32(text);
    759                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    760                        // Maybe. See if it's in the dictionary.
    761                        // TODO: this looks iffy; compare with old code.
    762                        int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    763                        utext_setNativeIndex(text, current + cuWordLength + chars);
    764                        if (num_candidates > 0) {
    765                            break;
    766                        }
    767                    }
    768                }
    769                
    770                // Bump the word count if there wasn't already one
    771                if (cuWordLength <= 0) {
    772                    wordsFound += 1;
    773                }
    774                
    775                // Update the length with the passed-over characters
    776                cuWordLength += chars;
    777            }
    778            else {
    779                // Back up to where we were for next iteration
    780                utext_setNativeIndex(text, current + cuWordLength);
    781            }
    782        }
    783        
    784        // Never stop before a combining mark.
    785        int32_t currPos;
    786        while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    787            utext_next32(text);
    788            cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;
    789        }
    790        
    791        // Look ahead for possible suffixes if a dictionary word does not follow.
    792        // We do this in code rather than using a rule so that the heuristic
    793        // resynch continues to function. For example, one of the suffix characters
    794        // could be a typo in the middle of a word.
    795        // NOT CURRENTLY APPLICABLE TO BURMESE
    796 
    797        // Did we find a word on this iteration? If so, push it on the break stack
    798        if (cuWordLength > 0) {
    799            foundBreaks.push((current+cuWordLength), status);
    800        }
    801    }
    802 
    803    // Don't return a break for the end of the dictionary range if there is one there.
    804    if (foundBreaks.peeki() >= rangeEnd) {
    805        (void) foundBreaks.popi();
    806        wordsFound -= 1;
    807    }
    808 
    809    return wordsFound;
    810 }
    811 
    812 /*
    813 ******************************************************************
    814 * KhmerBreakEngine
    815 */
    816 
    817 // How many words in a row are "good enough"?
    818 static const int32_t KHMER_LOOKAHEAD = 3;
    819 
    820 // Will not combine a non-word with a preceding dictionary word longer than this
    821 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
    822 
    823 // Will not combine a non-word that shares at least this much prefix with a
    824 // dictionary word, with a preceding word
    825 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
    826 
    827 // Minimum word size
    828 static const int32_t KHMER_MIN_WORD = 2;
    829 
    830 // Minimum number of characters for two words
    831 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
    832 
    833 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    834    : DictionaryBreakEngine(),
    835      fDictionary(adoptDictionary)
    836 {
    837    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
    838    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
    839    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
    840    if (U_SUCCESS(status)) {
    841        setCharacters(khmerWordSet);
    842    }
    843    fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
    844    fMarkSet.add(0x0020);
    845    fEndWordSet = khmerWordSet;
    846    fBeginWordSet.add(0x1780, 0x17B3);
    847    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
    848    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
    849    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
    850    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
    851    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
    852 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
    853 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
    854 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
    855 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
    856 //    fSuffixSet.add(THAI_PAIYANNOI);
    857 //    fSuffixSet.add(THAI_MAIYAMOK);
    858 
    859    // Compact for caching.
    860    fMarkSet.compact();
    861    fEndWordSet.compact();
    862    fBeginWordSet.compact();
    863 //    fSuffixSet.compact();
    864    UTRACE_EXIT_STATUS(status);
    865 }
    866 
    867 KhmerBreakEngine::~KhmerBreakEngine() {
    868    delete fDictionary;
    869 }
    870 
    871 int32_t
    872 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
    873                                                int32_t rangeStart,
    874                                                int32_t rangeEnd,
    875                                                UVector32 &foundBreaks,
    876                                                UBool /* isPhraseBreaking */,
    877                                                UErrorCode& status ) const {
    878    if (U_FAILURE(status)) return 0;
    879    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
    880        return 0;       // Not enough characters for two words
    881    }
    882 
    883    uint32_t wordsFound = 0;
    884    int32_t cpWordLength = 0;
    885    int32_t cuWordLength = 0;
    886    int32_t current;
    887    PossibleWord words[KHMER_LOOKAHEAD];
    888 
    889    utext_setNativeIndex(text, rangeStart);
    890 
    891    while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) {
    892        cuWordLength = 0;
    893        cpWordLength = 0;
    894 
    895        // Look for candidate words at the current position
    896        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    897 
    898        // If we found exactly one, use that
    899        if (candidates == 1) {
    900            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    901            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
    902            wordsFound += 1;
    903        }
    904 
    905        // If there was more than one, see which one can take us forward the most words
    906        else if (candidates > 1) {
    907            // If we're already at the end of the range, we're done
    908            if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {
    909                goto foundBest;
    910            }
    911            do {
    912                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
    913                    // Followed by another dictionary word; mark first word as a good candidate
    914                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    915 
    916                    // If we're already at the end of the range, we're done
    917                    if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) {
    918                        goto foundBest;
    919                    }
    920 
    921                    // See if any of the possible second words is followed by a third word
    922                    do {
    923                        // If we find a third word, stop right away
    924                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
    925                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
    926                            goto foundBest;
    927                        }
    928                    }
    929                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
    930                }
    931            }
    932            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
    933 foundBest:
    934            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
    935            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
    936            wordsFound += 1;
    937        }
    938 
    939        // We come here after having either found a word or not. We look ahead to the
    940        // next word. If it's not a dictionary word, we will combine it with the word we
    941        // just found (if there is one), but only if the preceding word does not exceed
    942        // the threshold.
    943        // The text iterator should now be positioned at the end of the word we found.
    944        if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
    945            // if it is a dictionary word, do nothing. If it isn't, then if there is
    946            // no preceding word, or the non-word shares less than the minimum threshold
    947            // of characters with a dictionary word, then scan to resynchronize
    948            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
    949                  && (cuWordLength == 0
    950                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
    951                // Look for a plausible word boundary
    952                int32_t remaining = rangeEnd - (current+cuWordLength);
    953                UChar32 pc;
    954                UChar32 uc;
    955                int32_t chars = 0;
    956                for (;;) {
    957                    int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    958                    pc = utext_next32(text);
    959                    int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex;
    960                    chars += pcSize;
    961                    remaining -= pcSize;
    962                    if (remaining <= 0) {
    963                        break;
    964                    }
    965                    uc = utext_current32(text);
    966                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
    967                        // Maybe. See if it's in the dictionary.
    968                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
    969                        utext_setNativeIndex(text, current+cuWordLength+chars);
    970                        if (num_candidates > 0) {
    971                            break;
    972                        }
    973                    }
    974                }
    975 
    976                // Bump the word count if there wasn't already one
    977                if (cuWordLength <= 0) {
    978                    wordsFound += 1;
    979                }
    980 
    981                // Update the length with the passed-over characters
    982                cuWordLength += chars;
    983            }
    984            else {
    985                // Back up to where we were for next iteration
    986                utext_setNativeIndex(text, current+cuWordLength);
    987            }
    988        }
    989 
    990        // Never stop before a combining mark.
    991        int32_t currPos;
    992        while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
    993            utext_next32(text);
    994            cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos;
    995        }
    996 
    997        // Look ahead for possible suffixes if a dictionary word does not follow.
    998        // We do this in code rather than using a rule so that the heuristic
    999        // resynch continues to function. For example, one of the suffix characters
   1000        // could be a typo in the middle of a word.
   1001 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
   1002 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
   1003 //                && fSuffixSet.contains(uc = utext_current32(text))) {
   1004 //                if (uc == KHMER_PAIYANNOI) {
   1005 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
   1006 //                        // Skip over previous end and PAIYANNOI
   1007 //                        utext_next32(text);
   1008 //                        utext_next32(text);
   1009 //                        wordLength += 1;            // Add PAIYANNOI to word
   1010 //                        uc = utext_current32(text);     // Fetch next character
   1011 //                    }
   1012 //                    else {
   1013 //                        // Restore prior position
   1014 //                        utext_next32(text);
   1015 //                    }
   1016 //                }
   1017 //                if (uc == KHMER_MAIYAMOK) {
   1018 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
   1019 //                        // Skip over previous end and MAIYAMOK
   1020 //                        utext_next32(text);
   1021 //                        utext_next32(text);
   1022 //                        wordLength += 1;            // Add MAIYAMOK to word
   1023 //                    }
   1024 //                    else {
   1025 //                        // Restore prior position
   1026 //                        utext_next32(text);
   1027 //                    }
   1028 //                }
   1029 //            }
   1030 //            else {
   1031 //                utext_setNativeIndex(text, current+wordLength);
   1032 //            }
   1033 //        }
   1034 
   1035        // Did we find a word on this iteration? If so, push it on the break stack
   1036        if (cuWordLength > 0) {
   1037            foundBreaks.push((current+cuWordLength), status);
   1038        }
   1039    }
   1040    
   1041    // Don't return a break for the end of the dictionary range if there is one there.
   1042    if (foundBreaks.peeki() >= rangeEnd) {
   1043        (void) foundBreaks.popi();
   1044        wordsFound -= 1;
   1045    }
   1046 
   1047    return wordsFound;
   1048 }
   1049 
   1050 #if !UCONFIG_NO_NORMALIZATION
   1051 /*
   1052 ******************************************************************
   1053 * CjkBreakEngine
   1054 */
   1055 static const uint32_t kuint32max = 0xFFFFFFFF;
   1056 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
   1057 : DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {
   1058    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
   1059    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
   1060    fMlBreakEngine = nullptr;
   1061    nfkcNorm2 = Normalizer2::getNFKCInstance(status);
   1062    // Korean dictionary only includes Hangul syllables
   1063    fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
   1064    fHangulWordSet.compact();
   1065    // Digits, open puncutation and Alphabetic characters.
   1066    fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
   1067        UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
   1068    fDigitOrOpenPunctuationOrAlphabetSet.compact();
   1069    fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
   1070    fClosePunctuationSet.compact();
   1071 
   1072    // handle Korean and Japanese/Chinese using different dictionaries
   1073    if (type == kKorean) {
   1074        if (U_SUCCESS(status)) {
   1075            setCharacters(fHangulWordSet);
   1076        }
   1077    } else { // Chinese and Japanese
   1078        UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
   1079        isCj = true;
   1080        if (U_SUCCESS(status)) {
   1081            setCharacters(cjSet);
   1082 #if UCONFIG_USE_ML_PHRASE_BREAKING
   1083            fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
   1084                                               fClosePunctuationSet, status);
   1085            if (fMlBreakEngine == nullptr) {
   1086                status = U_MEMORY_ALLOCATION_ERROR;
   1087            }
   1088 #else
   1089            initJapanesePhraseParameter(status);
   1090 #endif
   1091        }
   1092    }
   1093    UTRACE_EXIT_STATUS(status);
   1094 }
   1095 
   1096 CjkBreakEngine::~CjkBreakEngine(){
   1097    delete fDictionary;
   1098    delete fMlBreakEngine;
   1099 }
   1100 
   1101 // The katakanaCost values below are based on the length frequencies of all
   1102 // katakana phrases in the dictionary
   1103 static const int32_t kMaxKatakanaLength = 8;
   1104 static const int32_t kMaxKatakanaGroupLength = 20;
   1105 static const uint32_t maxSnlp = 255;
   1106 
   1107 static inline uint32_t getKatakanaCost(int32_t wordLength){
   1108    //TODO: fill array with actual values from dictionary!
   1109    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
   1110                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
   1111    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
   1112 }
   1113 
   1114 static inline bool isKatakana(UChar32 value) {
   1115    return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
   1116            (value >= 0xFF66 && value <= 0xFF9f);
   1117 }
   1118 
   1119 // Function for accessing internal utext flags.
   1120 //   Replicates an internal UText function.
   1121 
   1122 static inline int32_t utext_i32_flag(int32_t bitIndex) {
   1123    return static_cast<int32_t>(1) << bitIndex;
   1124 }
   1125       
   1126 /*
   1127 * @param text A UText representing the text
   1128 * @param rangeStart The start of the range of dictionary characters
   1129 * @param rangeEnd The end of the range of dictionary characters
   1130 * @param foundBreaks vector<int32> to receive the break positions
   1131 * @return The number of breaks found
   1132 */
   1133 int32_t 
   1134 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
   1135        int32_t rangeStart,
   1136        int32_t rangeEnd,
   1137        UVector32 &foundBreaks,
   1138        UBool isPhraseBreaking,
   1139        UErrorCode& status) const {
   1140    if (U_FAILURE(status)) return 0;
   1141    if (rangeStart >= rangeEnd) {
   1142        return 0;
   1143    }
   1144 
   1145    // UnicodeString version of input UText, NFKC normalized if necessary.
   1146    UnicodeString inString;
   1147 
   1148    // inputMap[inStringIndex] = corresponding native index from UText inText.
   1149    // If nullptr then mapping is 1:1
   1150    LocalPointer<UVector32>     inputMap;
   1151 
   1152    // if UText has the input string as one contiguous UTF-16 chunk
   1153    if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
   1154         inText->chunkNativeStart <= rangeStart &&
   1155         inText->chunkNativeLimit >= rangeEnd   &&
   1156         inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
   1157 
   1158        // Input UText is in one contiguous UTF-16 chunk.
   1159        // Use Read-only aliasing UnicodeString.
   1160        inString.setTo(false,
   1161                       inText->chunkContents + rangeStart - inText->chunkNativeStart,
   1162                       rangeEnd - rangeStart);
   1163    } else {
   1164        // Copy the text from the original inText (UText) to inString (UnicodeString).
   1165        // Create a map from UnicodeString indices -> UText offsets.
   1166        utext_setNativeIndex(inText, rangeStart);
   1167        int32_t limit = rangeEnd;
   1168        U_ASSERT(limit <= utext_nativeLength(inText));
   1169        if (limit > utext_nativeLength(inText)) {
   1170            limit = static_cast<int32_t>(utext_nativeLength(inText));
   1171        }
   1172        inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
   1173        if (U_FAILURE(status)) {
   1174            return 0;
   1175        }
   1176        while (utext_getNativeIndex(inText) < limit) {
   1177            int32_t nativePosition = static_cast<int32_t>(utext_getNativeIndex(inText));
   1178            UChar32 c = utext_next32(inText);
   1179            U_ASSERT(c != U_SENTINEL);
   1180            inString.append(c);
   1181            while (inputMap->size() < inString.length()) {
   1182                inputMap->addElement(nativePosition, status);
   1183            }
   1184        }
   1185        inputMap->addElement(limit, status);
   1186    }
   1187 
   1188 
   1189    if (!nfkcNorm2->isNormalized(inString, status)) {
   1190        UnicodeString normalizedInput;
   1191        //  normalizedMap[normalizedInput position] ==  original UText position.
   1192        LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
   1193        if (U_FAILURE(status)) {
   1194            return 0;
   1195        }
   1196        
   1197        UnicodeString fragment;
   1198        UnicodeString normalizedFragment;
   1199        for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
   1200            fragment.remove();
   1201            int32_t fragmentStartI = srcI;
   1202            UChar32 c = inString.char32At(srcI);
   1203            for (;;) {
   1204                fragment.append(c);
   1205                srcI = inString.moveIndex32(srcI, 1);
   1206                if (srcI == inString.length()) {
   1207                    break;
   1208                }
   1209                c = inString.char32At(srcI);
   1210                if (nfkcNorm2->hasBoundaryBefore(c)) {
   1211                    break;
   1212                }
   1213            }
   1214            nfkcNorm2->normalize(fragment, normalizedFragment, status);
   1215            normalizedInput.append(normalizedFragment);
   1216 
   1217            // Map every position in the normalized chunk to the start of the chunk
   1218            //   in the original input.
   1219            int32_t fragmentOriginalStart = inputMap.isValid() ?
   1220                    inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
   1221            while (normalizedMap->size() < normalizedInput.length()) {
   1222                normalizedMap->addElement(fragmentOriginalStart, status);
   1223                if (U_FAILURE(status)) {
   1224                    break;
   1225                }
   1226            }
   1227        }
   1228        U_ASSERT(normalizedMap->size() == normalizedInput.length());
   1229        int32_t nativeEnd = inputMap.isValid() ?
   1230                inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
   1231        normalizedMap->addElement(nativeEnd, status);
   1232 
   1233        inputMap = std::move(normalizedMap);
   1234        inString = std::move(normalizedInput);
   1235    }
   1236 
   1237    int32_t numCodePts = inString.countChar32();
   1238    if (numCodePts != inString.length()) {
   1239        // There are supplementary characters in the input.
   1240        // The dictionary will produce boundary positions in terms of code point indexes,
   1241        //   not in terms of code unit string indexes.
   1242        // Use the inputMap mechanism to take care of this in addition to indexing differences
   1243        //    from normalization and/or UTF-8 input.
   1244        UBool hadExistingMap = inputMap.isValid();
   1245        if (!hadExistingMap) {
   1246            inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
   1247            if (U_FAILURE(status)) {
   1248                return 0;
   1249            }
   1250        }
   1251        int32_t cpIdx = 0;
   1252        for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
   1253            U_ASSERT(cuIdx >= cpIdx);
   1254            if (hadExistingMap) {
   1255                inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
   1256            } else {
   1257                inputMap->addElement(cuIdx+rangeStart, status);
   1258            }
   1259            cpIdx++;
   1260            if (cuIdx == inString.length()) {
   1261               break;
   1262            }
   1263        }
   1264    }
   1265 
   1266 #if UCONFIG_USE_ML_PHRASE_BREAKING
   1267    // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
   1268    if (isPhraseBreaking && isCj) {
   1269        return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
   1270                                             inputMap, status);
   1271    }
   1272 #endif
   1273 
   1274    // bestSnlp[i] is the snlp of the best segmentation of the first i
   1275    // code points in the range to be matched.
   1276    UVector32 bestSnlp(numCodePts + 1, status);
   1277    bestSnlp.addElement(0, status);
   1278    for(int32_t i = 1; i <= numCodePts; i++) {
   1279        bestSnlp.addElement(kuint32max, status);
   1280    }
   1281 
   1282 
   1283    // prev[i] is the index of the last CJK code point in the previous word in 
   1284    // the best segmentation of the first i characters.
   1285    UVector32 prev(numCodePts + 1, status);
   1286    for(int32_t i = 0; i <= numCodePts; i++){
   1287        prev.addElement(-1, status);
   1288    }
   1289 
   1290    const int32_t maxWordSize = 20;
   1291    UVector32 values(numCodePts, status);
   1292    values.setSize(numCodePts);
   1293    UVector32 lengths(numCodePts, status);
   1294    lengths.setSize(numCodePts);
   1295 
   1296    UText fu = UTEXT_INITIALIZER;
   1297    utext_openUnicodeString(&fu, &inString, &status);
   1298 
   1299    // Dynamic programming to find the best segmentation.
   1300 
   1301    // In outer loop, i  is the code point index,
   1302    //                ix is the corresponding string (code unit) index.
   1303    //    They differ when the string contains supplementary characters.
   1304    int32_t ix = 0;
   1305    bool is_prev_katakana = false;
   1306    for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
   1307        if (static_cast<uint32_t>(bestSnlp.elementAti(i)) == kuint32max) {
   1308            continue;
   1309        }
   1310 
   1311        int32_t count;
   1312        utext_setNativeIndex(&fu, ix);
   1313        count = fDictionary->matches(&fu, maxWordSize, numCodePts,
   1314                             nullptr, lengths.getBuffer(), values.getBuffer(), nullptr);
   1315                             // Note: lengths is filled with code point lengths
   1316                             //       The nullptr parameter is the ignored code unit lengths.
   1317 
   1318        // if there are no single character matches found in the dictionary 
   1319        // starting with this character, treat character as a 1-character word 
   1320        // with the highest value possible, i.e. the least likely to occur.
   1321        // Exclude Korean characters from this treatment, as they should be left
   1322        // together by default.
   1323        if ((count == 0 || lengths.elementAti(0) != 1) &&
   1324                !fHangulWordSet.contains(inString.char32At(ix))) {
   1325            values.setElementAt(maxSnlp, count);   // 255
   1326            lengths.setElementAt(1, count++);
   1327        }
   1328 
   1329        for (int32_t j = 0; j < count; j++) {
   1330            uint32_t newSnlp = static_cast<uint32_t>(bestSnlp.elementAti(i)) + static_cast<uint32_t>(values.elementAti(j));
   1331            int32_t ln_j_i = lengths.elementAti(j) + i;
   1332            if (newSnlp < static_cast<uint32_t>(bestSnlp.elementAti(ln_j_i))) {
   1333                bestSnlp.setElementAt(newSnlp, ln_j_i);
   1334                prev.setElementAt(i, ln_j_i);
   1335            }
   1336        }
   1337 
   1338        // In Japanese,
   1339        // Katakana word in single character is pretty rare. So we apply
   1340        // the following heuristic to Katakana: any continuous run of Katakana
   1341        // characters is considered a candidate word with a default cost
   1342        // specified in the katakanaCost table according to its length.
   1343 
   1344        bool is_katakana = isKatakana(inString.char32At(ix));
   1345        int32_t katakanaRunLength = 1;
   1346        if (!is_prev_katakana && is_katakana) {
   1347            int32_t j = inString.moveIndex32(ix, 1);
   1348            // Find the end of the continuous run of Katakana characters
   1349            while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
   1350                    isKatakana(inString.char32At(j))) {
   1351                j = inString.moveIndex32(j, 1);
   1352                katakanaRunLength++;
   1353            }
   1354            if (katakanaRunLength < kMaxKatakanaGroupLength) {
   1355                uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
   1356                if (newSnlp < static_cast<uint32_t>(bestSnlp.elementAti(i + katakanaRunLength))) {
   1357                    bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
   1358                    prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
   1359                }
   1360            }
   1361        }
   1362        is_prev_katakana = is_katakana;
   1363    }
   1364    utext_close(&fu);
   1365 
   1366    // Start pushing the optimal offset index into t_boundary (t for tentative).
   1367    // prev[numCodePts] is guaranteed to be meaningful.
   1368    // We'll first push in the reverse order, i.e.,
   1369    // t_boundary[0] = numCodePts, and afterwards do a swap.
   1370    UVector32 t_boundary(numCodePts+1, status);
   1371 
   1372    int32_t numBreaks = 0;
   1373    // No segmentation found, set boundary to end of range
   1374    if (static_cast<uint32_t>(bestSnlp.elementAti(numCodePts)) == kuint32max) {
   1375        t_boundary.addElement(numCodePts, status);
   1376        numBreaks++;
   1377    } else if (isPhraseBreaking) {
   1378        t_boundary.addElement(numCodePts, status);
   1379        if(U_SUCCESS(status)) {
   1380            numBreaks++;
   1381            int32_t prevIdx = numCodePts;
   1382 
   1383            int32_t codeUnitIdx = -1;
   1384            int32_t prevCodeUnitIdx = -1;
   1385            int32_t length = -1;
   1386            for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
   1387                codeUnitIdx = inString.moveIndex32(0, i);
   1388                prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
   1389                // Calculate the length by using the code unit.
   1390                length = prevCodeUnitIdx - codeUnitIdx;
   1391                prevIdx = i;
   1392                // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
   1393                // characters don't occur.
   1394                if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
   1395                    && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
   1396                           || !isKatakana(inString.char32At(codeUnitIdx)))) {
   1397                    t_boundary.addElement(i, status);
   1398                    numBreaks++;
   1399                }
   1400            }
   1401        }
   1402    } else {
   1403        for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
   1404            t_boundary.addElement(i, status);
   1405            numBreaks++;
   1406        }
   1407        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
   1408    }
   1409 
   1410    // Add a break for the start of the dictionary range if there is not one
   1411    // there already.
   1412    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
   1413        t_boundary.addElement(0, status);
   1414        numBreaks++;
   1415    }
   1416 
   1417    // Now that we're done, convert positions in t_boundary[] (indices in 
   1418    // the normalized input string) back to indices in the original input UText
   1419    // while reversing t_boundary and pushing values to foundBreaks.
   1420    int32_t prevCPPos = -1;
   1421    int32_t prevUTextPos = -1;
   1422    int32_t correctedNumBreaks = 0;
   1423    for (int32_t i = numBreaks - 1; i >= 0; i--) {
   1424        int32_t cpPos = t_boundary.elementAti(i);
   1425        U_ASSERT(cpPos > prevCPPos);
   1426        int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
   1427        U_ASSERT(utextPos >= prevUTextPos);
   1428        if (utextPos > prevUTextPos) {
   1429            // Boundaries are added to foundBreaks output in ascending order.
   1430            U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
   1431            // In phrase breaking, there has to be a breakpoint between Cj character and close
   1432            // punctuation.
   1433            // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
   1434            if (utextPos != rangeStart
   1435                || (isPhraseBreaking && utextPos > 0
   1436                       && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
   1437                foundBreaks.push(utextPos, status);
   1438                correctedNumBreaks++;
   1439            }
   1440        } else {
   1441            // Normalization expanded the input text, the dictionary found a boundary
   1442            // within the expansion, giving two boundaries with the same index in the
   1443            // original text. Ignore the second. See ticket #12918.
   1444            --numBreaks;
   1445        }
   1446        prevCPPos = cpPos;
   1447        prevUTextPos = utextPos;
   1448    }
   1449    (void)prevCPPos; // suppress compiler warnings about unused variable
   1450 
   1451    UChar32 nextChar = utext_char32At(inText, rangeEnd);
   1452    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
   1453        // In phrase breaking, there has to be a breakpoint between Cj character and
   1454        // the number/open punctuation.
   1455        // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
   1456        // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
   1457        // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
   1458        if (isPhraseBreaking) {
   1459            if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
   1460                foundBreaks.popi();
   1461                correctedNumBreaks--;
   1462            }
   1463        } else {
   1464            foundBreaks.popi();
   1465            correctedNumBreaks--;
   1466        }
   1467    }
   1468 
   1469    // inString goes out of scope
   1470    // inputMap goes out of scope
   1471    return correctedNumBreaks;
   1472 }
   1473 
   1474 void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
   1475    loadJapaneseExtensions(error);
   1476    loadHiragana(error);
   1477 }
   1478 
   1479 void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
   1480    const char* tag = "extensions";
   1481    ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
   1482    if (U_SUCCESS(error)) {
   1483        ResourceBundle bundle = ja.get(tag, error);
   1484        while (U_SUCCESS(error) && bundle.hasNext()) {
   1485            fSkipSet.puti(bundle.getNextString(error), 1, error);
   1486        }
   1487    }
   1488 }
   1489 
   1490 void CjkBreakEngine::loadHiragana(UErrorCode& error) {
   1491    UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
   1492    hiraganaWordSet.compact();
   1493    UnicodeSetIterator iterator(hiraganaWordSet);
   1494    while (iterator.next()) {
   1495        fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
   1496    }
   1497 }
   1498 #endif
   1499 
   1500 U_NAMESPACE_END
   1501 
   1502 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE