[ tor-browser ].git.dasho

rbbi_cache.cpp (26912B)
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // file: rbbi_cache.cpp
      5 
      6 #include "unicode/utypes.h"
      7 
      8 #if !UCONFIG_NO_BREAK_ITERATION
      9 
     10 #include "unicode/ubrk.h"
     11 #include "unicode/rbbi.h"
     12 
     13 #include "rbbi_cache.h"
     14 
     15 #include "brkeng.h"
     16 #include "cmemory.h"
     17 #include "rbbidata.h"
     18 #include "rbbirb.h"
     19 #include "uassert.h"
     20 #include "uvectr32.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 /*
     25 * DictionaryCache implementation
     26 */
     27 
     28 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
     29        fBI(bi), fBreaks(status), fPositionInCache(-1),
     30        fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
     31 }
     32 
     33 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
     34 }
     35 
     36 void RuleBasedBreakIterator::DictionaryCache::reset() {
     37    fPositionInCache = -1;
     38    fStart = 0;
     39    fLimit = 0;
     40    fFirstRuleStatusIndex = 0;
     41    fOtherRuleStatusIndex = 0;
     42    fBreaks.removeAllElements();
     43 }
     44 
     45 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
     46    if (fromPos >= fLimit || fromPos < fStart) {
     47        fPositionInCache = -1;
     48        return false;
     49    }
     50 
     51    // Sequential iteration, move from previous boundary to the following
     52 
     53    int32_t r = 0;
     54    if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
     55        ++fPositionInCache;
     56        if (fPositionInCache >= fBreaks.size()) {
     57            fPositionInCache = -1;
     58            return false;
     59        }
     60        r = fBreaks.elementAti(fPositionInCache);
     61        U_ASSERT(r > fromPos);
     62        *result = r;
     63        *statusIndex = fOtherRuleStatusIndex;
     64        return true;
     65    }
     66 
     67    // Random indexing. Linear search for the boundary following the given position.
     68 
     69    for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
     70        r= fBreaks.elementAti(fPositionInCache);
     71        if (r > fromPos) {
     72            *result = r;
     73            *statusIndex = fOtherRuleStatusIndex;
     74            return true;
     75        }
     76    }
     77    UPRV_UNREACHABLE_EXIT;
     78 }
     79 
     80 
     81 UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
     82    if (fromPos <= fStart || fromPos > fLimit) {
     83        fPositionInCache = -1;
     84        return false;
     85    }
     86 
     87    if (fromPos == fLimit) {
     88        fPositionInCache = fBreaks.size() - 1;
     89        if (fPositionInCache >= 0) {
     90            U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
     91        }
     92    }
     93 
     94    int32_t r;
     95    if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
     96        --fPositionInCache;
     97        r = fBreaks.elementAti(fPositionInCache);
     98        U_ASSERT(r < fromPos);
     99        *result = r;
    100        *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
    101        return true;
    102    }
    103 
    104    if (fPositionInCache == 0) {
    105        fPositionInCache = -1;
    106        return false;
    107    }
    108 
    109    for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
    110        r = fBreaks.elementAti(fPositionInCache);
    111        if (r < fromPos) {
    112            *result = r;
    113            *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
    114            return true;
    115        }
    116    }
    117    UPRV_UNREACHABLE_EXIT;
    118 }
    119 
    120 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
    121                                       int32_t firstRuleStatus, int32_t otherRuleStatus) {
    122    if ((endPos - startPos) <= 1) {
    123        return;
    124    }
    125 
    126    reset();
    127    fFirstRuleStatusIndex = firstRuleStatus;
    128    fOtherRuleStatusIndex = otherRuleStatus;
    129 
    130    int32_t rangeStart = startPos;
    131    int32_t rangeEnd = endPos;
    132 
    133    uint16_t    category;
    134    int32_t     current;
    135    UErrorCode  status = U_ZERO_ERROR;
    136    int32_t     foundBreakCount = 0;
    137    UText      *text = &fBI->fText;
    138 
    139    // Loop through the text, looking for ranges of dictionary characters.
    140    // For each span, find the appropriate break engine, and ask it to find
    141    // any breaks within the span.
    142 
    143    utext_setNativeIndex(text, rangeStart);
    144    UChar32     c = utext_current32(text);
    145    category = ucptrie_get(fBI->fData->fTrie, c);
    146    uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
    147 
    148    while(U_SUCCESS(status)) {
    149        while ((current = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(text))) < rangeEnd
    150                && (category < dictStart)) {
    151            utext_next32(text);           // TODO: cleaner loop structure.
    152            c = utext_current32(text);
    153            category = ucptrie_get(fBI->fData->fTrie, c);
    154        }
    155        if (current >= rangeEnd) {
    156            break;
    157        }
    158 
    159        // We now have a dictionary character. Get the appropriate language object
    160        // to deal with it.
    161        const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
    162            c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
    163 
    164        // Ask the language object if there are any breaks. It will add them to the cache and
    165        // leave the text pointer on the other side of its range, ready to search for the next one.
    166        if (lbe != nullptr) {
    167            foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
    168        }
    169 
    170        // Reload the loop variables for the next go-round
    171        c = utext_current32(text);
    172        category = ucptrie_get(fBI->fData->fTrie, c);
    173    }
    174 
    175    // If we found breaks, ensure that the first and last entries are
    176    // the original starting and ending position. And initialize the
    177    // cache iteration position to the first entry.
    178 
    179    // printf("foundBreakCount = %d\n", foundBreakCount);
    180    if (foundBreakCount > 0) {
    181        U_ASSERT(foundBreakCount == fBreaks.size());
    182        if (startPos < fBreaks.elementAti(0)) {
    183            // The dictionary did not place a boundary at the start of the segment of text.
    184            // Add one now. This should not commonly happen, but it would be easy for interactions
    185            // of the rules for dictionary segments and the break engine implementations to
    186            // inadvertently cause it. Cover it here, just in case.
    187            fBreaks.insertElementAt(startPos, 0, status);
    188        }
    189        if (endPos > fBreaks.peeki()) {
    190            fBreaks.push(endPos, status);
    191        }
    192        fPositionInCache = 0;
    193        // Note: Dictionary matching may extend beyond the original limit.
    194        fStart = fBreaks.elementAti(0);
    195        fLimit = fBreaks.peeki();
    196    } else {
    197        // there were no language-based breaks, even though the segment contained
    198        // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
    199        // for this range will fail, and the calling code will fall back to the rule based boundaries.
    200    }
    201 }
    202 
    203 
    204 /*
    205 *   BreakCache implementation
    206 */
    207 
    208 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
    209        fBI(bi), fSideBuffer(status) {
    210    reset();
    211 }
    212 
    213 
    214 RuleBasedBreakIterator::BreakCache::~BreakCache() {
    215 }
    216 
    217 
    218 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
    219    fStartBufIdx = 0;
    220    fEndBufIdx = 0;
    221    fTextIdx = pos;
    222    fBufIdx = 0;
    223    fBoundaries[0] = pos;
    224    fStatuses[0] = static_cast<uint16_t>(ruleStatus);
    225 }
    226 
    227 
    228 int32_t  RuleBasedBreakIterator::BreakCache::current() {
    229    fBI->fPosition = fTextIdx;
    230    fBI->fRuleStatusIndex = fStatuses[fBufIdx];
    231    fBI->fDone = false;
    232    return fTextIdx;
    233 }
    234 
    235 
    236 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
    237    if (U_FAILURE(status)) {
    238        return;
    239    }
    240    if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
    241        // startPos is in the cache. Do a next() from that position.
    242        // TODO: an awkward set of interactions with bi->fDone
    243        //       seek() does not clear it; it can't because of interactions with populateNear().
    244        //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
    245        //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
    246        fBI->fDone = false;
    247        next();
    248    }
    249 }
    250 
    251 
    252 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
    253    if (U_FAILURE(status)) {
    254        return;
    255    }
    256    if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
    257        if (startPos == fTextIdx) {
    258            previous(status);
    259        } else {
    260            // seek() leaves the BreakCache positioned at the preceding boundary
    261            //        if the requested position is between two boundaries.
    262            // current() pushes the BreakCache position out to the BreakIterator itself.
    263            U_ASSERT(startPos > fTextIdx);
    264            current();
    265        }
    266    }
    267 }
    268 
    269 
    270 /*
    271 * Out-of-line code for BreakCache::next().
    272 * Cache does not already contain the boundary
    273 */
    274 void RuleBasedBreakIterator::BreakCache::nextOL() {
    275    fBI->fDone = !populateFollowing();
    276    fBI->fPosition = fTextIdx;
    277    fBI->fRuleStatusIndex = fStatuses[fBufIdx];
    278 }
    279 
    280 
    281 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
    282    if (U_FAILURE(status)) {
    283        return;
    284    }
    285    int32_t initialBufIdx = fBufIdx;
    286    if (fBufIdx == fStartBufIdx) {
    287        // At start of cache. Prepend to it.
    288        populatePreceding(status);
    289    } else {
    290        // Cache already holds the next boundary
    291        fBufIdx = modChunkSize(fBufIdx - 1);
    292        fTextIdx = fBoundaries[fBufIdx];
    293    }
    294    fBI->fDone = (fBufIdx == initialBufIdx);
    295    fBI->fPosition = fTextIdx;
    296    fBI->fRuleStatusIndex = fStatuses[fBufIdx];
    297 }
    298 
    299 
    300 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
    301    if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
    302        return false;
    303    }
    304    if (pos == fBoundaries[fStartBufIdx]) {
    305        // Common case: seek(0), from BreakIterator::first()
    306        fBufIdx = fStartBufIdx;
    307        fTextIdx = fBoundaries[fBufIdx];
    308        return true;
    309    }
    310    if (pos == fBoundaries[fEndBufIdx]) {
    311        fBufIdx = fEndBufIdx;
    312        fTextIdx = fBoundaries[fBufIdx];
    313        return true;
    314    }
    315 
    316    int32_t min = fStartBufIdx;
    317    int32_t max = fEndBufIdx;
    318    while (min != max) {
    319        int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
    320        probe = modChunkSize(probe);
    321        if (fBoundaries[probe] > pos) {
    322            max = probe;
    323        } else {
    324            min = modChunkSize(probe + 1);
    325        }
    326    }
    327    U_ASSERT(fBoundaries[max] > pos);
    328    fBufIdx = modChunkSize(max - 1);
    329    fTextIdx = fBoundaries[fBufIdx];
    330    U_ASSERT(fTextIdx <= pos);
    331    return true;
    332 }
    333 
    334 
    335 UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
    336    if (U_FAILURE(status)) {
    337        return false;
    338    }
    339    U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
    340 
    341    // Add boundaries to the cache near the specified position.
    342    // The given position need not be a boundary itself.
    343    // The input position must be within the range of the text, and
    344    // on a code point boundary.
    345    // If the requested position is a break boundary, leave the iteration
    346    // position on it.
    347    // If the requested position is not a boundary, leave the iteration
    348    // position on the preceding boundary and include both the
    349    // preceding and following boundaries in the cache.
    350    // Additional boundaries, either preceding or following, may be added
    351    // to the cache as a side effect.
    352 
    353    // If the requested position is not near already cached positions, clear the existing cache,
    354    // find a near-by boundary and begin new cache contents there.
    355 
    356    // Threshold for a text position to be considered near to existing cache contents.
    357    // TODO: See issue ICU-22024 "perf tuning of Cache needed."
    358    //       This value is subject to change. See the ticket for more details.
    359    static constexpr int32_t CACHE_NEAR = 15;
    360 
    361    int32_t aBoundary = -1;
    362    int32_t ruleStatusIndex = 0;
    363    bool retainCache = false;
    364    if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
    365        // Requested position is near the existing cache. Retain it.
    366        retainCache = true;
    367    } else if (position <= CACHE_NEAR) {
    368        // Requested position is near the start of the text. Fill cache from start, skipping
    369        // the need to find a safe point.
    370        retainCache = false;
    371        aBoundary = 0;
    372    } else {
    373        // Requested position is not near the existing cache.
    374        // Find a safe point to refill the cache from.
    375        int32_t backupPos = fBI->handleSafePrevious(position);
    376 
    377        if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
    378            // The requested position is beyond the end of the existing cache, but the
    379            // reverse rules produced a position near or before the cached region.
    380            // Retain the existing cache, and fill from the end of it.
    381            retainCache = true;
    382        } else if (backupPos < CACHE_NEAR) {
    383            // The safe reverse rules moved us to near the start of text.
    384            // Take that (index 0) as the backup boundary, avoiding the complication
    385            // (in the following block) of moving forward from the safe point to a known boundary.
    386            //
    387            // Retain the cache if it begins not too far from the requested position.
    388            aBoundary = 0;
    389            retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
    390        } else {
    391            // The safe reverse rules produced a position that is neither near the existing
    392            // cache, nor near the start of text.
    393            // Advance to the boundary following.
    394            // There is a complication: the safe reverse rules identify pairs of code points
    395            // that are safe. If advancing from the safe point moves forwards by less than
    396            // two code points, we need to advance one more time to ensure that the boundary
    397            // is good, including a correct rules status value.
    398            retainCache = false;
    399            fBI->fPosition = backupPos;
    400            aBoundary = fBI->handleNext();
    401            if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
    402                // +4 is a quick test for possibly having advanced only one codepoint.
    403                // Four being the length of the longest potential code point, a supplementary in UTF-8
    404                utext_setNativeIndex(&fBI->fText, aBoundary);
    405                if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
    406                    // The initial handleNext() only advanced by a single code point. Go again.
    407                    aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
    408                }
    409            }
    410            if (aBoundary == UBRK_DONE) {
    411                // Note (Andy Heninger): I don't think this condition can occur, but it's hard
    412                // to prove that it can't. We ran off the end of the string looking a boundary
    413                // following a safe point; choose the end of the string as that boundary.
    414                aBoundary = utext_nativeLength(&fBI->fText);
    415            }
    416            ruleStatusIndex = fBI->fRuleStatusIndex;
    417        }
    418    }
    419 
    420    if (!retainCache) {
    421        U_ASSERT(aBoundary != -1);
    422        reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
    423    }
    424 
    425    // Fill in boundaries between existing cache content and the new requested position.
    426 
    427    if (fBoundaries[fEndBufIdx] < position) {
    428        // The last position in the cache precedes the requested position.
    429        // Add following position(s) to the cache.
    430        while (fBoundaries[fEndBufIdx] < position) {
    431            if (!populateFollowing()) {
    432                UPRV_UNREACHABLE_EXIT;
    433            }
    434        }
    435        fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
    436        fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
    437        while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
    438            previous(status);
    439        }
    440        return true;
    441    }
    442 
    443    if (fBoundaries[fStartBufIdx] > position) {
    444        // The first position in the cache is beyond the requested position.
    445        // back up more until we get a boundary <= the requested position.
    446        while (fBoundaries[fStartBufIdx] > position) {
    447            populatePreceding(status);
    448        }
    449        fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
    450        fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
    451        while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
    452            next();
    453        }
    454        if (fTextIdx > position) {
    455            // If position is not itself a boundary, the next() loop above will overshoot.
    456            // Back up one, leaving cache position at the boundary preceding the requested position.
    457            previous(status);
    458        }
    459        return true;
    460    }
    461 
    462    U_ASSERT(fTextIdx == position);
    463    return true;
    464 }
    465 
    466 
    467 
    468 UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
    469    int32_t fromPosition = fBoundaries[fEndBufIdx];
    470    int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
    471    int32_t pos = 0;
    472    int32_t ruleStatusIdx = 0;
    473 
    474    if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
    475        addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
    476        return true;
    477    }
    478 
    479    fBI->fPosition = fromPosition;
    480    pos = fBI->handleNext();
    481    if (pos == UBRK_DONE) {
    482        return false;
    483    }
    484 
    485    ruleStatusIdx = fBI->fRuleStatusIndex;
    486    if (fBI->fDictionaryCharCount > 0) {
    487        // The text segment obtained from the rules includes dictionary characters.
    488        // Subdivide it, with subdivided results going into the dictionary cache.
    489        fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
    490        if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
    491            addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
    492            return true;
    493            // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
    494            //       But be careful with interactions with populateNear().
    495        }
    496    }
    497 
    498    // Rule based segment did not include dictionary characters.
    499    // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
    500    //    meaning that we didn't take the return, above.
    501    // Add its end point to the cache.
    502    addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
    503 
    504    // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
    505    //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
    506    //
    507    for (int count=0; count<6; ++count) {
    508        pos = fBI->handleNext();
    509        if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
    510            break;
    511        }
    512        addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
    513    }
    514 
    515    return true;
    516 }
    517 
    518 
    519 UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
    520    if (U_FAILURE(status)) {
    521        return false;
    522    }
    523 
    524    int32_t fromPosition = fBoundaries[fStartBufIdx];
    525    if (fromPosition == 0) {
    526        return false;
    527    }
    528 
    529    int32_t position = 0;
    530    int32_t positionStatusIdx = 0;
    531 
    532    if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
    533        addPreceding(position, positionStatusIdx, UpdateCachePosition);
    534        return true;
    535    }
    536 
    537    int32_t backupPosition = fromPosition;
    538 
    539    // Find a boundary somewhere preceding the first already-cached boundary
    540    do {
    541        backupPosition = backupPosition - 30;
    542        if (backupPosition <= 0) {
    543            backupPosition = 0;
    544        } else {
    545            backupPosition = fBI->handleSafePrevious(backupPosition);
    546        }
    547        if (backupPosition == UBRK_DONE || backupPosition == 0) {
    548            position = 0;
    549            positionStatusIdx = 0;
    550        } else {
    551            // Advance to the boundary following the backup position.
    552            // There is a complication: the safe reverse rules identify pairs of code points
    553            // that are safe. If advancing from the safe point moves forwards by less than
    554            // two code points, we need to advance one more time to ensure that the boundary
    555            // is good, including a correct rules status value.
    556            //
    557            fBI->fPosition = backupPosition;
    558            position = fBI->handleNext();
    559            if (position <= backupPosition + 4) {
    560                // +4 is a quick test for possibly having advanced only one codepoint.
    561                // Four being the length of the longest potential code point, a supplementary in UTF-8
    562                utext_setNativeIndex(&fBI->fText, position);
    563                if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
    564                    // The initial handleNext() only advanced by a single code point. Go again.
    565                    position = fBI->handleNext();   // Safe rules identify safe pairs.
    566                }
    567            }
    568            positionStatusIdx = fBI->fRuleStatusIndex;
    569        }
    570    } while (position >= fromPosition);
    571 
    572    // Find boundaries between the one we just located and the first already-cached boundary
    573    // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
    574 
    575    fSideBuffer.removeAllElements();
    576    fSideBuffer.addElement(position, status);
    577    fSideBuffer.addElement(positionStatusIdx, status);
    578 
    579    do {
    580        int32_t prevPosition = fBI->fPosition = position;
    581        int32_t prevStatusIdx = positionStatusIdx;
    582        position = fBI->handleNext();
    583        positionStatusIdx = fBI->fRuleStatusIndex;
    584        if (position == UBRK_DONE) {
    585            break;
    586        }
    587 
    588        UBool segmentHandledByDictionary = false;
    589        if (fBI->fDictionaryCharCount != 0) {
    590            // Segment from the rules includes dictionary characters.
    591            // Subdivide it, with subdivided results going into the dictionary cache.
    592            int32_t dictSegEndPosition = position;
    593            fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
    594            while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
    595                segmentHandledByDictionary = true;
    596                U_ASSERT(position > prevPosition);
    597                if (position >= fromPosition) {
    598                    break;
    599                }
    600                U_ASSERT(position <= dictSegEndPosition);
    601                fSideBuffer.addElement(position, status);
    602                fSideBuffer.addElement(positionStatusIdx, status);
    603                prevPosition = position;
    604            }
    605            U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
    606        }
    607 
    608        if (!segmentHandledByDictionary && position < fromPosition) {
    609            fSideBuffer.addElement(position, status);
    610            fSideBuffer.addElement(positionStatusIdx, status);
    611        }
    612    } while (position < fromPosition);
    613 
    614    // Move boundaries from the side buffer to the main circular buffer.
    615    UBool success = false;
    616    if (!fSideBuffer.isEmpty()) {
    617        positionStatusIdx = fSideBuffer.popi();
    618        position = fSideBuffer.popi();
    619        addPreceding(position, positionStatusIdx, UpdateCachePosition);
    620        success = true;
    621    }
    622 
    623    while (!fSideBuffer.isEmpty()) {
    624        positionStatusIdx = fSideBuffer.popi();
    625        position = fSideBuffer.popi();
    626        if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
    627            // No space in circular buffer to hold a new preceding result while
    628            // also retaining the current cache (iteration) position.
    629            // Bailing out is safe; the cache will refill again if needed.
    630            break;
    631        }
    632    }
    633 
    634    return success;
    635 }
    636 
    637 
    638 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
    639    U_ASSERT(position > fBoundaries[fEndBufIdx]);
    640    U_ASSERT(ruleStatusIdx <= UINT16_MAX);
    641    int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
    642    if (nextIdx == fStartBufIdx) {
    643        fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
    644    }
    645    fBoundaries[nextIdx] = position;
    646    fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
    647    fEndBufIdx = nextIdx;
    648    if (update == UpdateCachePosition) {
    649        // Set current position to the newly added boundary.
    650        fBufIdx = nextIdx;
    651        fTextIdx = position;
    652    } else {
    653        // Retaining the original cache position.
    654        // Check if the added boundary wraps around the buffer, and would over-write the original position.
    655        // It's the responsibility of callers of this function to not add too many.
    656        U_ASSERT(nextIdx != fBufIdx);
    657    }
    658 }
    659 
    660 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
    661    U_ASSERT(position < fBoundaries[fStartBufIdx]);
    662    U_ASSERT(ruleStatusIdx <= UINT16_MAX);
    663    int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
    664    if (nextIdx == fEndBufIdx) {
    665        if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
    666            // Failure. The insertion of the new boundary would claim the buffer position that is the
    667            // current iteration position. And we also want to retain the current iteration position.
    668            // (The buffer is already completely full of entries that precede the iteration position.)
    669            return false;
    670        }
    671        fEndBufIdx = modChunkSize(fEndBufIdx - 1);
    672    }
    673    fBoundaries[nextIdx] = position;
    674    fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
    675    fStartBufIdx = nextIdx;
    676    if (update == UpdateCachePosition) {
    677        fBufIdx = nextIdx;
    678        fTextIdx = position;
    679    }
    680    return true;
    681 }
    682 
    683 
    684 void RuleBasedBreakIterator::BreakCache::dumpCache() {
    685 #ifdef RBBI_DEBUG
    686    RBBIDebugPrintf("fTextIdx:%d   fBufIdx:%d\n", fTextIdx, fBufIdx);
    687    for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
    688        RBBIDebugPrintf("%d  %d\n", i, fBoundaries[i]);
    689        if (i == fEndBufIdx) {
    690            break;
    691        }
    692    }
    693 #endif
    694 }
    695 
    696 U_NAMESPACE_END
    697 
    698 #endif // #if !UCONFIG_NO_BREAK_ITERATION
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE