tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbi_cache.h (7953B)


      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // file: rbbi_cache.h
      5 //
      6 #ifndef RBBI_CACHE_H
      7 #define RBBI_CACHE_H
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_BREAK_ITERATION
     12 
     13 #include "unicode/rbbi.h"
     14 #include "unicode/uobject.h"
     15 
     16 #include "uvectr32.h"
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 /* DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
     21 *                 Dictionary boundaries are moved first to this cache, then from here
     22 *                 to the main BreakCache, where they may inter-leave with non-dictionary
     23 *                 boundaries. The public BreakIterator API always fetches directly
     24 *                 from the main BreakCache, not from here.
     25 *
     26 *                 In common situations, the number of boundaries in a single dictionary run
     27 *                 should be quite small, it will be terminated by punctuation, spaces,
     28 *                 or any other non-dictionary characters. The main BreakCache may end
     29 *                 up with boundaries from multiple dictionary based runs.
     30 *
     31 *                 The boundaries are stored in a simple ArrayList (vector), with the
     32 *                 assumption that they will be accessed sequentially.
     33 */                 
     34 class RuleBasedBreakIterator::DictionaryCache: public UMemory {
     35  public:
     36     DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
     37     ~DictionaryCache();
     38 
     39     void reset();
     40 
     41     UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
     42     UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
     43 
     44    /**
     45     * Populate the cache with the dictionary based boundaries within a region of text.
     46     * @param startPos  The start position of a range of text
     47     * @param endPos    The end position of a range of text
     48     * @param firstRuleStatus The rule status index that applies to the break at startPos
     49     * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
     50     * @internal
     51     */
     52    void populateDictionary(int32_t startPos, int32_t endPos,
     53                         int32_t firstRuleStatus, int32_t otherRuleStatus);
     54 
     55 
     56 
     57    RuleBasedBreakIterator *fBI;
     58    
     59    UVector32           fBreaks;                // A vector containing the boundaries.
     60    int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
     61                                                //    or preceding(). Optimizes sequential access.
     62    int32_t             fStart;                 // Text position of first boundary in cache.
     63    int32_t             fLimit;                 // Last boundary in cache. Which is the limit of the
     64                                                //    text segment being handled by the dictionary.
     65    int32_t             fFirstRuleStatusIndex;  // Rule status info for first boundary.
     66    int32_t             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
     67 };
     68 
     69 
     70 /*
     71 * class BreakCache
     72 *
     73 * Cache of break boundary positions and rule status values.
     74 * Break iterator API functions, next(), previous(), etc., will use cached results
     75 * when possible, and otherwise cache new results as they are obtained.
     76 *
     77 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
     78 *
     79 * The cache is implemented as a single circular buffer.
     80 */
     81 
     82 /*
     83 * size of the circular cache buffer.
     84 */
     85 
     86 class RuleBasedBreakIterator::BreakCache: public UMemory {
     87  public:
     88                BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
     89    virtual     ~BreakCache();
     90    void        reset(int32_t pos = 0, int32_t ruleStatus = 0);
     91    void        next() {    if (fBufIdx == fEndBufIdx) {
     92                                nextOL();
     93                            } else {
     94                                fBufIdx = modChunkSize(fBufIdx + 1);
     95                                fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
     96                                fBI->fRuleStatusIndex = fStatuses[fBufIdx];
     97                            }
     98                }
     99 
    100 
    101    void        nextOL();
    102    void        previous(UErrorCode &status);
    103 
    104    // Move the iteration state to the position following the startPosition.
    105    // Input position must be pinned to the input length.
    106    void        following(int32_t startPosition, UErrorCode &status);
    107 
    108    void        preceding(int32_t startPosition, UErrorCode &status);
    109 
    110    /*
    111     * Update the state of the public BreakIterator (fBI) to reflect the
    112     * current state of the break iterator cache (this).
    113     */
    114    int32_t     current();
    115 
    116    /**
    117     * Add boundaries to the cache near the specified position.
    118     * The given position need not be a boundary itself.
    119     * The input position must be within the range of the text, and
    120     * on a code point boundary.
    121     * If the requested position is a break boundary, leave the iteration
    122     * position on it.
    123     * If the requested position is not a boundary, leave the iteration
    124     * position on the preceding boundary and include both the
    125     * preceding and following boundaries in the cache.
    126     * Additional boundaries, either preceding or following, may be added
    127     * to the cache as a side effect.
    128     *
    129     * Return false if the operation failed.
    130     */
    131    UBool populateNear(int32_t position, UErrorCode &status);
    132 
    133    /**
    134     *  Add boundary(s) to the cache following the current last boundary.
    135     *  Return false if at the end of the text, and no more boundaries can be added.
    136     *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
    137     */
    138    UBool populateFollowing();
    139 
    140    /**
    141     *  Add one or more boundaries to the cache preceding the first currently cached boundary.
    142     *  Leave the iteration position on the first added boundary.
    143     *  Return false if no boundaries could be added (if at the start of the text.)
    144     */
    145    UBool populatePreceding(UErrorCode &status);
    146 
    147    enum UpdatePositionValues {
    148        RetainCachePosition = 0,
    149        UpdateCachePosition = 1
    150    };
    151 
    152    /*
    153     * Add the boundary following the current position.
    154     * The current position can be left as it was, or changed to the newly added boundary,
    155     * as specified by the update parameter.
    156     */
    157    void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
    158 
    159 
    160    /*
    161     * Add the boundary preceding the current position.
    162     * The current position can be left as it was, or changed to the newly added boundary,
    163     * as specified by the update parameter.
    164     */
    165    bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
    166 
    167    /**
    168     *  Set the cache position to the specified position, or, if the position
    169     *  falls between to cached boundaries, to the preceding boundary.
    170     *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
    171     *  The startPosition must be on a code point boundary.
    172     *
    173     *  Return true if successful, false if the specified position is after
    174     *  the last cached boundary or before the first.
    175     */
    176    UBool                   seek(int32_t startPosition);
    177 
    178    void dumpCache();
    179 
    180  private:
    181    static inline int32_t   modChunkSize(int index) { return index & (CACHE_SIZE - 1); }
    182 
    183    static constexpr int32_t CACHE_SIZE = 128;
    184    static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
    185 
    186    RuleBasedBreakIterator *fBI;
    187    int32_t                 fStartBufIdx;
    188    int32_t                 fEndBufIdx;    // inclusive
    189 
    190    int32_t                 fTextIdx;
    191    int32_t                 fBufIdx;
    192 
    193    int32_t                 fBoundaries[CACHE_SIZE];
    194    uint16_t                fStatuses[CACHE_SIZE];
    195 
    196    UVector32               fSideBuffer;
    197 };
    198 
    199 U_NAMESPACE_END
    200 
    201 #endif // #if !UCONFIG_NO_BREAK_ITERATION
    202 
    203 #endif // RBBI_CACHE_H