tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationiterator.h (10536B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationiterator.h
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONITERATOR_H__
     15 #define __COLLATIONITERATOR_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "cmemory.h"
     22 #include "collation.h"
     23 #include "collationdata.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 class SkippedState;
     28 class UCharsTrie;
     29 class UVector32;
     30 
     31 /* Large enough for CEs of most short strings. */
     32 #define CEBUFFER_INITIAL_CAPACITY 40
     33 
     34 /**
     35 * Collation element iterator and abstract character iterator.
     36 *
     37 * When a method returns a code point value, it must be in 0..10FFFF,
     38 * except it can be negative as a sentinel value.
     39 */
     40 class U_I18N_API_CLASS CollationIterator : public UObject {
     41 private:
     42    class CEBuffer {
     43    private:
     44        /** Large enough for CEs of most short strings. */
     45        static const int32_t INITIAL_CAPACITY = CEBUFFER_INITIAL_CAPACITY;
     46    public:
     47        CEBuffer() : length(0) {}
     48        ~CEBuffer();
     49 
     50        inline void append(int64_t ce, UErrorCode &errorCode) {
     51            if(length < INITIAL_CAPACITY || ensureAppendCapacity(1, errorCode)) {
     52                buffer[length++] = ce;
     53            }
     54        }
     55 
     56        inline void appendUnsafe(int64_t ce) {
     57            buffer[length++] = ce;
     58        }
     59 
     60        U_I18N_API UBool ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode);
     61 
     62        inline UBool incLength(UErrorCode &errorCode) {
     63            // Use INITIAL_CAPACITY for a very simple fastpath.
     64            // (Rather than buffer.getCapacity().)
     65            if(length < INITIAL_CAPACITY || ensureAppendCapacity(1, errorCode)) {
     66                ++length;
     67                return true;
     68            } else {
     69                return false;
     70            }
     71        }
     72 
     73        inline int64_t set(int32_t i, int64_t ce) {
     74            return buffer[i] = ce;
     75        }
     76        inline int64_t get(int32_t i) const { return buffer[i]; }
     77 
     78        const int64_t *getCEs() const { return buffer.getAlias(); }
     79 
     80        int32_t length;
     81 
     82    private:
     83        CEBuffer(const CEBuffer &) = delete;
     84        void operator=(const CEBuffer &) = delete;
     85 
     86        MaybeStackArray<int64_t, INITIAL_CAPACITY> buffer;
     87    };
     88 
     89 public:
     90    CollationIterator(const CollationData *d, UBool numeric)
     91            : trie(d->trie),
     92              data(d),
     93              cesIndex(0),
     94              skipped(nullptr),
     95              numCpFwd(-1),
     96              isNumeric(numeric) {}
     97 
     98    virtual ~CollationIterator();
     99 
    100    virtual bool operator==(const CollationIterator &other) const;
    101    inline bool operator!=(const CollationIterator &other) const {
    102        return !operator==(other);
    103    }
    104 
    105    /**
    106     * Resets the iterator state and sets the position to the specified offset.
    107     * Subclasses must implement, and must call the parent class method,
    108     * or CollationIterator::reset().
    109     */
    110    virtual void resetToOffset(int32_t newOffset) = 0;
    111 
    112    virtual int32_t getOffset() const = 0;
    113 
    114    /**
    115     * Returns the next collation element.
    116     */
    117    inline int64_t nextCE(UErrorCode &errorCode) {
    118        if(cesIndex < ceBuffer.length) {
    119            // Return the next buffered CE.
    120            return ceBuffer.get(cesIndex++);
    121        }
    122        // assert cesIndex == ceBuffer.length;
    123        if(!ceBuffer.incLength(errorCode)) {
    124            return Collation::NO_CE;
    125        }
    126        UChar32 c;
    127        uint32_t ce32 = handleNextCE32(c, errorCode);
    128        uint32_t t = ce32 & 0xff;
    129        if(t < Collation::SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
    130            // Normal CE from the main data.
    131            // Forced-inline of ceFromSimpleCE32(ce32).
    132            return ceBuffer.set(cesIndex++,
    133                    (static_cast<int64_t>(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (t << 8));
    134        }
    135        const CollationData *d;
    136        // The compiler should be able to optimize the previous and the following
    137        // comparisons of t with the same constant.
    138        if(t == Collation::SPECIAL_CE32_LOW_BYTE) {
    139            if(c < 0) {
    140                return ceBuffer.set(cesIndex++, Collation::NO_CE);
    141            }
    142            d = data->base;
    143            ce32 = d->getCE32(c);
    144            t = ce32 & 0xff;
    145            if(t < Collation::SPECIAL_CE32_LOW_BYTE) {
    146                // Normal CE from the base data.
    147                return ceBuffer.set(cesIndex++,
    148                        (static_cast<int64_t>(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (t << 8));
    149            }
    150        } else {
    151            d = data;
    152        }
    153        if(t == Collation::LONG_PRIMARY_CE32_LOW_BYTE) {
    154            // Forced-inline of ceFromLongPrimaryCE32(ce32).
    155            return ceBuffer.set(cesIndex++,
    156                    (static_cast<int64_t>(ce32 - t) << 32) | Collation::COMMON_SEC_AND_TER_CE);
    157        }
    158        return nextCEFromCE32(d, c, ce32, errorCode);
    159    }
    160 
    161    /**
    162     * Fetches all CEs.
    163     * @return getCEsLength()
    164     */
    165    int32_t fetchCEs(UErrorCode &errorCode);
    166 
    167    /**
    168     * Overwrites the current CE (the last one returned by nextCE()).
    169     */
    170    void setCurrentCE(int64_t ce) {
    171        // assert cesIndex > 0;
    172        ceBuffer.set(cesIndex - 1, ce);
    173    }
    174 
    175    /**
    176     * Returns the previous collation element.
    177     */
    178    int64_t previousCE(UVector32 &offsets, UErrorCode &errorCode);
    179 
    180    inline int32_t getCEsLength() const {
    181        return ceBuffer.length;
    182    }
    183 
    184    inline int64_t getCE(int32_t i) const {
    185        return ceBuffer.get(i);
    186    }
    187 
    188    const int64_t *getCEs() const {
    189        return ceBuffer.getCEs();
    190    }
    191 
    192    void clearCEs() {
    193        cesIndex = ceBuffer.length = 0;
    194    }
    195 
    196    void clearCEsIfNoneRemaining() {
    197        if(cesIndex == ceBuffer.length) { clearCEs(); }
    198    }
    199 
    200    /**
    201     * Returns the next code point (with post-increment).
    202     * Public for identical-level comparison and for testing.
    203     */
    204    virtual UChar32 nextCodePoint(UErrorCode &errorCode) = 0;
    205 
    206    /**
    207     * Returns the previous code point (with pre-decrement).
    208     * Public for identical-level comparison and for testing.
    209     */
    210    virtual UChar32 previousCodePoint(UErrorCode &errorCode) = 0;
    211 
    212 protected:
    213    CollationIterator(const CollationIterator &other);
    214 
    215    void reset();
    216 
    217    /**
    218     * Returns the next code point and its local CE32 value.
    219     * Returns Collation::FALLBACK_CE32 at the end of the text (c<0)
    220     * or when c's CE32 value is to be looked up in the base data (fallback).
    221     *
    222     * The code point is used for fallbacks, context and implicit weights.
    223     * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
    224     */
    225    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
    226 
    227    /**
    228     * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
    229     * Returns the trail surrogate in that case and advances past it,
    230     * if a trail surrogate follows the lead surrogate.
    231     * Otherwise returns any other code unit and does not advance.
    232     */
    233    virtual char16_t handleGetTrailSurrogate();
    234 
    235    /**
    236     * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
    237     * (Not needed in Java.)
    238     */
    239    virtual UBool foundNULTerminator();
    240 
    241    /**
    242     * @return false if surrogate code points U+D800..U+DFFF
    243     *         map to their own implicit primary weights (for UTF-16),
    244     *         or true if they map to CE(U+FFFD) (for UTF-8)
    245     */
    246    virtual UBool forbidSurrogateCodePoints() const;
    247 
    248    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) = 0;
    249 
    250    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) = 0;
    251 
    252    /**
    253     * Returns the CE32 from the data trie.
    254     * Normally the same as data->getCE32(), but overridden in the builder.
    255     * Call this only when the faster data->getCE32() cannot be used.
    256     */
    257    virtual uint32_t getDataCE32(UChar32 c) const;
    258 
    259    virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode);
    260 
    261    void appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
    262                           UBool forward, UErrorCode &errorCode);
    263 
    264    // Main lookup trie of the data object.
    265    const UTrie2 *trie;
    266    const CollationData *data;
    267 
    268 private:
    269    U_I18N_API int64_t nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
    270                                      UErrorCode &errorCode);
    271 
    272    uint32_t getCE32FromPrefix(const CollationData *d, uint32_t ce32,
    273                               UErrorCode &errorCode);
    274 
    275    UChar32 nextSkippedCodePoint(UErrorCode &errorCode);
    276 
    277    void backwardNumSkipped(int32_t n, UErrorCode &errorCode);
    278 
    279    uint32_t nextCE32FromContraction(
    280            const CollationData *d, uint32_t contractionCE32,
    281            const char16_t *p, uint32_t ce32, UChar32 c,
    282            UErrorCode &errorCode);
    283 
    284    uint32_t nextCE32FromDiscontiguousContraction(
    285            const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,
    286            int32_t lookAhead, UChar32 c,
    287            UErrorCode &errorCode);
    288 
    289    /**
    290     * Returns the previous CE when data->isUnsafeBackward(c, isNumeric).
    291     */
    292    int64_t previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode);
    293 
    294    /**
    295     * Turns a string of digits (bytes 0..9)
    296     * into a sequence of CEs that will sort in numeric order.
    297     *
    298     * Starts from this ce32's digit value and consumes the following/preceding digits.
    299     * The digits string must not be empty and must not have leading zeros.
    300     */
    301    void appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode);
    302 
    303    /**
    304     * Turns 1..254 digits into a sequence of CEs.
    305     * Called by appendNumericCEs() for each segment of at most 254 digits.
    306     */
    307    void appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode);
    308 
    309    CEBuffer ceBuffer;
    310    int32_t cesIndex;
    311 
    312    SkippedState *skipped;
    313 
    314    // Number of code points to read forward, or -1.
    315    // Used as a forward iteration limit in previousCEUnsafe().
    316    int32_t numCpFwd;
    317    // Numeric collation (CollationSettings::NUMERIC).
    318    UBool isNumeric;
    319 };
    320 
    321 U_NAMESPACE_END
    322 
    323 #endif  // !UCONFIG_NO_COLLATION
    324 #endif  // __COLLATIONITERATOR_H__