tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf8collationiterator.h (5343B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2016, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * utf8collationiterator.h
      9 *
     10 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __UTF8COLLATIONITERATOR_H__
     15 #define __UTF8COLLATIONITERATOR_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "cmemory.h"
     22 #include "collation.h"
     23 #include "collationdata.h"
     24 #include "collationiterator.h"
     25 #include "normalizer2impl.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 /**
     30 * UTF-8 collation element and character iterator.
     31 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
     32 * Unnormalized text is handled by a subclass.
     33 */
     34 class U_I18N_API UTF8CollationIterator : public CollationIterator {
     35 public:
     36    UTF8CollationIterator(const CollationData *d, UBool numeric,
     37                          const uint8_t *s, int32_t p, int32_t len)
     38            : CollationIterator(d, numeric),
     39              u8(s), pos(p), length(len) {}
     40 
     41    virtual ~UTF8CollationIterator();
     42 
     43    virtual void resetToOffset(int32_t newOffset) override;
     44 
     45    virtual int32_t getOffset() const override;
     46 
     47    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
     48 
     49    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
     50 
     51 protected:
     52    /**
     53     * For byte sequences that are illegal in UTF-8, an error value may be returned
     54     * together with a bogus code point. The caller will ignore that code point.
     55     *
     56     * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
     57     * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
     58     *
     59     * Valid lead surrogates are returned from inside a normalized text segment,
     60     * where handleGetTrailSurrogate() will return the matching trail surrogate.
     61     */
     62    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
     63 
     64    virtual UBool foundNULTerminator() override;
     65 
     66    virtual UBool forbidSurrogateCodePoints() const override;
     67 
     68    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
     69 
     70    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
     71 
     72    const uint8_t *u8;
     73    int32_t pos;
     74    int32_t length;  // <0 for NUL-terminated strings
     75 };
     76 
     77 /**
     78 * Incrementally checks the input text for FCD and normalizes where necessary.
     79 */
     80 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
     81 public:
     82    FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
     83                             const uint8_t *s, int32_t p, int32_t len)
     84            : UTF8CollationIterator(data, numeric, s, p, len),
     85              state(CHECK_FWD), start(p),
     86              nfcImpl(data->nfcImpl) {}
     87 
     88    virtual ~FCDUTF8CollationIterator();
     89 
     90    virtual void resetToOffset(int32_t newOffset) override;
     91 
     92    virtual int32_t getOffset() const override;
     93 
     94    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
     95 
     96    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
     97 
     98 protected:
     99    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
    100 
    101    virtual char16_t handleGetTrailSurrogate() override;
    102 
    103    virtual UBool foundNULTerminator() override;
    104 
    105    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
    106 
    107    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
    108 
    109 private:
    110    UBool nextHasLccc() const;
    111    UBool previousHasTccc() const;
    112 
    113    /**
    114     * Switches to forward checking if possible.
    115     */
    116    void switchToForward();
    117 
    118    /**
    119     * Extends the FCD text segment forward or normalizes around pos.
    120     * @return true if success
    121     */
    122    UBool nextSegment(UErrorCode &errorCode);
    123 
    124    /**
    125     * Switches to backward checking.
    126     */
    127    void switchToBackward();
    128 
    129    /**
    130     * Extends the FCD text segment backward or normalizes around pos.
    131     * @return true if success
    132     */
    133    UBool previousSegment(UErrorCode &errorCode);
    134 
    135    UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
    136 
    137    enum State {
    138        /**
    139         * The input text [start..pos[ passes the FCD check.
    140         * Moving forward checks incrementally.
    141         * limit is undefined.
    142         */
    143        CHECK_FWD,
    144        /**
    145         * The input text [pos..limit[ passes the FCD check.
    146         * Moving backward checks incrementally.
    147         * start is undefined.
    148         */
    149        CHECK_BWD,
    150        /**
    151         * The input text [start..limit[ passes the FCD check.
    152         * pos tracks the current text index.
    153         */
    154        IN_FCD_SEGMENT,
    155        /**
    156         * The input text [start..limit[ failed the FCD check and was normalized.
    157         * pos tracks the current index in the normalized string.
    158         */
    159        IN_NORMALIZED
    160    };
    161 
    162    State state;
    163 
    164    int32_t start;
    165    int32_t limit;
    166 
    167    const Normalizer2Impl &nfcImpl;
    168    UnicodeString normalized;
    169 };
    170 
    171 U_NAMESPACE_END
    172 
    173 #endif  // !UCONFIG_NO_COLLATION
    174 #endif  // __UTF8COLLATIONITERATOR_H__