tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationruleparser.h (6409B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationruleparser.h
      9 *
     10 * created on: 2013apr10
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONRULEPARSER_H__
     15 #define __COLLATIONRULEPARSER_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/ucol.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/unistr.h"
     24 
     25 struct UParseError;
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 struct CollationData;
     30 struct CollationTailoring;
     31 
     32 class Locale;
     33 class Normalizer2;
     34 
     35 struct CollationSettings;
     36 
     37 class U_I18N_API CollationRuleParser : public UMemory {
     38 public:
     39    /** Special reset positions. */
     40    enum Position {
     41        FIRST_TERTIARY_IGNORABLE,
     42        LAST_TERTIARY_IGNORABLE,
     43        FIRST_SECONDARY_IGNORABLE,
     44        LAST_SECONDARY_IGNORABLE,
     45        FIRST_PRIMARY_IGNORABLE,
     46        LAST_PRIMARY_IGNORABLE,
     47        FIRST_VARIABLE,
     48        LAST_VARIABLE,
     49        FIRST_REGULAR,
     50        LAST_REGULAR,
     51        FIRST_IMPLICIT,
     52        LAST_IMPLICIT,
     53        FIRST_TRAILING,
     54        LAST_TRAILING
     55    };
     56 
     57    /**
     58     * First character of contractions that encode special reset positions.
     59     * U+FFFE cannot be tailored via rule syntax.
     60     *
     61     * The second contraction character is POS_BASE + Position.
     62     */
     63    static const char16_t POS_LEAD = 0xfffe;
     64    /**
     65     * Base for the second character of contractions that encode special reset positions.
     66     * Braille characters U+28xx are printable and normalization-inert.
     67     * @see POS_LEAD
     68     */
     69    static const char16_t POS_BASE = 0x2800;
     70 
     71    class U_I18N_API Sink : public UObject {
     72    public:
     73        virtual ~Sink();
     74        /**
     75         * Adds a reset.
     76         * strength=UCOL_IDENTICAL for &str.
     77         * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
     78         */
     79        virtual void addReset(int32_t strength, const UnicodeString &str,
     80                              const char *&errorReason, UErrorCode &errorCode) = 0;
     81        /**
     82         * Adds a relation with strength and prefix | str / extension.
     83         */
     84        virtual void addRelation(int32_t strength, const UnicodeString &prefix,
     85                                 const UnicodeString &str, const UnicodeString &extension,
     86                                 const char *&errorReason, UErrorCode &errorCode) = 0;
     87 
     88        virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
     89                                          UErrorCode &errorCode);
     90 
     91        virtual void optimize(const UnicodeSet &set, const char *&errorReason,
     92                              UErrorCode &errorCode);
     93    };
     94 
     95    class U_I18N_API Importer : public UObject {
     96    public:
     97        virtual ~Importer();
     98        virtual void getRules(
     99                const char *localeID, const char *collationType,
    100                UnicodeString &rules,
    101                const char *&errorReason, UErrorCode &errorCode) = 0;
    102    };
    103 
    104    /**
    105     * Constructor.
    106     * The Sink must be set before parsing.
    107     * The Importer can be set, otherwise [import locale] syntax is not supported.
    108     */
    109    CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
    110    ~CollationRuleParser();
    111 
    112    /**
    113     * Sets the pointer to a Sink object.
    114     * The pointer is aliased: Pointer copy without cloning or taking ownership.
    115     */
    116    void setSink(Sink *sinkAlias) {
    117        sink = sinkAlias;
    118    }
    119 
    120    /**
    121     * Sets the pointer to an Importer object.
    122     * The pointer is aliased: Pointer copy without cloning or taking ownership.
    123     */
    124    void setImporter(Importer *importerAlias) {
    125        importer = importerAlias;
    126    }
    127 
    128    void parse(const UnicodeString &ruleString,
    129               CollationSettings &outSettings,
    130               UParseError *outParseError,
    131               UErrorCode &errorCode);
    132 
    133    const char *getErrorReason() const { return errorReason; }
    134 
    135    /**
    136     * Gets a script or reorder code from its string representation.
    137     * @return the script/reorder code, or
    138     * -1 if not recognized
    139     */
    140    static int32_t getReorderCode(const char *word);
    141 
    142 private:
    143    /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
    144    static const int32_t STRENGTH_MASK = 0xf;
    145    static const int32_t STARRED_FLAG = 0x10;
    146    static const int32_t OFFSET_SHIFT = 8;
    147 
    148    void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
    149    void parseRuleChain(UErrorCode &errorCode);
    150    int32_t parseResetAndPosition(UErrorCode &errorCode);
    151    int32_t parseRelationOperator(UErrorCode &errorCode);
    152    void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
    153    void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
    154    int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
    155    int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
    156 
    157    /**
    158     * Sets str to a contraction of U+FFFE and (U+2800 + Position).
    159     * @return rule index after the special reset position
    160     */
    161    int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
    162    void parseSetting(UErrorCode &errorCode);
    163    void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
    164    static UColAttributeValue getOnOffValue(const UnicodeString &s);
    165 
    166    int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
    167    int32_t readWords(int32_t i, UnicodeString &raw) const;
    168    int32_t skipComment(int32_t i) const;
    169 
    170    void setParseError(const char *reason, UErrorCode &errorCode);
    171    void setErrorContext();
    172 
    173    /**
    174     * ASCII [:P:] and [:S:]:
    175     * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
    176     */
    177    static UBool isSyntaxChar(UChar32 c);
    178    int32_t skipWhiteSpace(int32_t i) const;
    179 
    180    const Normalizer2 &nfd, &nfc;
    181 
    182    const UnicodeString *rules;
    183    const CollationData *const baseData;
    184    CollationSettings *settings;
    185    UParseError *parseError;
    186    const char *errorReason;
    187 
    188    Sink *sink;
    189    Importer *importer;
    190 
    191    int32_t ruleIndex;
    192 };
    193 
    194 U_NAMESPACE_END
    195 
    196 #endif  // !UCONFIG_NO_COLLATION
    197 #endif  // __COLLATIONRULEPARSER_H__