tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbt_pars.h (11853B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 * Copyright (C) 1999-2011, International Business Machines Corporation
      6 * and others. All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 #ifndef RBT_PARS_H
     13 #define RBT_PARS_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_TRANSLITERATION
     18 #ifdef __cplusplus
     19 
     20 #include "unicode/uobject.h"
     21 #include "unicode/parseerr.h"
     22 #include "unicode/unorm.h"
     23 #include "rbt.h"
     24 #include "hash.h"
     25 #include "uvector.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 class TransliterationRuleData;
     30 class UnicodeFunctor;
     31 class ParseData;
     32 class RuleHalf;
     33 class ParsePosition;
     34 class StringMatcher;
     35 
     36 class TransliteratorParser : public UMemory {
     37 
     38 public:
     39 
     40    /**
     41     * A Vector of TransliterationRuleData objects, one for each discrete group
     42     * of rules in the rule set
     43     */
     44    UVector dataVector;
     45 
     46    /**
     47     * PUBLIC data member.
     48     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
     49     */
     50    UVector idBlockVector;
     51 
     52    /**
     53     * PUBLIC data member containing the parsed compound filter, if any.
     54     */
     55    UnicodeSet* compoundFilter;
     56 
     57 private:
     58 
     59    /**
     60     * The current data object for which we are parsing rules
     61     */
     62    TransliterationRuleData* curData;
     63 
     64    UTransDirection direction;
     65 
     66    /**
     67     * Parse error information.
     68     */
     69    UParseError parseError;
     70 
     71    /**
     72     * Temporary symbol table used during parsing.
     73     */
     74    ParseData* parseData;
     75 
     76    /**
     77     * Temporary vector of matcher variables.  When parsing is complete, this
     78     * is copied into the array data.variables.  As with data.variables,
     79     * element 0 corresponds to character data.variablesBase.
     80     */
     81    UVector variablesVector;
     82 
     83    /**
     84     * Temporary table of variable names.  When parsing is complete, this is
     85     * copied into data.variableNames.
     86     */
     87    Hashtable variableNames;    
     88    
     89    /**
     90     * String of standins for segments.  Used during the parsing of a single
     91     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     92     * to StringMatcher object segmentObjects.elementAt(0), etc.
     93     */
     94    UnicodeString segmentStandins;
     95 
     96    /**
     97     * Vector of StringMatcher objects for segments.  Used during the
     98     * parsing of a single rule.  
     99     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
    100     * to StringMatcher object segmentObjects.elementAt(0), etc.
    101     */
    102    UVector segmentObjects;
    103 
    104    /**
    105     * The next available stand-in for variables.  This starts at some point in
    106     * the private use area (discovered dynamically) and increments up toward
    107     * <code>variableLimit</code>.  At any point during parsing, available
    108     * variables are <code>variableNext..variableLimit-1</code>.
    109     */
    110    char16_t variableNext;
    111 
    112    /**
    113     * The last available stand-in for variables.  This is discovered
    114     * dynamically.  At any point during parsing, available variables are
    115     * <code>variableNext..variableLimit-1</code>.
    116     */
    117    char16_t variableLimit;
    118 
    119    /**
    120     * When we encounter an undefined variable, we do not immediately signal
    121     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
    122     * Instead, we save the name of the undefined variable, and substitute
    123     * in the placeholder char variableLimit - 1, and decrement
    124     * variableLimit.
    125     */
    126    UnicodeString undefinedVariableName;
    127 
    128    /**
    129     * The stand-in character for the 'dot' set, represented by '.' in
    130     * patterns.  This is allocated the first time it is needed, and
    131     * reused thereafter.
    132     */
    133    char16_t dotStandIn;
    134 
    135 public:
    136 
    137    /**
    138     * Constructor.
    139     */
    140    TransliteratorParser(UErrorCode &statusReturn);
    141 
    142    /**
    143     * Destructor.
    144     */
    145    ~TransliteratorParser();
    146 
    147    /**
    148     * Parse the given string as a sequence of rules, separated by newline
    149     * characters ('\n'), and cause this object to implement those rules.  Any
    150     * previous rules are discarded.  Typically this method is called exactly
    151     * once after construction.
    152     *
    153     * Parse the given rules, in the given direction.  After this call
    154     * returns, query the public data members for results.  The caller
    155     * owns the 'data' and 'compoundFilter' data members after this
    156     * call returns.
    157     * @param rules      rules, separated by ';'
    158     * @param direction  either FORWARD or REVERSE.
    159     * @param pe         Struct to receive information on position 
    160     *                   of error if an error is encountered
    161     * @param ec         Output param set to success/failure code.
    162     */
    163    void parse(const UnicodeString& rules,
    164               UTransDirection direction,
    165               UParseError& pe,
    166               UErrorCode& ec);
    167 
    168    /**
    169     * Return the compound filter parsed by parse().  Caller owns result.
    170     * @return the compound filter parsed by parse().
    171     */ 
    172    UnicodeSet* orphanCompoundFilter();
    173 
    174 private:
    175 
    176    /**
    177     * Return a representation of this transliterator as source rules.
    178     * @param rules      Output param to receive the rules.
    179     * @param direction  either FORWARD or REVERSE.
    180     */
    181    void parseRules(const UnicodeString& rules,
    182                    UTransDirection direction,
    183                    UErrorCode& status);
    184 
    185    /**
    186     * MAIN PARSER.  Parse the next rule in the given rule string, starting
    187     * at pos.  Return the index after the last character parsed.  Do not
    188     * parse characters at or after limit.
    189     *
    190     * Important:  The character at pos must be a non-whitespace character
    191     * that is not the comment character.
    192     *
    193     * This method handles quoting, escaping, and whitespace removal.  It
    194     * parses the end-of-rule character.  It recognizes context and cursor
    195     * indicators.  Once it does a lexical breakdown of the rule at pos, it
    196     * creates a rule object and adds it to our rule list.
    197     * @param rules      Output param to receive the rules.
    198     * @param pos        the starting position.
    199     * @param limit      pointer past the last character of the rule.
    200     * @return           the index after the last character parsed.
    201     */
    202    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
    203 
    204    /**
    205     * Set the variable range to [start, end] (inclusive).
    206     * @param start    the start value of the range.
    207     * @param end      the end value of the range.
    208     */
    209    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
    210 
    211    /**
    212     * Assert that the given character is NOT within the variable range.
    213     * If it is, return false.  This is necessary to ensure that the
    214     * variable range does not overlap characters used in a rule.
    215     * @param ch     the given character.
    216     * @return       True, if the given character is NOT within the variable range.
    217     */
    218    UBool checkVariableRange(UChar32 ch) const;
    219 
    220    /**
    221     * Set the maximum backup to 'backup', in response to a pragma
    222     * statement.
    223     * @param backup    the new value to be set.
    224     */
    225    void pragmaMaximumBackup(int32_t backup);
    226 
    227    /**
    228     * Begin normalizing all rules using the given mode, in response
    229     * to a pragma statement.
    230     * @param mode    the given mode.
    231     */
    232    void pragmaNormalizeRules(UNormalizationMode mode);
    233 
    234    /**
    235     * Return true if the given rule looks like a pragma.
    236     * @param pos offset to the first non-whitespace character
    237     * of the rule.
    238     * @param limit pointer past the last character of the rule.
    239     * @return true if the given rule looks like a pragma.
    240     */
    241    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
    242 
    243    /**
    244     * Parse a pragma.  This method assumes resemblesPragma() has
    245     * already returned true.
    246     * @param pos offset to the first non-whitespace character
    247     * of the rule.
    248     * @param limit pointer past the last character of the rule.
    249     * @return the position index after the final ';' of the pragma,
    250     * or -1 on failure.
    251     */
    252    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
    253 
    254    /**
    255     * Called by main parser upon syntax error.  Search the rule string
    256     * for the probable end of the rule.  Of course, if the error is that
    257     * the end of rule marker is missing, then the rule end will not be found.
    258     * In any case the rule start will be correctly reported.
    259     * @param parseErrorCode error code.
    260     * @param msg error description.
    261     * @param start position of first character of current rule.
    262     * @return start position of first character of current rule.
    263     */
    264    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
    265                        UErrorCode& status);
    266 
    267    /**
    268     * Parse a UnicodeSet out, store it, and return the stand-in character
    269     * used to represent it.
    270     *
    271     * @param rule    the rule for UnicodeSet.
    272     * @param pos     the position in pattern at which to start parsing.
    273     * @return        the stand-in character used to represent it.
    274     */
    275    char16_t parseSet(const UnicodeString& rule,
    276                      ParsePosition& pos,
    277                      UErrorCode& status);
    278 
    279    /**
    280     * Generate and return a stand-in for a new UnicodeFunctor.  Store
    281     * the matcher (adopt it).
    282     * @param adopted the UnicodeFunctor to be adopted.
    283     * @return        a stand-in for a new UnicodeFunctor.
    284     */
    285    char16_t generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
    286 
    287    /**
    288     * Return the standin for segment seg (1-based).
    289     * @param seg    the given segment.
    290     * @return       the standIn character for the given segment.
    291     */
    292    char16_t getSegmentStandin(int32_t seg, UErrorCode& status);
    293 
    294    /**
    295     * Set the object for segment seg (1-based).
    296     * @param seg      the given segment.
    297     * @param adopted  the StringMatcher to be adopted.
    298     */
    299    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
    300 
    301    /**
    302     * Return the stand-in for the dot set.  It is allocated the first
    303     * time and reused thereafter.
    304     * @return    the stand-in for the dot set.
    305     */
    306    char16_t getDotStandIn(UErrorCode& status);
    307 
    308    /**
    309     * Append the value of the given variable name to the given
    310     * UnicodeString.
    311     * @param name    the variable name to be appended.
    312     * @param buf     the given UnicodeString to append to.
    313     */
    314    void appendVariableDef(const UnicodeString& name,
    315                           UnicodeString& buf,
    316                           UErrorCode& status);
    317 
    318    /**
    319     * Glue method to get around access restrictions in C++.
    320     */
    321    /*static Transliterator* createBasicInstance(const UnicodeString& id,
    322                                               const UnicodeString* canonID);*/
    323 
    324    friend class RuleHalf;
    325 
    326    // Disallowed methods; no impl.
    327    /**
    328     * Copy constructor
    329     */
    330    TransliteratorParser(const TransliteratorParser&);
    331    
    332    /**
    333     * Assignment operator
    334     */
    335    TransliteratorParser& operator=(const TransliteratorParser&);
    336 };
    337 
    338 U_NAMESPACE_END
    339 
    340 #endif /* #ifdef __cplusplus */
    341 
    342 /**
    343 * Strip/convert the following from the transliterator rules:
    344 * comments
    345 * newlines
    346 * white space at the beginning and end of a line
    347 * unescape \u notation
    348 *
    349 * The target must be equal in size as the source.
    350 * @internal
    351 */
    352 U_CAPI int32_t
    353 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
    354 
    355 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    356 
    357 #endif