tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbiscan.h (7513B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  rbbiscan.h
      5 //
      6 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains declarations for class RBBIRuleScanner
     10 //
     11 
     12 
     13 #ifndef RBBISCAN_H
     14 #define RBBISCAN_H
     15 
     16 #include "unicode/utypes.h"
     17 #include "unicode/uobject.h"
     18 #include "unicode/rbbi.h"
     19 #include "unicode/uniset.h"
     20 #include "unicode/parseerr.h"
     21 #include "uhash.h"
     22 #include "uvector.h"
     23 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
     24                          //    looks up references to $variables within a set.
     25 #include "rbbinode.h"
     26 #include "rbbirpt.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 class   RBBIRuleBuilder;
     31 class   RBBISymbolTable;
     32 
     33 
     34 //--------------------------------------------------------------------------------
     35 //
     36 //  class RBBIRuleScanner does the lowest level, character-at-a-time
     37 //                        scanning of break iterator rules.  
     38 //
     39 //                        The output of the scanner is parse trees for
     40 //                        the rule expressions and a list of all Unicode Sets
     41 //                        encountered.
     42 //
     43 //--------------------------------------------------------------------------------
     44 
     45 class RBBIRuleScanner : public UMemory {
     46 public:
     47 
     48    enum {
     49        kStackSize = 100            // The size of the state stack for
     50    };                              //   rules parsing.  Corresponds roughly
     51                                    //   to the depth of parentheses nesting
     52                                    //   that is allowed in the rules.
     53 
     54    struct RBBIRuleChar {
     55        UChar32             fChar;
     56        UBool               fEscaped;
     57        RBBIRuleChar() : fChar(0), fEscaped(false) {}
     58    };
     59 
     60    RBBIRuleScanner(RBBIRuleBuilder  *rb);
     61 
     62 
     63    virtual    ~RBBIRuleScanner();
     64 
     65    void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
     66                                                    // Return false if at end.
     67 
     68    UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
     69                                                    //   Only a single character may be pushed.
     70 
     71    void        parse();                            // Parse the rules, generating two parse
     72                                                    //   trees, one each for the forward and
     73                                                    //   reverse rules,
     74                                                    //   and a list of UnicodeSets encountered.
     75 
     76    int32_t     numRules();                         // Return the number of rules that have been seen.
     77 
     78    /**
     79     * Return a rules string without unnecessary
     80     * characters.
     81     */
     82    static UnicodeString stripRules(const UnicodeString &rules);
     83 private:
     84 
     85    UBool       doParseActions(int32_t a);
     86    void        error(UErrorCode e);                   // error reporting convenience function.
     87    void        fixOpStack(RBBINode::OpPrecedence p);
     88                                                       //   a character.
     89    void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr);
     90 
     91    UChar32     nextCharLL();
     92 #ifdef RBBI_DEBUG
     93    void        printNodeStack(const char *title);
     94 #endif
     95    RBBINode    *pushNewNode(RBBINode::NodeType  t);
     96    void        scanSet();
     97 
     98 
     99    RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
    100 
    101    int32_t                       fScanIndex;        // Index of current character being processed
    102                                                     //   in the rule input string.
    103    int32_t                       fNextIndex;        // Index of the next character, which
    104                                                     //   is the first character not yet scanned.
    105    UBool                         fQuoteMode;        // Scan is in a 'quoted region'
    106    int32_t                       fLineNum;          // Line number in input file.
    107    int32_t                       fCharNum;          // Char position within the line.
    108    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
    109                                                     //   as a single line, not two.
    110 
    111    RBBIRuleChar                  fC;                // Current char for parse state machine
    112                                                     //   processing.
    113    UnicodeString                 fVarName;          // $variableName, valid when we've just
    114                                                     //   scanned one.
    115 
    116    RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
    117                                                     //   parsing.  index by p[state][char-class]
    118 
    119    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
    120    int32_t                       fStackPtr;           //  and pops as specified in the state
    121                                                       //  transition rules.
    122 
    123    RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
    124                                                           //  during the parse of a rule
    125    int32_t                        fNodeStackPtr;
    126 
    127 
    128    UBool                          fReverseRule;     // True if the rule currently being scanned
    129                                                     //  is a reverse direction rule (if it
    130                                                     //  starts with a '!')
    131 
    132    UBool                          fLookAheadRule;   // True if the rule includes a '/'
    133                                                     //   somewhere within it.
    134 
    135    UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.
    136 
    137    RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
    138                                                     //   $variable symbols.
    139 
    140    UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
    141                                                     //   the sets created while parsing rules.
    142                                                     //   The key is the string used for creating
    143                                                     //   the set.
    144 
    145    UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
    146                                                     //  the scanning of RBBI rules.  The
    147                                                     //  indices for these are assigned by the
    148                                                     //  perl script that builds the state tables.
    149                                                     //  See rbbirpt.h.
    150 
    151    int32_t                        fRuleNum;         // Counts each rule as it is scanned.
    152 
    153    int32_t                        fOptionStart;     // Input index of start of a !!option
    154                                                     //   keyword, while being scanned.
    155 
    156    UnicodeSet *gRuleSet_rule_char;
    157    UnicodeSet *gRuleSet_white_space;
    158    UnicodeSet *gRuleSet_name_char;
    159    UnicodeSet *gRuleSet_name_start_char;
    160 
    161    RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class
    162    RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class
    163 };
    164 
    165 U_NAMESPACE_END
    166 
    167 #endif