tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

regexcmp.h (12200B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  regexcmp.h
      5 //
      6 //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains declarations for the class RegexCompile
     10 //
     11 //  This class is internal to the regular expression implementation.
     12 //  For the public Regular Expression API, see the file "unicode/regex.h"
     13 //
     14 
     15 
     16 #ifndef REGEXCMP_H
     17 #define REGEXCMP_H
     18 
     19 #include "unicode/utypes.h"
     20 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     21 
     22 #include "unicode/parseerr.h"
     23 #include "unicode/uniset.h"
     24 #include "unicode/uobject.h"
     25 #include "unicode/utext.h"
     26 #include "uhash.h"
     27 #include "uvector.h"
     28 #include "uvectr32.h"
     29 
     30 
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 
     35 //--------------------------------------------------------------------------------
     36 //
     37 //  class RegexCompile    Contains the regular expression compiler.
     38 //
     39 //--------------------------------------------------------------------------------
     40 class   RegexPattern;
     41 
     42 
     43 class U_I18N_API RegexCompile : public UMemory {
     44 public:
     45 
     46    enum {
     47        kStackSize = 100            // The size of the state stack for
     48    };                              //   pattern parsing.  Corresponds roughly
     49                                    //   to the depth of parentheses nesting
     50                                    //   that is allowed in the rules.
     51 
     52    struct RegexPatternChar {
     53        UChar32             fChar;
     54        UBool               fQuoted;
     55    };
     56 
     57    RegexCompile(RegexPattern *rp, UErrorCode &e);
     58 
     59    void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
     60    void       compile(UText *pat, UParseError &pp, UErrorCode &e);
     61    
     62 
     63    virtual    ~RegexCompile();
     64 
     65    void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.
     66 
     67 
     68    // Categories of parentheses in pattern.
     69    //   The category is saved in the compile-time parentheses stack frame, and
     70    //   determines the code to be generated when the matching close ) is encountered.
     71    enum EParenClass {
     72        plain        = -1,               // No special handling
     73        capturing    = -2,
     74        atomic       = -3,
     75        lookAhead    = -4,
     76        negLookAhead = -5,
     77        flags        = -6,
     78        lookBehind   = -7,
     79        lookBehindN  = -8
     80    };
     81 
     82 private:
     83 
     84 
     85    UBool       doParseActions(int32_t a);
     86    void        error(UErrorCode e);                   // error reporting convenience function.
     87 
     88    UChar32     nextCharLL();
     89    UChar32     peekCharLL();
     90    UnicodeSet  *scanProp();
     91    UnicodeSet  *scanPosixProp();
     92    void        handleCloseParen();
     93    int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
     94                                                     //  at the top of the just completed block
     95                                                     //  or operation, and optionally ensure that
     96                                                     //  there is space to add an opcode there.
     97    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
     98                                                     //   a reference to a UnicodeSet.
     99    void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
    100                               int32_t LoopOp);
    101    UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
    102    void        literalChar(UChar32 c);              // Compile a literal char
    103    void        fixLiterals(UBool split=false);      // Generate code for pending literal characters.
    104    void        insertOp(int32_t where);             // Open up a slot for a new op in the
    105                                                     //   generated code at the specified location.
    106    void        appendOp(int32_t op);                // Append a new op to the compiled pattern.
    107    void        appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
    108    int32_t     buildOp(int32_t type, int32_t val);  // Construct a new pcode instruction.
    109    int32_t     allocateData(int32_t size);          // Allocate space in the matcher data area.
    110                                                     //   Return index of the newly allocated data.
    111    int32_t     allocateStackData(int32_t size);     // Allocate space in the match back-track stack frame.
    112                                                     //   Return offset index in the frame.
    113    int32_t     minMatchLength(int32_t start,
    114                               int32_t end);
    115    int32_t     maxMatchLength(int32_t start,
    116                               int32_t end);
    117    void        matchStartType();
    118    void        stripNOPs();
    119 
    120    void        setEval(int32_t op);
    121    void        setPushOp(int32_t op);
    122    UChar32     scanNamedChar();
    123    UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
    124 
    125 public:   // Public for testing only.
    126    static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
    127 private:
    128 
    129 
    130    UErrorCode                    *fStatus;
    131    RegexPattern                  *fRXPat;
    132    UParseError                   *fParseErr;
    133 
    134    //
    135    //  Data associated with low level character scanning
    136    //
    137    int64_t                       fScanIndex;        // Index of current character being processed
    138                                                     //   in the rule input string.
    139    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
    140    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
    141    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
    142                                                     //   end of line comments, in favor of (?#...) comments.
    143    int64_t                       fLineNum;          // Line number in input file.
    144    int64_t                       fCharNum;          // Char position within the line.
    145    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
    146                                                     //   as a single line, not two.
    147    UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
    148 
    149 
    150    RegexPatternChar              fC;                // Current char for parse state machine
    151                                                     //   processing.
    152 
    153    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
    154    int32_t                       fStackPtr;           //  and pops as specified in the state
    155                                                       //  transition rules.
    156 
    157    //
    158    //  Data associated with the generation of the pcode for the match engine
    159    //
    160    int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
    161                                                     //   Always has high bit (31) set so that flag values
    162                                                     //   on the paren stack are distinguished from relocatable
    163                                                     //   pcode addresses.
    164    int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
    165                                                     //   until last flag is scanned.
    166    UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx
    167 
    168    UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here.
    169                                                     //   Once completed, meaning that some non-literal pattern
    170                                                     //   construct is encountered, the appropriate opcodes
    171                                                     //   to match the literal will be generated, and this
    172                                                     //   string will be cleared.
    173 
    174    int64_t                       fPatternLength;    // Length of the input pattern string.
    175    
    176    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
    177                                                     //   the positions of compiled pattern operations
    178                                                     //   needing fixup, followed by negative value.  The
    179                                                     //   first entry in each frame is the position of the
    180                                                     //   spot reserved for use when a quantifier
    181                                                     //   needs to add a SAVE at the start of a (block)
    182                                                     //   The negative value (-1, -2,...) indicates
    183                                                     //   the kind of paren that opened the frame.  Some
    184                                                     //   need special handling on close.
    185 
    186 
    187    int32_t                       fMatchOpenParen;   // The position in the compiled pattern
    188                                                     //   of the slot reserved for a state save
    189                                                     //   at the start of the most recently processed
    190                                                     //   parenthesized block. Updated when processing
    191                                                     //   a close to the location for the corresponding open.
    192 
    193    int32_t                       fMatchCloseParen;  // The position in the pattern of the first
    194                                                     //   location after the most recently processed
    195                                                     //   parenthesized block.
    196 
    197    int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values.
    198    int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
    199                                                     //   initially scanned.  Each new interval
    200                                                     //   encountered overwrites these values.
    201                                                     //   -1 for the upper interval value means none
    202                                                     //   was specified (unlimited occurrences.)
    203 
    204    UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating
    205                                                     //   (at compile time) set expressions within
    206                                                     //   the pattern.
    207    UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union)
    208 
    209    UChar32                       fLastSetLiteral;   // The last single code point added to a set.
    210                                                     //   needed when "-y" is scanned, and we need
    211                                                     //   to turn "x-y" into a range.
    212 
    213    UnicodeString                *fCaptureName;      // Named Capture, the group name is built up
    214                                                     //   in this string while being scanned.
    215 };
    216 
    217 // Constant values to be pushed onto fSetOpStack while scanning & evaluating [set expressions]
    218 //   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
    219 
    220 enum SetOperations {
    221    setStart         = 0 << 16 | 1,
    222    setEnd           = 1 << 16 | 2,
    223    setNegation      = 2 << 16 | 3,
    224    setCaseClose     = 2 << 16 | 9,
    225    setDifference2   = 3 << 16 | 4,    // '--' set difference operator
    226    setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator
    227    setUnion         = 4 << 16 | 6,    // implicit union of adjacent items
    228    setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet.
    229    setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet.
    230    };
    231 
    232 U_NAMESPACE_END
    233 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
    234 #endif   // REGEXCMP_H