[ tor-browser ].git.dasho

rbbirpt.txt (15664B)
      1 #*****************************************************************************
      2 #
      3 #   Copyright (C) 2016 and later: Unicode, Inc. and others.
      4 #   License & terms of use: http://www.unicode.org/copyright.html
      5 #
      6 #*****************************************************************************
      7 #*****************************************************************************
      8 #
      9 #   Copyright (C) 2002-2016, International Business Machines Corporation and others.
     10 #   All Rights Reserved.
     11 #
     12 #*****************************************************************************
     13 #
     14 #  file:  rbbirpt.txt
     15 #  ICU Break Iterator Rule Parser State Table
     16 #
     17 #     This state table is used when reading and parsing a set of RBBI rules
     18 #     The rule parser uses a state machine; the data in this file define the
     19 #     state transitions that occur for each input character.
     20 #
     21 #     *** This file defines the RBBI rule grammar.   This is it.
     22 #     *** The determination of what is accepted is here.
     23 #
     24 #     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
     25 #     that are then built with the rule parser.
     26 #
     27 #    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
     28 
     29 #
     30 # Here is the syntax of the state definitions in this file:
     31 #
     32 #
     33 #StateName:
     34 #   input-char           n next-state           ^push-state     action    
     35 #   input-char           n next-state           ^push-state     action    
     36 #       |                |   |                      |             |
     37 #       |                |   |                      |             |--- action to be performed by state machine
     38 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
     39 #       |                |   |                      |
     40 #       |                |   |                      |--- Push this named state onto the state stack.
     41 #       |                |   |                           Later, when next state is specified as "pop",
     42 #       |                |   |                           the pushed state will become the current state.
     43 #       |                |   |
     44 #       |                |   |--- Transition to this state if the current input character matches the input
     45 #       |                |        character or char class in the left hand column.  "pop" causes the next
     46 #       |                |        state to be popped from the state stack.
     47 #       |                |
     48 #       |                |--- When making the state transition specified on this line, advance to the next
     49 #       |                     character from the input only if 'n' appears here.
     50 #       |
     51 #       |--- Character or named character classes to test for.  If the current character being scanned
     52 #            matches, perform the actions and go to the state specified on this line.
     53 #            The input character is tested sequentally, in the order written.  The characters and
     54 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
     55 #            
     56 
     57 
     58 
     59 
     60 #
     61 #  start state, scan position is at the beginning of the rules file, or in between two rules.
     62 #
     63 start:
     64    escaped                term                  ^break-rule-end    doExprStart                       
     65    white_space          n start                     
     66    '^'                  n start-after-caret     ^break-rule-end    doNoChain
     67    '$'                    scan-var-name         ^assign-or-rule    doExprStart
     68    '!'                  n rev-option                             
     69    ';'                  n start                                                  # ignore empty rules.
     70    eof                    exit              
     71    default                term                  ^break-rule-end    doExprStart
     72    
     73 #
     74 #  break-rule-end:  Returned from doing a break-rule expression.
     75 #
     76 break-rule-end:
     77    ';'	                 n start                                    doEndOfRule
     78    white_space          n break-rule-end
     79    default                errorDeath                               doRuleError
     80     
     81 #
     82 # start of a rule, after having seen a '^' (inhibits rule chain in).
     83 #     Similar to the main 'start' state in most respects, except
     84 #          - empty rule is an error.
     85 #          - A second '^' is an error.
     86 #
     87 start-after-caret:
     88    escaped                term                                     doExprStart
     89    white_space          n start-after-caret
     90    '^'                    errorDeath                               doRuleError    # two '^'s
     91    '$'                    scan-var-name         ^term-var-ref      doExprStart
     92    ';'                    errorDeath                               doRuleError    # ^ ;
     93    eof                    errorDeath                               doRuleError
     94    default                term                                     doExprStart
     95 
     96 #
     97 #   !               We've just scanned a '!', indicating either a !!key word flag or a
     98 #                   !Reverse rule.
     99 #
    100 rev-option:
    101    '!'                  n option-scan1   
    102    default                reverse-rule           ^break-rule-end   doReverseDir
    103    
    104 option-scan1:
    105    name_start_char      n option-scan2                             doOptionStart
    106    default                errorDeath                               doRuleError
    107    
    108 option-scan2:
    109    name_char            n option-scan2
    110    default                option-scan3                             doOptionEnd
    111    
    112 option-scan3:
    113    ';'                  n start 
    114    white_space          n option-scan3 
    115    default                errorDeath                               doRuleError 
    116    
    117 
    118 reverse-rule:
    119    default                term                   ^break-rule-end   doExprStart
    120    
    121    
    122 #
    123 #  term.  Eat through a single rule character, or a composite thing, which
    124 #         could be a parenthesized expression, a variable name, or a Unicode Set.
    125 #
    126 term:
    127    escaped              n expr-mod                                 doRuleChar
    128    white_space          n term
    129    rule_char            n expr-mod                                 doRuleChar
    130    '['                    scan-unicode-set      ^expr-mod
    131    '('                  n term                  ^expr-mod          doLParen
    132    '$'                    scan-var-name         ^term-var-ref
    133    '.'                  n expr-mod                                 doDotAny
    134    default                errorDeath                               doRuleError
    135    
    136    
    137 
    138 #
    139 #  term-var-ref   We've just finished scanning a reference to a $variable.
    140 #                 Check that the variable was defined.
    141 #                 The variable name scanning is in common with assignment statements,
    142 #                 so the check can't be done there.
    143 term-var-ref:
    144    default                expr-mod                                 doCheckVarDef
    145    
    146    
    147 #
    148 #   expr-mod      We've just finished scanning a term, now look for the optional
    149 #                 trailing '*', '?', '+'
    150 #
    151 expr-mod:
    152    white_space          n  expr-mod
    153    '*'                  n  expr-cont                               doUnaryOpStar
    154    '+'                  n  expr-cont                               doUnaryOpPlus
    155    '?'                  n  expr-cont                               doUnaryOpQuestion
    156    default                 expr-cont 
    157    
    158    
    159 #
    160 #  expr-cont      Expression, continuation.  At a point where additional terms are
    161 #                                            allowed, but not required.
    162 #
    163 expr-cont:
    164    escaped                 term                                    doExprCatOperator
    165    white_space          n  expr-cont
    166    rule_char               term                                    doExprCatOperator
    167    '['                     term                                    doExprCatOperator
    168    '('                     term                                    doExprCatOperator
    169    '$'                     term                                    doExprCatOperator
    170    '.'                     term                                    doExprCatOperator
    171    '/'                     look-ahead                              doExprCatOperator
    172    '{'                  n  tag-open                                doExprCatOperator
    173    '|'                  n  term                                    doExprOrOperator
    174    ')'                  n  pop                                     doExprRParen
    175    default                 pop                                     doExprFinished
    176    
    177 
    178 #
    179 #   look-ahead    Scanning a '/', which identifies a break point, assuming that the
    180 #                 remainder of the expression matches.
    181 #
    182 #                 Generate a parse tree as if this was a special kind of input symbol
    183 #                 appearing in an otherwise normal concatenation expression.
    184 #
    185 look-ahead:
    186    '/'                   n expr-cont-no-slash                      doSlash
    187    default                 errorDeath
    188 
    189 
    190 #
    191 #  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
    192 #                                            allowed, but not required.  Just like
    193 #                                            expr-cont, above, except that no '/'
    194 #                                            look-ahead symbol is permitted.
    195 #
    196 expr-cont-no-slash:
    197    escaped                 term                                    doExprCatOperator
    198    white_space          n  expr-cont
    199    rule_char               term                                    doExprCatOperator
    200    '['                     term                                    doExprCatOperator
    201    '('                     term                                    doExprCatOperator
    202    '$'                     term                                    doExprCatOperator
    203    '.'                     term                                    doExprCatOperator
    204    '|'                  n  term                                    doExprOrOperator
    205    ')'                  n  pop                                     doExprRParen
    206    default                 pop                                     doExprFinished
    207 
    208 
    209 #
    210 #   tags             scanning a '{', the opening delimiter for a tag that identifies
    211 #                    the kind of match.  Scan the whole {dddd} tag, where d=digit
    212 #
    213 tag-open:
    214    white_space          n  tag-open
    215    digit_char              tag-value                               doStartTagValue
    216    default                 errorDeath                              doTagExpectedError
    217    
    218 tag-value:
    219    white_space          n  tag-close
    220    '}'                     tag-close
    221    digit_char           n  tag-value                               doTagDigit
    222    default                 errorDeath                              doTagExpectedError
    223    
    224 tag-close:
    225    white_space          n  tag-close
    226    '}'                  n  expr-cont-no-tag                        doTagValue
    227    default                 errorDeath                              doTagExpectedError
    228    
    229    
    230    
    231 #
    232 #  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
    233 #                                            allowed, but not required.  Just like
    234 #                                            expr-cont, above, except that no "{ddd}"
    235 #                                            tagging is permitted.
    236 #
    237 expr-cont-no-tag:
    238    escaped                 term                                    doExprCatOperator
    239    white_space          n  expr-cont-no-tag
    240    rule_char               term                                    doExprCatOperator
    241    '['                     term                                    doExprCatOperator
    242    '('                     term                                    doExprCatOperator
    243    '$'                     term                                    doExprCatOperator
    244    '.'                     term                                    doExprCatOperator
    245    '/'                     look-ahead                              doExprCatOperator
    246    '|'                  n  term                                    doExprOrOperator
    247    ')'                  n  pop                                     doExprRParen
    248    default                 pop                                     doExprFinished
    249    
    250    
    251 
    252 
    253 #
    254 #   Variable Name Scanning.
    255 #
    256 #                    The state that branched to here must have pushed a return state
    257 #                    to go to after completion of the variable name scanning.
    258 #
    259 #                    The current input character must be the $ that introduces the name.
    260 #                    The $ is consumed here rather than in the state that first detected it
    261 #                    so that the doStartVariableName action only needs to happen in one
    262 #                    place (here), and the other states don't need to worry about it.
    263 #
    264 scan-var-name:
    265   '$'                  n scan-var-start                            doStartVariableName
    266   default                errorDeath
    267 
    268 
    269 scan-var-start:
    270    name_start_char      n scan-var-body
    271    default                errorDeath                               doVariableNameExpectedErr
    272    
    273 scan-var-body:
    274    name_char            n scan-var-body
    275    default                pop                                      doEndVariableName
    276    
    277    
    278    
    279 #
    280 #  scan-unicode-set   Unicode Sets are parsed by the UnicodeSet class.
    281 #                     Within the RBBI parser, after finding the first character
    282 #                     of a Unicode Set, we just hand the rule input at that
    283 #                     point of to the Unicode Set constructor, then pick
    284 #                     up parsing after the close of the set.
    285 #
    286 #                     The action for this state invokes the UnicodeSet parser.
    287 #
    288 scan-unicode-set:
    289    '['                   n pop                                      doScanUnicodeSet
    290    'p'                   n pop                                      doScanUnicodeSet
    291    'P'                   n pop                                      doScanUnicodeSet
    292    default		    errorDeath 
    293    
    294    
    295 
    296 
    297 
    298 
    299 
    300 #
    301 #  assign-or-rule.   A $variable was encountered at the start of something, could be
    302 #                    either an assignment statement or a rule, depending on whether an '='
    303 #                    follows the variable name.  We get to this state when the variable name
    304 #                    scanning does a return.
    305 #
    306 assign-or-rule:
    307    white_space          n assign-or-rule
    308    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
    309    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
    310 
    311 
    312 
    313 #
    314 #  assign-end        This state is entered when the end of the expression on the
    315 #                    right hand side of an assignment is found.  We get here via
    316 #                    a pop; this state is pushed when the '=' in an assignment is found.
    317 #
    318 #                    The only thing allowed at this point is a ';'.  The RHS of an
    319 #                    assignment must look like a rule expression, and we come here
    320 #                    when what is being scanned no longer looks like an expression.
    321 #
    322 assign-end:
    323    ';'                  n start                                    doEndAssign
    324    default                errorDeath                               doRuleErrorAssignExpr
    325    
    326    
    327    
    328 #
    329 # errorDeath.   This state is specified as the next state whenever a syntax error
    330 #               in the source rules is detected.  Barring bugs, the state machine will never
    331 #               actually get here, but will stop because of the action associated with the error.
    332 #               But, just in case, this state asks the state machine to exit.
    333 errorDeath:
    334    default              n errorDeath                               doExit
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE