rbbirpt.txt (15664B)
1 #***************************************************************************** 2 # 3 # Copyright (C) 2016 and later: Unicode, Inc. and others. 4 # License & terms of use: http://www.unicode.org/copyright.html 5 # 6 #***************************************************************************** 7 #***************************************************************************** 8 # 9 # Copyright (C) 2002-2016, International Business Machines Corporation and others. 10 # All Rights Reserved. 11 # 12 #***************************************************************************** 13 # 14 # file: rbbirpt.txt 15 # ICU Break Iterator Rule Parser State Table 16 # 17 # This state table is used when reading and parsing a set of RBBI rules 18 # The rule parser uses a state machine; the data in this file define the 19 # state transitions that occur for each input character. 20 # 21 # *** This file defines the RBBI rule grammar. This is it. 22 # *** The determination of what is accepted is here. 23 # 24 # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays 25 # that are then built with the rule parser. 26 # 27 # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h 28 29 # 30 # Here is the syntax of the state definitions in this file: 31 # 32 # 33 #StateName: 34 # input-char n next-state ^push-state action 35 # input-char n next-state ^push-state action 36 # | | | | | 37 # | | | | |--- action to be performed by state machine 38 # | | | | See function RBBIRuleScanner::doParseActions() 39 # | | | | 40 # | | | |--- Push this named state onto the state stack. 41 # | | | Later, when next state is specified as "pop", 42 # | | | the pushed state will become the current state. 43 # | | | 44 # | | |--- Transition to this state if the current input character matches the input 45 # | | character or char class in the left hand column. "pop" causes the next 46 # | | state to be popped from the state stack. 47 # | | 48 # | |--- When making the state transition specified on this line, advance to the next 49 # | character from the input only if 'n' appears here. 50 # | 51 # |--- Character or named character classes to test for. If the current character being scanned 52 # matches, perform the actions and go to the state specified on this line. 53 # The input character is tested sequentally, in the order written. The characters and 54 # character classes tested for do not need to be mutually exclusive. The first match wins. 55 # 56 57 58 59 60 # 61 # start state, scan position is at the beginning of the rules file, or in between two rules. 62 # 63 start: 64 escaped term ^break-rule-end doExprStart 65 white_space n start 66 '^' n start-after-caret ^break-rule-end doNoChain 67 '$' scan-var-name ^assign-or-rule doExprStart 68 '!' n rev-option 69 ';' n start # ignore empty rules. 70 eof exit 71 default term ^break-rule-end doExprStart 72 73 # 74 # break-rule-end: Returned from doing a break-rule expression. 75 # 76 break-rule-end: 77 ';' n start doEndOfRule 78 white_space n break-rule-end 79 default errorDeath doRuleError 80 81 # 82 # start of a rule, after having seen a '^' (inhibits rule chain in). 83 # Similar to the main 'start' state in most respects, except 84 # - empty rule is an error. 85 # - A second '^' is an error. 86 # 87 start-after-caret: 88 escaped term doExprStart 89 white_space n start-after-caret 90 '^' errorDeath doRuleError # two '^'s 91 '$' scan-var-name ^term-var-ref doExprStart 92 ';' errorDeath doRuleError # ^ ; 93 eof errorDeath doRuleError 94 default term doExprStart 95 96 # 97 # ! We've just scanned a '!', indicating either a !!key word flag or a 98 # !Reverse rule. 99 # 100 rev-option: 101 '!' n option-scan1 102 default reverse-rule ^break-rule-end doReverseDir 103 104 option-scan1: 105 name_start_char n option-scan2 doOptionStart 106 default errorDeath doRuleError 107 108 option-scan2: 109 name_char n option-scan2 110 default option-scan3 doOptionEnd 111 112 option-scan3: 113 ';' n start 114 white_space n option-scan3 115 default errorDeath doRuleError 116 117 118 reverse-rule: 119 default term ^break-rule-end doExprStart 120 121 122 # 123 # term. Eat through a single rule character, or a composite thing, which 124 # could be a parenthesized expression, a variable name, or a Unicode Set. 125 # 126 term: 127 escaped n expr-mod doRuleChar 128 white_space n term 129 rule_char n expr-mod doRuleChar 130 '[' scan-unicode-set ^expr-mod 131 '(' n term ^expr-mod doLParen 132 '$' scan-var-name ^term-var-ref 133 '.' n expr-mod doDotAny 134 default errorDeath doRuleError 135 136 137 138 # 139 # term-var-ref We've just finished scanning a reference to a $variable. 140 # Check that the variable was defined. 141 # The variable name scanning is in common with assignment statements, 142 # so the check can't be done there. 143 term-var-ref: 144 default expr-mod doCheckVarDef 145 146 147 # 148 # expr-mod We've just finished scanning a term, now look for the optional 149 # trailing '*', '?', '+' 150 # 151 expr-mod: 152 white_space n expr-mod 153 '*' n expr-cont doUnaryOpStar 154 '+' n expr-cont doUnaryOpPlus 155 '?' n expr-cont doUnaryOpQuestion 156 default expr-cont 157 158 159 # 160 # expr-cont Expression, continuation. At a point where additional terms are 161 # allowed, but not required. 162 # 163 expr-cont: 164 escaped term doExprCatOperator 165 white_space n expr-cont 166 rule_char term doExprCatOperator 167 '[' term doExprCatOperator 168 '(' term doExprCatOperator 169 '$' term doExprCatOperator 170 '.' term doExprCatOperator 171 '/' look-ahead doExprCatOperator 172 '{' n tag-open doExprCatOperator 173 '|' n term doExprOrOperator 174 ')' n pop doExprRParen 175 default pop doExprFinished 176 177 178 # 179 # look-ahead Scanning a '/', which identifies a break point, assuming that the 180 # remainder of the expression matches. 181 # 182 # Generate a parse tree as if this was a special kind of input symbol 183 # appearing in an otherwise normal concatenation expression. 184 # 185 look-ahead: 186 '/' n expr-cont-no-slash doSlash 187 default errorDeath 188 189 190 # 191 # expr-cont-no-slash Expression, continuation. At a point where additional terms are 192 # allowed, but not required. Just like 193 # expr-cont, above, except that no '/' 194 # look-ahead symbol is permitted. 195 # 196 expr-cont-no-slash: 197 escaped term doExprCatOperator 198 white_space n expr-cont 199 rule_char term doExprCatOperator 200 '[' term doExprCatOperator 201 '(' term doExprCatOperator 202 '$' term doExprCatOperator 203 '.' term doExprCatOperator 204 '|' n term doExprOrOperator 205 ')' n pop doExprRParen 206 default pop doExprFinished 207 208 209 # 210 # tags scanning a '{', the opening delimiter for a tag that identifies 211 # the kind of match. Scan the whole {dddd} tag, where d=digit 212 # 213 tag-open: 214 white_space n tag-open 215 digit_char tag-value doStartTagValue 216 default errorDeath doTagExpectedError 217 218 tag-value: 219 white_space n tag-close 220 '}' tag-close 221 digit_char n tag-value doTagDigit 222 default errorDeath doTagExpectedError 223 224 tag-close: 225 white_space n tag-close 226 '}' n expr-cont-no-tag doTagValue 227 default errorDeath doTagExpectedError 228 229 230 231 # 232 # expr-cont-no-tag Expression, continuation. At a point where additional terms are 233 # allowed, but not required. Just like 234 # expr-cont, above, except that no "{ddd}" 235 # tagging is permitted. 236 # 237 expr-cont-no-tag: 238 escaped term doExprCatOperator 239 white_space n expr-cont-no-tag 240 rule_char term doExprCatOperator 241 '[' term doExprCatOperator 242 '(' term doExprCatOperator 243 '$' term doExprCatOperator 244 '.' term doExprCatOperator 245 '/' look-ahead doExprCatOperator 246 '|' n term doExprOrOperator 247 ')' n pop doExprRParen 248 default pop doExprFinished 249 250 251 252 253 # 254 # Variable Name Scanning. 255 # 256 # The state that branched to here must have pushed a return state 257 # to go to after completion of the variable name scanning. 258 # 259 # The current input character must be the $ that introduces the name. 260 # The $ is consumed here rather than in the state that first detected it 261 # so that the doStartVariableName action only needs to happen in one 262 # place (here), and the other states don't need to worry about it. 263 # 264 scan-var-name: 265 '$' n scan-var-start doStartVariableName 266 default errorDeath 267 268 269 scan-var-start: 270 name_start_char n scan-var-body 271 default errorDeath doVariableNameExpectedErr 272 273 scan-var-body: 274 name_char n scan-var-body 275 default pop doEndVariableName 276 277 278 279 # 280 # scan-unicode-set Unicode Sets are parsed by the UnicodeSet class. 281 # Within the RBBI parser, after finding the first character 282 # of a Unicode Set, we just hand the rule input at that 283 # point of to the Unicode Set constructor, then pick 284 # up parsing after the close of the set. 285 # 286 # The action for this state invokes the UnicodeSet parser. 287 # 288 scan-unicode-set: 289 '[' n pop doScanUnicodeSet 290 'p' n pop doScanUnicodeSet 291 'P' n pop doScanUnicodeSet 292 default errorDeath 293 294 295 296 297 298 299 300 # 301 # assign-or-rule. A $variable was encountered at the start of something, could be 302 # either an assignment statement or a rule, depending on whether an '=' 303 # follows the variable name. We get to this state when the variable name 304 # scanning does a return. 305 # 306 assign-or-rule: 307 white_space n assign-or-rule 308 '=' n term ^assign-end doStartAssign # variable was target of assignment 309 default term-var-ref ^break-rule-end # variable was a term in a rule 310 311 312 313 # 314 # assign-end This state is entered when the end of the expression on the 315 # right hand side of an assignment is found. We get here via 316 # a pop; this state is pushed when the '=' in an assignment is found. 317 # 318 # The only thing allowed at this point is a ';'. The RHS of an 319 # assignment must look like a rule expression, and we come here 320 # when what is being scanned no longer looks like an expression. 321 # 322 assign-end: 323 ';' n start doEndAssign 324 default errorDeath doRuleErrorAssignExpr 325 326 327 328 # 329 # errorDeath. This state is specified as the next state whenever a syntax error 330 # in the source rules is detected. Barring bugs, the state machine will never 331 # actually get here, but will stop because of the action associated with the error. 332 # But, just in case, this state asks the state machine to exit. 333 errorDeath: 334 default n errorDeath doExit