rbbiscan.cpp (47842B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // file: rbbiscan.cpp 5 // 6 // Copyright (C) 2002-2016, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains the Rule Based Break Iterator Rule Builder functions for 10 // scanning the rules and assembling a parse tree. This is the first phase 11 // of compiling the rules. 12 // 13 // The overall of the rules is managed by class RBBIRuleBuilder, which will 14 // create and use an instance of this class as part of the process. 15 // 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include "unicode/unistr.h" 22 #include "unicode/uniset.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uchriter.h" 25 #include "unicode/parsepos.h" 26 #include "unicode/parseerr.h" 27 #include "cmemory.h" 28 #include "cstring.h" 29 30 #include "rbbirpt.h" // Contains state table for the rbbi rules parser. 31 // generated by a Perl script. 32 #include "rbbirb.h" 33 #include "rbbinode.h" 34 #include "rbbiscan.h" 35 #include "rbbitblb.h" 36 37 #include "uassert.h" 38 39 //------------------------------------------------------------------------------ 40 // 41 // Unicode Set init strings for each of the character classes needed for parsing a rule file. 42 // (Initialized with hex values for portability to EBCDIC based machines. 43 // Really ugly, but there's no good way to avoid it.) 44 // 45 // The sets are referred to by name in the rbbirpt.txt, which is the 46 // source form of the state transition table for the RBBI rule parser. 47 // 48 //------------------------------------------------------------------------------ 49 static const char16_t gRuleSet_rule_char_pattern[] = { 50 // Characters that may appear as literals in patterns without escaping or quoting. 51 // [ ^ [ \ p { Z } \ u 0 0 2 0 52 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 53 // - \ u 0 0 7 f ] - [ \ p 54 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 55 // { L } ] - [ \ p { N } ] ] 56 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 57 58 static const char16_t gRuleSet_name_char_pattern[] = { 59 // [ _ \ p { L } \ p { N } ] 60 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 61 62 static const char16_t gRuleSet_digit_char_pattern[] = { 63 // [ 0 - 9 ] 64 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 65 66 static const char16_t gRuleSet_name_start_char_pattern[] = { 67 // [ _ \ p { L } ] 68 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 69 70 static const char16_t kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 71 72 73 U_CDECL_BEGIN 74 static void U_CALLCONV RBBISetTable_deleter(void *p) { 75 icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; 76 delete px->key; 77 // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 78 // Don't delete the value nodes here. 79 uprv_free(px); 80 } 81 U_CDECL_END 82 83 U_NAMESPACE_BEGIN 84 85 //------------------------------------------------------------------------------ 86 // 87 // Constructor. 88 // 89 //------------------------------------------------------------------------------ 90 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 91 { 92 fRB = rb; 93 fScanIndex = 0; 94 fNextIndex = 0; 95 fQuoteMode = false; 96 fLineNum = 1; 97 fCharNum = 0; 98 fLastChar = 0; 99 100 fStateTable = nullptr; 101 fStack[0] = 0; 102 fStackPtr = 0; 103 fNodeStack[0] = nullptr; 104 fNodeStackPtr = 0; 105 106 fReverseRule = false; 107 fLookAheadRule = false; 108 fNoChainInRule = false; 109 110 fSymbolTable = nullptr; 111 fSetTable = nullptr; 112 fRuleNum = 0; 113 fOptionStart = 0; 114 115 // Do not check status until after all critical fields are sufficiently initialized 116 // that the destructor can run cleanly. 117 if (U_FAILURE(*rb->fStatus)) { 118 return; 119 } 120 121 // 122 // Set up the constant Unicode Sets. 123 // Note: These could be made static, lazily initialized, and shared among 124 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 125 // and the time to build these few sets should be small compared to a 126 // full break iterator build. 127 fRuleSets[kRuleSet_rule_char-128] 128 = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); 129 // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] 130 fRuleSets[kRuleSet_white_space-128]. 131 add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 132 fRuleSets[kRuleSet_name_char-128] 133 = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus); 134 fRuleSets[kRuleSet_name_start_char-128] 135 = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); 136 fRuleSets[kRuleSet_digit_char-128] 137 = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); 138 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 139 // This case happens if ICU's data is missing. UnicodeSet tries to look up property 140 // names from the init string, can't find them, and claims an illegal argument. 141 // Change the error so that the actual problem will be clearer to users. 142 *rb->fStatus = U_BRK_INIT_ERROR; 143 } 144 if (U_FAILURE(*rb->fStatus)) { 145 return; 146 } 147 148 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 149 if (fSymbolTable == nullptr) { 150 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 151 return; 152 } 153 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus); 154 if (U_FAILURE(*rb->fStatus)) { 155 return; 156 } 157 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 158 } 159 160 161 162 //------------------------------------------------------------------------------ 163 // 164 // Destructor 165 // 166 //------------------------------------------------------------------------------ 167 RBBIRuleScanner::~RBBIRuleScanner() { 168 delete fSymbolTable; 169 if (fSetTable != nullptr) { 170 uhash_close(fSetTable); 171 fSetTable = nullptr; 172 173 } 174 175 176 // Node Stack. 177 // Normally has one entry, which is the entire parse tree for the rules. 178 // If errors occurred, there may be additional subtrees left on the stack. 179 while (fNodeStackPtr > 0) { 180 delete fNodeStack[fNodeStackPtr]; 181 fNodeStackPtr--; 182 } 183 184 } 185 186 //------------------------------------------------------------------------------ 187 // 188 // doParseAction Do some action during rule parsing. 189 // Called by the parse state machine. 190 // Actions build the parse tree and Unicode Sets, 191 // and maintain the parse stack for nested expressions. 192 // 193 // TODO: unify EParseAction and RBBI_RuleParseAction enum types. 194 // They represent exactly the same thing. They're separate 195 // only to work around enum forward declaration restrictions 196 // in some compilers, while at the same time avoiding multiple 197 // definitions problems. I'm sure that there's a better way. 198 // 199 //------------------------------------------------------------------------------ 200 UBool RBBIRuleScanner::doParseActions(int32_t action) 201 { 202 RBBINode *n = nullptr; 203 204 UBool returnVal = true; 205 206 switch (action) { 207 208 case doExprStart: 209 pushNewNode(RBBINode::opStart); 210 fRuleNum++; 211 break; 212 213 214 case doNoChain: 215 // Scanned a '^' while on the rule start state. 216 fNoChainInRule = true; 217 break; 218 219 220 case doExprOrOperator: 221 { 222 fixOpStack(RBBINode::precOpCat); 223 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 224 RBBINode *orNode = pushNewNode(RBBINode::opOr); 225 if (U_FAILURE(*fRB->fStatus)) { 226 break; 227 } 228 orNode->fLeftChild = operandNode; 229 operandNode->fParent = orNode; 230 } 231 break; 232 233 case doExprCatOperator: 234 // concatenation operator. 235 // For the implicit concatenation of adjacent terms in an expression that are 236 // not separated by any other operator. Action is invoked between the 237 // actions for the two terms. 238 { 239 fixOpStack(RBBINode::precOpCat); 240 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 241 RBBINode *catNode = pushNewNode(RBBINode::opCat); 242 if (U_FAILURE(*fRB->fStatus)) { 243 break; 244 } 245 catNode->fLeftChild = operandNode; 246 operandNode->fParent = catNode; 247 } 248 break; 249 250 case doLParen: 251 // Open Paren. 252 // The openParen node is a dummy operation type with a low precedence, 253 // which has the affect of ensuring that any real binary op that 254 // follows within the parens binds more tightly to the operands than 255 // stuff outside of the parens. 256 pushNewNode(RBBINode::opLParen); 257 break; 258 259 case doExprRParen: 260 fixOpStack(RBBINode::precLParen); 261 break; 262 263 case doNOP: 264 break; 265 266 case doStartAssign: 267 // We've just scanned "$variable = " 268 // The top of the node stack has the $variable ref node. 269 270 // Save the start position of the RHS text in the StartExpression node 271 // that precedes the $variableReference node on the stack. 272 // This will eventually be used when saving the full $variable replacement 273 // text as a string. 274 n = fNodeStack[fNodeStackPtr-1]; 275 n->fFirstPos = fNextIndex; // move past the '=' 276 277 // Push a new start-of-expression node; needed to keep parse of the 278 // RHS expression happy. 279 pushNewNode(RBBINode::opStart); 280 break; 281 282 283 284 285 case doEndAssign: 286 { 287 // We have reached the end of an assignment statement. 288 // Current scan char is the ';' that terminates the assignment. 289 290 // Terminate expression, leaves expression parse tree rooted in TOS node. 291 fixOpStack(RBBINode::precStart); 292 if (U_FAILURE(*fRB->fStatus)) { 293 break; 294 } 295 296 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 297 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 298 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 299 300 // Save original text of right side of assignment, excluding the terminating ';' 301 // in the root of the node for the right-hand-side expression. 302 RHSExprNode->fFirstPos = startExprNode->fFirstPos; 303 RHSExprNode->fLastPos = fScanIndex; 304 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 305 306 // Expression parse tree becomes l. child of the $variable reference node. 307 varRefNode->fLeftChild = RHSExprNode; 308 RHSExprNode->fParent = varRefNode; 309 310 // Make a symbol table entry for the $variableRef node. 311 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 312 if (U_FAILURE(*fRB->fStatus)) { 313 // This is a round-about way to get the parse position set 314 // so that duplicate symbols error messages include a line number. 315 UErrorCode t = *fRB->fStatus; 316 *fRB->fStatus = U_ZERO_ERROR; 317 error(t); 318 // When adding $variableRef to the symbol table fail, Delete 319 // both nodes because deleting varRefNode will not delete 320 // RHSExprNode internally. 321 delete RHSExprNode; 322 delete varRefNode; 323 } 324 325 // Clean up the stack. 326 delete startExprNode; 327 fNodeStackPtr-=3; 328 break; 329 } 330 331 case doEndOfRule: 332 { 333 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 334 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 335 break; 336 } 337 #ifdef RBBI_DEBUG 338 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 339 #endif 340 U_ASSERT(fNodeStackPtr == 1); 341 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 342 343 // If this rule includes a look-ahead '/', add a endMark node to the 344 // expression tree. 345 if (fLookAheadRule) { 346 RBBINode *endNode = pushNewNode(RBBINode::endMark); 347 RBBINode *catNode = pushNewNode(RBBINode::opCat); 348 if (U_FAILURE(*fRB->fStatus)) { 349 break; 350 } 351 fNodeStackPtr -= 2; 352 catNode->fLeftChild = thisRule; 353 catNode->fRightChild = endNode; 354 fNodeStack[fNodeStackPtr] = catNode; 355 endNode->fVal = fRuleNum; 356 endNode->fLookAheadEnd = true; 357 thisRule = catNode; 358 359 // TODO: Disable chaining out of look-ahead (hard break) rules. 360 // The break on rule match is forced, so there is no point in building up 361 // the state table to chain into another rule for a longer match. 362 } 363 364 // Mark this node as being the root of a rule. 365 thisRule->fRuleRoot = true; 366 367 // Flag if chaining into this rule is wanted. 368 // 369 if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain 370 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 371 thisRule->fChainIn = true; 372 } 373 374 375 // All rule expressions are ORed together. 376 // The ';' that terminates an expression really just functions as a '|' with 377 // a low operator prededence. 378 // 379 // Each of the four sets of rules are collected separately. 380 // (forward, reverse, safe_forward, safe_reverse) 381 // OR this rule into the appropriate group of them. 382 // 383 RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree); 384 385 if (*destRules != nullptr) { 386 // This is not the first rule encountered. 387 // OR previous stuff (from *destRules) 388 // with the current rule expression (on the Node Stack) 389 // with the resulting OR expression going to *destRules 390 // 391 thisRule = fNodeStack[fNodeStackPtr]; 392 RBBINode *prevRules = *destRules; 393 RBBINode *orNode = pushNewNode(RBBINode::opOr); 394 if (U_FAILURE(*fRB->fStatus)) { 395 break; 396 } 397 orNode->fLeftChild = prevRules; 398 prevRules->fParent = orNode; 399 orNode->fRightChild = thisRule; 400 thisRule->fParent = orNode; 401 *destRules = orNode; 402 } 403 else 404 { 405 // This is the first rule encountered (for this direction). 406 // Just move its parse tree from the stack to *destRules. 407 *destRules = fNodeStack[fNodeStackPtr]; 408 } 409 fReverseRule = false; // in preparation for the next rule. 410 fLookAheadRule = false; 411 fNoChainInRule = false; 412 fNodeStackPtr = 0; 413 } 414 break; 415 416 417 case doRuleError: 418 error(U_BRK_RULE_SYNTAX); 419 returnVal = false; 420 break; 421 422 423 case doVariableNameExpectedErr: 424 error(U_BRK_RULE_SYNTAX); 425 break; 426 427 428 // 429 // Unary operands + ? * 430 // These all appear after the operand to which they apply. 431 // When we hit one, the operand (may be a whole sub expression) 432 // will be on the top of the stack. 433 // Unary Operator becomes TOS, with the old TOS as its one child. 434 case doUnaryOpPlus: 435 { 436 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 437 RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 438 if (U_FAILURE(*fRB->fStatus)) { 439 break; 440 } 441 plusNode->fLeftChild = operandNode; 442 operandNode->fParent = plusNode; 443 } 444 break; 445 446 case doUnaryOpQuestion: 447 { 448 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 449 RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 450 if (U_FAILURE(*fRB->fStatus)) { 451 break; 452 } 453 qNode->fLeftChild = operandNode; 454 operandNode->fParent = qNode; 455 } 456 break; 457 458 case doUnaryOpStar: 459 { 460 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 461 RBBINode *starNode = pushNewNode(RBBINode::opStar); 462 if (U_FAILURE(*fRB->fStatus)) { 463 break; 464 } 465 starNode->fLeftChild = operandNode; 466 operandNode->fParent = starNode; 467 } 468 break; 469 470 case doRuleChar: 471 // A "Rule Character" is any single character that is a literal part 472 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 473 // These are pretty uncommon in break rules; the terms are more commonly 474 // sets. To keep things uniform, treat these characters like as 475 // sets that just happen to contain only one character. 476 { 477 n = pushNewNode(RBBINode::setRef); 478 if (U_FAILURE(*fRB->fStatus)) { 479 break; 480 } 481 findSetFor(UnicodeString(fC.fChar), n); 482 n->fFirstPos = fScanIndex; 483 n->fLastPos = fNextIndex; 484 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 485 break; 486 } 487 488 case doDotAny: 489 // scanned a ".", meaning match any single character. 490 { 491 n = pushNewNode(RBBINode::setRef); 492 if (U_FAILURE(*fRB->fStatus)) { 493 break; 494 } 495 findSetFor(UnicodeString(true, kAny, 3), n); 496 n->fFirstPos = fScanIndex; 497 n->fLastPos = fNextIndex; 498 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 499 break; 500 } 501 502 case doSlash: 503 // Scanned a '/', which identifies a look-ahead break position in a rule. 504 n = pushNewNode(RBBINode::lookAhead); 505 if (U_FAILURE(*fRB->fStatus)) { 506 break; 507 } 508 n->fVal = fRuleNum; 509 n->fFirstPos = fScanIndex; 510 n->fLastPos = fNextIndex; 511 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 512 fLookAheadRule = true; 513 break; 514 515 516 case doStartTagValue: 517 // Scanned a '{', the opening delimiter for a tag value within a rule. 518 n = pushNewNode(RBBINode::tag); 519 if (U_FAILURE(*fRB->fStatus)) { 520 break; 521 } 522 n->fVal = 0; 523 n->fFirstPos = fScanIndex; 524 n->fLastPos = fNextIndex; 525 break; 526 527 case doTagDigit: 528 // Just scanned a decimal digit that's part of a tag value 529 { 530 n = fNodeStack[fNodeStackPtr]; 531 uint32_t v = u_charDigitValue(fC.fChar); 532 U_ASSERT(v < 10); 533 int64_t updated = static_cast<int64_t>(n->fVal)*10 + v; 534 // Avoid overflow n->fVal 535 if (updated > INT32_MAX) { 536 error(U_BRK_RULE_SYNTAX); 537 break; 538 } 539 n->fVal = static_cast<int32_t>(updated); 540 break; 541 } 542 543 case doTagValue: 544 n = fNodeStack[fNodeStackPtr]; 545 n->fLastPos = fNextIndex; 546 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 547 break; 548 549 case doTagExpectedError: 550 error(U_BRK_MALFORMED_RULE_TAG); 551 returnVal = false; 552 break; 553 554 case doOptionStart: 555 // Scanning a !!option. At the start of string. 556 fOptionStart = fScanIndex; 557 break; 558 559 case doOptionEnd: 560 { 561 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 562 if (opt == UNICODE_STRING("chain", 5)) { 563 fRB->fChainRules = true; 564 } else if (opt == UNICODE_STRING("forward", 7)) { 565 fRB->fDefaultTree = &fRB->fForwardTree; 566 } else if (opt == UNICODE_STRING("reverse", 7)) { 567 fRB->fDefaultTree = &fRB->fReverseTree; 568 } else if (opt == UNICODE_STRING("safe_forward", 12)) { 569 fRB->fDefaultTree = &fRB->fSafeFwdTree; 570 } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 571 fRB->fDefaultTree = &fRB->fSafeRevTree; 572 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 573 fRB->fLookAheadHardBreak = true; 574 } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) { 575 fRuleSets[kRuleSet_rule_char-128].clear(); 576 } else if (opt == UNICODE_STRING("unquoted_literals", 17)) { 577 fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus); 578 } else { 579 error(U_BRK_UNRECOGNIZED_OPTION); 580 } 581 } 582 break; 583 584 case doReverseDir: 585 fReverseRule = true; 586 break; 587 588 case doStartVariableName: 589 n = pushNewNode(RBBINode::varRef); 590 if (U_FAILURE(*fRB->fStatus)) { 591 break; 592 } 593 n->fFirstPos = fScanIndex; 594 break; 595 596 case doEndVariableName: 597 n = fNodeStack[fNodeStackPtr]; 598 if (n==nullptr || n->fType != RBBINode::varRef) { 599 error(U_BRK_INTERNAL_ERROR); 600 break; 601 } 602 n->fLastPos = fScanIndex; 603 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 604 // Look the newly scanned name up in the symbol table 605 // If there's an entry, set the l. child of the var ref to the replacement expression. 606 // (We also pass through here when scanning assignments, but no harm is done, other 607 // than a slight wasted effort that seems hard to avoid. Lookup will be null) 608 n->fLeftChild = fSymbolTable->lookupNode(n->fText); 609 break; 610 611 case doCheckVarDef: 612 n = fNodeStack[fNodeStackPtr]; 613 if (n->fLeftChild == nullptr) { 614 error(U_BRK_UNDEFINED_VARIABLE); 615 returnVal = false; 616 } 617 break; 618 619 case doExprFinished: 620 break; 621 622 case doRuleErrorAssignExpr: 623 error(U_BRK_ASSIGN_ERROR); 624 returnVal = false; 625 break; 626 627 case doExit: 628 returnVal = false; 629 break; 630 631 case doScanUnicodeSet: 632 scanSet(); 633 break; 634 635 default: 636 error(U_BRK_INTERNAL_ERROR); 637 returnVal = false; 638 break; 639 } 640 return returnVal && U_SUCCESS(*fRB->fStatus); 641 } 642 643 644 645 646 //------------------------------------------------------------------------------ 647 // 648 // Error Report a rule parse error. 649 // Only report it if no previous error has been recorded. 650 // 651 //------------------------------------------------------------------------------ 652 void RBBIRuleScanner::error(UErrorCode e) { 653 if (U_SUCCESS(*fRB->fStatus)) { 654 *fRB->fStatus = e; 655 if (fRB->fParseError) { 656 fRB->fParseError->line = fLineNum; 657 fRB->fParseError->offset = fCharNum; 658 fRB->fParseError->preContext[0] = 0; 659 fRB->fParseError->postContext[0] = 0; 660 } 661 } 662 } 663 664 665 666 667 //------------------------------------------------------------------------------ 668 // 669 // fixOpStack The parse stack holds partially assembled chunks of the parse tree. 670 // An entry on the stack may be as small as a single setRef node, 671 // or as large as the parse tree 672 // for an entire expression (this will be the one item left on the stack 673 // when the parsing of an RBBI rule completes. 674 // 675 // This function is called when a binary operator is encountered. 676 // It looks back up the stack for operators that are not yet associated 677 // with a right operand, and if the precedence of the stacked operator >= 678 // the precedence of the current operator, binds the operand left, 679 // to the previously encountered operator. 680 // 681 //------------------------------------------------------------------------------ 682 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 683 RBBINode *n; 684 // printNodeStack("entering fixOpStack()"); 685 for (;;) { 686 n = fNodeStack[fNodeStackPtr-1]; // an operator node 687 if (n->fPrecedence == 0) { 688 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 689 error(U_BRK_INTERNAL_ERROR); 690 return; 691 } 692 693 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 694 // The most recent operand goes with the current operator, 695 // not with the previously stacked one. 696 break; 697 } 698 // Stack operator is a binary op ( '|' or concatenation) 699 // TOS operand becomes right child of this operator. 700 // Resulting subexpression becomes the TOS operand. 701 n->fRightChild = fNodeStack[fNodeStackPtr]; 702 fNodeStack[fNodeStackPtr]->fParent = n; 703 fNodeStackPtr--; 704 // printNodeStack("looping in fixOpStack() "); 705 } 706 707 if (p <= RBBINode::precLParen) { 708 // Scan is at a right paren or end of expression. 709 // The scanned item must match the stack, or else there was an error. 710 // Discard the left paren (or start expr) node from the stack, 711 // leaving the completed (sub)expression as TOS. 712 if (n->fPrecedence != p) { 713 // Right paren encountered matched start of expression node, or 714 // end of expression matched with a left paren node. 715 error(U_BRK_MISMATCHED_PAREN); 716 } 717 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 718 fNodeStackPtr--; 719 // Delete the now-discarded LParen or Start node. 720 delete n; 721 } 722 // printNodeStack("leaving fixOpStack()"); 723 } 724 725 726 727 728 //------------------------------------------------------------------------------ 729 // 730 // findSetFor given a UnicodeString, 731 // - find the corresponding Unicode Set (uset node) 732 // (create one if necessary) 733 // - Set fLeftChild of the caller's node (should be a setRef node) 734 // to the uset node 735 // Maintain a hash table of uset nodes, so the same one is always used 736 // for the same string. 737 // If a "to adopt" set is provided and we haven't seen this key before, 738 // add the provided set to the hash table. 739 // If the string is one (32 bit) char in length, the set contains 740 // just one element which is the char in question. 741 // If the string is "any", return a set containing all chars. 742 // 743 //------------------------------------------------------------------------------ 744 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 745 746 RBBISetTableEl *el; 747 748 // First check whether we've already cached a set for this string. 749 // If so, just use the cached set in the new node. 750 // delete any set provided by the caller, since we own it. 751 el = static_cast<RBBISetTableEl*>(uhash_get(fSetTable, &s)); 752 if (el != nullptr) { 753 delete setToAdopt; 754 node->fLeftChild = el->val; 755 U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 756 return; 757 } 758 759 // Haven't seen this set before. 760 // If the caller didn't provide us with a prebuilt set, 761 // create a new UnicodeSet now. 762 if (setToAdopt == nullptr) { 763 if (s.compare(kAny, -1) == 0) { 764 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 765 } else { 766 UChar32 c; 767 c = s.char32At(0); 768 setToAdopt = new UnicodeSet(c, c); 769 } 770 if (setToAdopt == nullptr) { 771 error(U_MEMORY_ALLOCATION_ERROR); 772 return; 773 } 774 } 775 776 // 777 // Make a new uset node to refer to this UnicodeSet 778 // This new uset node becomes the child of the caller's setReference node. 779 // 780 UErrorCode localStatus = U_ZERO_ERROR; 781 RBBINode *usetNode = new RBBINode(RBBINode::uset, localStatus); 782 if (usetNode == nullptr) { 783 localStatus = U_MEMORY_ALLOCATION_ERROR; 784 } 785 if (U_FAILURE(localStatus)) { 786 delete usetNode; 787 error(localStatus); 788 delete setToAdopt; 789 return; 790 } 791 usetNode->fInputSet = setToAdopt; 792 usetNode->fParent = node; 793 node->fLeftChild = usetNode; 794 usetNode->fText = s; 795 796 797 // 798 // Add the new uset node to the list of all uset nodes. 799 // 800 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 801 802 803 // 804 // Add the new set to the set hash table. 805 // 806 el = static_cast<RBBISetTableEl*>(uprv_malloc(sizeof(RBBISetTableEl))); 807 UnicodeString *tkey = new UnicodeString(s); 808 if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) { 809 // Delete to avoid memory leak 810 delete tkey; 811 tkey = nullptr; 812 uprv_free(el); 813 el = nullptr; 814 delete setToAdopt; 815 setToAdopt = nullptr; 816 817 error(U_MEMORY_ALLOCATION_ERROR); 818 return; 819 } 820 el->key = tkey; 821 el->val = usetNode; 822 uhash_put(fSetTable, el->key, el, fRB->fStatus); 823 } 824 825 826 827 // 828 // Assorted Unicode character constants. 829 // Numeric because there is no portable way to enter them as literals. 830 // (Think EBCDIC). 831 // 832 static const char16_t chCR = 0x0d; // New lines, for terminating comments. 833 static const char16_t chLF = 0x0a; 834 static const char16_t chNEL = 0x85; // NEL newline variant 835 static const char16_t chLS = 0x2028; // Unicode Line Separator 836 static const char16_t chApos = 0x27; // single quote, for quoted chars. 837 static const char16_t chPound = 0x23; // '#', introduces a comment. 838 static const char16_t chBackSlash = 0x5c; // '\' introduces a char escape 839 static const char16_t chLParen = 0x28; 840 static const char16_t chRParen = 0x29; 841 842 843 //------------------------------------------------------------------------------ 844 // 845 // stripRules Return a rules string without extra spaces. 846 // (Comments are removed separately, during rule parsing.) 847 // 848 //------------------------------------------------------------------------------ 849 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 850 UnicodeString strippedRules; 851 int32_t rulesLength = rules.length(); 852 853 for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) { 854 UChar32 cp = rules.char32At(idx); 855 bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE); 856 if (whiteSpace) { 857 continue; 858 } 859 strippedRules.append(cp); 860 } 861 return strippedRules; 862 } 863 864 865 //------------------------------------------------------------------------------ 866 // 867 // nextCharLL Low Level Next Char from rule input source. 868 // Get a char from the input character iterator, 869 // keep track of input position for error reporting. 870 // 871 //------------------------------------------------------------------------------ 872 UChar32 RBBIRuleScanner::nextCharLL() { 873 UChar32 ch; 874 875 if (fNextIndex >= fRB->fRules.length()) { 876 return static_cast<UChar32>(-1); 877 } 878 ch = fRB->fRules.char32At(fNextIndex); 879 if (U_IS_SURROGATE(ch)) { 880 error(U_ILLEGAL_CHAR_FOUND); 881 return U_SENTINEL; 882 } 883 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 884 885 if (ch == chCR || 886 ch == chNEL || 887 ch == chLS || 888 (ch == chLF && fLastChar != chCR)) { 889 // Character is starting a new line. Bump up the line number, and 890 // reset the column to 0. 891 fLineNum++; 892 fCharNum=0; 893 if (fQuoteMode) { 894 error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 895 fQuoteMode = false; 896 } 897 } 898 else { 899 // Character is not starting a new line. Except in the case of a 900 // LF following a CR, increment the column position. 901 if (ch != chLF) { 902 fCharNum++; 903 } 904 } 905 fLastChar = ch; 906 return ch; 907 } 908 909 910 //------------------------------------------------------------------------------ 911 // 912 // nextChar for rules scanning. At this level, we handle stripping 913 // out comments and processing backslash character escapes. 914 // The rest of the rules grammar is handled at the next level up. 915 // 916 //------------------------------------------------------------------------------ 917 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 918 919 // Unicode Character constants needed for the processing done by nextChar(), 920 // in hex because literals wont work on EBCDIC machines. 921 922 fScanIndex = fNextIndex; 923 c.fChar = nextCharLL(); 924 c.fEscaped = false; 925 926 // 927 // check for '' sequence. 928 // These are recognized in all contexts, whether in quoted text or not. 929 // 930 if (c.fChar == chApos) { 931 if (fRB->fRules.char32At(fNextIndex) == chApos) { 932 c.fChar = nextCharLL(); // get nextChar officially so character counts 933 c.fEscaped = true; // stay correct. 934 } 935 else 936 { 937 // Single quote, by itself. 938 // Toggle quoting mode. 939 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 940 fQuoteMode = !fQuoteMode; 941 if (fQuoteMode) { 942 c.fChar = chLParen; 943 } else { 944 c.fChar = chRParen; 945 } 946 c.fEscaped = false; // The paren that we return is not escaped. 947 return; 948 } 949 } 950 951 if (c.fChar == static_cast<UChar32>(-1)) { 952 return; 953 } 954 if (fQuoteMode) { 955 c.fEscaped = true; 956 } 957 else 958 { 959 // We are not in a 'quoted region' of the source. 960 // 961 if (c.fChar == chPound) { 962 // Start of a comment. Consume the rest of it. 963 // The new-line char that terminates the comment is always returned. 964 // It will be treated as white-space, and serves to break up anything 965 // that might otherwise incorrectly clump together with a comment in 966 // the middle (a variable name, for example.) 967 int32_t commentStart = fScanIndex; 968 for (;;) { 969 c.fChar = nextCharLL(); 970 if (c.fChar == static_cast<UChar32>(-1) || // EOF 971 c.fChar == chCR || 972 c.fChar == chLF || 973 c.fChar == chNEL || 974 c.fChar == chLS) {break;} 975 } 976 for (int32_t i=commentStart; i<fNextIndex-1; ++i) { 977 fRB->fStrippedRules.setCharAt(i, u' '); 978 } 979 } 980 if (c.fChar == static_cast<UChar32>(-1)) { 981 return; 982 } 983 984 // 985 // check for backslash escaped characters. 986 // Use UnicodeString::unescapeAt() to handle them. 987 // 988 if (c.fChar == chBackSlash) { 989 c.fEscaped = true; 990 int32_t startX = fNextIndex; 991 c.fChar = fRB->fRules.unescapeAt(fNextIndex); 992 if (fNextIndex == startX) { 993 error(U_BRK_HEX_DIGITS_EXPECTED); 994 } 995 fCharNum += fNextIndex-startX; 996 } 997 } 998 // putc(c.fChar, stdout); 999 } 1000 1001 //------------------------------------------------------------------------------ 1002 // 1003 // Parse RBBI rules. The state machine for rules parsing is here. 1004 // The state tables are hand-written in the file rbbirpt.txt, 1005 // and converted to the form used here by a perl 1006 // script rbbicst.pl 1007 // 1008 //------------------------------------------------------------------------------ 1009 void RBBIRuleScanner::parse() { 1010 uint16_t state; 1011 const RBBIRuleTableEl *tableEl; 1012 1013 if (U_FAILURE(*fRB->fStatus)) { 1014 return; 1015 } 1016 1017 state = 1; 1018 nextChar(fC); 1019 // 1020 // Main loop for the rule parsing state machine. 1021 // Runs once per state transition. 1022 // Each time through optionally performs, depending on the state table, 1023 // - an advance to the next input char 1024 // - an action to be performed. 1025 // - pushing or popping a state to/from the local state return stack. 1026 // 1027 for (;;) { 1028 // Bail out if anything has gone wrong. 1029 // RBBI rule file parsing stops on the first error encountered. 1030 if (U_FAILURE(*fRB->fStatus)) { 1031 break; 1032 } 1033 1034 // Quit if state == 0. This is the normal way to exit the state machine. 1035 // 1036 if (state == 0) { 1037 break; 1038 } 1039 1040 // Find the state table element that matches the input char from the rule, or the 1041 // class of the input character. Start with the first table row for this 1042 // state, then linearly scan forward until we find a row that matches the 1043 // character. The last row for each state always matches all characters, so 1044 // the search will stop there, if not before. 1045 // 1046 tableEl = &gRuleParseStateTable[state]; 1047 #ifdef RBBI_DEBUG 1048 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 1049 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 1050 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 1051 } 1052 #endif 1053 1054 for (;;) { 1055 #ifdef RBBI_DEBUG 1056 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);} 1057 #endif 1058 if (tableEl->fCharClass < 127 && fC.fEscaped == false && tableEl->fCharClass == fC.fChar) { 1059 // Table row specified an individual character, not a set, and 1060 // the input character is not escaped, and 1061 // the input character matched it. 1062 break; 1063 } 1064 if (tableEl->fCharClass == 255) { 1065 // Table row specified default, match anything character class. 1066 break; 1067 } 1068 if (tableEl->fCharClass == 254 && fC.fEscaped) { 1069 // Table row specified "escaped" and the char was escaped. 1070 break; 1071 } 1072 if (tableEl->fCharClass == 253 && fC.fEscaped && 1073 (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 1074 // Table row specified "escaped P" and the char is either 'p' or 'P'. 1075 break; 1076 } 1077 if (tableEl->fCharClass == 252 && fC.fChar == static_cast<UChar32>(-1)) { 1078 // Table row specified eof and we hit eof on the input. 1079 break; 1080 } 1081 1082 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 1083 fC.fEscaped == false && // char is not escaped && 1084 fC.fChar != static_cast<UChar32>(-1)) { // char is not EOF 1085 U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets)); 1086 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 1087 // Table row specified a character class, or set of characters, 1088 // and the current char matches it. 1089 break; 1090 } 1091 } 1092 1093 // No match on this row, advance to the next row for this state, 1094 tableEl++; 1095 } 1096 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 1097 1098 // 1099 // We've found the row of the state table that matches the current input 1100 // character from the rules string. 1101 // Perform any action specified by this row in the state table. 1102 if (doParseActions(static_cast<int32_t>(tableEl->fAction)) == false) { 1103 // Break out of the state machine loop if the 1104 // the action signalled some kind of error, or 1105 // the action was to exit, occurs on normal end-of-rules-input. 1106 break; 1107 } 1108 1109 if (tableEl->fPushState != 0) { 1110 fStackPtr++; 1111 if (fStackPtr >= kStackSize) { 1112 error(U_BRK_INTERNAL_ERROR); 1113 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 1114 fStackPtr--; 1115 } 1116 fStack[fStackPtr] = tableEl->fPushState; 1117 } 1118 1119 if (tableEl->fNextChar) { 1120 nextChar(fC); 1121 } 1122 1123 // Get the next state from the table entry, or from the 1124 // state stack if the next state was specified as "pop". 1125 if (tableEl->fNextState != 255) { 1126 state = tableEl->fNextState; 1127 } else { 1128 state = fStack[fStackPtr]; 1129 fStackPtr--; 1130 if (fStackPtr < 0) { 1131 error(U_BRK_INTERNAL_ERROR); 1132 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 1133 fStackPtr++; 1134 } 1135 } 1136 1137 } 1138 1139 if (U_FAILURE(*fRB->fStatus)) { 1140 return; 1141 } 1142 1143 // If there are no forward rules set an error. 1144 // 1145 if (fRB->fForwardTree == nullptr) { 1146 error(U_BRK_RULE_SYNTAX); 1147 return; 1148 } 1149 1150 // 1151 // Parsing of the input RBBI rules is complete. 1152 // We now have a parse tree for the rule expressions 1153 // and a list of all UnicodeSets that are referenced. 1154 // 1155 #ifdef RBBI_DEBUG 1156 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 1157 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { 1158 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 1159 RBBINode::printTree(fRB->fForwardTree, true); 1160 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 1161 RBBINode::printTree(fRB->fReverseTree, true); 1162 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 1163 RBBINode::printTree(fRB->fSafeFwdTree, true); 1164 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 1165 RBBINode::printTree(fRB->fSafeRevTree, true); 1166 } 1167 #endif 1168 } 1169 1170 1171 //------------------------------------------------------------------------------ 1172 // 1173 // printNodeStack for debugging... 1174 // 1175 //------------------------------------------------------------------------------ 1176 #ifdef RBBI_DEBUG 1177 void RBBIRuleScanner::printNodeStack(const char *title) { 1178 int i; 1179 RBBIDebugPrintf("%s. Dumping node stack...\n", title); 1180 for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);} 1181 } 1182 #endif 1183 1184 1185 1186 1187 //------------------------------------------------------------------------------ 1188 // 1189 // pushNewNode create a new RBBINode of the specified type and push it 1190 // onto the stack of nodes. 1191 // 1192 //------------------------------------------------------------------------------ 1193 RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 1194 if (U_FAILURE(*fRB->fStatus)) { 1195 return nullptr; 1196 } 1197 if (fNodeStackPtr >= kStackSize - 1) { 1198 error(U_BRK_RULE_SYNTAX); 1199 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 1200 return nullptr; 1201 } 1202 fNodeStackPtr++; 1203 fNodeStack[fNodeStackPtr] = new RBBINode(t, *fRB->fStatus); 1204 if (fNodeStack[fNodeStackPtr] == nullptr) { 1205 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 1206 } 1207 return fNodeStack[fNodeStackPtr]; 1208 } 1209 1210 1211 1212 //------------------------------------------------------------------------------ 1213 // 1214 // scanSet Construct a UnicodeSet from the text at the current scan 1215 // position. Advance the scan position to the first character 1216 // after the set. 1217 // 1218 // A new RBBI setref node referring to the set is pushed onto the node 1219 // stack. 1220 // 1221 // The scan position is normally under the control of the state machine 1222 // that controls rule parsing. UnicodeSets, however, are parsed by 1223 // the UnicodeSet constructor, not by the RBBI rule parser. 1224 // 1225 //------------------------------------------------------------------------------ 1226 void RBBIRuleScanner::scanSet() { 1227 ParsePosition pos; 1228 int startPos; 1229 int i; 1230 1231 if (U_FAILURE(*fRB->fStatus)) { 1232 return; 1233 } 1234 1235 pos.setIndex(fScanIndex); 1236 startPos = fScanIndex; 1237 UErrorCode localStatus = U_ZERO_ERROR; 1238 LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus); 1239 if (U_FAILURE(localStatus)) { 1240 error(localStatus); 1241 return; 1242 } 1243 uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); 1244 if (U_FAILURE(localStatus)) { 1245 // TODO: Get more accurate position of the error from UnicodeSet's return info. 1246 // UnicodeSet appears to not be reporting correctly at this time. 1247 #ifdef RBBI_DEBUG 1248 RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex()); 1249 #endif 1250 error(localStatus); 1251 return; 1252 } 1253 1254 // Verify that the set contains at least one code point. 1255 // 1256 U_ASSERT(uset.isValid()); 1257 UnicodeSet tempSet(*uset); 1258 // Use tempSet to handle the case that the UnicodeSet contains 1259 // only string element, such as [{ab}] and treat it as empty set. 1260 tempSet.removeAllStrings(); 1261 if (tempSet.isEmpty()) { 1262 // This set is empty. 1263 // Make it an error, because it almost certainly is not what the user wanted. 1264 // Also, avoids having to think about corner cases in the tree manipulation code 1265 // that occurs later on. 1266 error(U_BRK_RULE_EMPTY_SET); 1267 return; 1268 } 1269 1270 1271 // Advance the RBBI parse position over the UnicodeSet pattern. 1272 // Don't just set fScanIndex because the line/char positions maintained 1273 // for error reporting would be thrown off. 1274 i = pos.getIndex(); 1275 for (;U_SUCCESS(*fRB->fStatus);) { 1276 if (fNextIndex >= i) { 1277 break; 1278 } 1279 nextCharLL(); 1280 } 1281 1282 if (U_SUCCESS(*fRB->fStatus)) { 1283 RBBINode *n; 1284 1285 n = pushNewNode(RBBINode::setRef); 1286 if (U_FAILURE(*fRB->fStatus)) { 1287 return; 1288 } 1289 n->fFirstPos = startPos; 1290 n->fLastPos = fNextIndex; 1291 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1292 // findSetFor() serves several purposes here: 1293 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1294 // - Maintains collection of all sets in use, needed later for establishing 1295 // character categories for run time engine. 1296 // - Eliminates mulitiple instances of the same set. 1297 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1298 findSetFor(n->fText, n, uset.orphan()); 1299 } 1300 1301 } 1302 1303 int32_t RBBIRuleScanner::numRules() { 1304 return fRuleNum; 1305 } 1306 1307 U_NAMESPACE_END 1308 1309 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */