regexcmp.cpp (183548B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // file: regexcmp.cpp 5 // 6 // Copyright (C) 2002-2016 International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains the ICU regular expression compiler, which is responsible 10 // for processing a regular expression pattern into the compiled form that 11 // is used by the match finding engine. 12 // 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 17 18 #include "unicode/ustring.h" 19 #include "unicode/unistr.h" 20 #include "unicode/uniset.h" 21 #include "unicode/uchar.h" 22 #include "unicode/uchriter.h" 23 #include "unicode/parsepos.h" 24 #include "unicode/parseerr.h" 25 #include "unicode/regex.h" 26 #include "unicode/utf.h" 27 #include "unicode/utf16.h" 28 #include "patternprops.h" 29 #include "putilimp.h" 30 #include "cmemory.h" 31 #include "cstr.h" 32 #include "cstring.h" 33 #include "uvectr32.h" 34 #include "uvectr64.h" 35 #include "uassert.h" 36 #include "uinvchar.h" 37 38 #include "regeximp.h" 39 #include "regexcst.h" // Contains state table for the regex pattern parser. 40 // generated by a Perl script. 41 #include "regexcmp.h" 42 #include "regexst.h" 43 #include "regextxt.h" 44 45 46 47 U_NAMESPACE_BEGIN 48 49 50 //------------------------------------------------------------------------------ 51 // 52 // Constructor. 53 // 54 //------------------------------------------------------------------------------ 55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 56 fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status) 57 { 58 // Lazy init of all shared global sets (needed for init()'s empty text) 59 RegexStaticSets::initGlobals(&status); 60 61 fStatus = &status; 62 63 fRXPat = rxp; 64 fScanIndex = 0; 65 fLastChar = -1; 66 fPeekChar = -1; 67 fLineNum = 1; 68 fCharNum = 0; 69 fQuoteMode = false; 70 fInBackslashQuote = false; 71 fModeFlags = fRXPat->fFlags | 0x80000000; 72 fEOLComments = true; 73 74 fMatchOpenParen = -1; 75 fMatchCloseParen = -1; 76 fCaptureName = nullptr; 77 fLastSetLiteral = U_SENTINEL; 78 79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 80 status = rxp->fDeferredStatus; 81 } 82 } 83 84 static const char16_t chAmp = 0x26; // '&' 85 static const char16_t chDash = 0x2d; // '-' 86 87 88 //------------------------------------------------------------------------------ 89 // 90 // Destructor 91 // 92 //------------------------------------------------------------------------------ 93 RegexCompile::~RegexCompile() { 94 delete fCaptureName; // Normally will be nullptr, but can exist if pattern 95 // compilation stops with a syntax error. 96 } 97 98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 100 } 101 102 //------------------------------------------------------------------------------ 103 // 104 // Compile regex pattern. The state machine for rexexp pattern parsing is here. 105 // The state tables are hand-written in the file regexcst.txt, 106 // and converted to the form used here by a perl 107 // script regexcst.pl 108 // 109 //------------------------------------------------------------------------------ 110 void RegexCompile::compile( 111 const UnicodeString &pat, // Source pat to be compiled. 112 UParseError &pp, // Error position info 113 UErrorCode &e) // Error Code 114 { 115 fRXPat->fPatternString = new UnicodeString(pat); 116 UText patternText = UTEXT_INITIALIZER; 117 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 118 119 if (U_SUCCESS(e)) { 120 compile(&patternText, pp, e); 121 utext_close(&patternText); 122 } 123 } 124 125 // 126 // compile, UText mode 127 // All the work is actually done here. 128 // 129 void RegexCompile::compile( 130 UText *pat, // Source pat to be compiled. 131 UParseError &pp, // Error position info 132 UErrorCode &e) // Error Code 133 { 134 fStatus = &e; 135 fParseErr = &pp; 136 fStackPtr = 0; 137 fStack[fStackPtr] = 0; 138 139 if (U_FAILURE(*fStatus)) { 140 return; 141 } 142 143 // There should be no pattern stuff in the RegexPattern object. They can not be reused. 144 U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0); 145 146 // Prepare the RegexPattern object to receive the compiled pattern. 147 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, false, true, fStatus); 148 if (U_FAILURE(*fStatus)) { 149 return; 150 } 151 152 // Initialize the pattern scanning state machine 153 fPatternLength = utext_nativeLength(pat); 154 uint16_t state = 1; 155 const RegexTableEl *tableEl; 156 157 // UREGEX_LITERAL force entire pattern to be treated as a literal string. 158 if (fModeFlags & UREGEX_LITERAL) { 159 fQuoteMode = true; 160 } 161 162 nextChar(fC); // Fetch the first char from the pattern string. 163 164 // 165 // Main loop for the regex pattern parsing state machine. 166 // Runs once per state transition. 167 // Each time through optionally performs, depending on the state table, 168 // - an advance to the next pattern char 169 // - an action to be performed. 170 // - pushing or popping a state to/from the local state return stack. 171 // file regexcst.txt is the source for the state table. The logic behind 172 // recongizing the pattern syntax is there, not here. 173 // 174 for (;;) { 175 // Bail out if anything has gone wrong. 176 // Regex pattern parsing stops on the first error encountered. 177 if (U_FAILURE(*fStatus)) { 178 break; 179 } 180 181 U_ASSERT(state != 0); 182 183 // Find the state table element that matches the input char from the pattern, or the 184 // class of the input character. Start with the first table row for this 185 // state, then linearly scan forward until we find a row that matches the 186 // character. The last row for each state always matches all characters, so 187 // the search will stop there, if not before. 188 // 189 tableEl = &gRuleParseStateTable[state]; 190 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 191 fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 192 193 for (;;) { // loop through table rows belonging to this state, looking for one 194 // that matches the current input char. 195 REGEX_SCAN_DEBUG_PRINTF((".")); 196 if (tableEl->fCharClass < 127 && fC.fQuoted == false && tableEl->fCharClass == fC.fChar) { 197 // Table row specified an individual character, not a set, and 198 // the input character is not quoted, and 199 // the input character matched it. 200 break; 201 } 202 if (tableEl->fCharClass == 255) { 203 // Table row specified default, match anything character class. 204 break; 205 } 206 if (tableEl->fCharClass == 254 && fC.fQuoted) { 207 // Table row specified "quoted" and the char was quoted. 208 break; 209 } 210 if (tableEl->fCharClass == 253 && fC.fChar == static_cast<UChar32>(-1)) { 211 // Table row specified eof and we hit eof on the input. 212 break; 213 } 214 215 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 216 fC.fQuoted == false && // char is not escaped && 217 fC.fChar != static_cast<UChar32>(-1)) { // char is not EOF 218 U_ASSERT(tableEl->fCharClass <= 137); 219 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 220 // Table row specified a character class, or set of characters, 221 // and the current char matches it. 222 break; 223 } 224 } 225 226 // No match on this row, advance to the next row for this state, 227 tableEl++; 228 } 229 REGEX_SCAN_DEBUG_PRINTF(("\n")); 230 231 // 232 // We've found the row of the state table that matches the current input 233 // character from the rules string. 234 // Perform any action specified by this row in the state table. 235 if (doParseActions(tableEl->fAction) == false) { 236 // Break out of the state machine loop if the 237 // the action signalled some kind of error, or 238 // the action was to exit, occurs on normal end-of-rules-input. 239 break; 240 } 241 242 if (tableEl->fPushState != 0) { 243 fStackPtr++; 244 if (fStackPtr >= kStackSize) { 245 error(U_REGEX_INTERNAL_ERROR); 246 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 247 fStackPtr--; 248 } 249 fStack[fStackPtr] = tableEl->fPushState; 250 } 251 252 // 253 // NextChar. This is where characters are actually fetched from the pattern. 254 // Happens under control of the 'n' tag in the state table. 255 // 256 if (tableEl->fNextChar) { 257 nextChar(fC); 258 } 259 260 // Get the next state from the table entry, or from the 261 // state stack if the next state was specified as "pop". 262 if (tableEl->fNextState != 255) { 263 state = tableEl->fNextState; 264 } else { 265 state = fStack[fStackPtr]; 266 fStackPtr--; 267 if (fStackPtr < 0) { 268 // state stack underflow 269 // This will occur if the user pattern has mis-matched parentheses, 270 // with extra close parens. 271 // 272 fStackPtr++; 273 error(U_REGEX_MISMATCHED_PAREN); 274 } 275 } 276 277 } 278 279 if (U_FAILURE(*fStatus)) { 280 // Bail out if the pattern had errors. 281 return; 282 } 283 284 // 285 // The pattern has now been read and processed, and the compiled code generated. 286 // 287 288 // 289 // The pattern's fFrameSize so far has accumulated the requirements for 290 // storage for capture parentheses, counters, etc. that are encountered 291 // in the pattern. Add space for the two variables that are always 292 // present in the saved state: the input string position (int64_t) and 293 // the position in the compiled pattern. 294 // 295 allocateStackData(RESTACKFRAME_HDRCOUNT); 296 297 // 298 // Optimization pass 1: NOPs, back-references, and case-folding 299 // 300 stripNOPs(); 301 302 // 303 // Get bounds for the minimum and maximum length of a string that this 304 // pattern can match. Used to avoid looking for matches in strings that 305 // are too short. 306 // 307 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 308 309 // 310 // Optimization pass 2: match start type 311 // 312 matchStartType(); 313 314 // 315 // Set up fast latin-1 range sets 316 // 317 int32_t numSets = fRXPat->fSets->size(); 318 fRXPat->fSets8 = new Regex8BitSet[numSets]; 319 // Null pointer check. 320 if (fRXPat->fSets8 == nullptr) { 321 e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 322 return; 323 } 324 int32_t i; 325 for (i=0; i<numSets; i++) { 326 UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(i)); 327 fRXPat->fSets8[i].init(s); 328 } 329 330 } 331 332 333 334 335 336 //------------------------------------------------------------------------------ 337 // 338 // doParseAction Do some action during regex pattern parsing. 339 // Called by the parse state machine. 340 // 341 // Generation of the match engine PCode happens here, or 342 // in functions called from the parse actions defined here. 343 // 344 // 345 //------------------------------------------------------------------------------ 346 UBool RegexCompile::doParseActions(int32_t action) 347 { 348 UBool returnVal = true; 349 350 switch (static_cast<Regex_PatternParseAction>(action)) { 351 352 case doPatStart: 353 // Start of pattern compiles to: 354 //0 SAVE 2 Fall back to position of FAIL 355 //1 jmp 3 356 //2 FAIL Stop if we ever reach here. 357 //3 NOP Dummy, so start of pattern looks the same as 358 // the start of an ( grouping. 359 //4 NOP Resreved, will be replaced by a save if there are 360 // OR | operators at the top level 361 appendOp(URX_STATE_SAVE, 2); 362 appendOp(URX_JMP, 3); 363 appendOp(URX_FAIL, 0); 364 365 // Standard open nonCapture paren action emits the two NOPs and 366 // sets up the paren stack frame. 367 doParseActions(doOpenNonCaptureParen); 368 break; 369 370 case doPatFinish: 371 // We've scanned to the end of the pattern 372 // The end of pattern compiles to: 373 // URX_END 374 // which will stop the runtime match engine. 375 // Encountering end of pattern also behaves like a close paren, 376 // and forces fixups of the State Save at the beginning of the compiled pattern 377 // and of any OR operations at the top level. 378 // 379 handleCloseParen(); 380 if (fParenStack.size() > 0) { 381 // Missing close paren in pattern. 382 error(U_REGEX_MISMATCHED_PAREN); 383 } 384 385 // add the END operation to the compiled pattern. 386 appendOp(URX_END, 0); 387 388 // Terminate the pattern compilation state machine. 389 returnVal = false; 390 break; 391 392 393 394 case doOrOperator: 395 // Scanning a '|', as in (A|B) 396 { 397 // Generate code for any pending literals preceding the '|' 398 fixLiterals(false); 399 400 // Insert a SAVE operation at the start of the pattern section preceding 401 // this OR at this level. This SAVE will branch the match forward 402 // to the right hand side of the OR in the event that the left hand 403 // side fails to match and backtracks. Locate the position for the 404 // save from the location on the top of the parentheses stack. 405 int32_t savePosition = fParenStack.popi(); 406 int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(savePosition)); 407 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 408 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 409 fRXPat->fCompiledPat->setElementAt(op, savePosition); 410 411 // Append an JMP operation into the compiled pattern. The operand for 412 // the JMP will eventually be the location following the ')' for the 413 // group. This will be patched in later, when the ')' is encountered. 414 appendOp(URX_JMP, 0); 415 416 // Push the position of the newly added JMP op onto the parentheses stack. 417 // This registers if for fixup when this block's close paren is encountered. 418 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 419 420 // Append a NOP to the compiled pattern. This is the slot reserved 421 // for a SAVE in the event that there is yet another '|' following 422 // this one. 423 appendOp(URX_NOP, 0); 424 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 425 } 426 break; 427 428 429 case doBeginNamedCapture: 430 // Scanning (?<letter. 431 // The first letter of the name will come through again under doConinueNamedCapture. 432 fCaptureName = new UnicodeString(); 433 if (fCaptureName == nullptr) { 434 error(U_MEMORY_ALLOCATION_ERROR); 435 } 436 break; 437 438 case doContinueNamedCapture: 439 fCaptureName->append(fC.fChar); 440 break; 441 442 case doBadNamedCapture: 443 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 444 break; 445 446 case doOpenCaptureParen: 447 // Open Capturing Paren, possibly named. 448 // Compile to a 449 // - NOP, which later may be replaced by a save-state if the 450 // parenthesized group gets a * quantifier, followed by 451 // - START_CAPTURE n where n is stack frame offset to the capture group variables. 452 // - NOP, which may later be replaced by a save-state if there 453 // is an '|' alternation within the parens. 454 // 455 // Each capture group gets three slots in the save stack frame: 456 // 0: Capture Group start position (in input string being matched.) 457 // 1: Capture Group end position. 458 // 2: Start of Match-in-progress. 459 // The first two locations are for a completed capture group, and are 460 // referred to by back references and the like. 461 // The third location stores the capture start position when an START_CAPTURE is 462 // encountered. This will be promoted to a completed capture when (and if) the corresponding 463 // END_CAPTURE is encountered. 464 { 465 fixLiterals(); 466 appendOp(URX_NOP, 0); 467 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame. 468 appendOp(URX_START_CAPTURE, varsLoc); 469 appendOp(URX_NOP, 0); 470 471 // On the Parentheses stack, start a new frame and add the positions 472 // of the two NOPs. Depending on what follows in the pattern, the 473 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 474 // address of the end of the parenthesized group. 475 fParenStack.push(fModeFlags, *fStatus); // Match mode state 476 fParenStack.push(capturing, *fStatus); // Frame type. 477 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 478 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 479 480 // Save the mapping from group number to stack frame variable position. 481 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 482 483 // If this is a named capture group, add the name->group number mapping. 484 if (fCaptureName != nullptr) { 485 if (!fRXPat->initNamedCaptureMap()) { 486 if (U_SUCCESS(*fStatus)) { 487 error(fRXPat->fDeferredStatus); 488 } 489 break; 490 } 491 int32_t groupNumber = fRXPat->fGroupMap->size(); 492 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus); 493 fCaptureName = nullptr; // hash table takes ownership of the name (key) string. 494 if (previousMapping > 0 && U_SUCCESS(*fStatus)) { 495 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 496 } 497 } 498 } 499 break; 500 501 case doOpenNonCaptureParen: 502 // Open non-caputuring (grouping only) Paren. 503 // Compile to a 504 // - NOP, which later may be replaced by a save-state if the 505 // parenthesized group gets a * quantifier, followed by 506 // - NOP, which may later be replaced by a save-state if there 507 // is an '|' alternation within the parens. 508 { 509 fixLiterals(); 510 appendOp(URX_NOP, 0); 511 appendOp(URX_NOP, 0); 512 513 // On the Parentheses stack, start a new frame and add the positions 514 // of the two NOPs. 515 fParenStack.push(fModeFlags, *fStatus); // Match mode state 516 fParenStack.push(plain, *fStatus); // Begin a new frame. 517 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 518 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 519 } 520 break; 521 522 523 case doOpenAtomicParen: 524 // Open Atomic Paren. (?> 525 // Compile to a 526 // - NOP, which later may be replaced if the parenthesized group 527 // has a quantifier, followed by 528 // - STO_SP save state stack position, so it can be restored at the ")" 529 // - NOP, which may later be replaced by a save-state if there 530 // is an '|' alternation within the parens. 531 { 532 fixLiterals(); 533 appendOp(URX_NOP, 0); 534 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr. 535 appendOp(URX_STO_SP, varLoc); 536 appendOp(URX_NOP, 0); 537 538 // On the Parentheses stack, start a new frame and add the positions 539 // of the two NOPs. Depending on what follows in the pattern, the 540 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 541 // address of the end of the parenthesized group. 542 fParenStack.push(fModeFlags, *fStatus); // Match mode state 543 fParenStack.push(atomic, *fStatus); // Frame type. 544 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 545 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 546 } 547 break; 548 549 550 case doOpenLookAhead: 551 // Positive Look-ahead (?= stuff ) 552 // 553 // Note: Addition of transparent input regions, with the need to 554 // restore the original regions when failing out of a lookahead 555 // block, complicated this sequence. Some combined opcodes 556 // might make sense - or might not, lookahead aren't that common. 557 // 558 // Caution: min match length optimization knows about this 559 // sequence; don't change without making updates there too. 560 // 561 // Compiles to 562 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region. 563 // 2. STATE_SAVE 4 on failure of lookahead, goto 4 564 // 3 JMP 6 continue ... 565 // 566 // 4. LA_END Look Ahead failed. Restore regions. 567 // 5. BACKTRACK and back track again. 568 // 569 // 6. NOP reserved for use by quantifiers on the block. 570 // Look-ahead can't have quantifiers, but paren stack 571 // compile time conventions require the slot anyhow. 572 // 7. NOP may be replaced if there is are '|' ops in the block. 573 // 8. code for parenthesized stuff. 574 // 9. LA_END 575 // 576 // Four data slots are reserved, for saving state on entry to the look-around 577 // 0: stack pointer on entry. 578 // 1: input position on entry. 579 // 2: fActiveStart, the active bounds start on entry. 580 // 3: fActiveLimit, the active bounds limit on entry. 581 { 582 fixLiterals(); 583 int32_t dataLoc = allocateData(4); 584 appendOp(URX_LA_START, dataLoc); 585 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 586 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 587 appendOp(URX_LA_END, dataLoc); 588 appendOp(URX_BACKTRACK, 0); 589 appendOp(URX_NOP, 0); 590 appendOp(URX_NOP, 0); 591 592 // On the Parentheses stack, start a new frame and add the positions 593 // of the NOPs. 594 fParenStack.push(fModeFlags, *fStatus); // Match mode state 595 fParenStack.push(lookAhead, *fStatus); // Frame type. 596 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 597 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 598 } 599 break; 600 601 case doOpenLookAheadNeg: 602 // Negated Lookahead. (?! stuff ) 603 // Compiles to 604 // 1. LA_START dataloc 605 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 606 // // which continues with the match. 607 // 3. NOP // Std. Open Paren sequence, for possible '|' 608 // 4. code for parenthesized stuff. 609 // 5. LA_END // Cut back stack, remove saved state from step 2. 610 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 611 // 7. END_LA // Restore match region, in case look-ahead was using 612 // an alternate (transparent) region. 613 // Four data slots are reserved, for saving state on entry to the look-around 614 // 0: stack pointer on entry. 615 // 1: input position on entry. 616 // 2: fActiveStart, the active bounds start on entry. 617 // 3: fActiveLimit, the active bounds limit on entry. 618 { 619 fixLiterals(); 620 int32_t dataLoc = allocateData(4); 621 appendOp(URX_LA_START, dataLoc); 622 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later. 623 appendOp(URX_NOP, 0); 624 625 // On the Parentheses stack, start a new frame and add the positions 626 // of the StateSave and NOP. 627 fParenStack.push(fModeFlags, *fStatus); // Match mode state 628 fParenStack.push(negLookAhead, *fStatus); // Frame type 629 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 630 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 631 632 // Instructions #5 - #7 will be added when the ')' is encountered. 633 } 634 break; 635 636 case doOpenLookBehind: 637 { 638 // Compile a (?<= look-behind open paren. 639 // 640 // Compiles to 641 // 0 URX_LB_START dataLoc 642 // 1 URX_LB_CONT dataLoc 643 // 2 MinMatchLen 644 // 3 MaxMatchLen 645 // 4 URX_NOP Standard '(' boilerplate. 646 // 5 URX_NOP Reserved slot for use with '|' ops within (block). 647 // 6 <code for LookBehind expression> 648 // 7 URX_LB_END dataLoc # Check match len, restore input len 649 // 8 URX_LA_END dataLoc # Restore stack, input pos 650 // 651 // Allocate a block of matcher data, to contain (when running a match) 652 // 0: Stack ptr on entry 653 // 1: Input Index on entry 654 // 2: fActiveStart, the active bounds start on entry. 655 // 3: fActiveLimit, the active bounds limit on entry. 656 // 4: Start index of match current match attempt. 657 // The first four items must match the layout of data for LA_START / LA_END 658 659 // Generate match code for any pending literals. 660 fixLiterals(); 661 662 // Allocate data space 663 int32_t dataLoc = allocateData(5); 664 665 // Emit URX_LB_START 666 appendOp(URX_LB_START, dataLoc); 667 668 // Emit URX_LB_CONT 669 appendOp(URX_LB_CONT, dataLoc); 670 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. 671 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. 672 673 // Emit the NOPs 674 appendOp(URX_NOP, 0); 675 appendOp(URX_NOP, 0); 676 677 // On the Parentheses stack, start a new frame and add the positions 678 // of the URX_LB_CONT and the NOP. 679 fParenStack.push(fModeFlags, *fStatus); // Match mode state 680 fParenStack.push(lookBehind, *fStatus); // Frame type 681 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 682 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 683 684 // The final two instructions will be added when the ')' is encountered. 685 } 686 687 break; 688 689 case doOpenLookBehindNeg: 690 { 691 // Compile a (?<! negated look-behind open paren. 692 // 693 // Compiles to 694 // 0 URX_LB_START dataLoc # Save entry stack, input len 695 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 696 // 2 MinMatchLen 697 // 3 MaxMatchLen 698 // 4 continueLoc (9) 699 // 5 URX_NOP Standard '(' boilerplate. 700 // 6 URX_NOP Reserved slot for use with '|' ops within (block). 701 // 7 <code for LookBehind expression> 702 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 703 // 9 ... 704 // 705 // Allocate a block of matcher data, to contain (when running a match) 706 // 0: Stack ptr on entry 707 // 1: Input Index on entry 708 // 2: fActiveStart, the active bounds start on entry. 709 // 3: fActiveLimit, the active bounds limit on entry. 710 // 4: Start index of match current match attempt. 711 // The first four items must match the layout of data for LA_START / LA_END 712 713 // Generate match code for any pending literals. 714 fixLiterals(); 715 716 // Allocate data space 717 int32_t dataLoc = allocateData(5); 718 719 // Emit URX_LB_START 720 appendOp(URX_LB_START, dataLoc); 721 722 // Emit URX_LBN_CONT 723 appendOp(URX_LBN_CONT, dataLoc); 724 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. 725 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. 726 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later. 727 728 // Emit the NOPs 729 appendOp(URX_NOP, 0); 730 appendOp(URX_NOP, 0); 731 732 // On the Parentheses stack, start a new frame and add the positions 733 // of the URX_LB_CONT and the NOP. 734 fParenStack.push(fModeFlags, *fStatus); // Match mode state 735 fParenStack.push(lookBehindN, *fStatus); // Frame type 736 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 737 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 738 739 // The final two instructions will be added when the ')' is encountered. 740 } 741 break; 742 743 case doConditionalExpr: 744 // Conditionals such as (?(1)a:b) 745 case doPerlInline: 746 // Perl inline-conditionals. (?{perl code}a|b) We're not perl, no way to do them. 747 error(U_REGEX_UNIMPLEMENTED); 748 break; 749 750 751 case doCloseParen: 752 handleCloseParen(); 753 if (fParenStack.size() <= 0) { 754 // Extra close paren, or missing open paren. 755 error(U_REGEX_MISMATCHED_PAREN); 756 } 757 break; 758 759 case doNOP: 760 break; 761 762 763 case doBadOpenParenType: 764 case doRuleError: 765 error(U_REGEX_RULE_SYNTAX); 766 break; 767 768 769 case doMismatchedParenErr: 770 error(U_REGEX_MISMATCHED_PAREN); 771 break; 772 773 case doPlus: 774 // Normal '+' compiles to 775 // 1. stuff to be repeated (already built) 776 // 2. jmp-sav 1 777 // 3. ... 778 // 779 // Or, if the item to be repeated can match a zero length string, 780 // 1. STO_INP_LOC data-loc 781 // 2. body of stuff to be repeated 782 // 3. JMP_SAV_X 2 783 // 4. ... 784 785 // 786 // Or, if the item to be repeated is simple 787 // 1. Item to be repeated. 788 // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 789 // 3. LOOP_C stack location 790 { 791 int32_t topLoc = blockTopLoc(false); // location of item #1 792 int32_t frameLoc; 793 794 // Check for simple constructs, which may get special optimized code. 795 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 796 int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc)); 797 798 if (URX_TYPE(repeatedOp) == URX_SETREF) { 799 // Emit optimized code for [char set]+ 800 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 801 frameLoc = allocateStackData(1); 802 appendOp(URX_LOOP_C, frameLoc); 803 break; 804 } 805 806 if (URX_TYPE(repeatedOp) == URX_DOTANY || 807 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 808 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 809 // Emit Optimized code for .+ operations. 810 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); 811 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 812 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 813 loopOpI |= 1; 814 } 815 if (fModeFlags & UREGEX_UNIX_LINES) { 816 loopOpI |= 2; 817 } 818 appendOp(loopOpI); 819 frameLoc = allocateStackData(1); 820 appendOp(URX_LOOP_C, frameLoc); 821 break; 822 } 823 824 } 825 826 // General case. 827 828 // Check for minimum match length of zero, which requires 829 // extra loop-breaking code. 830 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 831 // Zero length match is possible. 832 // Emit the code sequence that can handle it. 833 insertOp(topLoc); 834 frameLoc = allocateStackData(1); 835 836 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); 837 fRXPat->fCompiledPat->setElementAt(op, topLoc); 838 839 appendOp(URX_JMP_SAV_X, topLoc+1); 840 } else { 841 // Simpler code when the repeated body must match something non-empty 842 appendOp(URX_JMP_SAV, topLoc); 843 } 844 } 845 break; 846 847 case doNGPlus: 848 // Non-greedy '+?' compiles to 849 // 1. stuff to be repeated (already built) 850 // 2. state-save 1 851 // 3. ... 852 { 853 int32_t topLoc = blockTopLoc(false); 854 appendOp(URX_STATE_SAVE, topLoc); 855 } 856 break; 857 858 859 case doOpt: 860 // Normal (greedy) ? quantifier. 861 // Compiles to 862 // 1. state save 3 863 // 2. body of optional block 864 // 3. ... 865 // Insert the state save into the compiled pattern, and we're done. 866 { 867 int32_t saveStateLoc = blockTopLoc(true); 868 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 869 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 870 } 871 break; 872 873 case doNGOpt: 874 // Non-greedy ?? quantifier 875 // compiles to 876 // 1. jmp 4 877 // 2. body of optional block 878 // 3 jmp 5 879 // 4. state save 2 880 // 5 ... 881 // This code is less than ideal, with two jmps instead of one, because we can only 882 // insert one instruction at the top of the block being iterated. 883 { 884 int32_t jmp1_loc = blockTopLoc(true); 885 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 886 887 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); 888 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 889 890 appendOp(URX_JMP, jmp2_loc+2); 891 892 appendOp(URX_STATE_SAVE, jmp1_loc+1); 893 } 894 break; 895 896 897 case doStar: 898 // Normal (greedy) * quantifier. 899 // Compiles to 900 // 1. STATE_SAVE 4 901 // 2. body of stuff being iterated over 902 // 3. JMP_SAV 2 903 // 4. ... 904 // 905 // Or, if the body is a simple [Set], 906 // 1. LOOP_SR_I set number 907 // 2. LOOP_C stack location 908 // ... 909 // 910 // Or if this is a .* 911 // 1. LOOP_DOT_I (. matches all mode flag) 912 // 2. LOOP_C stack location 913 // 914 // Or, if the body can match a zero-length string, to inhibit infinite loops, 915 // 1. STATE_SAVE 5 916 // 2. STO_INP_LOC data-loc 917 // 3. body of stuff 918 // 4. JMP_SAV_X 2 919 // 5. ... 920 { 921 // location of item #1, the STATE_SAVE 922 int32_t topLoc = blockTopLoc(false); 923 int32_t dataLoc = -1; 924 925 // Check for simple *, where the construct being repeated 926 // compiled to single opcode, and might be optimizable. 927 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 928 int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc)); 929 930 if (URX_TYPE(repeatedOp) == URX_SETREF) { 931 // Emit optimized code for a [char set]* 932 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 933 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 934 dataLoc = allocateStackData(1); 935 appendOp(URX_LOOP_C, dataLoc); 936 break; 937 } 938 939 if (URX_TYPE(repeatedOp) == URX_DOTANY || 940 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 941 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 942 // Emit Optimized code for .* operations. 943 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); 944 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 945 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 946 loopOpI |= 1; 947 } 948 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 949 loopOpI |= 2; 950 } 951 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 952 dataLoc = allocateStackData(1); 953 appendOp(URX_LOOP_C, dataLoc); 954 break; 955 } 956 } 957 958 // Emit general case code for this * 959 // The optimizations did not apply. 960 961 int32_t saveStateLoc = blockTopLoc(true); 962 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); 963 964 // Check for minimum match length of zero, which requires 965 // extra loop-breaking code. 966 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 967 insertOp(saveStateLoc); 968 dataLoc = allocateStackData(1); 969 970 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); 971 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 972 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); 973 } 974 975 // Locate the position in the compiled pattern where the match will continue 976 // after completing the *. (4 or 5 in the comment above) 977 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 978 979 // Put together the save state op and store it into the compiled code. 980 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); 981 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 982 983 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 984 appendOp(jmpOp); 985 } 986 break; 987 988 case doNGStar: 989 // Non-greedy *? quantifier 990 // compiles to 991 // 1. JMP 3 992 // 2. body of stuff being iterated over 993 // 3. STATE_SAVE 2 994 // 4 ... 995 { 996 int32_t jmpLoc = blockTopLoc(true); // loc 1. 997 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 998 int32_t jmpOp = buildOp(URX_JMP, saveLoc); 999 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 1000 appendOp(URX_STATE_SAVE, jmpLoc+1); 1001 } 1002 break; 1003 1004 1005 case doIntervalInit: 1006 // The '{' opening an interval quantifier was just scanned. 1007 // Init the counter variables that will accumulate the values as the digits 1008 // are scanned. 1009 fIntervalLow = 0; 1010 fIntervalUpper = -1; 1011 break; 1012 1013 case doIntevalLowerDigit: 1014 // Scanned a digit from the lower value of an {lower,upper} interval 1015 { 1016 int32_t digitValue = u_charDigitValue(fC.fChar); 1017 U_ASSERT(digitValue >= 0); 1018 int64_t val = static_cast<int64_t>(fIntervalLow) * 10 + digitValue; 1019 if (val > INT32_MAX) { 1020 error(U_REGEX_NUMBER_TOO_BIG); 1021 } else { 1022 fIntervalLow = static_cast<int32_t>(val); 1023 } 1024 } 1025 break; 1026 1027 case doIntervalUpperDigit: 1028 // Scanned a digit from the upper value of an {lower,upper} interval 1029 { 1030 if (fIntervalUpper < 0) { 1031 fIntervalUpper = 0; 1032 } 1033 int32_t digitValue = u_charDigitValue(fC.fChar); 1034 U_ASSERT(digitValue >= 0); 1035 int64_t val = static_cast<int64_t>(fIntervalUpper) * 10 + digitValue; 1036 if (val > INT32_MAX) { 1037 error(U_REGEX_NUMBER_TOO_BIG); 1038 } else { 1039 fIntervalUpper = static_cast<int32_t>(val); 1040 } 1041 } 1042 break; 1043 1044 case doIntervalSame: 1045 // Scanned a single value interval like {27}. Upper = Lower. 1046 fIntervalUpper = fIntervalLow; 1047 break; 1048 1049 case doInterval: 1050 // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1051 if (compileInlineInterval() == false) { 1052 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1053 } 1054 break; 1055 1056 case doPossessiveInterval: 1057 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1058 { 1059 // Remember the loc for the top of the block being looped over. 1060 // (Can not reserve a slot in the compiled pattern at this time, because 1061 // compileInterval needs to reserve also, and blockTopLoc can only reserve 1062 // once per block.) 1063 int32_t topLoc = blockTopLoc(false); 1064 1065 // Produce normal looping code. 1066 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1067 1068 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1069 // just as if the loop was inclosed in atomic parentheses. 1070 1071 // First the STO_SP before the start of the loop 1072 insertOp(topLoc); 1073 1074 int32_t varLoc = allocateData(1); // Reserve a data location for saving the 1075 int32_t op = buildOp(URX_STO_SP, varLoc); 1076 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1077 1078 int32_t loopOp = static_cast<int32_t>(fRXPat->fCompiledPat->popi()); 1079 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1080 loopOp++; // point LoopOp after the just-inserted STO_SP 1081 fRXPat->fCompiledPat->push(loopOp, *fStatus); 1082 1083 // Then the LD_SP after the end of the loop 1084 appendOp(URX_LD_SP, varLoc); 1085 } 1086 1087 break; 1088 1089 case doNGInterval: 1090 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1091 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1092 break; 1093 1094 case doIntervalError: 1095 error(U_REGEX_BAD_INTERVAL); 1096 break; 1097 1098 case doLiteralChar: 1099 // We've just scanned a "normal" character from the pattern, 1100 literalChar(fC.fChar); 1101 break; 1102 1103 1104 case doEscapedLiteralChar: 1105 // We've just scanned an backslashed escaped character with no 1106 // special meaning. It represents itself. 1107 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1108 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1109 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1110 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1111 } 1112 literalChar(fC.fChar); 1113 break; 1114 1115 1116 case doDotAny: 1117 // scanned a ".", match any single character. 1118 { 1119 fixLiterals(false); 1120 if (fModeFlags & UREGEX_DOTALL) { 1121 appendOp(URX_DOTANY_ALL, 0); 1122 } else if (fModeFlags & UREGEX_UNIX_LINES) { 1123 appendOp(URX_DOTANY_UNIX, 0); 1124 } else { 1125 appendOp(URX_DOTANY, 0); 1126 } 1127 } 1128 break; 1129 1130 case doCaret: 1131 { 1132 fixLiterals(false); 1133 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1134 appendOp(URX_CARET, 0); 1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1136 appendOp(URX_CARET_M, 0); 1137 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1138 appendOp(URX_CARET, 0); // Only testing true start of input. 1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1140 appendOp(URX_CARET_M_UNIX, 0); 1141 } 1142 } 1143 break; 1144 1145 case doDollar: 1146 { 1147 fixLiterals(false); 1148 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1149 appendOp(URX_DOLLAR, 0); 1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1151 appendOp(URX_DOLLAR_M, 0); 1152 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1153 appendOp(URX_DOLLAR_D, 0); 1154 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1155 appendOp(URX_DOLLAR_MD, 0); 1156 } 1157 } 1158 break; 1159 1160 case doBackslashA: 1161 fixLiterals(false); 1162 appendOp(URX_CARET, 0); 1163 break; 1164 1165 case doBackslashB: 1166 { 1167 #if UCONFIG_NO_BREAK_ITERATION==1 1168 if (fModeFlags & UREGEX_UWORD) { 1169 error(U_UNSUPPORTED_ERROR); 1170 } 1171 #endif 1172 fixLiterals(false); 1173 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1174 appendOp(op, 1); 1175 } 1176 break; 1177 1178 case doBackslashb: 1179 { 1180 #if UCONFIG_NO_BREAK_ITERATION==1 1181 if (fModeFlags & UREGEX_UWORD) { 1182 error(U_UNSUPPORTED_ERROR); 1183 } 1184 #endif 1185 fixLiterals(false); 1186 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1187 appendOp(op, 0); 1188 } 1189 break; 1190 1191 case doBackslashD: 1192 fixLiterals(false); 1193 appendOp(URX_BACKSLASH_D, 1); 1194 break; 1195 1196 case doBackslashd: 1197 fixLiterals(false); 1198 appendOp(URX_BACKSLASH_D, 0); 1199 break; 1200 1201 case doBackslashG: 1202 fixLiterals(false); 1203 appendOp(URX_BACKSLASH_G, 0); 1204 break; 1205 1206 case doBackslashH: 1207 fixLiterals(false); 1208 appendOp(URX_BACKSLASH_H, 1); 1209 break; 1210 1211 case doBackslashh: 1212 fixLiterals(false); 1213 appendOp(URX_BACKSLASH_H, 0); 1214 break; 1215 1216 case doBackslashR: 1217 fixLiterals(false); 1218 appendOp(URX_BACKSLASH_R, 0); 1219 break; 1220 1221 case doBackslashS: 1222 fixLiterals(false); 1223 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); 1224 break; 1225 1226 case doBackslashs: 1227 fixLiterals(false); 1228 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); 1229 break; 1230 1231 case doBackslashV: 1232 fixLiterals(false); 1233 appendOp(URX_BACKSLASH_V, 1); 1234 break; 1235 1236 case doBackslashv: 1237 fixLiterals(false); 1238 appendOp(URX_BACKSLASH_V, 0); 1239 break; 1240 1241 case doBackslashW: 1242 fixLiterals(false); 1243 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); 1244 break; 1245 1246 case doBackslashw: 1247 fixLiterals(false); 1248 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); 1249 break; 1250 1251 case doBackslashX: 1252 #if UCONFIG_NO_BREAK_ITERATION==1 1253 // Grapheme Cluster Boundary requires ICU break iteration. 1254 error(U_UNSUPPORTED_ERROR); 1255 #endif 1256 fixLiterals(false); 1257 appendOp(URX_BACKSLASH_X, 0); 1258 break; 1259 1260 case doBackslashZ: 1261 fixLiterals(false); 1262 appendOp(URX_DOLLAR, 0); 1263 break; 1264 1265 case doBackslashz: 1266 fixLiterals(false); 1267 appendOp(URX_BACKSLASH_Z, 0); 1268 break; 1269 1270 case doEscapeError: 1271 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1272 break; 1273 1274 case doExit: 1275 fixLiterals(false); 1276 returnVal = false; 1277 break; 1278 1279 case doProperty: 1280 { 1281 fixLiterals(false); 1282 UnicodeSet *theSet = scanProp(); 1283 compileSet(theSet); 1284 } 1285 break; 1286 1287 case doNamedChar: 1288 { 1289 UChar32 c = scanNamedChar(); 1290 literalChar(c); 1291 } 1292 break; 1293 1294 1295 case doBackRef: 1296 // BackReference. Somewhat unusual in that the front-end can not completely parse 1297 // the regular expression, because the number of digits to be consumed 1298 // depends on the number of capture groups that have been defined. So 1299 // we have to do it here instead. 1300 { 1301 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1302 int32_t groupNum = 0; 1303 UChar32 c = fC.fChar; 1304 1305 for (;;) { 1306 // Loop once per digit, for max allowed number of digits in a back reference. 1307 int32_t digit = u_charDigitValue(c); 1308 groupNum = groupNum * 10 + digit; 1309 if (groupNum >= numCaptureGroups) { 1310 break; 1311 } 1312 c = peekCharLL(); 1313 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) { 1314 break; 1315 } 1316 nextCharLL(); 1317 } 1318 1319 // Scan of the back reference in the source regexp is complete. Now generate 1320 // the compiled code for it. 1321 // Because capture groups can be forward-referenced by back-references, 1322 // we fill the operand with the capture group number. At the end 1323 // of compilation, it will be changed to the variable's location. 1324 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, 1325 // and shouldn't enter this code path at all. 1326 fixLiterals(false); 1327 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1328 appendOp(URX_BACKREF_I, groupNum); 1329 } else { 1330 appendOp(URX_BACKREF, groupNum); 1331 } 1332 } 1333 break; 1334 1335 case doBeginNamedBackRef: 1336 U_ASSERT(fCaptureName == nullptr); 1337 fCaptureName = new UnicodeString; 1338 if (fCaptureName == nullptr) { 1339 error(U_MEMORY_ALLOCATION_ERROR); 1340 } 1341 break; 1342 1343 case doContinueNamedBackRef: 1344 fCaptureName->append(fC.fChar); 1345 break; 1346 1347 case doCompleteNamedBackRef: 1348 { 1349 int32_t groupNumber = 1350 fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0; 1351 if (groupNumber == 0) { 1352 // Group name has not been defined. 1353 // Could be a forward reference. If we choose to support them at some 1354 // future time, extra mechanism will be required at this point. 1355 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 1356 } else { 1357 // Given the number, handle identically to a \n numbered back reference. 1358 // See comments above, under doBackRef 1359 fixLiterals(false); 1360 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1361 appendOp(URX_BACKREF_I, groupNumber); 1362 } else { 1363 appendOp(URX_BACKREF, groupNumber); 1364 } 1365 } 1366 delete fCaptureName; 1367 fCaptureName = nullptr; 1368 break; 1369 } 1370 1371 case doPossessivePlus: 1372 // Possessive ++ quantifier. 1373 // Compiles to 1374 // 1. STO_SP 1375 // 2. body of stuff being iterated over 1376 // 3. STATE_SAVE 5 1377 // 4. JMP 2 1378 // 5. LD_SP 1379 // 6. ... 1380 // 1381 // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1382 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1383 // 1384 { 1385 // Emit the STO_SP 1386 int32_t topLoc = blockTopLoc(true); 1387 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1388 int32_t op = buildOp(URX_STO_SP, stoLoc); 1389 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1390 1391 // Emit the STATE_SAVE 1392 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1393 1394 // Emit the JMP 1395 appendOp(URX_JMP, topLoc+1); 1396 1397 // Emit the LD_SP 1398 appendOp(URX_LD_SP, stoLoc); 1399 } 1400 break; 1401 1402 case doPossessiveStar: 1403 // Possessive *+ quantifier. 1404 // Compiles to 1405 // 1. STO_SP loc 1406 // 2. STATE_SAVE 5 1407 // 3. body of stuff being iterated over 1408 // 4. JMP 2 1409 // 5. LD_SP loc 1410 // 6 ... 1411 // TODO: do something to cut back the state stack each time through the loop. 1412 { 1413 // Reserve two slots at the top of the block. 1414 int32_t topLoc = blockTopLoc(true); 1415 insertOp(topLoc); 1416 1417 // emit STO_SP loc 1418 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1419 int32_t op = buildOp(URX_STO_SP, stoLoc); 1420 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1421 1422 // Emit the SAVE_STATE 5 1423 int32_t L7 = fRXPat->fCompiledPat->size()+1; 1424 op = buildOp(URX_STATE_SAVE, L7); 1425 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1426 1427 // Append the JMP operation. 1428 appendOp(URX_JMP, topLoc+1); 1429 1430 // Emit the LD_SP loc 1431 appendOp(URX_LD_SP, stoLoc); 1432 } 1433 break; 1434 1435 case doPossessiveOpt: 1436 // Possessive ?+ quantifier. 1437 // Compiles to 1438 // 1. STO_SP loc 1439 // 2. SAVE_STATE 5 1440 // 3. body of optional block 1441 // 4. LD_SP loc 1442 // 5. ... 1443 // 1444 { 1445 // Reserve two slots at the top of the block. 1446 int32_t topLoc = blockTopLoc(true); 1447 insertOp(topLoc); 1448 1449 // Emit the STO_SP 1450 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1451 int32_t op = buildOp(URX_STO_SP, stoLoc); 1452 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1453 1454 // Emit the SAVE_STATE 1455 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1456 op = buildOp(URX_STATE_SAVE, continueLoc); 1457 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1458 1459 // Emit the LD_SP 1460 appendOp(URX_LD_SP, stoLoc); 1461 } 1462 break; 1463 1464 1465 case doBeginMatchMode: 1466 fNewModeFlags = fModeFlags; 1467 fSetModeFlag = true; 1468 break; 1469 1470 case doMatchMode: // (?i) and similar 1471 { 1472 int32_t bit = 0; 1473 switch (fC.fChar) { 1474 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1475 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1476 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1477 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1478 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1479 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1480 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1481 case 0x2d: /* '-' */ fSetModeFlag = false; break; 1482 default: 1483 UPRV_UNREACHABLE_EXIT; // Should never happen. Other chars are filtered out 1484 // by the scanner. 1485 } 1486 if (fSetModeFlag) { 1487 fNewModeFlags |= bit; 1488 } else { 1489 fNewModeFlags &= ~bit; 1490 } 1491 } 1492 break; 1493 1494 case doSetMatchMode: 1495 // Emit code to match any pending literals, using the not-yet changed match mode. 1496 fixLiterals(); 1497 1498 // We've got a (?i) or similar. The match mode is being changed, but 1499 // the change is not scoped to a parenthesized block. 1500 U_ASSERT(fNewModeFlags < 0); 1501 fModeFlags = fNewModeFlags; 1502 1503 break; 1504 1505 1506 case doMatchModeParen: 1507 // We've got a (?i: or similar. Begin a parenthesized block, save old 1508 // mode flags so they can be restored at the close of the block. 1509 // 1510 // Compile to a 1511 // - NOP, which later may be replaced by a save-state if the 1512 // parenthesized group gets a * quantifier, followed by 1513 // - NOP, which may later be replaced by a save-state if there 1514 // is an '|' alternation within the parens. 1515 { 1516 fixLiterals(false); 1517 appendOp(URX_NOP, 0); 1518 appendOp(URX_NOP, 0); 1519 1520 // On the Parentheses stack, start a new frame and add the positions 1521 // of the two NOPs (a normal non-capturing () frame, except for the 1522 // saving of the original mode flags.) 1523 fParenStack.push(fModeFlags, *fStatus); 1524 fParenStack.push(flags, *fStatus); // Frame Marker 1525 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1527 1528 // Set the current mode flags to the new values. 1529 U_ASSERT(fNewModeFlags < 0); 1530 fModeFlags = fNewModeFlags; 1531 } 1532 break; 1533 1534 case doBadModeFlag: 1535 error(U_REGEX_INVALID_FLAG); 1536 break; 1537 1538 case doSuppressComments: 1539 // We have just scanned a '(?'. We now need to prevent the character scanner from 1540 // treating a '#' as a to-the-end-of-line comment. 1541 // (This Perl compatibility just gets uglier and uglier to do...) 1542 fEOLComments = false; 1543 break; 1544 1545 1546 case doSetAddAmp: 1547 { 1548 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1549 set->add(chAmp); 1550 } 1551 break; 1552 1553 case doSetAddDash: 1554 { 1555 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1556 set->add(chDash); 1557 } 1558 break; 1559 1560 case doSetBackslash_s: 1561 { 1562 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1563 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1564 break; 1565 } 1566 1567 case doSetBackslash_S: 1568 { 1569 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1570 UnicodeSet SSet; 1571 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement(); 1572 set->addAll(SSet); 1573 break; 1574 } 1575 1576 case doSetBackslash_d: 1577 { 1578 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1579 // TODO - make a static set, ticket 6058. 1580 addCategory(set, U_GC_ND_MASK, *fStatus); 1581 break; 1582 } 1583 1584 case doSetBackslash_D: 1585 { 1586 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1587 UnicodeSet digits; 1588 // TODO - make a static set, ticket 6058. 1589 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1590 digits.complement(); 1591 set->addAll(digits); 1592 break; 1593 } 1594 1595 case doSetBackslash_h: 1596 { 1597 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1598 UnicodeSet h; 1599 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 1600 h.add(static_cast<UChar32>(9)); // Tab 1601 set->addAll(h); 1602 break; 1603 } 1604 1605 case doSetBackslash_H: 1606 { 1607 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1608 UnicodeSet h; 1609 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 1610 h.add(static_cast<UChar32>(9)); // Tab 1611 h.complement(); 1612 set->addAll(h); 1613 break; 1614 } 1615 1616 case doSetBackslash_v: 1617 { 1618 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1619 set->add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range 1620 set->add(static_cast<UChar32>(0x85)); 1621 set->add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029)); 1622 break; 1623 } 1624 1625 case doSetBackslash_V: 1626 { 1627 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1628 UnicodeSet v; 1629 v.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range 1630 v.add(static_cast<UChar32>(0x85)); 1631 v.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029)); 1632 v.complement(); 1633 set->addAll(v); 1634 break; 1635 } 1636 1637 case doSetBackslash_w: 1638 { 1639 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1640 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1641 break; 1642 } 1643 1644 case doSetBackslash_W: 1645 { 1646 UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek()); 1647 UnicodeSet SSet; 1648 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement(); 1649 set->addAll(SSet); 1650 break; 1651 } 1652 1653 case doSetBegin: 1654 { 1655 fixLiterals(false); 1656 LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus); 1657 fSetStack.push(lpSet.orphan(), *fStatus); 1658 fSetOpStack.push(setStart, *fStatus); 1659 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1660 fSetOpStack.push(setCaseClose, *fStatus); 1661 } 1662 break; 1663 } 1664 1665 case doSetBeginDifference1: 1666 // We have scanned something like [[abc]-[ 1667 // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1668 // Push a Difference operator, which will cause the new set to be subtracted from what 1669 // went before once it is created. 1670 setPushOp(setDifference1); 1671 fSetOpStack.push(setStart, *fStatus); 1672 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1673 fSetOpStack.push(setCaseClose, *fStatus); 1674 } 1675 break; 1676 1677 case doSetBeginIntersection1: 1678 // We have scanned something like [[abc]&[ 1679 // Need both the '&' operator and the open '[' operator. 1680 setPushOp(setIntersection1); 1681 fSetOpStack.push(setStart, *fStatus); 1682 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1683 fSetOpStack.push(setCaseClose, *fStatus); 1684 } 1685 break; 1686 1687 case doSetBeginUnion: 1688 // We have scanned something like [[abc][ 1689 // Need to handle the union operation explicitly [[abc] | [ 1690 setPushOp(setUnion); 1691 fSetOpStack.push(setStart, *fStatus); 1692 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1693 fSetOpStack.push(setCaseClose, *fStatus); 1694 } 1695 break; 1696 1697 case doSetDifference2: 1698 // We have scanned something like [abc-- 1699 // Consider this to unambiguously be a set difference operator. 1700 setPushOp(setDifference2); 1701 break; 1702 1703 case doSetEnd: 1704 // Have encountered the ']' that closes a set. 1705 // Force the evaluation of any pending operations within this set, 1706 // leave the completed set on the top of the set stack. 1707 setEval(setEnd); 1708 U_ASSERT(fSetOpStack.peeki()==setStart); 1709 fSetOpStack.popi(); 1710 break; 1711 1712 case doSetFinish: 1713 { 1714 // Finished a complete set expression, including all nested sets. 1715 // The close bracket has already triggered clearing out pending set operators, 1716 // the operator stack should be empty and the operand stack should have just 1717 // one entry, the result set. 1718 U_ASSERT(fSetOpStack.empty()); 1719 UnicodeSet* theSet = static_cast<UnicodeSet*>(fSetStack.pop()); 1720 U_ASSERT(fSetStack.empty()); 1721 compileSet(theSet); 1722 break; 1723 } 1724 1725 case doSetIntersection2: 1726 // Have scanned something like [abc&& 1727 setPushOp(setIntersection2); 1728 break; 1729 1730 case doSetLiteral: 1731 // Union the just-scanned literal character into the set being built. 1732 // This operation is the highest precedence set operation, so we can always do 1733 // it immediately, without waiting to see what follows. It is necessary to perform 1734 // any pending '-' or '&' operation first, because these have the same precedence 1735 // as union-ing in a literal' 1736 { 1737 setEval(setUnion); 1738 UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek()); 1739 s->add(fC.fChar); 1740 fLastSetLiteral = fC.fChar; 1741 break; 1742 } 1743 1744 case doSetLiteralEscaped: 1745 // A back-slash escaped literal character was encountered. 1746 // Processing is the same as with setLiteral, above, with the addition of 1747 // the optional check for errors on escaped ASCII letters. 1748 { 1749 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1750 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1751 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1752 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1753 } 1754 setEval(setUnion); 1755 UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek()); 1756 s->add(fC.fChar); 1757 fLastSetLiteral = fC.fChar; 1758 break; 1759 } 1760 1761 case doSetNamedChar: 1762 // Scanning a \N{UNICODE CHARACTER NAME} 1763 // Aside from the source of the character, the processing is identical to doSetLiteral, 1764 // above. 1765 { 1766 UChar32 c = scanNamedChar(); 1767 setEval(setUnion); 1768 UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek()); 1769 s->add(c); 1770 fLastSetLiteral = c; 1771 break; 1772 } 1773 1774 case doSetNamedRange: 1775 // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1776 // The left character is already in the set, and is saved in fLastSetLiteral. 1777 // The right side needs to be picked up, the scan is at the 'N'. 1778 // Lower Limit > Upper limit being an error matches both Java 1779 // and ICU UnicodeSet behavior. 1780 { 1781 UChar32 c = scanNamedChar(); 1782 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) { 1783 error(U_REGEX_INVALID_RANGE); 1784 } 1785 UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek()); 1786 s->add(fLastSetLiteral, c); 1787 fLastSetLiteral = c; 1788 break; 1789 } 1790 1791 1792 case doSetNegate: 1793 // Scanned a '^' at the start of a set. 1794 // Push the negation operator onto the set op stack. 1795 // A twist for case-insensitive matching: 1796 // the case closure operation must happen _before_ negation. 1797 // But the case closure operation will already be on the stack if it's required. 1798 // This requires checking for case closure, and swapping the stack order 1799 // if it is present. 1800 { 1801 int32_t tosOp = fSetOpStack.peeki(); 1802 if (tosOp == setCaseClose) { 1803 fSetOpStack.popi(); 1804 fSetOpStack.push(setNegation, *fStatus); 1805 fSetOpStack.push(setCaseClose, *fStatus); 1806 } else { 1807 fSetOpStack.push(setNegation, *fStatus); 1808 } 1809 } 1810 break; 1811 1812 case doSetNoCloseError: 1813 error(U_REGEX_MISSING_CLOSE_BRACKET); 1814 break; 1815 1816 case doSetOpError: 1817 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1818 break; 1819 1820 case doSetPosixProp: 1821 { 1822 UnicodeSet *s = scanPosixProp(); 1823 if (s != nullptr) { 1824 UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek()); 1825 tos->addAll(*s); 1826 delete s; 1827 } // else error. scanProp() reported the error status already. 1828 } 1829 break; 1830 1831 case doSetProp: 1832 // Scanned a \p \P within [brackets]. 1833 { 1834 UnicodeSet *s = scanProp(); 1835 if (s != nullptr) { 1836 UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek()); 1837 tos->addAll(*s); 1838 delete s; 1839 } // else error. scanProp() reported the error status already. 1840 } 1841 break; 1842 1843 1844 case doSetRange: 1845 // We have scanned literal-literal. Add the range to the set. 1846 // The left character is already in the set, and is saved in fLastSetLiteral. 1847 // The right side is the current character. 1848 // Lower Limit > Upper limit being an error matches both Java 1849 // and ICU UnicodeSet behavior. 1850 { 1851 1852 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) { 1853 error(U_REGEX_INVALID_RANGE); 1854 } 1855 UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek()); 1856 s->add(fLastSetLiteral, fC.fChar); 1857 break; 1858 } 1859 1860 default: 1861 UPRV_UNREACHABLE_EXIT; 1862 } 1863 1864 if (U_FAILURE(*fStatus)) { 1865 returnVal = false; 1866 } 1867 1868 return returnVal; 1869 } 1870 1871 1872 1873 //------------------------------------------------------------------------------ 1874 // 1875 // literalChar We've encountered a literal character from the pattern, 1876 // or an escape sequence that reduces to a character. 1877 // Add it to the string containing all literal chars/strings from 1878 // the pattern. 1879 // 1880 //------------------------------------------------------------------------------ 1881 void RegexCompile::literalChar(UChar32 c) { 1882 fLiteralChars.append(c); 1883 } 1884 1885 1886 //------------------------------------------------------------------------------ 1887 // 1888 // fixLiterals When compiling something that can follow a literal 1889 // string in a pattern, emit the code to match the 1890 // accumulated literal string. 1891 // 1892 // Optionally, split the last char of the string off into 1893 // a single "ONE_CHAR" operation, so that quantifiers can 1894 // apply to that char alone. Example: abc* 1895 // The * must apply to the 'c' only. 1896 // 1897 //------------------------------------------------------------------------------ 1898 void RegexCompile::fixLiterals(UBool split) { 1899 1900 // If no literal characters have been scanned but not yet had code generated 1901 // for them, nothing needs to be done. 1902 if (fLiteralChars.length() == 0) { 1903 return; 1904 } 1905 1906 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1907 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1908 1909 // Split: We need to ensure that the last item in the compiled pattern 1910 // refers only to the last literal scanned in the pattern, so that 1911 // quantifiers (*, +, etc.) affect only it, and not a longer string. 1912 // Split before case folding for case insensitive matches. 1913 1914 if (split) { 1915 fLiteralChars.truncate(indexOfLastCodePoint); 1916 fixLiterals(false); // Recursive call, emit code to match the first part of the string. 1917 // Note that the truncated literal string may be empty, in which case 1918 // nothing will be emitted. 1919 1920 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. 1921 fixLiterals(false); // Second recursive call, code for the final code point. 1922 return; 1923 } 1924 1925 // If we are doing case-insensitive matching, case fold the string. This may expand 1926 // the string, e.g. the German sharp-s turns into "ss" 1927 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1928 fLiteralChars.foldCase(); 1929 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1930 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1931 } 1932 1933 if (indexOfLastCodePoint == 0) { 1934 // Single character, emit a URX_ONECHAR op to match it. 1935 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1936 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { 1937 appendOp(URX_ONECHAR_I, lastCodePoint); 1938 } else { 1939 appendOp(URX_ONECHAR, lastCodePoint); 1940 } 1941 } else { 1942 // Two or more chars, emit a URX_STRING to match them. 1943 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) { 1944 error(U_REGEX_PATTERN_TOO_BIG); 1945 } 1946 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1947 appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); 1948 } else { 1949 // TODO here: add optimization to split case sensitive strings of length two 1950 // into two single char ops, for efficiency. 1951 appendOp(URX_STRING, fRXPat->fLiteralText.length()); 1952 } 1953 appendOp(URX_STRING_LEN, fLiteralChars.length()); 1954 1955 // Add this string into the accumulated strings of the compiled pattern. 1956 fRXPat->fLiteralText.append(fLiteralChars); 1957 } 1958 1959 fLiteralChars.remove(); 1960 } 1961 1962 1963 int32_t RegexCompile::buildOp(int32_t type, int32_t val) { 1964 if (U_FAILURE(*fStatus)) { 1965 return 0; 1966 } 1967 if (type < 0 || type > 255) { 1968 UPRV_UNREACHABLE_EXIT; 1969 } 1970 if (val > 0x00ffffff) { 1971 UPRV_UNREACHABLE_EXIT; 1972 } 1973 if (val < 0) { 1974 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { 1975 UPRV_UNREACHABLE_EXIT; 1976 } 1977 if (URX_TYPE(val) != 0xff) { 1978 UPRV_UNREACHABLE_EXIT; 1979 } 1980 type = URX_RESERVED_OP_N; 1981 } 1982 return (type << 24) | val; 1983 } 1984 1985 1986 //------------------------------------------------------------------------------ 1987 // 1988 // appendOp() Append a new instruction onto the compiled pattern 1989 // Includes error checking, limiting the size of the 1990 // pattern to lengths that can be represented in the 1991 // 24 bit operand field of an instruction. 1992 // 1993 //------------------------------------------------------------------------------ 1994 void RegexCompile::appendOp(int32_t op) { 1995 if (U_FAILURE(*fStatus)) { 1996 return; 1997 } 1998 fRXPat->fCompiledPat->addElement(op, *fStatus); 1999 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { 2000 error(U_REGEX_PATTERN_TOO_BIG); 2001 } 2002 } 2003 2004 void RegexCompile::appendOp(int32_t type, int32_t val) { 2005 appendOp(buildOp(type, val)); 2006 } 2007 2008 2009 //------------------------------------------------------------------------------ 2010 // 2011 // insertOp() Insert a slot for a new opcode into the already 2012 // compiled pattern code. 2013 // 2014 // Fill the slot with a NOP. Our caller will replace it 2015 // with what they really wanted. 2016 // 2017 //------------------------------------------------------------------------------ 2018 void RegexCompile::insertOp(int32_t where) { 2019 UVector64 *code = fRXPat->fCompiledPat; 2020 U_ASSERT(where>0 && where < code->size()); 2021 2022 int32_t nop = buildOp(URX_NOP, 0); 2023 code->insertElementAt(nop, where, *fStatus); 2024 2025 // Walk through the pattern, looking for any ops with targets that 2026 // were moved down by the insert. Fix them. 2027 int32_t loc; 2028 for (loc=0; loc<code->size(); loc++) { 2029 int32_t op = static_cast<int32_t>(code->elementAti(loc)); 2030 int32_t opType = URX_TYPE(op); 2031 int32_t opValue = URX_VAL(op); 2032 if ((opType == URX_JMP || 2033 opType == URX_JMPX || 2034 opType == URX_STATE_SAVE || 2035 opType == URX_CTR_LOOP || 2036 opType == URX_CTR_LOOP_NG || 2037 opType == URX_JMP_SAV || 2038 opType == URX_JMP_SAV_X || 2039 opType == URX_RELOC_OPRND) && opValue > where) { 2040 // Target location for this opcode is after the insertion point and 2041 // needs to be incremented to adjust for the insertion. 2042 opValue++; 2043 op = buildOp(opType, opValue); 2044 code->setElementAt(op, loc); 2045 } 2046 } 2047 2048 // Now fix up the parentheses stack. All positive values in it are locations in 2049 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 2050 for (loc=0; loc<fParenStack.size(); loc++) { 2051 int32_t x = fParenStack.elementAti(loc); 2052 U_ASSERT(x < code->size()); 2053 if (x>where) { 2054 x++; 2055 fParenStack.setElementAt(x, loc); 2056 } 2057 } 2058 2059 if (fMatchCloseParen > where) { 2060 fMatchCloseParen++; 2061 } 2062 if (fMatchOpenParen > where) { 2063 fMatchOpenParen++; 2064 } 2065 } 2066 2067 2068 //------------------------------------------------------------------------------ 2069 // 2070 // allocateData() Allocate storage in the matcher's static data area. 2071 // Return the index for the newly allocated data. 2072 // The storage won't actually exist until we are running a match 2073 // operation, but the storage indexes are inserted into various 2074 // opcodes while compiling the pattern. 2075 // 2076 //------------------------------------------------------------------------------ 2077 int32_t RegexCompile::allocateData(int32_t size) { 2078 if (U_FAILURE(*fStatus)) { 2079 return 0; 2080 } 2081 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { 2082 error(U_REGEX_INTERNAL_ERROR); 2083 return 0; 2084 } 2085 int32_t dataIndex = fRXPat->fDataSize; 2086 fRXPat->fDataSize += size; 2087 if (fRXPat->fDataSize >= 0x00fffff0) { 2088 error(U_REGEX_INTERNAL_ERROR); 2089 } 2090 return dataIndex; 2091 } 2092 2093 2094 //------------------------------------------------------------------------------ 2095 // 2096 // allocateStackData() Allocate space in the back-tracking stack frame. 2097 // Return the index for the newly allocated data. 2098 // The frame indexes are inserted into various 2099 // opcodes while compiling the pattern, meaning that frame 2100 // size must be restricted to the size that will fit 2101 // as an operand (24 bits). 2102 // 2103 //------------------------------------------------------------------------------ 2104 int32_t RegexCompile::allocateStackData(int32_t size) { 2105 if (U_FAILURE(*fStatus)) { 2106 return 0; 2107 } 2108 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { 2109 error(U_REGEX_INTERNAL_ERROR); 2110 return 0; 2111 } 2112 int32_t dataIndex = fRXPat->fFrameSize; 2113 fRXPat->fFrameSize += size; 2114 if (fRXPat->fFrameSize >= 0x00fffff0) { 2115 error(U_REGEX_PATTERN_TOO_BIG); 2116 } 2117 return dataIndex; 2118 } 2119 2120 2121 //------------------------------------------------------------------------------ 2122 // 2123 // blockTopLoc() Find or create a location in the compiled pattern 2124 // at the start of the operation or block that has 2125 // just been compiled. Needed when a quantifier (* or 2126 // whatever) appears, and we need to add an operation 2127 // at the start of the thing being quantified. 2128 // 2129 // (Parenthesized Blocks) have a slot with a NOP that 2130 // is reserved for this purpose. .* or similar don't 2131 // and a slot needs to be added. 2132 // 2133 // parameter reserveLoc : true - ensure that there is space to add an opcode 2134 // at the returned location. 2135 // false - just return the address, 2136 // do not reserve a location there. 2137 // 2138 //------------------------------------------------------------------------------ 2139 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 2140 int32_t theLoc; 2141 fixLiterals(true); // Emit code for any pending literals. 2142 // If last item was a string, emit separate op for the its last char. 2143 if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 2144 { 2145 // The item just processed is a parenthesized block. 2146 theLoc = fMatchOpenParen; // A slot is already reserved for us. 2147 U_ASSERT(theLoc > 0); 2148 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 2149 } 2150 else { 2151 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. 2152 // No slot for STATE_SAVE was pre-reserved in the compiled code. 2153 // We need to make space now. 2154 theLoc = fRXPat->fCompiledPat->size()-1; 2155 int32_t opAtTheLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(theLoc)); 2156 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { 2157 // Strings take two opcode, we want the position of the first one. 2158 // We can have a string at this point if a single character case-folded to two. 2159 theLoc--; 2160 } 2161 if (reserveLoc) { 2162 int32_t nop = buildOp(URX_NOP, 0); 2163 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 2164 } 2165 } 2166 return theLoc; 2167 } 2168 2169 2170 2171 //------------------------------------------------------------------------------ 2172 // 2173 // handleCloseParen When compiling a close paren, we need to go back 2174 // and fix up any JMP or SAVE operations within the 2175 // parenthesized block that need to target the end 2176 // of the block. The locations of these are kept on 2177 // the paretheses stack. 2178 // 2179 // This function is called both when encountering a 2180 // real ) and at the end of the pattern. 2181 // 2182 //------------------------------------------------------------------------------ 2183 void RegexCompile::handleCloseParen() { 2184 int32_t patIdx; 2185 int32_t patOp; 2186 if (fParenStack.size() <= 0) { 2187 error(U_REGEX_MISMATCHED_PAREN); 2188 return; 2189 } 2190 2191 // Emit code for any pending literals. 2192 fixLiterals(false); 2193 2194 // Fixup any operations within the just-closed parenthesized group 2195 // that need to reference the end of the (block). 2196 // (The first one popped from the stack is an unused slot for 2197 // alternation (OR) state save, but applying the fixup to it does no harm.) 2198 for (;;) { 2199 patIdx = fParenStack.popi(); 2200 if (patIdx < 0) { 2201 // value < 0 flags the start of the frame on the paren stack. 2202 break; 2203 } 2204 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 2205 patOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(patIdx)); 2206 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2207 patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2208 fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2209 fMatchOpenParen = patIdx; 2210 } 2211 2212 // At the close of any parenthesized block, restore the match mode flags to 2213 // the value they had at the open paren. Saved value is 2214 // at the top of the paren stack. 2215 fModeFlags = fParenStack.popi(); 2216 U_ASSERT(fModeFlags < 0); 2217 2218 // DO any additional fixups, depending on the specific kind of 2219 // parentesized grouping this is 2220 2221 switch (patIdx) { 2222 case plain: 2223 case flags: 2224 // No additional fixups required. 2225 // (Grouping-only parentheses) 2226 break; 2227 case capturing: 2228 // Capturing Parentheses. 2229 // Insert a End Capture op into the pattern. 2230 // The frame offset of the variables for this cg is obtained from the 2231 // start capture op and put it into the end-capture op. 2232 { 2233 int32_t captureOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1)); 2234 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2235 2236 int32_t frameVarLocation = URX_VAL(captureOp); 2237 appendOp(URX_END_CAPTURE, frameVarLocation); 2238 } 2239 break; 2240 case atomic: 2241 // Atomic Parenthesis. 2242 // Insert a LD_SP operation to restore the state stack to the position 2243 // it was when the atomic parens were entered. 2244 { 2245 int32_t stoOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1)); 2246 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2247 int32_t stoLoc = URX_VAL(stoOp); 2248 appendOp(URX_LD_SP, stoLoc); 2249 } 2250 break; 2251 2252 case lookAhead: 2253 { 2254 int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5)); 2255 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2256 int32_t dataLoc = URX_VAL(startOp); 2257 appendOp(URX_LA_END, dataLoc); 2258 } 2259 break; 2260 2261 case negLookAhead: 2262 { 2263 // See comment at doOpenLookAheadNeg 2264 int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 1)); 2265 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2266 int32_t dataLoc = URX_VAL(startOp); 2267 appendOp(URX_LA_END, dataLoc); 2268 appendOp(URX_BACKTRACK, 0); 2269 appendOp(URX_LA_END, dataLoc); 2270 2271 // Patch the URX_SAVE near the top of the block. 2272 // The destination of the SAVE is the final LA_END that was just added. 2273 int32_t saveOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen)); 2274 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2275 int32_t dest = fRXPat->fCompiledPat->size()-1; 2276 saveOp = buildOp(URX_STATE_SAVE, dest); 2277 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2278 } 2279 break; 2280 2281 case lookBehind: 2282 { 2283 // See comment at doOpenLookBehind. 2284 2285 // Append the URX_LB_END and URX_LA_END to the compiled pattern. 2286 int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 4)); 2287 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2288 int32_t dataLoc = URX_VAL(startOp); 2289 appendOp(URX_LB_END, dataLoc); 2290 appendOp(URX_LA_END, dataLoc); 2291 2292 // Determine the min and max bounds for the length of the 2293 // string that the pattern can match. 2294 // An unbounded upper limit is an error. 2295 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2296 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2297 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2298 if (URX_TYPE(maxML) != 0) { 2299 error(U_REGEX_LOOK_BEHIND_LIMIT); 2300 break; 2301 } 2302 if (maxML == INT32_MAX) { 2303 error(U_REGEX_LOOK_BEHIND_LIMIT); 2304 break; 2305 } 2306 if (minML == INT32_MAX) { 2307 // This condition happens when no match is possible, such as with a 2308 // [set] expression containing no elements. 2309 // In principle, the generated code to evaluate the expression could be deleted, 2310 // but it's probably not worth the complication. 2311 minML = 0; 2312 } 2313 U_ASSERT(minML <= maxML); 2314 2315 // Insert the min and max match len bounds into the URX_LB_CONT op that 2316 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2317 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2318 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2319 2320 } 2321 break; 2322 2323 2324 2325 case lookBehindN: 2326 { 2327 // See comment at doOpenLookBehindNeg. 2328 2329 // Append the URX_LBN_END to the compiled pattern. 2330 int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5)); 2331 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2332 int32_t dataLoc = URX_VAL(startOp); 2333 appendOp(URX_LBN_END, dataLoc); 2334 2335 // Determine the min and max bounds for the length of the 2336 // string that the pattern can match. 2337 // An unbounded upper limit is an error. 2338 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2339 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2340 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2341 if (URX_TYPE(maxML) != 0) { 2342 error(U_REGEX_LOOK_BEHIND_LIMIT); 2343 break; 2344 } 2345 if (maxML == INT32_MAX) { 2346 error(U_REGEX_LOOK_BEHIND_LIMIT); 2347 break; 2348 } 2349 if (minML == INT32_MAX) { 2350 // This condition happens when no match is possible, such as with a 2351 // [set] expression containing no elements. 2352 // In principle, the generated code to evaluate the expression could be deleted, 2353 // but it's probably not worth the complication. 2354 minML = 0; 2355 } 2356 2357 U_ASSERT(minML <= maxML); 2358 2359 // Insert the min and max match len bounds into the URX_LB_CONT op that 2360 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2361 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2362 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2363 2364 // Insert the pattern location to continue at after a successful match 2365 // as the last operand of the URX_LBN_CONT 2366 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2367 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2368 } 2369 break; 2370 2371 2372 2373 default: 2374 UPRV_UNREACHABLE_EXIT; 2375 } 2376 2377 // remember the next location in the compiled pattern. 2378 // The compilation of Quantifiers will look at this to see whether its looping 2379 // over a parenthesized block or a single item 2380 fMatchCloseParen = fRXPat->fCompiledPat->size(); 2381 } 2382 2383 2384 2385 //------------------------------------------------------------------------------ 2386 // 2387 // compileSet Compile the pattern operations for a reference to a 2388 // UnicodeSet. 2389 // 2390 //------------------------------------------------------------------------------ 2391 void RegexCompile::compileSet(UnicodeSet *theSet) 2392 { 2393 if (theSet == nullptr) { 2394 return; 2395 } 2396 // Remove any strings from the set. 2397 // There shouldn't be any, but just in case. 2398 // (Case Closure can add them; if we had a simple case closure available that 2399 // ignored strings, that would be better.) 2400 theSet->removeAllStrings(); 2401 int32_t setSize = theSet->size(); 2402 2403 switch (setSize) { 2404 case 0: 2405 { 2406 // Set of no elements. Always fails to match. 2407 appendOp(URX_BACKTRACK, 0); 2408 delete theSet; 2409 } 2410 break; 2411 2412 case 1: 2413 { 2414 // The set contains only a single code point. Put it into 2415 // the compiled pattern as a single char operation rather 2416 // than a set, and discard the set itself. 2417 literalChar(theSet->charAt(0)); 2418 delete theSet; 2419 } 2420 break; 2421 2422 default: 2423 { 2424 // The set contains two or more chars. (the normal case) 2425 // Put it into the compiled pattern as a set. 2426 theSet->freeze(); 2427 int32_t setNumber = fRXPat->fSets->size(); 2428 fRXPat->fSets->addElement(theSet, *fStatus); 2429 if (U_SUCCESS(*fStatus)) { 2430 appendOp(URX_SETREF, setNumber); 2431 } else { 2432 delete theSet; 2433 } 2434 } 2435 } 2436 } 2437 2438 2439 //------------------------------------------------------------------------------ 2440 // 2441 // compileInterval Generate the code for a {min, max} style interval quantifier. 2442 // Except for the specific opcodes used, the code is the same 2443 // for all three types (greedy, non-greedy, possessive) of 2444 // intervals. The opcodes are supplied as parameters. 2445 // (There are two sets of opcodes - greedy & possessive use the 2446 // same ones, while non-greedy has it's own.) 2447 // 2448 // The code for interval loops has this form: 2449 // 0 CTR_INIT counter loc (in stack frame) 2450 // 1 5 patt address of CTR_LOOP at bottom of block 2451 // 2 min count 2452 // 3 max count (-1 for unbounded) 2453 // 4 ... block to be iterated over 2454 // 5 CTR_LOOP 2455 // 2456 // In 2457 //------------------------------------------------------------------------------ 2458 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2459 { 2460 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2461 // four slots in the compiled code. Reserve them. 2462 int32_t topOfBlock = blockTopLoc(true); 2463 insertOp(topOfBlock); 2464 insertOp(topOfBlock); 2465 insertOp(topOfBlock); 2466 2467 // The operands for the CTR_INIT opcode include the index in the matcher data 2468 // of the counter. Allocate it now. There are two data items 2469 // counterLoc --> Loop counter 2470 // +1 --> Input index (for breaking non-progressing loops) 2471 // (Only present if unbounded upper limit on loop) 2472 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; 2473 int32_t counterLoc = allocateStackData(dataSize); 2474 2475 int32_t op = buildOp(InitOp, counterLoc); 2476 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2477 2478 // The second operand of CTR_INIT is the location following the end of the loop. 2479 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2480 // compilation of something later on causes the code to grow and the target 2481 // position to move. 2482 int32_t loopEnd = fRXPat->fCompiledPat->size(); 2483 op = buildOp(URX_RELOC_OPRND, loopEnd); 2484 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2485 2486 // Followed by the min and max counts. 2487 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2488 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2489 2490 // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2491 // Goes at end of the block being looped over, so just append to the code so far. 2492 appendOp(LoopOp, topOfBlock); 2493 2494 if ((fIntervalLow & 0xff000000) != 0 || 2495 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2496 error(U_REGEX_NUMBER_TOO_BIG); 2497 } 2498 2499 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2500 error(U_REGEX_MAX_LT_MIN); 2501 } 2502 } 2503 2504 2505 2506 UBool RegexCompile::compileInlineInterval() { 2507 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2508 // Too big to inline. Fail, which will cause looping code to be generated. 2509 // (Upper < Lower picks up unbounded upper and errors, both.) 2510 return false; 2511 } 2512 2513 int32_t topOfBlock = blockTopLoc(false); 2514 if (fIntervalUpper == 0) { 2515 // Pathological case. Attempt no matches, as if the block doesn't exist. 2516 // Discard the generated code for the block. 2517 // If the block included parens, discard the info pertaining to them as well. 2518 fRXPat->fCompiledPat->setSize(topOfBlock); 2519 if (fMatchOpenParen >= topOfBlock) { 2520 fMatchOpenParen = -1; 2521 } 2522 if (fMatchCloseParen >= topOfBlock) { 2523 fMatchCloseParen = -1; 2524 } 2525 return true; 2526 } 2527 2528 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2529 // The thing being repeated is not a single op, but some 2530 // more complex block. Do it as a loop, not inlines. 2531 // Note that things "repeated" a max of once are handled as inline, because 2532 // the one copy of the code already generated is just fine. 2533 return false; 2534 } 2535 2536 // Pick up the opcode that is to be repeated 2537 // 2538 int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topOfBlock)); 2539 2540 // Compute the pattern location where the inline sequence 2541 // will end, and set up the state save op that will be needed. 2542 // 2543 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2544 + fIntervalUpper + (fIntervalUpper-fIntervalLow); 2545 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); 2546 if (fIntervalLow == 0) { 2547 insertOp(topOfBlock); 2548 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2549 } 2550 2551 2552 2553 // Loop, emitting the op for the thing being repeated each time. 2554 // Loop starts at 1 because one instance of the op already exists in the pattern, 2555 // it was put there when it was originally encountered. 2556 int32_t i; 2557 for (i=1; i<fIntervalUpper; i++ ) { 2558 if (i >= fIntervalLow) { 2559 appendOp(saveOp); 2560 } 2561 appendOp(op); 2562 } 2563 return true; 2564 } 2565 2566 2567 2568 //------------------------------------------------------------------------------ 2569 // 2570 // caseInsensitiveStart given a single code point from a pattern string, determine the 2571 // set of characters that could potentially begin a case-insensitive 2572 // match of a string beginning with that character, using full Unicode 2573 // case insensitive matching. 2574 // 2575 // This is used in optimizing find(). 2576 // 2577 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but 2578 // misses cases like this: 2579 // A string from the pattern begins with 'ss' (although all we know 2580 // in this context is that it begins with 's') 2581 // The pattern could match a string beginning with a German sharp-s 2582 // 2583 // To the ordinary case closure for a character c, we add all other 2584 // characters cx where the case closure of cx includes a string form that begins 2585 // with the original character c. 2586 // 2587 // This function could be made smarter. The full pattern string is available 2588 // and it would be possible to verify that the extra characters being added 2589 // to the starting set fully match, rather than having just a first-char of the 2590 // folded form match. 2591 // 2592 //------------------------------------------------------------------------------ 2593 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) { 2594 2595 // Machine Generated below. 2596 // It may need updating with new versions of Unicode. 2597 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. 2598 // The update tool is here: 2599 // https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing 2600 2601 // Machine Generated Data. Do not hand edit. 2602 static const UChar32 RECaseFixCodePoints[] = { 2603 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 2604 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 2605 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 2606 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 2607 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000}; 2608 2609 static const int16_t RECaseFixStringOffsets[] = { 2610 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 2611 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 2612 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 2613 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 2614 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0}; 2615 2616 static const int16_t RECaseFixCounts[] = { 2617 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 2618 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 2619 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 2620 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 2621 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0}; 2622 2623 static const char16_t RECaseFixData[] = { 2624 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 2625 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 2626 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 2627 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 2628 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 2629 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 2630 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 2631 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 2632 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 2633 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 2634 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0}; 2635 2636 // End of machine generated data. 2637 2638 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) { 2639 // This function should never be called with an invalid input character. 2640 UPRV_UNREACHABLE_EXIT; 2641 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2642 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); 2643 starterChars->set(caseFoldedC, caseFoldedC); 2644 2645 int32_t i; 2646 for (i=0; RECaseFixCodePoints[i]<c ; i++) { 2647 // Simple linear search through the sorted list of interesting code points. 2648 } 2649 2650 if (RECaseFixCodePoints[i] == c) { 2651 int32_t dataIndex = RECaseFixStringOffsets[i]; 2652 int32_t numCharsToAdd = RECaseFixCounts[i]; 2653 UChar32 cpToAdd = 0; 2654 for (int32_t j=0; j<numCharsToAdd; j++) { 2655 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd); 2656 starterChars->add(cpToAdd); 2657 } 2658 } 2659 2660 starterChars->closeOver(USET_CASE_INSENSITIVE); 2661 starterChars->removeAllStrings(); 2662 } else { 2663 // Not a cased character. Just return it alone. 2664 starterChars->set(c, c); 2665 } 2666 } 2667 2668 2669 // Increment with overflow check. 2670 // val and delta will both be positive. 2671 2672 static int32_t safeIncrement(int32_t val, int32_t delta) { 2673 if (INT32_MAX - val > delta) { 2674 return val + delta; 2675 } else { 2676 return INT32_MAX; 2677 } 2678 } 2679 2680 2681 //------------------------------------------------------------------------------ 2682 // 2683 // matchStartType Determine how a match can start. 2684 // Used to optimize find() operations. 2685 // 2686 // Operation is very similar to minMatchLength(). Walk the compiled 2687 // pattern, keeping an on-going minimum-match-length. For any 2688 // op where the min match coming in is zero, add that ops possible 2689 // starting matches to the possible starts for the overall pattern. 2690 // 2691 //------------------------------------------------------------------------------ 2692 void RegexCompile::matchStartType() { 2693 if (U_FAILURE(*fStatus)) { 2694 return; 2695 } 2696 2697 2698 int32_t loc; // Location in the pattern of the current op being processed. 2699 int32_t op; // The op being processed 2700 int32_t opType; // The opcode type of the op 2701 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2702 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2703 2704 UBool atStart = true; // True if no part of the pattern yet encountered 2705 // could have advanced the position in a match. 2706 // (Maximum match length so far == 0) 2707 2708 // forwardedLength is a vector holding minimum-match-length values that 2709 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2710 // It must be one longer than the pattern being checked because some ops 2711 // will jmp to a end-of-block+1 location from within a block, and we must 2712 // count those when checking the block. 2713 int32_t end = fRXPat->fCompiledPat->size(); 2714 UVector32 forwardedLength(end+1, *fStatus); 2715 forwardedLength.setSize(end+1); 2716 for (loc=3; loc<end; loc++) { 2717 forwardedLength.setElementAt(INT32_MAX, loc); 2718 } 2719 2720 for (loc = 3; loc<end; loc++) { 2721 op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 2722 opType = URX_TYPE(op); 2723 2724 // The loop is advancing linearly through the pattern. 2725 // If the op we are now at was the destination of a branch in the pattern, 2726 // and that path has a shorter minimum length than the current accumulated value, 2727 // replace the current accumulated value. 2728 if (forwardedLength.elementAti(loc) < currentLen) { 2729 currentLen = forwardedLength.elementAti(loc); 2730 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2731 } 2732 2733 switch (opType) { 2734 // Ops that don't change the total length matched 2735 case URX_RESERVED_OP: 2736 case URX_END: 2737 case URX_FAIL: 2738 case URX_STRING_LEN: 2739 case URX_NOP: 2740 case URX_START_CAPTURE: 2741 case URX_END_CAPTURE: 2742 case URX_BACKSLASH_B: 2743 case URX_BACKSLASH_BU: 2744 case URX_BACKSLASH_G: 2745 case URX_BACKSLASH_Z: 2746 case URX_DOLLAR: 2747 case URX_DOLLAR_M: 2748 case URX_DOLLAR_D: 2749 case URX_DOLLAR_MD: 2750 case URX_RELOC_OPRND: 2751 case URX_STO_INP_LOC: 2752 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2753 case URX_BACKREF_I: 2754 2755 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2756 case URX_LD_SP: 2757 break; 2758 2759 case URX_CARET: 2760 if (atStart) { 2761 fRXPat->fStartType = START_START; 2762 } 2763 break; 2764 2765 case URX_CARET_M: 2766 case URX_CARET_M_UNIX: 2767 if (atStart) { 2768 fRXPat->fStartType = START_LINE; 2769 } 2770 break; 2771 2772 case URX_ONECHAR: 2773 if (currentLen == 0) { 2774 // This character could appear at the start of a match. 2775 // Add it to the set of possible starting characters. 2776 fRXPat->fInitialChars->add(URX_VAL(op)); 2777 numInitialStrings += 2; 2778 } 2779 currentLen = safeIncrement(currentLen, 1); 2780 atStart = false; 2781 break; 2782 2783 2784 case URX_SETREF: 2785 if (currentLen == 0) { 2786 int32_t sn = URX_VAL(op); 2787 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2788 const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn)); 2789 fRXPat->fInitialChars->addAll(*s); 2790 numInitialStrings += 2; 2791 } 2792 currentLen = safeIncrement(currentLen, 1); 2793 atStart = false; 2794 break; 2795 2796 case URX_LOOP_SR_I: 2797 // [Set]*, like a SETREF, above, in what it can match, 2798 // but may not match at all, so currentLen is not incremented. 2799 if (currentLen == 0) { 2800 int32_t sn = URX_VAL(op); 2801 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2802 const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn)); 2803 fRXPat->fInitialChars->addAll(*s); 2804 numInitialStrings += 2; 2805 } 2806 atStart = false; 2807 break; 2808 2809 case URX_LOOP_DOT_I: 2810 if (currentLen == 0) { 2811 // .* at the start of a pattern. 2812 // Any character can begin the match. 2813 fRXPat->fInitialChars->clear(); 2814 fRXPat->fInitialChars->complement(); 2815 numInitialStrings += 2; 2816 } 2817 atStart = false; 2818 break; 2819 2820 2821 case URX_STATIC_SETREF: 2822 if (currentLen == 0) { 2823 int32_t sn = URX_VAL(op); 2824 U_ASSERT(sn>0 && sn<URX_LAST_SET); 2825 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn]; 2826 fRXPat->fInitialChars->addAll(s); 2827 numInitialStrings += 2; 2828 } 2829 currentLen = safeIncrement(currentLen, 1); 2830 atStart = false; 2831 break; 2832 2833 2834 2835 case URX_STAT_SETREF_N: 2836 if (currentLen == 0) { 2837 int32_t sn = URX_VAL(op); 2838 UnicodeSet sc; 2839 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement(); 2840 fRXPat->fInitialChars->addAll(sc); 2841 numInitialStrings += 2; 2842 } 2843 currentLen = safeIncrement(currentLen, 1); 2844 atStart = false; 2845 break; 2846 2847 2848 2849 case URX_BACKSLASH_D: 2850 // Digit Char 2851 if (currentLen == 0) { 2852 UnicodeSet s; 2853 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2854 if (URX_VAL(op) != 0) { 2855 s.complement(); 2856 } 2857 fRXPat->fInitialChars->addAll(s); 2858 numInitialStrings += 2; 2859 } 2860 currentLen = safeIncrement(currentLen, 1); 2861 atStart = false; 2862 break; 2863 2864 2865 case URX_BACKSLASH_H: 2866 // Horiz white space 2867 if (currentLen == 0) { 2868 UnicodeSet s; 2869 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 2870 s.add(static_cast<UChar32>(9)); // Tab 2871 if (URX_VAL(op) != 0) { 2872 s.complement(); 2873 } 2874 fRXPat->fInitialChars->addAll(s); 2875 numInitialStrings += 2; 2876 } 2877 currentLen = safeIncrement(currentLen, 1); 2878 atStart = false; 2879 break; 2880 2881 2882 case URX_BACKSLASH_R: // Any line ending sequence 2883 case URX_BACKSLASH_V: // Any line ending code point, with optional negation 2884 if (currentLen == 0) { 2885 UnicodeSet s; 2886 s.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range 2887 s.add(static_cast<UChar32>(0x85)); 2888 s.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029)); 2889 if (URX_VAL(op) != 0) { 2890 // Complement option applies to URX_BACKSLASH_V only. 2891 s.complement(); 2892 } 2893 fRXPat->fInitialChars->addAll(s); 2894 numInitialStrings += 2; 2895 } 2896 currentLen = safeIncrement(currentLen, 1); 2897 atStart = false; 2898 break; 2899 2900 2901 2902 case URX_ONECHAR_I: 2903 // Case Insensitive Single Character. 2904 if (currentLen == 0) { 2905 UChar32 c = URX_VAL(op); 2906 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2907 UnicodeSet starters(c, c); 2908 starters.closeOver(USET_CASE_INSENSITIVE); 2909 // findCaseInsensitiveStarters(c, &starters); 2910 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings. 2911 // The expanded folding can't match the pattern. 2912 fRXPat->fInitialChars->addAll(starters); 2913 } else { 2914 // Char has no case variants. Just add it as-is to the 2915 // set of possible starting chars. 2916 fRXPat->fInitialChars->add(c); 2917 } 2918 numInitialStrings += 2; 2919 } 2920 currentLen = safeIncrement(currentLen, 1); 2921 atStart = false; 2922 break; 2923 2924 2925 case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. 2926 case URX_DOTANY_ALL: // . matches one or two. 2927 case URX_DOTANY: 2928 case URX_DOTANY_UNIX: 2929 if (currentLen == 0) { 2930 // These constructs are all bad news when they appear at the start 2931 // of a match. Any character can begin the match. 2932 fRXPat->fInitialChars->clear(); 2933 fRXPat->fInitialChars->complement(); 2934 numInitialStrings += 2; 2935 } 2936 currentLen = safeIncrement(currentLen, 1); 2937 atStart = false; 2938 break; 2939 2940 2941 case URX_JMPX: 2942 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 2943 U_FALLTHROUGH; 2944 case URX_JMP: 2945 { 2946 int32_t jmpDest = URX_VAL(op); 2947 if (jmpDest < loc) { 2948 // Loop of some kind. Can safely ignore, the worst that will happen 2949 // is that we understate the true minimum length 2950 currentLen = forwardedLength.elementAti(loc+1); 2951 2952 } else { 2953 // Forward jump. Propagate the current min length to the target loc of the jump. 2954 U_ASSERT(jmpDest <= end+1); 2955 if (forwardedLength.elementAti(jmpDest) > currentLen) { 2956 forwardedLength.setElementAt(currentLen, jmpDest); 2957 } 2958 } 2959 } 2960 atStart = false; 2961 break; 2962 2963 case URX_JMP_SAV: 2964 case URX_JMP_SAV_X: 2965 // Combo of state save to the next loc, + jmp backwards. 2966 // Net effect on min. length computation is nothing. 2967 atStart = false; 2968 break; 2969 2970 case URX_BACKTRACK: 2971 // Fails are kind of like a branch, except that the min length was 2972 // propagated already, by the state save. 2973 currentLen = forwardedLength.elementAti(loc+1); 2974 atStart = false; 2975 break; 2976 2977 2978 case URX_STATE_SAVE: 2979 { 2980 // State Save, for forward jumps, propagate the current minimum. 2981 // of the state save. 2982 int32_t jmpDest = URX_VAL(op); 2983 if (jmpDest > loc) { 2984 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2985 forwardedLength.setElementAt(currentLen, jmpDest); 2986 } 2987 } 2988 } 2989 atStart = false; 2990 break; 2991 2992 2993 2994 2995 case URX_STRING: 2996 { 2997 loc++; 2998 int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 2999 int32_t stringLen = URX_VAL(stringLenOp); 3000 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 3001 U_ASSERT(stringLenOp >= 2); 3002 if (currentLen == 0) { 3003 // Add the starting character of this string to the set of possible starting 3004 // characters for this pattern. 3005 int32_t stringStartIdx = URX_VAL(op); 3006 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 3007 fRXPat->fInitialChars->add(c); 3008 3009 // Remember this string. After the entire pattern has been checked, 3010 // if nothing else is identified that can start a match, we'll use it. 3011 numInitialStrings++; 3012 fRXPat->fInitialStringIdx = stringStartIdx; 3013 fRXPat->fInitialStringLen = stringLen; 3014 } 3015 3016 currentLen = safeIncrement(currentLen, stringLen); 3017 atStart = false; 3018 } 3019 break; 3020 3021 case URX_STRING_I: 3022 { 3023 // Case-insensitive string. Unlike exact-match strings, we won't 3024 // attempt a string search for possible match positions. But we 3025 // do update the set of possible starting characters. 3026 loc++; 3027 int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3028 int32_t stringLen = URX_VAL(stringLenOp); 3029 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 3030 U_ASSERT(stringLenOp >= 2); 3031 if (currentLen == 0) { 3032 // Add the starting character of this string to the set of possible starting 3033 // characters for this pattern. 3034 int32_t stringStartIdx = URX_VAL(op); 3035 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 3036 UnicodeSet s; 3037 findCaseInsensitiveStarters(c, &s); 3038 fRXPat->fInitialChars->addAll(s); 3039 numInitialStrings += 2; // Matching on an initial string not possible. 3040 } 3041 currentLen = safeIncrement(currentLen, stringLen); 3042 atStart = false; 3043 } 3044 break; 3045 3046 case URX_CTR_INIT: 3047 case URX_CTR_INIT_NG: 3048 { 3049 // Loop Init Ops. These don't change the min length, but they are 4 word ops 3050 // so location must be updated accordingly. 3051 // Loop Init Ops. 3052 // If the min loop count == 0 3053 // move loc forwards to the end of the loop, skipping over the body. 3054 // If the min count is > 0, 3055 // continue normal processing of the body of the loop. 3056 int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1)); 3057 loopEndLoc = URX_VAL(loopEndLoc); 3058 int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2)); 3059 if (minLoopCount == 0) { 3060 // Min Loop Count of 0, treat like a forward branch and 3061 // move the current minimum length up to the target 3062 // (end of loop) location. 3063 U_ASSERT(loopEndLoc <= end+1); 3064 if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 3065 forwardedLength.setElementAt(currentLen, loopEndLoc); 3066 } 3067 } 3068 loc+=3; // Skips over operands of CTR_INIT 3069 } 3070 atStart = false; 3071 break; 3072 3073 3074 case URX_CTR_LOOP: 3075 case URX_CTR_LOOP_NG: 3076 // Loop ops. 3077 // The jump is conditional, backwards only. 3078 atStart = false; 3079 break; 3080 3081 case URX_LOOP_C: 3082 // More loop ops. These state-save to themselves. 3083 // don't change the minimum match 3084 atStart = false; 3085 break; 3086 3087 3088 case URX_LA_START: 3089 case URX_LB_START: 3090 { 3091 // Look-around. Scan forward until the matching look-ahead end, 3092 // without processing the look-around block. This is overly pessimistic. 3093 3094 // Keep track of the nesting depth of look-around blocks. Boilerplate code for 3095 // lookahead contains two LA_END instructions, so count goes up by two 3096 // for each LA_START. 3097 int32_t depth = (opType == URX_LA_START? 2: 1); 3098 for (;;) { 3099 loc++; 3100 op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3101 if (URX_TYPE(op) == URX_LA_START) { 3102 depth+=2; 3103 } 3104 if (URX_TYPE(op) == URX_LB_START) { 3105 depth++; 3106 } 3107 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3108 depth--; 3109 if (depth == 0) { 3110 break; 3111 } 3112 } 3113 if (URX_TYPE(op) == URX_STATE_SAVE) { 3114 // Need this because neg lookahead blocks will FAIL to outside 3115 // of the block. 3116 int32_t jmpDest = URX_VAL(op); 3117 if (jmpDest > loc) { 3118 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3119 forwardedLength.setElementAt(currentLen, jmpDest); 3120 } 3121 } 3122 } 3123 U_ASSERT(loc <= end); 3124 } 3125 } 3126 break; 3127 3128 case URX_LA_END: 3129 case URX_LB_CONT: 3130 case URX_LB_END: 3131 case URX_LBN_CONT: 3132 case URX_LBN_END: 3133 UPRV_UNREACHABLE_EXIT; // Shouldn't get here. These ops should be 3134 // consumed by the scan in URX_LA_START and LB_START 3135 default: 3136 UPRV_UNREACHABLE_EXIT; 3137 } 3138 3139 } 3140 3141 3142 // We have finished walking through the ops. Check whether some forward jump 3143 // propagated a shorter length to location end+1. 3144 if (forwardedLength.elementAti(end+1) < currentLen) { 3145 currentLen = forwardedLength.elementAti(end+1); 3146 } 3147 3148 3149 fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 3150 3151 3152 // Sort out what we should check for when looking for candidate match start positions. 3153 // In order of preference, 3154 // 1. Start of input text buffer. 3155 // 2. A literal string. 3156 // 3. Start of line in multi-line mode. 3157 // 4. A single literal character. 3158 // 5. A character from a set of characters. 3159 // 3160 if (fRXPat->fStartType == START_START) { 3161 // Match only at the start of an input text string. 3162 // start type is already set. We're done. 3163 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 3164 // Match beginning only with a literal string. 3165 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 3166 U_ASSERT(fRXPat->fInitialChars->contains(c)); 3167 fRXPat->fStartType = START_STRING; 3168 fRXPat->fInitialChar = c; 3169 } else if (fRXPat->fStartType == START_LINE) { 3170 // Match at start of line in Multi-Line mode. 3171 // Nothing to do here; everything is already set. 3172 } else if (fRXPat->fMinMatchLen == 0) { 3173 // Zero length match possible. We could start anywhere. 3174 fRXPat->fStartType = START_NO_INFO; 3175 } else if (fRXPat->fInitialChars->size() == 1) { 3176 // All matches begin with the same char. 3177 fRXPat->fStartType = START_CHAR; 3178 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 3179 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 3180 } else if (fRXPat->fInitialChars->contains(static_cast<UChar32>(0), static_cast<UChar32>(0x10ffff)) == false && 3181 fRXPat->fMinMatchLen > 0) { 3182 // Matches start with a set of character smaller than the set of all chars. 3183 fRXPat->fStartType = START_SET; 3184 } else { 3185 // Matches can start with anything 3186 fRXPat->fStartType = START_NO_INFO; 3187 } 3188 } 3189 3190 3191 3192 //------------------------------------------------------------------------------ 3193 // 3194 // minMatchLength Calculate the length of the shortest string that could 3195 // match the specified pattern. 3196 // Length is in 16 bit code units, not code points. 3197 // 3198 // The calculated length may not be exact. The returned 3199 // value may be shorter than the actual minimum; it must 3200 // never be longer. 3201 // 3202 // start and end are the range of p-code operations to be 3203 // examined. The endpoints are included in the range. 3204 // 3205 //------------------------------------------------------------------------------ 3206 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 3207 if (U_FAILURE(*fStatus)) { 3208 return 0; 3209 } 3210 3211 U_ASSERT(start <= end); 3212 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3213 3214 3215 int32_t loc; 3216 int32_t op; 3217 int32_t opType; 3218 int32_t currentLen = 0; 3219 3220 3221 // forwardedLength is a vector holding minimum-match-length values that 3222 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 3223 // It must be one longer than the pattern being checked because some ops 3224 // will jmp to a end-of-block+1 location from within a block, and we must 3225 // count those when checking the block. 3226 UVector32 forwardedLength(end+2, *fStatus); 3227 forwardedLength.setSize(end+2); 3228 for (loc=start; loc<=end+1; loc++) { 3229 forwardedLength.setElementAt(INT32_MAX, loc); 3230 } 3231 3232 for (loc = start; loc<=end; loc++) { 3233 op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3234 opType = URX_TYPE(op); 3235 3236 // The loop is advancing linearly through the pattern. 3237 // If the op we are now at was the destination of a branch in the pattern, 3238 // and that path has a shorter minimum length than the current accumulated value, 3239 // replace the current accumulated value. 3240 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 3241 // no-match-possible cases. 3242 if (forwardedLength.elementAti(loc) < currentLen) { 3243 currentLen = forwardedLength.elementAti(loc); 3244 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3245 } 3246 3247 switch (opType) { 3248 // Ops that don't change the total length matched 3249 case URX_RESERVED_OP: 3250 case URX_END: 3251 case URX_STRING_LEN: 3252 case URX_NOP: 3253 case URX_START_CAPTURE: 3254 case URX_END_CAPTURE: 3255 case URX_BACKSLASH_B: 3256 case URX_BACKSLASH_BU: 3257 case URX_BACKSLASH_G: 3258 case URX_BACKSLASH_Z: 3259 case URX_CARET: 3260 case URX_DOLLAR: 3261 case URX_DOLLAR_M: 3262 case URX_DOLLAR_D: 3263 case URX_DOLLAR_MD: 3264 case URX_RELOC_OPRND: 3265 case URX_STO_INP_LOC: 3266 case URX_CARET_M: 3267 case URX_CARET_M_UNIX: 3268 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3269 case URX_BACKREF_I: 3270 3271 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3272 case URX_LD_SP: 3273 3274 case URX_JMP_SAV: 3275 case URX_JMP_SAV_X: 3276 break; 3277 3278 3279 // Ops that match a minimum of one character (one or two 16 bit code units.) 3280 // 3281 case URX_ONECHAR: 3282 case URX_STATIC_SETREF: 3283 case URX_STAT_SETREF_N: 3284 case URX_SETREF: 3285 case URX_BACKSLASH_D: 3286 case URX_BACKSLASH_H: 3287 case URX_BACKSLASH_R: 3288 case URX_BACKSLASH_V: 3289 case URX_ONECHAR_I: 3290 case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. 3291 case URX_DOTANY_ALL: // . matches one or two. 3292 case URX_DOTANY: 3293 case URX_DOTANY_UNIX: 3294 currentLen = safeIncrement(currentLen, 1); 3295 break; 3296 3297 3298 case URX_JMPX: 3299 loc++; // URX_JMPX has an extra operand, ignored here, 3300 // otherwise processed identically to URX_JMP. 3301 U_FALLTHROUGH; 3302 case URX_JMP: 3303 { 3304 int32_t jmpDest = URX_VAL(op); 3305 if (jmpDest < loc) { 3306 // Loop of some kind. Can safely ignore, the worst that will happen 3307 // is that we understate the true minimum length 3308 currentLen = forwardedLength.elementAti(loc+1); 3309 } else { 3310 // Forward jump. Propagate the current min length to the target loc of the jump. 3311 U_ASSERT(jmpDest <= end+1); 3312 if (forwardedLength.elementAti(jmpDest) > currentLen) { 3313 forwardedLength.setElementAt(currentLen, jmpDest); 3314 } 3315 } 3316 } 3317 break; 3318 3319 case URX_BACKTRACK: 3320 { 3321 // Back-tracks are kind of like a branch, except that the min length was 3322 // propagated already, by the state save. 3323 currentLen = forwardedLength.elementAti(loc+1); 3324 } 3325 break; 3326 3327 3328 case URX_STATE_SAVE: 3329 { 3330 // State Save, for forward jumps, propagate the current minimum. 3331 // of the state save. 3332 int32_t jmpDest = URX_VAL(op); 3333 if (jmpDest > loc) { 3334 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3335 forwardedLength.setElementAt(currentLen, jmpDest); 3336 } 3337 } 3338 } 3339 break; 3340 3341 3342 case URX_STRING: 3343 { 3344 loc++; 3345 int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3346 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3347 } 3348 break; 3349 3350 3351 case URX_STRING_I: 3352 { 3353 loc++; 3354 // TODO: with full case folding, matching input text may be shorter than 3355 // the string we have here. More smarts could put some bounds on it. 3356 // Assume a min length of one for now. A min length of zero causes 3357 // optimization failures for a pattern like "string"+ 3358 // currentLen += URX_VAL(stringLenOp); 3359 currentLen = safeIncrement(currentLen, 1); 3360 } 3361 break; 3362 3363 case URX_CTR_INIT: 3364 case URX_CTR_INIT_NG: 3365 { 3366 // Loop Init Ops. 3367 // If the min loop count == 0 3368 // move loc forwards to the end of the loop, skipping over the body. 3369 // If the min count is > 0, 3370 // continue normal processing of the body of the loop. 3371 int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1)); 3372 loopEndLoc = URX_VAL(loopEndLoc); 3373 int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2)); 3374 if (minLoopCount == 0) { 3375 loc = loopEndLoc; 3376 } else { 3377 loc+=3; // Skips over operands of CTR_INIT 3378 } 3379 } 3380 break; 3381 3382 3383 case URX_CTR_LOOP: 3384 case URX_CTR_LOOP_NG: 3385 // Loop ops. 3386 // The jump is conditional, backwards only. 3387 break; 3388 3389 case URX_LOOP_SR_I: 3390 case URX_LOOP_DOT_I: 3391 case URX_LOOP_C: 3392 // More loop ops. These state-save to themselves. 3393 // don't change the minimum match - could match nothing at all. 3394 break; 3395 3396 3397 case URX_LA_START: 3398 case URX_LB_START: 3399 { 3400 // Look-around. Scan forward until the matching look-ahead end, 3401 // without processing the look-around block. This is overly pessimistic for look-ahead, 3402 // it assumes that the look-ahead match might be zero-length. 3403 // TODO: Positive lookahead could recursively do the block, then continue 3404 // with the longer of the block or the value coming in. Ticket 6060 3405 int32_t depth = (opType == URX_LA_START? 2: 1); 3406 for (;;) { 3407 loc++; 3408 op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3409 if (URX_TYPE(op) == URX_LA_START) { 3410 // The boilerplate for look-ahead includes two LA_END instructions, 3411 // Depth will be decremented by each one when it is seen. 3412 depth += 2; 3413 } 3414 if (URX_TYPE(op) == URX_LB_START) { 3415 depth++; 3416 } 3417 if (URX_TYPE(op) == URX_LA_END) { 3418 depth--; 3419 if (depth == 0) { 3420 break; 3421 } 3422 } 3423 if (URX_TYPE(op)==URX_LBN_END) { 3424 depth--; 3425 if (depth == 0) { 3426 break; 3427 } 3428 } 3429 if (URX_TYPE(op) == URX_STATE_SAVE) { 3430 // Need this because neg lookahead blocks will FAIL to outside 3431 // of the block. 3432 int32_t jmpDest = URX_VAL(op); 3433 if (jmpDest > loc) { 3434 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3435 forwardedLength.setElementAt(currentLen, jmpDest); 3436 } 3437 } 3438 } 3439 U_ASSERT(loc <= end); 3440 } 3441 } 3442 break; 3443 3444 case URX_LA_END: 3445 case URX_LB_CONT: 3446 case URX_LB_END: 3447 case URX_LBN_CONT: 3448 case URX_LBN_END: 3449 // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3450 // range being sized, which happens when measuring size of look-behind blocks. 3451 break; 3452 3453 default: 3454 UPRV_UNREACHABLE_EXIT; 3455 } 3456 3457 } 3458 3459 // We have finished walking through the ops. Check whether some forward jump 3460 // propagated a shorter length to location end+1. 3461 if (forwardedLength.elementAti(end+1) < currentLen) { 3462 currentLen = forwardedLength.elementAti(end+1); 3463 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3464 } 3465 3466 return currentLen; 3467 } 3468 3469 //------------------------------------------------------------------------------ 3470 // 3471 // maxMatchLength Calculate the length of the longest string that could 3472 // match the specified pattern. 3473 // Length is in 16 bit code units, not code points. 3474 // 3475 // The calculated length may not be exact. The returned 3476 // value may be longer than the actual maximum; it must 3477 // never be shorter. 3478 // 3479 // start, end: the range of the pattern to check. 3480 // end is inclusive. 3481 // 3482 //------------------------------------------------------------------------------ 3483 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3484 if (U_FAILURE(*fStatus)) { 3485 return 0; 3486 } 3487 U_ASSERT(start <= end); 3488 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3489 3490 int32_t loc; 3491 int32_t op; 3492 int32_t opType; 3493 int32_t currentLen = 0; 3494 UVector32 forwardedLength(end+1, *fStatus); 3495 forwardedLength.setSize(end+1); 3496 3497 for (loc=start; loc<=end; loc++) { 3498 forwardedLength.setElementAt(0, loc); 3499 } 3500 3501 for (loc = start; loc<=end; loc++) { 3502 op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3503 opType = URX_TYPE(op); 3504 3505 // The loop is advancing linearly through the pattern. 3506 // If the op we are now at was the destination of a branch in the pattern, 3507 // and that path has a longer maximum length than the current accumulated value, 3508 // replace the current accumulated value. 3509 if (forwardedLength.elementAti(loc) > currentLen) { 3510 currentLen = forwardedLength.elementAti(loc); 3511 } 3512 3513 switch (opType) { 3514 // Ops that don't change the total length matched 3515 case URX_RESERVED_OP: 3516 case URX_END: 3517 case URX_STRING_LEN: 3518 case URX_NOP: 3519 case URX_START_CAPTURE: 3520 case URX_END_CAPTURE: 3521 case URX_BACKSLASH_B: 3522 case URX_BACKSLASH_BU: 3523 case URX_BACKSLASH_G: 3524 case URX_BACKSLASH_Z: 3525 case URX_CARET: 3526 case URX_DOLLAR: 3527 case URX_DOLLAR_M: 3528 case URX_DOLLAR_D: 3529 case URX_DOLLAR_MD: 3530 case URX_RELOC_OPRND: 3531 case URX_STO_INP_LOC: 3532 case URX_CARET_M: 3533 case URX_CARET_M_UNIX: 3534 3535 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3536 case URX_LD_SP: 3537 3538 case URX_LB_END: 3539 case URX_LB_CONT: 3540 case URX_LBN_CONT: 3541 case URX_LBN_END: 3542 break; 3543 3544 3545 // Ops that increase that cause an unbounded increase in the length 3546 // of a matched string, or that increase it a hard to characterize way. 3547 // Call the max length unbounded, and stop further checking. 3548 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3549 case URX_BACKREF_I: 3550 case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. 3551 currentLen = INT32_MAX; 3552 break; 3553 3554 3555 // Ops that match a max of one character (possibly two 16 bit code units.) 3556 // 3557 case URX_STATIC_SETREF: 3558 case URX_STAT_SETREF_N: 3559 case URX_SETREF: 3560 case URX_BACKSLASH_D: 3561 case URX_BACKSLASH_H: 3562 case URX_BACKSLASH_R: 3563 case URX_BACKSLASH_V: 3564 case URX_ONECHAR_I: 3565 case URX_DOTANY_ALL: 3566 case URX_DOTANY: 3567 case URX_DOTANY_UNIX: 3568 currentLen = safeIncrement(currentLen, 2); 3569 break; 3570 3571 // Single literal character. Increase current max length by one or two, 3572 // depending on whether the char is in the supplementary range. 3573 case URX_ONECHAR: 3574 currentLen = safeIncrement(currentLen, 1); 3575 if (URX_VAL(op) > 0x10000) { 3576 currentLen = safeIncrement(currentLen, 1); 3577 } 3578 break; 3579 3580 // Jumps. 3581 // 3582 case URX_JMP: 3583 case URX_JMPX: 3584 case URX_JMP_SAV: 3585 case URX_JMP_SAV_X: 3586 { 3587 int32_t jmpDest = URX_VAL(op); 3588 if (jmpDest < loc) { 3589 // Loop of some kind. Max match length is unbounded. 3590 currentLen = INT32_MAX; 3591 } else { 3592 // Forward jump. Propagate the current min length to the target loc of the jump. 3593 if (forwardedLength.elementAti(jmpDest) < currentLen) { 3594 forwardedLength.setElementAt(currentLen, jmpDest); 3595 } 3596 currentLen = 0; 3597 } 3598 } 3599 break; 3600 3601 case URX_BACKTRACK: 3602 // back-tracks are kind of like a branch, except that the max length was 3603 // propagated already, by the state save. 3604 currentLen = forwardedLength.elementAti(loc+1); 3605 break; 3606 3607 3608 case URX_STATE_SAVE: 3609 { 3610 // State Save, for forward jumps, propagate the current minimum. 3611 // of the state save. 3612 // For backwards jumps, they create a loop, maximum 3613 // match length is unbounded. 3614 int32_t jmpDest = URX_VAL(op); 3615 if (jmpDest > loc) { 3616 if (currentLen > forwardedLength.elementAti(jmpDest)) { 3617 forwardedLength.setElementAt(currentLen, jmpDest); 3618 } 3619 } else { 3620 currentLen = INT32_MAX; 3621 } 3622 } 3623 break; 3624 3625 3626 3627 3628 case URX_STRING: 3629 { 3630 loc++; 3631 int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3632 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3633 break; 3634 } 3635 3636 case URX_STRING_I: 3637 // TODO: This code assumes that any user string that matches will be no longer 3638 // than our compiled string, with case insensitive matching. 3639 // Our compiled string has been case-folded already. 3640 // 3641 // Any matching user string will have no more code points than our 3642 // compiled (folded) string. Folding may add code points, but 3643 // not remove them. 3644 // 3645 // There is a potential problem if a supplemental code point 3646 // case-folds to a BMP code point. In this case our compiled string 3647 // could be shorter (in code units) than a matching user string. 3648 // 3649 // At this time (Unicode 6.1) there are no such characters, and this case 3650 // is not being handled. A test, intltest regex/Bug9283, will fail if 3651 // any problematic characters are added to Unicode. 3652 // 3653 // If this happens, we can make a set of the BMP chars that the 3654 // troublesome supplementals fold to, scan our string, and bump the 3655 // currentLen one extra for each that is found. 3656 // 3657 { 3658 loc++; 3659 int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3660 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3661 } 3662 break; 3663 3664 case URX_CTR_INIT: 3665 case URX_CTR_INIT_NG: 3666 // For Loops, recursively call this function on the pattern for the loop body, 3667 // then multiply the result by the maximum loop count. 3668 { 3669 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); 3670 if (loopEndLoc == loc+4) { 3671 // Loop has an empty body. No affect on max match length. 3672 // Continue processing with code after the loop end. 3673 loc = loopEndLoc; 3674 break; 3675 } 3676 3677 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3)); 3678 if (maxLoopCount == -1) { 3679 // Unbounded Loop. No upper bound on match length. 3680 currentLen = INT32_MAX; 3681 break; 3682 } 3683 3684 U_ASSERT(loopEndLoc >= loc+4); 3685 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. 3686 int64_t updatedLen = static_cast<int64_t>(currentLen) + blockLen * maxLoopCount; 3687 if (updatedLen >= INT32_MAX) { 3688 currentLen = INT32_MAX; 3689 break; 3690 } 3691 currentLen = static_cast<int32_t>(updatedLen); 3692 loc = loopEndLoc; 3693 break; 3694 } 3695 3696 case URX_CTR_LOOP: 3697 case URX_CTR_LOOP_NG: 3698 // These opcodes will be skipped over by code for URX_CTR_INIT. 3699 // We shouldn't encounter them here. 3700 UPRV_UNREACHABLE_EXIT; 3701 3702 case URX_LOOP_SR_I: 3703 case URX_LOOP_DOT_I: 3704 case URX_LOOP_C: 3705 // For anything to do with loops, make the match length unbounded. 3706 currentLen = INT32_MAX; 3707 break; 3708 3709 3710 3711 case URX_LA_START: 3712 case URX_LA_END: 3713 // Look-ahead. Just ignore, treat the look-ahead block as if 3714 // it were normal pattern. Gives a too-long match length, 3715 // but good enough for now. 3716 break; 3717 3718 // End of look-ahead ops should always be consumed by the processing at 3719 // the URX_LA_START op. 3720 // UPRV_UNREACHABLE_EXIT; 3721 3722 case URX_LB_START: 3723 { 3724 // Look-behind. Scan forward until the matching look-around end, 3725 // without processing the look-behind block. 3726 int32_t dataLoc = URX_VAL(op); 3727 for (loc = loc + 1; loc <= end; ++loc) { 3728 op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3729 int32_t opType = URX_TYPE(op); 3730 if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) { 3731 break; 3732 } 3733 } 3734 U_ASSERT(loc <= end); 3735 } 3736 break; 3737 3738 default: 3739 UPRV_UNREACHABLE_EXIT; 3740 } 3741 3742 3743 if (currentLen == INT32_MAX) { 3744 // The maximum length is unbounded. 3745 // Stop further processing of the pattern. 3746 break; 3747 } 3748 3749 } 3750 return currentLen; 3751 3752 } 3753 3754 3755 //------------------------------------------------------------------------------ 3756 // 3757 // stripNOPs Remove any NOP operations from the compiled pattern code. 3758 // Extra NOPs are inserted for some constructs during the initial 3759 // code generation to provide locations that may be patched later. 3760 // Many end up unneeded, and are removed by this function. 3761 // 3762 // In order to minimize the number of passes through the pattern, 3763 // back-reference fixup is also performed here (adjusting 3764 // back-reference operands to point to the correct frame offsets). 3765 // 3766 //------------------------------------------------------------------------------ 3767 void RegexCompile::stripNOPs() { 3768 3769 if (U_FAILURE(*fStatus)) { 3770 return; 3771 } 3772 3773 int32_t end = fRXPat->fCompiledPat->size(); 3774 UVector32 deltas(end, *fStatus); 3775 3776 // Make a first pass over the code, computing the amount that things 3777 // will be offset at each location in the original code. 3778 int32_t loc; 3779 int32_t d = 0; 3780 for (loc=0; loc<end; loc++) { 3781 deltas.addElement(d, *fStatus); 3782 int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc)); 3783 if (URX_TYPE(op) == URX_NOP) { 3784 d++; 3785 } 3786 } 3787 3788 UnicodeString caseStringBuffer; 3789 3790 // Make a second pass over the code, removing the NOPs by moving following 3791 // code up, and patching operands that refer to code locations that 3792 // are being moved. The array of offsets from the first step is used 3793 // to compute the new operand values. 3794 int32_t src; 3795 int32_t dst = 0; 3796 for (src=0; src<end; src++) { 3797 int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(src)); 3798 int32_t opType = URX_TYPE(op); 3799 switch (opType) { 3800 case URX_NOP: 3801 break; 3802 3803 case URX_STATE_SAVE: 3804 case URX_JMP: 3805 case URX_CTR_LOOP: 3806 case URX_CTR_LOOP_NG: 3807 case URX_RELOC_OPRND: 3808 case URX_JMPX: 3809 case URX_JMP_SAV: 3810 case URX_JMP_SAV_X: 3811 // These are instructions with operands that refer to code locations. 3812 { 3813 int32_t operandAddress = URX_VAL(op); 3814 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3815 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 3816 op = buildOp(opType, fixedOperandAddress); 3817 fRXPat->fCompiledPat->setElementAt(op, dst); 3818 dst++; 3819 break; 3820 } 3821 3822 case URX_BACKREF: 3823 case URX_BACKREF_I: 3824 { 3825 int32_t where = URX_VAL(op); 3826 if (where > fRXPat->fGroupMap->size()) { 3827 error(U_REGEX_INVALID_BACK_REF); 3828 break; 3829 } 3830 where = fRXPat->fGroupMap->elementAti(where-1); 3831 op = buildOp(opType, where); 3832 fRXPat->fCompiledPat->setElementAt(op, dst); 3833 dst++; 3834 3835 fRXPat->fNeedsAltInput = true; 3836 break; 3837 } 3838 case URX_RESERVED_OP: 3839 case URX_RESERVED_OP_N: 3840 case URX_BACKTRACK: 3841 case URX_END: 3842 case URX_ONECHAR: 3843 case URX_STRING: 3844 case URX_STRING_LEN: 3845 case URX_START_CAPTURE: 3846 case URX_END_CAPTURE: 3847 case URX_STATIC_SETREF: 3848 case URX_STAT_SETREF_N: 3849 case URX_SETREF: 3850 case URX_DOTANY: 3851 case URX_FAIL: 3852 case URX_BACKSLASH_B: 3853 case URX_BACKSLASH_BU: 3854 case URX_BACKSLASH_G: 3855 case URX_BACKSLASH_X: 3856 case URX_BACKSLASH_Z: 3857 case URX_DOTANY_ALL: 3858 case URX_BACKSLASH_D: 3859 case URX_CARET: 3860 case URX_DOLLAR: 3861 case URX_CTR_INIT: 3862 case URX_CTR_INIT_NG: 3863 case URX_DOTANY_UNIX: 3864 case URX_STO_SP: 3865 case URX_LD_SP: 3866 case URX_STO_INP_LOC: 3867 case URX_LA_START: 3868 case URX_LA_END: 3869 case URX_ONECHAR_I: 3870 case URX_STRING_I: 3871 case URX_DOLLAR_M: 3872 case URX_CARET_M: 3873 case URX_CARET_M_UNIX: 3874 case URX_LB_START: 3875 case URX_LB_CONT: 3876 case URX_LB_END: 3877 case URX_LBN_CONT: 3878 case URX_LBN_END: 3879 case URX_LOOP_SR_I: 3880 case URX_LOOP_DOT_I: 3881 case URX_LOOP_C: 3882 case URX_DOLLAR_D: 3883 case URX_DOLLAR_MD: 3884 case URX_BACKSLASH_H: 3885 case URX_BACKSLASH_R: 3886 case URX_BACKSLASH_V: 3887 // These instructions are unaltered by the relocation. 3888 fRXPat->fCompiledPat->setElementAt(op, dst); 3889 dst++; 3890 break; 3891 3892 default: 3893 // Some op is unaccounted for. 3894 UPRV_UNREACHABLE_EXIT; 3895 } 3896 } 3897 3898 fRXPat->fCompiledPat->setSize(dst); 3899 } 3900 3901 3902 3903 3904 //------------------------------------------------------------------------------ 3905 // 3906 // Error Report a rule parse error. 3907 // Only report it if no previous error has been recorded. 3908 // 3909 //------------------------------------------------------------------------------ 3910 void RegexCompile::error(UErrorCode e) { 3911 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) { 3912 *fStatus = e; 3913 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 3914 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 3915 // int64_t. If the values of the latter are out of range for the former, 3916 // set them to the appropriate "field not supported" values. 3917 if (fLineNum > 0x7FFFFFFF) { 3918 fParseErr->line = 0; 3919 fParseErr->offset = -1; 3920 } else if (fCharNum > 0x7FFFFFFF) { 3921 fParseErr->line = static_cast<int32_t>(fLineNum); 3922 fParseErr->offset = -1; 3923 } else { 3924 fParseErr->line = static_cast<int32_t>(fLineNum); 3925 fParseErr->offset = static_cast<int32_t>(fCharNum); 3926 } 3927 3928 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3929 3930 // Fill in the context. 3931 // Note: extractBetween() pins supplied indices to the string bounds. 3932 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3933 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 3934 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 3935 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3936 } 3937 } 3938 3939 3940 // 3941 // Assorted Unicode character constants. 3942 // Numeric because there is no portable way to enter them as literals. 3943 // (Think EBCDIC). 3944 // 3945 static const char16_t chCR = 0x0d; // New lines, for terminating comments. 3946 static const char16_t chLF = 0x0a; // Line Feed 3947 static const char16_t chPound = 0x23; // '#', introduces a comment. 3948 static const char16_t chDigit0 = 0x30; // '0' 3949 static const char16_t chDigit7 = 0x37; // '9' 3950 static const char16_t chColon = 0x3A; // ':' 3951 static const char16_t chE = 0x45; // 'E' 3952 static const char16_t chQ = 0x51; // 'Q' 3953 //static const char16_t chN = 0x4E; // 'N' 3954 static const char16_t chP = 0x50; // 'P' 3955 static const char16_t chBackSlash = 0x5c; // '\' introduces a char escape 3956 //static const char16_t chLBracket = 0x5b; // '[' 3957 static const char16_t chRBracket = 0x5d; // ']' 3958 static const char16_t chUp = 0x5e; // '^' 3959 static const char16_t chLowerP = 0x70; 3960 static const char16_t chLBrace = 0x7b; // '{' 3961 static const char16_t chRBrace = 0x7d; // '}' 3962 static const char16_t chNEL = 0x85; // NEL newline variant 3963 static const char16_t chLS = 0x2028; // Unicode Line Separator 3964 3965 3966 //------------------------------------------------------------------------------ 3967 // 3968 // nextCharLL Low Level Next Char from the regex pattern. 3969 // Get a char from the string, keep track of input position 3970 // for error reporting. 3971 // 3972 //------------------------------------------------------------------------------ 3973 UChar32 RegexCompile::nextCharLL() { 3974 UChar32 ch; 3975 3976 if (fPeekChar != -1) { 3977 ch = fPeekChar; 3978 fPeekChar = -1; 3979 return ch; 3980 } 3981 3982 // assume we're already in the right place 3983 ch = UTEXT_NEXT32(fRXPat->fPattern); 3984 if (ch == U_SENTINEL) { 3985 return ch; 3986 } 3987 3988 if (ch == chCR || 3989 ch == chNEL || 3990 ch == chLS || 3991 (ch == chLF && fLastChar != chCR)) { 3992 // Character is starting a new line. Bump up the line number, and 3993 // reset the column to 0. 3994 fLineNum++; 3995 fCharNum=0; 3996 } 3997 else { 3998 // Character is not starting a new line. Except in the case of a 3999 // LF following a CR, increment the column position. 4000 if (ch != chLF) { 4001 fCharNum++; 4002 } 4003 } 4004 fLastChar = ch; 4005 return ch; 4006 } 4007 4008 //------------------------------------------------------------------------------ 4009 // 4010 // peekCharLL Low Level Character Scanning, sneak a peek at the next 4011 // character without actually getting it. 4012 // 4013 //------------------------------------------------------------------------------ 4014 UChar32 RegexCompile::peekCharLL() { 4015 if (fPeekChar == -1) { 4016 fPeekChar = nextCharLL(); 4017 } 4018 return fPeekChar; 4019 } 4020 4021 4022 //------------------------------------------------------------------------------ 4023 // 4024 // nextChar for pattern scanning. At this level, we handle stripping 4025 // out comments and processing some backslash character escapes. 4026 // The rest of the pattern grammar is handled at the next level up. 4027 // 4028 //------------------------------------------------------------------------------ 4029 void RegexCompile::nextChar(RegexPatternChar &c) { 4030 tailRecursion: 4031 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4032 c.fChar = nextCharLL(); 4033 c.fQuoted = false; 4034 4035 if (fQuoteMode) { 4036 c.fQuoted = true; 4037 if ((c.fChar == chBackSlash && peekCharLL() == chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || 4038 c.fChar == static_cast<UChar32>(-1)) { 4039 fQuoteMode = false; // Exit quote mode, 4040 nextCharLL(); // discard the E 4041 // nextChar(c); // recurse to get the real next char 4042 goto tailRecursion; // Note: fuzz testing produced testcases that 4043 // resulted in stack overflow here. 4044 } 4045 } 4046 else if (fInBackslashQuote) { 4047 // The current character immediately follows a '\' 4048 // Don't check for any further escapes, just return it as-is. 4049 // Don't set c.fQuoted, because that would prevent the state machine from 4050 // dispatching on the character. 4051 fInBackslashQuote = false; 4052 } 4053 else 4054 { 4055 // We are not in a \Q quoted region \E of the source. 4056 // 4057 if (fModeFlags & UREGEX_COMMENTS) { 4058 // 4059 // We are in free-spacing and comments mode. 4060 // Scan through any white space and comments, until we 4061 // reach a significant character or the end of input. 4062 for (;;) { 4063 if (c.fChar == static_cast<UChar32>(-1)) { 4064 break; // End of Input 4065 } 4066 if (c.fChar == chPound && fEOLComments) { 4067 // Start of a comment. Consume the rest of it, until EOF or a new line 4068 for (;;) { 4069 c.fChar = nextCharLL(); 4070 if (c.fChar == static_cast<UChar32>(-1) || // EOF 4071 c.fChar == chCR || 4072 c.fChar == chLF || 4073 c.fChar == chNEL || 4074 c.fChar == chLS) { 4075 break; 4076 } 4077 } 4078 } 4079 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 4080 if (PatternProps::isWhiteSpace(c.fChar) == false) { 4081 break; 4082 } 4083 c.fChar = nextCharLL(); 4084 } 4085 } 4086 4087 // 4088 // check for backslash escaped characters. 4089 // 4090 if (c.fChar == chBackSlash) { 4091 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4092 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 4093 // 4094 // A '\' sequence that is handled by ICU's standard unescapeAt function. 4095 // Includes \uxxxx, \n, \r, many others. 4096 // Return the single equivalent character. 4097 // 4098 nextCharLL(); // get & discard the peeked char. 4099 c.fQuoted = true; 4100 4101 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 4102 int32_t endIndex = static_cast<int32_t>(pos); 4103 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, static_cast<int32_t>(fPatternLength), const_cast<char16_t*>(fRXPat->fPattern->chunkContents)); 4104 4105 if (endIndex == pos) { 4106 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4107 } 4108 fCharNum += endIndex - pos; 4109 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 4110 } else { 4111 int32_t offset = 0; 4112 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 4113 4114 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 4115 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 4116 4117 if (offset == 0) { 4118 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4119 } else if (context.lastOffset == offset) { 4120 UTEXT_PREVIOUS32(fRXPat->fPattern); 4121 } else if (context.lastOffset != offset-1) { 4122 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 4123 } 4124 fCharNum += offset; 4125 } 4126 } 4127 else if (peekCharLL() == chDigit0) { 4128 // Octal Escape, using Java Regexp Conventions 4129 // which are \0 followed by 1-3 octal digits. 4130 // Different from ICU Unescape handling of Octal, which does not 4131 // require the leading 0. 4132 // Java also has the convention of only consuming 2 octal digits if 4133 // the three digit number would be > 0xff 4134 // 4135 c.fChar = 0; 4136 nextCharLL(); // Consume the initial 0. 4137 int index; 4138 for (index=0; index<3; index++) { 4139 int32_t ch = peekCharLL(); 4140 if (ch<chDigit0 || ch>chDigit7) { 4141 if (index==0) { 4142 // \0 is not followed by any octal digits. 4143 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4144 } 4145 break; 4146 } 4147 c.fChar <<= 3; 4148 c.fChar += ch&7; 4149 if (c.fChar <= 255) { 4150 nextCharLL(); 4151 } else { 4152 // The last digit made the number too big. Forget we saw it. 4153 c.fChar >>= 3; 4154 } 4155 } 4156 c.fQuoted = true; 4157 } 4158 else if (peekCharLL() == chQ) { 4159 // "\Q" enter quote mode, which will continue until "\E" 4160 fQuoteMode = true; 4161 nextCharLL(); // discard the 'Q'. 4162 // nextChar(c); // recurse to get the real next char. 4163 goto tailRecursion; // Note: fuzz testing produced test cases that 4164 // resulted in stack overflow here. 4165 } 4166 else 4167 { 4168 // We are in a '\' escape that will be handled by the state table scanner. 4169 // Just return the backslash, but remember that the following char is to 4170 // be taken literally. 4171 fInBackslashQuote = true; 4172 } 4173 } 4174 } 4175 4176 // re-enable # to end-of-line comments, in case they were disabled. 4177 // They are disabled by the parser upon seeing '(?', but this lasts for 4178 // the fetching of the next character only. 4179 fEOLComments = true; 4180 4181 // putc(c.fChar, stdout); 4182 } 4183 4184 4185 4186 //------------------------------------------------------------------------------ 4187 // 4188 // scanNamedChar 4189 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 4190 // 4191 // The scan position will be at the 'N'. On return 4192 // the scan position should be just after the '}' 4193 // 4194 // Return the UChar32 4195 // 4196 //------------------------------------------------------------------------------ 4197 UChar32 RegexCompile::scanNamedChar() { 4198 if (U_FAILURE(*fStatus)) { 4199 return 0; 4200 } 4201 4202 nextChar(fC); 4203 if (fC.fChar != chLBrace) { 4204 error(U_REGEX_PROPERTY_SYNTAX); 4205 return 0; 4206 } 4207 4208 UnicodeString charName; 4209 for (;;) { 4210 nextChar(fC); 4211 if (fC.fChar == chRBrace) { 4212 break; 4213 } 4214 if (fC.fChar == -1) { 4215 error(U_REGEX_PROPERTY_SYNTAX); 4216 return 0; 4217 } 4218 charName.append(fC.fChar); 4219 } 4220 4221 char name[100]; 4222 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 4223 static_cast<uint32_t>(charName.length()) >= sizeof(name)) { 4224 // All Unicode character names have only invariant characters. 4225 // The API to get a character, given a name, accepts only char *, forcing us to convert, 4226 // which requires this error check 4227 error(U_REGEX_PROPERTY_SYNTAX); 4228 return 0; 4229 } 4230 charName.extract(0, charName.length(), name, sizeof(name), US_INV); 4231 4232 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 4233 if (U_FAILURE(*fStatus)) { 4234 error(U_REGEX_PROPERTY_SYNTAX); 4235 } 4236 4237 nextChar(fC); // Continue overall regex pattern processing with char after the '}' 4238 return theChar; 4239 } 4240 4241 //------------------------------------------------------------------------------ 4242 // 4243 // scanProp Construct a UnicodeSet from the text at the current scan 4244 // position, which will be of the form \p{whaterver} 4245 // 4246 // The scan position will be at the 'p' or 'P'. On return 4247 // the scan position should be just after the '}' 4248 // 4249 // Return a UnicodeSet, constructed from the \P pattern, 4250 // or nullptr if the pattern is invalid. 4251 // 4252 //------------------------------------------------------------------------------ 4253 UnicodeSet *RegexCompile::scanProp() { 4254 UnicodeSet *uset = nullptr; 4255 4256 if (U_FAILURE(*fStatus)) { 4257 return nullptr; 4258 } 4259 (void)chLowerP; // Suppress compiler unused variable warning. 4260 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 4261 UBool negated = (fC.fChar == chP); 4262 4263 UnicodeString propertyName; 4264 nextChar(fC); 4265 if (fC.fChar != chLBrace) { 4266 error(U_REGEX_PROPERTY_SYNTAX); 4267 return nullptr; 4268 } 4269 for (;;) { 4270 nextChar(fC); 4271 if (fC.fChar == chRBrace) { 4272 break; 4273 } 4274 if (fC.fChar == -1) { 4275 // Hit the end of the input string without finding the closing '}' 4276 error(U_REGEX_PROPERTY_SYNTAX); 4277 return nullptr; 4278 } 4279 propertyName.append(fC.fChar); 4280 } 4281 uset = createSetForProperty(propertyName, negated); 4282 nextChar(fC); // Move input scan to position following the closing '}' 4283 return uset; 4284 } 4285 4286 //------------------------------------------------------------------------------ 4287 // 4288 // scanPosixProp Construct a UnicodeSet from the text at the current scan 4289 // position, which is expected be of the form [:property expression:] 4290 // 4291 // The scan position will be at the opening ':'. On return 4292 // the scan position must be on the closing ']' 4293 // 4294 // Return a UnicodeSet constructed from the pattern, 4295 // or nullptr if this is not a valid POSIX-style set expression. 4296 // If not a property expression, restore the initial scan position 4297 // (to the opening ':') 4298 // 4299 // Note: the opening '[:' is not sufficient to guarantee that 4300 // this is a [:property:] expression. 4301 // [:'+=,] is a perfectly good ordinary set expression that 4302 // happens to include ':' as one of its characters. 4303 // 4304 //------------------------------------------------------------------------------ 4305 UnicodeSet *RegexCompile::scanPosixProp() { 4306 UnicodeSet *uset = nullptr; 4307 4308 if (U_FAILURE(*fStatus)) { 4309 return nullptr; 4310 } 4311 4312 U_ASSERT(fC.fChar == chColon); 4313 4314 // Save the scanner state. 4315 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 4316 int64_t savedScanIndex = fScanIndex; 4317 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4318 UBool savedQuoteMode = fQuoteMode; 4319 UBool savedInBackslashQuote = fInBackslashQuote; 4320 UBool savedEOLComments = fEOLComments; 4321 int64_t savedLineNum = fLineNum; 4322 int64_t savedCharNum = fCharNum; 4323 UChar32 savedLastChar = fLastChar; 4324 UChar32 savedPeekChar = fPeekChar; 4325 RegexPatternChar savedfC = fC; 4326 4327 // Scan for a closing ]. A little tricky because there are some perverse 4328 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 4329 // ending on the second closing ]. 4330 4331 UnicodeString propName; 4332 UBool negated = false; 4333 4334 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 4335 nextChar(fC); 4336 if (fC.fChar == chUp) { 4337 negated = true; 4338 nextChar(fC); 4339 } 4340 4341 // Scan for the closing ":]", collecting the property name along the way. 4342 UBool sawPropSetTerminator = false; 4343 for (;;) { 4344 propName.append(fC.fChar); 4345 nextChar(fC); 4346 if (fC.fQuoted || fC.fChar == -1) { 4347 // Escaped characters or end of input - either says this isn't a [:Property:] 4348 break; 4349 } 4350 if (fC.fChar == chColon) { 4351 nextChar(fC); 4352 if (fC.fChar == chRBracket) { 4353 sawPropSetTerminator = true; 4354 } 4355 break; 4356 } 4357 } 4358 4359 if (sawPropSetTerminator) { 4360 uset = createSetForProperty(propName, negated); 4361 } 4362 else 4363 { 4364 // No closing ":]". 4365 // Restore the original scan position. 4366 // The main scanner will retry the input as a normal set expression, 4367 // not a [:Property:] expression. 4368 fScanIndex = savedScanIndex; 4369 fQuoteMode = savedQuoteMode; 4370 fInBackslashQuote = savedInBackslashQuote; 4371 fEOLComments = savedEOLComments; 4372 fLineNum = savedLineNum; 4373 fCharNum = savedCharNum; 4374 fLastChar = savedLastChar; 4375 fPeekChar = savedPeekChar; 4376 fC = savedfC; 4377 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4378 } 4379 return uset; 4380 } 4381 4382 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4383 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4384 addCategory(set, U_GC_CF_MASK, ec); 4385 } 4386 4387 // 4388 // Create a Unicode Set from a Unicode Property expression. 4389 // This is common code underlying both \p{...} and [:...:] expressions. 4390 // Includes trying the Java "properties" that aren't supported as 4391 // normal ICU UnicodeSet properties 4392 // 4393 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4394 4395 if (U_FAILURE(*fStatus)) { 4396 return nullptr; 4397 } 4398 LocalPointer<UnicodeSet> set; 4399 UErrorCode status = U_ZERO_ERROR; 4400 4401 do { // non-loop, exists to allow breaks from the block. 4402 // 4403 // First try the property as we received it 4404 // 4405 UnicodeString setExpr; 4406 uint32_t usetFlags = 0; 4407 setExpr.append(u"[\\p{", -1); 4408 setExpr.append(propName); 4409 setExpr.append(u"}]", -1); 4410 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4411 usetFlags |= USET_CASE_INSENSITIVE; 4412 } 4413 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status); 4414 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) { 4415 break; 4416 } 4417 4418 // 4419 // The incoming property wasn't directly recognized by ICU. 4420 4421 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet. 4422 // Java accepts 'word' with mixed case. 4423 // Java accepts 'all' only in all lower case. 4424 4425 status = U_ZERO_ERROR; 4426 if (propName.caseCompare(u"word", -1, 0) == 0) { 4427 set.adoptInsteadAndCheckErrorCode( 4428 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status); 4429 break; 4430 } 4431 if (propName.compare(u"all", -1) == 0) { 4432 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status); 4433 break; 4434 } 4435 4436 4437 // Do Java InBlock expressions 4438 // 4439 UnicodeString mPropName = propName; 4440 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) { 4441 status = U_ZERO_ERROR; 4442 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); 4443 if (U_FAILURE(status)) { 4444 break; 4445 } 4446 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed. 4447 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status); 4448 break; 4449 } 4450 4451 // Check for the Java form "IsBooleanPropertyValue", which we will recast 4452 // as "BooleanPropertyValue". The property value can be either a 4453 // a General Category or a Script Name. 4454 4455 if (propName.startsWith(u"Is", 2) && propName.length()>=3) { 4456 mPropName.remove(0, 2); // Strip the "Is" 4457 if (mPropName.indexOf(u'=') >= 0) { 4458 // Reject any "Is..." property expression containing an '=', that is, 4459 // any non-binary property expression. 4460 status = U_REGEX_PROPERTY_SYNTAX; 4461 break; 4462 } 4463 4464 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) { 4465 mPropName.setTo(u"unassigned", -1); 4466 negated = !negated; 4467 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) { 4468 mPropName.setTo(u"Titlecase_Letter", -1); 4469 } 4470 4471 mPropName.insert(0, u"[\\p{", -1); 4472 mPropName.append(u"}]", -1); 4473 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status); 4474 4475 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { 4476 set->closeOver(USET_CASE_INSENSITIVE); 4477 } 4478 break; 4479 4480 } 4481 4482 if (propName.startsWith(u"java", -1)) { 4483 status = U_ZERO_ERROR; 4484 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); 4485 if (U_FAILURE(status)) { 4486 break; 4487 } 4488 // 4489 // Try the various Java specific properties. 4490 // These all begin with "java" 4491 // 4492 if (propName.compare(u"javaDefined", -1) == 0) { 4493 addCategory(set.getAlias(), U_GC_CN_MASK, status); 4494 set->complement(); 4495 } 4496 else if (propName.compare(u"javaDigit", -1) == 0) { 4497 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4498 } 4499 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) { 4500 addIdentifierIgnorable(set.getAlias(), status); 4501 } 4502 else if (propName.compare(u"javaISOControl", -1) == 0) { 4503 set->add(0, 0x1F).add(0x7F, 0x9F); 4504 } 4505 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) { 4506 addCategory(set.getAlias(), U_GC_L_MASK, status); 4507 addCategory(set.getAlias(), U_GC_SC_MASK, status); 4508 addCategory(set.getAlias(), U_GC_PC_MASK, status); 4509 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4510 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4511 addCategory(set.getAlias(), U_GC_MC_MASK, status); 4512 addCategory(set.getAlias(), U_GC_MN_MASK, status); 4513 addIdentifierIgnorable(set.getAlias(), status); 4514 } 4515 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) { 4516 addCategory(set.getAlias(), U_GC_L_MASK, status); 4517 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4518 addCategory(set.getAlias(), U_GC_SC_MASK, status); 4519 addCategory(set.getAlias(), U_GC_PC_MASK, status); 4520 } 4521 else if (propName.compare(u"javaLetter", -1) == 0) { 4522 addCategory(set.getAlias(), U_GC_L_MASK, status); 4523 } 4524 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) { 4525 addCategory(set.getAlias(), U_GC_L_MASK, status); 4526 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4527 } 4528 else if (propName.compare(u"javaLowerCase", -1) == 0) { 4529 addCategory(set.getAlias(), U_GC_LL_MASK, status); 4530 } 4531 else if (propName.compare(u"javaMirrored", -1) == 0) { 4532 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status); 4533 } 4534 else if (propName.compare(u"javaSpaceChar", -1) == 0) { 4535 addCategory(set.getAlias(), U_GC_Z_MASK, status); 4536 } 4537 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) { 4538 set->add(0x10000, UnicodeSet::MAX_VALUE); 4539 } 4540 else if (propName.compare(u"javaTitleCase", -1) == 0) { 4541 addCategory(set.getAlias(), U_GC_LT_MASK, status); 4542 } 4543 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) { 4544 addCategory(set.getAlias(), U_GC_L_MASK, status); 4545 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4546 } 4547 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) { 4548 addCategory(set.getAlias(), U_GC_L_MASK, status); 4549 addCategory(set.getAlias(), U_GC_PC_MASK, status); 4550 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4551 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4552 addCategory(set.getAlias(), U_GC_MC_MASK, status); 4553 addCategory(set.getAlias(), U_GC_MN_MASK, status); 4554 addIdentifierIgnorable(set.getAlias(), status); 4555 } 4556 else if (propName.compare(u"javaUpperCase", -1) == 0) { 4557 addCategory(set.getAlias(), U_GC_LU_MASK, status); 4558 } 4559 else if (propName.compare(u"javaValidCodePoint", -1) == 0) { 4560 set->add(0, UnicodeSet::MAX_VALUE); 4561 } 4562 else if (propName.compare(u"javaWhitespace", -1) == 0) { 4563 addCategory(set.getAlias(), U_GC_Z_MASK, status); 4564 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4565 set->add(9, 0x0d).add(0x1c, 0x1f); 4566 } else { 4567 status = U_REGEX_PROPERTY_SYNTAX; 4568 } 4569 4570 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { 4571 set->closeOver(USET_CASE_INSENSITIVE); 4572 } 4573 break; 4574 } 4575 4576 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility 4577 // extensions matched it. 4578 status = U_REGEX_PROPERTY_SYNTAX; 4579 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure. 4580 4581 if (U_SUCCESS(status)) { 4582 // ICU 70 adds emoji properties of strings, but as long as Java does not say how to 4583 // deal with properties of strings and character classes with strings, we ignore them. 4584 // Just in case something downstream might stumble over the strings, 4585 // we remove them from the set. 4586 // Note that when we support strings, the complement of a property (as with \P) 4587 // should be implemented as .complement().removeAllStrings() (code point complement). 4588 set->removeAllStrings(); 4589 U_ASSERT(set.isValid()); 4590 if (negated) { 4591 set->complement(); 4592 } 4593 return set.orphan(); 4594 } else { 4595 if (status == U_ILLEGAL_ARGUMENT_ERROR) { 4596 status = U_REGEX_PROPERTY_SYNTAX; 4597 } 4598 error(status); 4599 return nullptr; 4600 } 4601 } 4602 4603 4604 // 4605 // SetEval Part of the evaluation of [set expressions]. 4606 // Perform any pending (stacked) operations with precedence 4607 // equal or greater to that of the next operator encountered 4608 // in the expression. 4609 // 4610 void RegexCompile::setEval(int32_t nextOp) { 4611 UnicodeSet *rightOperand = nullptr; 4612 UnicodeSet *leftOperand = nullptr; 4613 for (;;) { 4614 U_ASSERT(fSetOpStack.empty()==false); 4615 int32_t pendingSetOperation = fSetOpStack.peeki(); 4616 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4617 break; 4618 } 4619 fSetOpStack.popi(); 4620 U_ASSERT(fSetStack.empty() == false); 4621 rightOperand = static_cast<UnicodeSet*>(fSetStack.peek()); 4622 // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings 4623 // (see comments there). 4624 // We also do not yet support string literals in character classes, 4625 // so there should not be any strings. 4626 // Note that when we support strings, the complement of a set (as with ^ or \P) 4627 // should be implemented as .complement().removeAllStrings() (code point complement). 4628 U_ASSERT(!rightOperand->hasStrings()); 4629 switch (pendingSetOperation) { 4630 case setNegation: 4631 rightOperand->complement(); 4632 break; 4633 case setCaseClose: 4634 // TODO: need a simple close function. Ticket 6065 4635 rightOperand->closeOver(USET_CASE_INSENSITIVE); 4636 rightOperand->removeAllStrings(); 4637 break; 4638 case setDifference1: 4639 case setDifference2: 4640 fSetStack.pop(); 4641 leftOperand = static_cast<UnicodeSet*>(fSetStack.peek()); 4642 leftOperand->removeAll(*rightOperand); 4643 delete rightOperand; 4644 break; 4645 case setIntersection1: 4646 case setIntersection2: 4647 fSetStack.pop(); 4648 leftOperand = static_cast<UnicodeSet*>(fSetStack.peek()); 4649 leftOperand->retainAll(*rightOperand); 4650 delete rightOperand; 4651 break; 4652 case setUnion: 4653 fSetStack.pop(); 4654 leftOperand = static_cast<UnicodeSet*>(fSetStack.peek()); 4655 leftOperand->addAll(*rightOperand); 4656 delete rightOperand; 4657 break; 4658 default: 4659 UPRV_UNREACHABLE_EXIT; 4660 } 4661 } 4662 } 4663 4664 void RegexCompile::setPushOp(int32_t op) { 4665 setEval(op); 4666 fSetOpStack.push(op, *fStatus); 4667 LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus); 4668 fSetStack.push(lpSet.orphan(), *fStatus); 4669 } 4670 4671 U_NAMESPACE_END 4672 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS