rematch.cpp (224707B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************** 5 * Copyright (C) 2002-2016 International Business Machines Corporation 6 * and others. All rights reserved. 7 ************************************************************************** 8 */ 9 // 10 // file: rematch.cpp 11 // 12 // Contains the implementation of class RegexMatcher, 13 // which is one of the main API classes for the ICU regular expression package. 14 // 15 16 #include "unicode/utypes.h" 17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 18 19 #include "unicode/regex.h" 20 #include "unicode/uniset.h" 21 #include "unicode/uchar.h" 22 #include "unicode/ustring.h" 23 #include "unicode/rbbi.h" 24 #include "unicode/utf.h" 25 #include "unicode/utf16.h" 26 #include "uassert.h" 27 #include "cmemory.h" 28 #include "cstr.h" 29 #include "uvector.h" 30 #include "uvectr32.h" 31 #include "uvectr64.h" 32 #include "regeximp.h" 33 #include "regexst.h" 34 #include "regextxt.h" 35 #include "ucase.h" 36 37 // #include <malloc.h> // Needed for heapcheck testing 38 39 40 U_NAMESPACE_BEGIN 41 42 // Default limit for the size of the back track stack, to avoid system 43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 44 // This value puts ICU's limits higher than most other regexp implementations, 45 // which use recursion rather than the heap, and take more storage per 46 // backtrack point. 47 // 48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 49 50 // Time limit counter constant. 51 // Time limits for expression evaluation are in terms of quanta of work by 52 // the engine, each of which is 10,000 state saves. 53 // This constant determines that state saves per tick number. 54 static const int32_t TIMER_INITIAL_VALUE = 10000; 55 56 57 // Test for any of the Unicode line terminating characters. 58 static inline UBool isLineTerminator(UChar32 c) { 59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { 60 return false; 61 } 62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; 63 } 64 65 //----------------------------------------------------------------------------- 66 // 67 // Constructor and Destructor 68 // 69 //----------------------------------------------------------------------------- 70 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 71 fDeferredStatus = U_ZERO_ERROR; 72 init(fDeferredStatus); 73 if (U_FAILURE(fDeferredStatus)) { 74 return; 75 } 76 if (pat==nullptr) { 77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 78 return; 79 } 80 fPattern = pat; 81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 82 } 83 84 85 86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 87 uint32_t flags, UErrorCode &status) { 88 init(status); 89 if (U_FAILURE(status)) { 90 return; 91 } 92 UParseError pe; 93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 94 fPattern = fPatternOwned; 95 96 UText inputText = UTEXT_INITIALIZER; 97 utext_openConstUnicodeString(&inputText, &input, &status); 98 init2(&inputText, status); 99 utext_close(&inputText); 100 101 fInputUniStrMaybeMutable = true; 102 } 103 104 105 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 106 uint32_t flags, UErrorCode &status) { 107 init(status); 108 if (U_FAILURE(status)) { 109 return; 110 } 111 UParseError pe; 112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 113 if (U_FAILURE(status)) { 114 return; 115 } 116 117 fPattern = fPatternOwned; 118 init2(input, status); 119 } 120 121 122 RegexMatcher::RegexMatcher(const UnicodeString ®exp, 123 uint32_t flags, UErrorCode &status) { 124 init(status); 125 if (U_FAILURE(status)) { 126 return; 127 } 128 UParseError pe; 129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 130 if (U_FAILURE(status)) { 131 return; 132 } 133 fPattern = fPatternOwned; 134 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 135 } 136 137 RegexMatcher::RegexMatcher(UText *regexp, 138 uint32_t flags, UErrorCode &status) { 139 init(status); 140 if (U_FAILURE(status)) { 141 return; 142 } 143 UParseError pe; 144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 145 if (U_FAILURE(status)) { 146 return; 147 } 148 149 fPattern = fPatternOwned; 150 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 151 } 152 153 154 155 156 RegexMatcher::~RegexMatcher() { 157 delete fStack; 158 if (fData != fSmallData) { 159 uprv_free(fData); 160 fData = nullptr; 161 } 162 if (fPatternOwned) { 163 delete fPatternOwned; 164 fPatternOwned = nullptr; 165 fPattern = nullptr; 166 } 167 168 delete fInput; 169 if (fInputText) { 170 utext_close(fInputText); 171 } 172 if (fAltInputText) { 173 utext_close(fAltInputText); 174 } 175 176 #if UCONFIG_NO_BREAK_ITERATION==0 177 delete fWordBreakItr; 178 delete fGCBreakItr; 179 #endif 180 } 181 182 // 183 // init() common initialization for use by all constructors. 184 // Initialize all fields, get the object into a consistent state. 185 // This must be done even when the initial status shows an error, 186 // so that the object is initialized sufficiently well for the destructor 187 // to run safely. 188 // 189 void RegexMatcher::init(UErrorCode &status) { 190 fPattern = nullptr; 191 fPatternOwned = nullptr; 192 fFrameSize = 0; 193 fRegionStart = 0; 194 fRegionLimit = 0; 195 fAnchorStart = 0; 196 fAnchorLimit = 0; 197 fLookStart = 0; 198 fLookLimit = 0; 199 fActiveStart = 0; 200 fActiveLimit = 0; 201 fTransparentBounds = false; 202 fAnchoringBounds = true; 203 fMatch = false; 204 fMatchStart = 0; 205 fMatchEnd = 0; 206 fLastMatchEnd = -1; 207 fAppendPosition = 0; 208 fHitEnd = false; 209 fRequireEnd = false; 210 fStack = nullptr; 211 fFrame = nullptr; 212 fTimeLimit = 0; 213 fTime = 0; 214 fTickCounter = 0; 215 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 216 fCallbackFn = nullptr; 217 fCallbackContext = nullptr; 218 fFindProgressCallbackFn = nullptr; 219 fFindProgressCallbackContext = nullptr; 220 fTraceDebug = false; 221 fDeferredStatus = status; 222 fData = fSmallData; 223 fWordBreakItr = nullptr; 224 fGCBreakItr = nullptr; 225 226 fStack = nullptr; 227 fInputText = nullptr; 228 fAltInputText = nullptr; 229 fInput = nullptr; 230 fInputLength = 0; 231 fInputUniStrMaybeMutable = false; 232 } 233 234 // 235 // init2() Common initialization for use by RegexMatcher constructors, part 2. 236 // This handles the common setup to be done after the Pattern is available. 237 // 238 void RegexMatcher::init2(UText *input, UErrorCode &status) { 239 if (U_FAILURE(status)) { 240 fDeferredStatus = status; 241 return; 242 } 243 244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) { 245 fData = static_cast<int64_t*>(uprv_malloc(fPattern->fDataSize * sizeof(int64_t))); 246 if (fData == nullptr) { 247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 248 return; 249 } 250 } 251 252 fStack = new UVector64(status); 253 if (fStack == nullptr) { 254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 255 return; 256 } 257 258 reset(input); 259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 260 if (U_FAILURE(status)) { 261 fDeferredStatus = status; 262 return; 263 } 264 } 265 266 267 static const char16_t BACKSLASH = 0x5c; 268 static const char16_t DOLLARSIGN = 0x24; 269 static const char16_t LEFTBRACKET = 0x7b; 270 static const char16_t RIGHTBRACKET = 0x7d; 271 272 //-------------------------------------------------------------------------------- 273 // 274 // appendReplacement 275 // 276 //-------------------------------------------------------------------------------- 277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 278 const UnicodeString &replacement, 279 UErrorCode &status) { 280 UText replacementText = UTEXT_INITIALIZER; 281 282 utext_openConstUnicodeString(&replacementText, &replacement, &status); 283 if (U_SUCCESS(status)) { 284 UText resultText = UTEXT_INITIALIZER; 285 utext_openUnicodeString(&resultText, &dest, &status); 286 287 if (U_SUCCESS(status)) { 288 appendReplacement(&resultText, &replacementText, status); 289 utext_close(&resultText); 290 } 291 utext_close(&replacementText); 292 } 293 294 return *this; 295 } 296 297 // 298 // appendReplacement, UText mode 299 // 300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 301 UText *replacement, 302 UErrorCode &status) { 303 if (U_FAILURE(status)) { 304 return *this; 305 } 306 if (U_FAILURE(fDeferredStatus)) { 307 status = fDeferredStatus; 308 return *this; 309 } 310 if (fMatch == false) { 311 status = U_REGEX_INVALID_STATE; 312 return *this; 313 } 314 315 // Copy input string from the end of previous match to start of current match 316 int64_t destLen = utext_nativeLength(dest); 317 if (fMatchStart > fAppendPosition) { 318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 320 static_cast<int32_t>(fMatchStart - fAppendPosition), &status); 321 } else { 322 int32_t len16; 323 if (UTEXT_USES_U16(fInputText)) { 324 len16 = static_cast<int32_t>(fMatchStart - fAppendPosition); 325 } else { 326 UErrorCode lengthStatus = U_ZERO_ERROR; 327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus); 328 } 329 char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1))); 330 if (inputChars == nullptr) { 331 status = U_MEMORY_ALLOCATION_ERROR; 332 return *this; 333 } 334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 336 uprv_free(inputChars); 337 } 338 } 339 fAppendPosition = fMatchEnd; 340 341 342 // scan the replacement text, looking for substitutions ($n) and \escapes. 343 // TODO: optimize this loop by efficiently scanning for '$' or '\', 344 // move entire ranges not containing substitutions. 345 UTEXT_SETNATIVEINDEX(replacement, 0); 346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { 347 if (c == BACKSLASH) { 348 // Backslash Escape. Copy the following char out without further checks. 349 // Note: Surrogate pairs don't need any special handling 350 // The second half wont be a '$' or a '\', and 351 // will move to the dest normally on the next 352 // loop iteration. 353 c = UTEXT_CURRENT32(replacement); 354 if (c == U_SENTINEL) { 355 break; 356 } 357 358 if (c==0x55/*U*/ || c==0x75/*u*/) { 359 // We have a \udddd or \Udddddddd escape sequence. 360 int32_t offset = 0; 361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 363 if (escapedChar != static_cast<UChar32>(0xFFFFFFFF)) { 364 if (U_IS_BMP(escapedChar)) { 365 char16_t c16 = static_cast<char16_t>(escapedChar); 366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 367 } else { 368 char16_t surrogate[2]; 369 surrogate[0] = U16_LEAD(escapedChar); 370 surrogate[1] = U16_TRAIL(escapedChar); 371 if (U_SUCCESS(status)) { 372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 373 } 374 } 375 // TODO: Report errors for mal-formed \u escapes? 376 // As this is, the original sequence is output, which may be OK. 377 if (context.lastOffset == offset) { 378 (void)UTEXT_PREVIOUS32(replacement); 379 } else if (context.lastOffset != offset-1) { 380 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 381 } 382 } 383 } else { 384 (void)UTEXT_NEXT32(replacement); 385 // Plain backslash escape. Just put out the escaped character. 386 if (U_IS_BMP(c)) { 387 char16_t c16 = static_cast<char16_t>(c); 388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 389 } else { 390 char16_t surrogate[2]; 391 surrogate[0] = U16_LEAD(c); 392 surrogate[1] = U16_TRAIL(c); 393 if (U_SUCCESS(status)) { 394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 395 } 396 } 397 } 398 } else if (c != DOLLARSIGN) { 399 // Normal char, not a $. Copy it out without further checks. 400 if (U_IS_BMP(c)) { 401 char16_t c16 = static_cast<char16_t>(c); 402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 403 } else { 404 char16_t surrogate[2]; 405 surrogate[0] = U16_LEAD(c); 406 surrogate[1] = U16_TRAIL(c); 407 if (U_SUCCESS(status)) { 408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 409 } 410 } 411 } else { 412 // We've got a $. Pick up a capture group name or number if one follows. 413 // Consume digits so long as the resulting group number <= the number of 414 // number of capture groups in the pattern. 415 416 int32_t groupNum = 0; 417 int32_t numDigits = 0; 418 UChar32 nextChar = utext_current32(replacement); 419 if (nextChar == LEFTBRACKET) { 420 // Scan for a Named Capture Group, ${name}. 421 UnicodeString groupName; 422 utext_next32(replacement); 423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { 424 nextChar = utext_next32(replacement); 425 if (nextChar == U_SENTINEL) { 426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z 428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z 429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 430 groupName.append(nextChar); 431 } else if (nextChar == RIGHTBRACKET) { 432 groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0; 433 if (groupNum == 0) { 434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 435 } 436 } else { 437 // Character was something other than a name char or a closing '}' 438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 439 } 440 } 441 442 } else if (u_isdigit(nextChar)) { 443 // $n Scan for a capture group number 444 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 445 for (;;) { 446 nextChar = UTEXT_CURRENT32(replacement); 447 if (nextChar == U_SENTINEL) { 448 break; 449 } 450 if (u_isdigit(nextChar) == false) { 451 break; 452 } 453 int32_t nextDigitVal = u_charDigitValue(nextChar); 454 if (groupNum*10 + nextDigitVal > numCaptureGroups) { 455 // Don't consume the next digit if it makes the capture group number too big. 456 if (numDigits == 0) { 457 status = U_INDEX_OUTOFBOUNDS_ERROR; 458 } 459 break; 460 } 461 (void)UTEXT_NEXT32(replacement); 462 groupNum=groupNum*10 + nextDigitVal; 463 ++numDigits; 464 } 465 } else { 466 // $ not followed by capture group name or number. 467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; 468 } 469 470 if (U_SUCCESS(status)) { 471 destLen += appendGroup(groupNum, dest, status); 472 } 473 } // End of $ capture group handling 474 } // End of per-character loop through the replacement string. 475 476 return *this; 477 } 478 479 480 481 //-------------------------------------------------------------------------------- 482 // 483 // appendTail Intended to be used in conjunction with appendReplacement() 484 // To the destination string, append everything following 485 // the last match position from the input string. 486 // 487 // Note: Match ranges do not affect appendTail or appendReplacement 488 // 489 //-------------------------------------------------------------------------------- 490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 491 UErrorCode status = U_ZERO_ERROR; 492 UText resultText = UTEXT_INITIALIZER; 493 utext_openUnicodeString(&resultText, &dest, &status); 494 495 if (U_SUCCESS(status)) { 496 appendTail(&resultText, status); 497 utext_close(&resultText); 498 } 499 500 return dest; 501 } 502 503 // 504 // appendTail, UText mode 505 // 506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 507 if (U_FAILURE(status)) { 508 return dest; 509 } 510 if (U_FAILURE(fDeferredStatus)) { 511 status = fDeferredStatus; 512 return dest; 513 } 514 515 if (fInputLength > fAppendPosition) { 516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 517 int64_t destLen = utext_nativeLength(dest); 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 519 static_cast<int32_t>(fInputLength - fAppendPosition), &status); 520 } else { 521 int32_t len16; 522 if (UTEXT_USES_U16(fInputText)) { 523 len16 = static_cast<int32_t>(fInputLength - fAppendPosition); 524 } else { 525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, nullptr, 0, &status); 526 status = U_ZERO_ERROR; // buffer overflow 527 } 528 529 char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16))); 530 if (inputChars == nullptr) { 531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 532 } else { 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 534 int64_t destLen = utext_nativeLength(dest); 535 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 536 uprv_free(inputChars); 537 } 538 } 539 } 540 return dest; 541 } 542 543 544 545 //-------------------------------------------------------------------------------- 546 // 547 // end 548 // 549 //-------------------------------------------------------------------------------- 550 int32_t RegexMatcher::end(UErrorCode &err) const { 551 return end(0, err); 552 } 553 554 int64_t RegexMatcher::end64(UErrorCode &err) const { 555 return end64(0, err); 556 } 557 558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 559 if (U_FAILURE(err)) { 560 return -1; 561 } 562 if (fMatch == false) { 563 err = U_REGEX_INVALID_STATE; 564 return -1; 565 } 566 if (group < 0 || group > fPattern->fGroupMap->size()) { 567 err = U_INDEX_OUTOFBOUNDS_ERROR; 568 return -1; 569 } 570 int64_t e = -1; 571 if (group == 0) { 572 e = fMatchEnd; 573 } else { 574 // Get the position within the stack frame of the variables for 575 // this capture group. 576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 577 U_ASSERT(groupOffset < fPattern->fFrameSize); 578 U_ASSERT(groupOffset >= 0); 579 e = fFrame->fExtra[groupOffset + 1]; 580 } 581 582 return e; 583 } 584 585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 586 return static_cast<int32_t>(end64(group, err)); 587 } 588 589 //-------------------------------------------------------------------------------- 590 // 591 // findProgressInterrupt This function is called once for each advance in the target 592 // string from the find() function, and calls the user progress callback 593 // function if there is one installed. 594 // 595 // Return: true if the find operation is to be terminated. 596 // false if the find operation is to continue running. 597 // 598 //-------------------------------------------------------------------------------- 599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { 600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) { 601 status = U_REGEX_STOPPED_BY_CALLER; 602 return true; 603 } 604 return false; 605 } 606 607 //-------------------------------------------------------------------------------- 608 // 609 // find() 610 // 611 //-------------------------------------------------------------------------------- 612 UBool RegexMatcher::find() { 613 if (U_FAILURE(fDeferredStatus)) { 614 return false; 615 } 616 UErrorCode status = U_ZERO_ERROR; 617 UBool result = find(status); 618 return result; 619 } 620 621 //-------------------------------------------------------------------------------- 622 // 623 // find() 624 // 625 //-------------------------------------------------------------------------------- 626 UBool RegexMatcher::find(UErrorCode &status) { 627 // Start at the position of the last match end. (Will be zero if the 628 // matcher has been reset.) 629 // 630 if (U_FAILURE(status)) { 631 return false; 632 } 633 if (U_FAILURE(fDeferredStatus)) { 634 status = fDeferredStatus; 635 return false; 636 } 637 638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 639 return findUsingChunk(status); 640 } 641 642 int64_t startPos = fMatchEnd; 643 if (startPos==0) { 644 startPos = fActiveStart; 645 } 646 647 if (fMatch) { 648 // Save the position of any previous successful match. 649 fLastMatchEnd = fMatchEnd; 650 651 if (fMatchStart == fMatchEnd) { 652 // Previous match had zero length. Move start position up one position 653 // to avoid sending find() into a loop on zero-length matches. 654 if (startPos >= fActiveLimit) { 655 fMatch = false; 656 fHitEnd = true; 657 return false; 658 } 659 UTEXT_SETNATIVEINDEX(fInputText, startPos); 660 (void)UTEXT_NEXT32(fInputText); 661 startPos = UTEXT_GETNATIVEINDEX(fInputText); 662 } 663 } else { 664 if (fLastMatchEnd >= 0) { 665 // A previous find() failed to match. Don't try again. 666 // (without this test, a pattern with a zero-length match 667 // could match again at the end of an input string.) 668 fHitEnd = true; 669 return false; 670 } 671 } 672 673 674 // Compute the position in the input string beyond which a match can not begin, because 675 // the minimum length match would extend past the end of the input. 676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 677 // Be aware of possible overflows if making changes here. 678 int64_t testStartLimit; 679 if (UTEXT_USES_U16(fInputText)) { 680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 681 if (startPos > testStartLimit) { 682 fMatch = false; 683 fHitEnd = true; 684 return false; 685 } 686 } else { 687 // We don't know exactly how long the minimum match length is in native characters. 688 // Treat anything > 0 as 1. 689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); 690 } 691 692 UChar32 c; 693 U_ASSERT(startPos >= 0); 694 695 switch (fPattern->fStartType) { 696 case START_NO_INFO: 697 // No optimization was found. 698 // Try a match at each input position. 699 for (;;) { 700 MatchAt(startPos, false, status); 701 if (U_FAILURE(status)) { 702 return false; 703 } 704 if (fMatch) { 705 return true; 706 } 707 if (startPos >= testStartLimit) { 708 fHitEnd = true; 709 return false; 710 } 711 UTEXT_SETNATIVEINDEX(fInputText, startPos); 712 (void)UTEXT_NEXT32(fInputText); 713 startPos = UTEXT_GETNATIVEINDEX(fInputText); 714 // Note that it's perfectly OK for a pattern to have a zero-length 715 // match at the end of a string, so we must make sure that the loop 716 // runs with startPos == testStartLimit the last time through. 717 if (findProgressInterrupt(startPos, status)) 718 return false; 719 } 720 UPRV_UNREACHABLE_EXIT; 721 722 case START_START: 723 // Matches are only possible at the start of the input string 724 // (pattern begins with ^ or \A) 725 if (startPos > fActiveStart) { 726 fMatch = false; 727 return false; 728 } 729 MatchAt(startPos, false, status); 730 if (U_FAILURE(status)) { 731 return false; 732 } 733 return fMatch; 734 735 736 case START_SET: 737 { 738 // Match may start on any char from a pre-computed set. 739 U_ASSERT(fPattern->fMinMatchLen > 0); 740 UTEXT_SETNATIVEINDEX(fInputText, startPos); 741 for (;;) { 742 int64_t pos = startPos; 743 c = UTEXT_NEXT32(fInputText); 744 startPos = UTEXT_GETNATIVEINDEX(fInputText); 745 // c will be -1 (U_SENTINEL) at end of text, in which case we 746 // skip this next block (so we don't have a negative array index) 747 // and handle end of text in the following block. 748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 749 (c>=256 && fPattern->fInitialChars->contains(c)))) { 750 MatchAt(pos, false, status); 751 if (U_FAILURE(status)) { 752 return false; 753 } 754 if (fMatch) { 755 return true; 756 } 757 UTEXT_SETNATIVEINDEX(fInputText, pos); 758 } 759 if (startPos > testStartLimit) { 760 fMatch = false; 761 fHitEnd = true; 762 return false; 763 } 764 if (findProgressInterrupt(startPos, status)) 765 return false; 766 } 767 } 768 UPRV_UNREACHABLE_EXIT; 769 770 case START_STRING: 771 case START_CHAR: 772 { 773 // Match starts on exactly one char. 774 U_ASSERT(fPattern->fMinMatchLen > 0); 775 UChar32 theChar = fPattern->fInitialChar; 776 UTEXT_SETNATIVEINDEX(fInputText, startPos); 777 for (;;) { 778 int64_t pos = startPos; 779 c = UTEXT_NEXT32(fInputText); 780 startPos = UTEXT_GETNATIVEINDEX(fInputText); 781 if (c == theChar) { 782 MatchAt(pos, false, status); 783 if (U_FAILURE(status)) { 784 return false; 785 } 786 if (fMatch) { 787 return true; 788 } 789 UTEXT_SETNATIVEINDEX(fInputText, startPos); 790 } 791 if (startPos > testStartLimit) { 792 fMatch = false; 793 fHitEnd = true; 794 return false; 795 } 796 if (findProgressInterrupt(startPos, status)) 797 return false; 798 } 799 } 800 UPRV_UNREACHABLE_EXIT; 801 802 case START_LINE: 803 { 804 UChar32 ch; 805 if (startPos == fAnchorStart) { 806 MatchAt(startPos, false, status); 807 if (U_FAILURE(status)) { 808 return false; 809 } 810 if (fMatch) { 811 return true; 812 } 813 UTEXT_SETNATIVEINDEX(fInputText, startPos); 814 ch = UTEXT_NEXT32(fInputText); 815 startPos = UTEXT_GETNATIVEINDEX(fInputText); 816 } else { 817 UTEXT_SETNATIVEINDEX(fInputText, startPos); 818 ch = UTEXT_PREVIOUS32(fInputText); 819 UTEXT_SETNATIVEINDEX(fInputText, startPos); 820 } 821 822 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 823 for (;;) { 824 if (ch == 0x0a) { 825 MatchAt(startPos, false, status); 826 if (U_FAILURE(status)) { 827 return false; 828 } 829 if (fMatch) { 830 return true; 831 } 832 UTEXT_SETNATIVEINDEX(fInputText, startPos); 833 } 834 if (startPos >= testStartLimit) { 835 fMatch = false; 836 fHitEnd = true; 837 return false; 838 } 839 ch = UTEXT_NEXT32(fInputText); 840 startPos = UTEXT_GETNATIVEINDEX(fInputText); 841 // Note that it's perfectly OK for a pattern to have a zero-length 842 // match at the end of a string, so we must make sure that the loop 843 // runs with startPos == testStartLimit the last time through. 844 if (findProgressInterrupt(startPos, status)) 845 return false; 846 } 847 } else { 848 for (;;) { 849 if (isLineTerminator(ch)) { 850 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 851 (void)UTEXT_NEXT32(fInputText); 852 startPos = UTEXT_GETNATIVEINDEX(fInputText); 853 } 854 MatchAt(startPos, false, status); 855 if (U_FAILURE(status)) { 856 return false; 857 } 858 if (fMatch) { 859 return true; 860 } 861 UTEXT_SETNATIVEINDEX(fInputText, startPos); 862 } 863 if (startPos >= testStartLimit) { 864 fMatch = false; 865 fHitEnd = true; 866 return false; 867 } 868 ch = UTEXT_NEXT32(fInputText); 869 startPos = UTEXT_GETNATIVEINDEX(fInputText); 870 // Note that it's perfectly OK for a pattern to have a zero-length 871 // match at the end of a string, so we must make sure that the loop 872 // runs with startPos == testStartLimit the last time through. 873 if (findProgressInterrupt(startPos, status)) 874 return false; 875 } 876 } 877 } 878 879 default: 880 UPRV_UNREACHABLE_ASSERT; 881 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But 882 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. 883 // See ICU-21669. 884 status = U_INTERNAL_PROGRAM_ERROR; 885 return false; 886 } 887 888 UPRV_UNREACHABLE_EXIT; 889 } 890 891 892 893 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 894 if (U_FAILURE(status)) { 895 return false; 896 } 897 if (U_FAILURE(fDeferredStatus)) { 898 status = fDeferredStatus; 899 return false; 900 } 901 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 902 // This will reset the region to be the full input length. 903 if (start < 0) { 904 status = U_INDEX_OUTOFBOUNDS_ERROR; 905 return false; 906 } 907 908 int64_t nativeStart = start; 909 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 910 status = U_INDEX_OUTOFBOUNDS_ERROR; 911 return false; 912 } 913 fMatchEnd = nativeStart; 914 return find(status); 915 } 916 917 918 //-------------------------------------------------------------------------------- 919 // 920 // findUsingChunk() -- like find(), but with the advance knowledge that the 921 // entire string is available in the UText's chunk buffer. 922 // 923 //-------------------------------------------------------------------------------- 924 UBool RegexMatcher::findUsingChunk(UErrorCode &status) { 925 // Start at the position of the last match end. (Will be zero if the 926 // matcher has been reset. 927 // 928 929 int32_t startPos = static_cast<int32_t>(fMatchEnd); 930 if (startPos==0) { 931 startPos = static_cast<int32_t>(fActiveStart); 932 } 933 934 const char16_t *inputBuf = fInputText->chunkContents; 935 936 if (fMatch) { 937 // Save the position of any previous successful match. 938 fLastMatchEnd = fMatchEnd; 939 940 if (fMatchStart == fMatchEnd) { 941 // Previous match had zero length. Move start position up one position 942 // to avoid sending find() into a loop on zero-length matches. 943 if (startPos >= fActiveLimit) { 944 fMatch = false; 945 fHitEnd = true; 946 return false; 947 } 948 U16_FWD_1(inputBuf, startPos, fInputLength); 949 } 950 } else { 951 if (fLastMatchEnd >= 0) { 952 // A previous find() failed to match. Don't try again. 953 // (without this test, a pattern with a zero-length match 954 // could match again at the end of an input string.) 955 fHitEnd = true; 956 return false; 957 } 958 } 959 960 961 // Compute the position in the input string beyond which a match can not begin, because 962 // the minimum length match would extend past the end of the input. 963 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 964 // Be aware of possible overflows if making changes here. 965 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit. 966 int32_t testLen = static_cast<int32_t>(fActiveLimit - fPattern->fMinMatchLen); 967 if (startPos > testLen) { 968 fMatch = false; 969 fHitEnd = true; 970 return false; 971 } 972 973 UChar32 c; 974 U_ASSERT(startPos >= 0); 975 976 switch (fPattern->fStartType) { 977 case START_NO_INFO: 978 // No optimization was found. 979 // Try a match at each input position. 980 for (;;) { 981 MatchChunkAt(startPos, false, status); 982 if (U_FAILURE(status)) { 983 return false; 984 } 985 if (fMatch) { 986 return true; 987 } 988 if (startPos >= testLen) { 989 fHitEnd = true; 990 return false; 991 } 992 U16_FWD_1(inputBuf, startPos, fActiveLimit); 993 // Note that it's perfectly OK for a pattern to have a zero-length 994 // match at the end of a string, so we must make sure that the loop 995 // runs with startPos == testLen the last time through. 996 if (findProgressInterrupt(startPos, status)) 997 return false; 998 } 999 UPRV_UNREACHABLE_EXIT; 1000 1001 case START_START: 1002 // Matches are only possible at the start of the input string 1003 // (pattern begins with ^ or \A) 1004 if (startPos > fActiveStart) { 1005 fMatch = false; 1006 return false; 1007 } 1008 MatchChunkAt(startPos, false, status); 1009 if (U_FAILURE(status)) { 1010 return false; 1011 } 1012 return fMatch; 1013 1014 1015 case START_SET: 1016 { 1017 // Match may start on any char from a pre-computed set. 1018 U_ASSERT(fPattern->fMinMatchLen > 0); 1019 for (;;) { 1020 int32_t pos = startPos; 1021 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1022 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 1023 (c>=256 && fPattern->fInitialChars->contains(c))) { 1024 MatchChunkAt(pos, false, status); 1025 if (U_FAILURE(status)) { 1026 return false; 1027 } 1028 if (fMatch) { 1029 return true; 1030 } 1031 } 1032 if (startPos > testLen) { 1033 fMatch = false; 1034 fHitEnd = true; 1035 return false; 1036 } 1037 if (findProgressInterrupt(startPos, status)) 1038 return false; 1039 } 1040 } 1041 UPRV_UNREACHABLE_EXIT; 1042 1043 case START_STRING: 1044 case START_CHAR: 1045 { 1046 // Match starts on exactly one char. 1047 U_ASSERT(fPattern->fMinMatchLen > 0); 1048 UChar32 theChar = fPattern->fInitialChar; 1049 for (;;) { 1050 int32_t pos = startPos; 1051 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1052 if (c == theChar) { 1053 MatchChunkAt(pos, false, status); 1054 if (U_FAILURE(status)) { 1055 return false; 1056 } 1057 if (fMatch) { 1058 return true; 1059 } 1060 } 1061 if (startPos > testLen) { 1062 fMatch = false; 1063 fHitEnd = true; 1064 return false; 1065 } 1066 if (findProgressInterrupt(startPos, status)) 1067 return false; 1068 } 1069 } 1070 UPRV_UNREACHABLE_EXIT; 1071 1072 case START_LINE: 1073 { 1074 UChar32 ch; 1075 if (startPos == fAnchorStart) { 1076 MatchChunkAt(startPos, false, status); 1077 if (U_FAILURE(status)) { 1078 return false; 1079 } 1080 if (fMatch) { 1081 return true; 1082 } 1083 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1084 } 1085 1086 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1087 for (;;) { 1088 ch = inputBuf[startPos-1]; 1089 if (ch == 0x0a) { 1090 MatchChunkAt(startPos, false, status); 1091 if (U_FAILURE(status)) { 1092 return false; 1093 } 1094 if (fMatch) { 1095 return true; 1096 } 1097 } 1098 if (startPos >= testLen) { 1099 fMatch = false; 1100 fHitEnd = true; 1101 return false; 1102 } 1103 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1104 // Note that it's perfectly OK for a pattern to have a zero-length 1105 // match at the end of a string, so we must make sure that the loop 1106 // runs with startPos == testLen the last time through. 1107 if (findProgressInterrupt(startPos, status)) 1108 return false; 1109 } 1110 } else { 1111 for (;;) { 1112 ch = inputBuf[startPos-1]; 1113 if (isLineTerminator(ch)) { 1114 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1115 startPos++; 1116 } 1117 MatchChunkAt(startPos, false, status); 1118 if (U_FAILURE(status)) { 1119 return false; 1120 } 1121 if (fMatch) { 1122 return true; 1123 } 1124 } 1125 if (startPos >= testLen) { 1126 fMatch = false; 1127 fHitEnd = true; 1128 return false; 1129 } 1130 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1131 // Note that it's perfectly OK for a pattern to have a zero-length 1132 // match at the end of a string, so we must make sure that the loop 1133 // runs with startPos == testLen the last time through. 1134 if (findProgressInterrupt(startPos, status)) 1135 return false; 1136 } 1137 } 1138 } 1139 1140 default: 1141 UPRV_UNREACHABLE_ASSERT; 1142 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But 1143 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. 1144 // See ICU-21669. 1145 status = U_INTERNAL_PROGRAM_ERROR; 1146 return false; 1147 } 1148 1149 UPRV_UNREACHABLE_EXIT; 1150 } 1151 1152 1153 1154 //-------------------------------------------------------------------------------- 1155 // 1156 // group() 1157 // 1158 //-------------------------------------------------------------------------------- 1159 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1160 return group(0, status); 1161 } 1162 1163 // Return immutable shallow clone 1164 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1165 return group(0, dest, group_len, status); 1166 } 1167 1168 // Return immutable shallow clone 1169 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1170 group_len = 0; 1171 if (U_FAILURE(status)) { 1172 return dest; 1173 } 1174 if (U_FAILURE(fDeferredStatus)) { 1175 status = fDeferredStatus; 1176 } else if (fMatch == false) { 1177 status = U_REGEX_INVALID_STATE; 1178 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1179 status = U_INDEX_OUTOFBOUNDS_ERROR; 1180 } 1181 1182 if (U_FAILURE(status)) { 1183 return dest; 1184 } 1185 1186 int64_t s, e; 1187 if (groupNum == 0) { 1188 s = fMatchStart; 1189 e = fMatchEnd; 1190 } else { 1191 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1192 U_ASSERT(groupOffset < fPattern->fFrameSize); 1193 U_ASSERT(groupOffset >= 0); 1194 s = fFrame->fExtra[groupOffset]; 1195 e = fFrame->fExtra[groupOffset+1]; 1196 } 1197 1198 if (s < 0) { 1199 // A capture group wasn't part of the match 1200 return utext_clone(dest, fInputText, false, true, &status); 1201 } 1202 U_ASSERT(s <= e); 1203 group_len = e - s; 1204 1205 dest = utext_clone(dest, fInputText, false, true, &status); 1206 if (dest) 1207 UTEXT_SETNATIVEINDEX(dest, s); 1208 return dest; 1209 } 1210 1211 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1212 UnicodeString result; 1213 int64_t groupStart = start64(groupNum, status); 1214 int64_t groupEnd = end64(groupNum, status); 1215 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { 1216 return result; 1217 } 1218 1219 // Get the group length using a utext_extract preflight. 1220 // UText is actually pretty efficient at this when underlying encoding is UTF-16. 1221 UErrorCode bufferStatus = U_ZERO_ERROR; 1222 int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &bufferStatus); 1223 if (bufferStatus != U_BUFFER_OVERFLOW_ERROR) { 1224 if (U_FAILURE(bufferStatus)) { 1225 status = bufferStatus; 1226 } 1227 return result; 1228 } 1229 1230 char16_t *buf = result.getBuffer(length); 1231 if (buf == nullptr) { 1232 status = U_MEMORY_ALLOCATION_ERROR; 1233 } else { 1234 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); 1235 result.releaseBuffer(extractLength); 1236 U_ASSERT(length == extractLength); 1237 } 1238 return result; 1239 } 1240 1241 1242 //-------------------------------------------------------------------------------- 1243 // 1244 // appendGroup() -- currently internal only, appends a group to a UText rather 1245 // than replacing its contents 1246 // 1247 //-------------------------------------------------------------------------------- 1248 1249 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1250 if (U_FAILURE(status)) { 1251 return 0; 1252 } 1253 if (U_FAILURE(fDeferredStatus)) { 1254 status = fDeferredStatus; 1255 return 0; 1256 } 1257 int64_t destLen = utext_nativeLength(dest); 1258 1259 if (fMatch == false) { 1260 status = U_REGEX_INVALID_STATE; 1261 return utext_replace(dest, destLen, destLen, nullptr, 0, &status); 1262 } 1263 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1264 status = U_INDEX_OUTOFBOUNDS_ERROR; 1265 return utext_replace(dest, destLen, destLen, nullptr, 0, &status); 1266 } 1267 1268 int64_t s, e; 1269 if (groupNum == 0) { 1270 s = fMatchStart; 1271 e = fMatchEnd; 1272 } else { 1273 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1274 U_ASSERT(groupOffset < fPattern->fFrameSize); 1275 U_ASSERT(groupOffset >= 0); 1276 s = fFrame->fExtra[groupOffset]; 1277 e = fFrame->fExtra[groupOffset+1]; 1278 } 1279 1280 if (s < 0) { 1281 // A capture group wasn't part of the match 1282 return utext_replace(dest, destLen, destLen, nullptr, 0, &status); 1283 } 1284 U_ASSERT(s <= e); 1285 1286 int64_t deltaLen; 1287 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1288 U_ASSERT(e <= fInputLength); 1289 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents + s, static_cast<int32_t>(e - s), &status); 1290 } else { 1291 int32_t len16; 1292 if (UTEXT_USES_U16(fInputText)) { 1293 len16 = static_cast<int32_t>(e - s); 1294 } else { 1295 UErrorCode lengthStatus = U_ZERO_ERROR; 1296 len16 = utext_extract(fInputText, s, e, nullptr, 0, &lengthStatus); 1297 } 1298 char16_t* groupChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1))); 1299 if (groupChars == nullptr) { 1300 status = U_MEMORY_ALLOCATION_ERROR; 1301 return 0; 1302 } 1303 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1304 1305 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1306 uprv_free(groupChars); 1307 } 1308 return deltaLen; 1309 } 1310 1311 1312 1313 //-------------------------------------------------------------------------------- 1314 // 1315 // groupCount() 1316 // 1317 //-------------------------------------------------------------------------------- 1318 int32_t RegexMatcher::groupCount() const { 1319 return fPattern->fGroupMap->size(); 1320 } 1321 1322 //-------------------------------------------------------------------------------- 1323 // 1324 // hasAnchoringBounds() 1325 // 1326 //-------------------------------------------------------------------------------- 1327 UBool RegexMatcher::hasAnchoringBounds() const { 1328 return fAnchoringBounds; 1329 } 1330 1331 1332 //-------------------------------------------------------------------------------- 1333 // 1334 // hasTransparentBounds() 1335 // 1336 //-------------------------------------------------------------------------------- 1337 UBool RegexMatcher::hasTransparentBounds() const { 1338 return fTransparentBounds; 1339 } 1340 1341 1342 1343 //-------------------------------------------------------------------------------- 1344 // 1345 // hitEnd() 1346 // 1347 //-------------------------------------------------------------------------------- 1348 UBool RegexMatcher::hitEnd() const { 1349 return fHitEnd; 1350 } 1351 1352 1353 //-------------------------------------------------------------------------------- 1354 // 1355 // input() 1356 // 1357 //-------------------------------------------------------------------------------- 1358 const UnicodeString &RegexMatcher::input() const { 1359 if (!fInput) { 1360 UErrorCode status = U_ZERO_ERROR; 1361 int32_t len16; 1362 if (UTEXT_USES_U16(fInputText)) { 1363 len16 = static_cast<int32_t>(fInputLength); 1364 } else { 1365 len16 = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &status); 1366 status = U_ZERO_ERROR; // overflow, length status 1367 } 1368 UnicodeString *result = new UnicodeString(len16, 0, 0); 1369 1370 char16_t *inputChars = result->getBuffer(len16); 1371 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1372 result->releaseBuffer(len16); 1373 1374 *const_cast<const UnicodeString**>(&fInput) = result; // pointer assignment, rather than operator= 1375 } 1376 1377 return *fInput; 1378 } 1379 1380 //-------------------------------------------------------------------------------- 1381 // 1382 // inputText() 1383 // 1384 //-------------------------------------------------------------------------------- 1385 UText *RegexMatcher::inputText() const { 1386 return fInputText; 1387 } 1388 1389 1390 //-------------------------------------------------------------------------------- 1391 // 1392 // getInput() -- like inputText(), but makes a clone or copies into another UText 1393 // 1394 //-------------------------------------------------------------------------------- 1395 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1396 if (U_FAILURE(status)) { 1397 return dest; 1398 } 1399 if (U_FAILURE(fDeferredStatus)) { 1400 status = fDeferredStatus; 1401 return dest; 1402 } 1403 1404 if (dest) { 1405 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1406 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, static_cast<int32_t>(fInputLength), &status); 1407 } else { 1408 int32_t input16Len; 1409 if (UTEXT_USES_U16(fInputText)) { 1410 input16Len = static_cast<int32_t>(fInputLength); 1411 } else { 1412 UErrorCode lengthStatus = U_ZERO_ERROR; 1413 input16Len = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &lengthStatus); // buffer overflow error 1414 } 1415 char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (input16Len))); 1416 if (inputChars == nullptr) { 1417 return dest; 1418 } 1419 1420 status = U_ZERO_ERROR; 1421 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1422 status = U_ZERO_ERROR; 1423 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1424 1425 uprv_free(inputChars); 1426 } 1427 return dest; 1428 } else { 1429 return utext_clone(nullptr, fInputText, false, true, &status); 1430 } 1431 } 1432 1433 1434 static UBool compat_SyncMutableUTextContents(UText *ut); 1435 static UBool compat_SyncMutableUTextContents(UText *ut) { 1436 UBool retVal = false; 1437 1438 // In the following test, we're really only interested in whether the UText should switch 1439 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1440 // will still point to the correct data. 1441 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1442 UnicodeString *us=(UnicodeString *)ut->context; 1443 1444 // Update to the latest length. 1445 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1446 int32_t newLength = us->length(); 1447 1448 // Update the chunk description. 1449 // The buffer may have switched between stack- and heap-based. 1450 ut->chunkContents = us->getBuffer(); 1451 ut->chunkLength = newLength; 1452 ut->chunkNativeLimit = newLength; 1453 ut->nativeIndexingLimit = newLength; 1454 retVal = true; 1455 } 1456 1457 return retVal; 1458 } 1459 1460 //-------------------------------------------------------------------------------- 1461 // 1462 // lookingAt() 1463 // 1464 //-------------------------------------------------------------------------------- 1465 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1466 if (U_FAILURE(status)) { 1467 return false; 1468 } 1469 if (U_FAILURE(fDeferredStatus)) { 1470 status = fDeferredStatus; 1471 return false; 1472 } 1473 1474 if (fInputUniStrMaybeMutable) { 1475 if (compat_SyncMutableUTextContents(fInputText)) { 1476 fInputLength = utext_nativeLength(fInputText); 1477 reset(); 1478 } 1479 } 1480 else { 1481 resetPreserveRegion(); 1482 } 1483 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1484 MatchChunkAt(static_cast<int32_t>(fActiveStart), false, status); 1485 } else { 1486 MatchAt(fActiveStart, false, status); 1487 } 1488 return fMatch; 1489 } 1490 1491 1492 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1493 if (U_FAILURE(status)) { 1494 return false; 1495 } 1496 if (U_FAILURE(fDeferredStatus)) { 1497 status = fDeferredStatus; 1498 return false; 1499 } 1500 reset(); 1501 1502 if (start < 0) { 1503 status = U_INDEX_OUTOFBOUNDS_ERROR; 1504 return false; 1505 } 1506 1507 if (fInputUniStrMaybeMutable) { 1508 if (compat_SyncMutableUTextContents(fInputText)) { 1509 fInputLength = utext_nativeLength(fInputText); 1510 reset(); 1511 } 1512 } 1513 1514 int64_t nativeStart; 1515 nativeStart = start; 1516 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1517 status = U_INDEX_OUTOFBOUNDS_ERROR; 1518 return false; 1519 } 1520 1521 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1522 MatchChunkAt(static_cast<int32_t>(nativeStart), false, status); 1523 } else { 1524 MatchAt(nativeStart, false, status); 1525 } 1526 return fMatch; 1527 } 1528 1529 1530 1531 //-------------------------------------------------------------------------------- 1532 // 1533 // matches() 1534 // 1535 //-------------------------------------------------------------------------------- 1536 UBool RegexMatcher::matches(UErrorCode &status) { 1537 if (U_FAILURE(status)) { 1538 return false; 1539 } 1540 if (U_FAILURE(fDeferredStatus)) { 1541 status = fDeferredStatus; 1542 return false; 1543 } 1544 1545 if (fInputUniStrMaybeMutable) { 1546 if (compat_SyncMutableUTextContents(fInputText)) { 1547 fInputLength = utext_nativeLength(fInputText); 1548 reset(); 1549 } 1550 } 1551 else { 1552 resetPreserveRegion(); 1553 } 1554 1555 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1556 MatchChunkAt(static_cast<int32_t>(fActiveStart), true, status); 1557 } else { 1558 MatchAt(fActiveStart, true, status); 1559 } 1560 return fMatch; 1561 } 1562 1563 1564 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1565 if (U_FAILURE(status)) { 1566 return false; 1567 } 1568 if (U_FAILURE(fDeferredStatus)) { 1569 status = fDeferredStatus; 1570 return false; 1571 } 1572 reset(); 1573 1574 if (start < 0) { 1575 status = U_INDEX_OUTOFBOUNDS_ERROR; 1576 return false; 1577 } 1578 1579 if (fInputUniStrMaybeMutable) { 1580 if (compat_SyncMutableUTextContents(fInputText)) { 1581 fInputLength = utext_nativeLength(fInputText); 1582 reset(); 1583 } 1584 } 1585 1586 int64_t nativeStart; 1587 nativeStart = start; 1588 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1589 status = U_INDEX_OUTOFBOUNDS_ERROR; 1590 return false; 1591 } 1592 1593 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1594 MatchChunkAt(static_cast<int32_t>(nativeStart), true, status); 1595 } else { 1596 MatchAt(nativeStart, true, status); 1597 } 1598 return fMatch; 1599 } 1600 1601 1602 1603 //-------------------------------------------------------------------------------- 1604 // 1605 // pattern 1606 // 1607 //-------------------------------------------------------------------------------- 1608 const RegexPattern &RegexMatcher::pattern() const { 1609 return *fPattern; 1610 } 1611 1612 1613 1614 //-------------------------------------------------------------------------------- 1615 // 1616 // region 1617 // 1618 //-------------------------------------------------------------------------------- 1619 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1620 if (U_FAILURE(status)) { 1621 return *this; 1622 } 1623 1624 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1625 status = U_ILLEGAL_ARGUMENT_ERROR; 1626 } 1627 1628 int64_t nativeStart = regionStart; 1629 int64_t nativeLimit = regionLimit; 1630 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1631 status = U_ILLEGAL_ARGUMENT_ERROR; 1632 } 1633 1634 if (startIndex == -1) 1635 this->reset(); 1636 else 1637 resetPreserveRegion(); 1638 1639 fRegionStart = nativeStart; 1640 fRegionLimit = nativeLimit; 1641 fActiveStart = nativeStart; 1642 fActiveLimit = nativeLimit; 1643 1644 if (startIndex != -1) { 1645 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1646 status = U_INDEX_OUTOFBOUNDS_ERROR; 1647 } 1648 fMatchEnd = startIndex; 1649 } 1650 1651 if (!fTransparentBounds) { 1652 fLookStart = nativeStart; 1653 fLookLimit = nativeLimit; 1654 } 1655 if (fAnchoringBounds) { 1656 fAnchorStart = nativeStart; 1657 fAnchorLimit = nativeLimit; 1658 } 1659 return *this; 1660 } 1661 1662 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1663 return region(start, limit, -1, status); 1664 } 1665 1666 //-------------------------------------------------------------------------------- 1667 // 1668 // regionEnd 1669 // 1670 //-------------------------------------------------------------------------------- 1671 int32_t RegexMatcher::regionEnd() const { 1672 return static_cast<int32_t>(fRegionLimit); 1673 } 1674 1675 int64_t RegexMatcher::regionEnd64() const { 1676 return fRegionLimit; 1677 } 1678 1679 //-------------------------------------------------------------------------------- 1680 // 1681 // regionStart 1682 // 1683 //-------------------------------------------------------------------------------- 1684 int32_t RegexMatcher::regionStart() const { 1685 return static_cast<int32_t>(fRegionStart); 1686 } 1687 1688 int64_t RegexMatcher::regionStart64() const { 1689 return fRegionStart; 1690 } 1691 1692 1693 //-------------------------------------------------------------------------------- 1694 // 1695 // replaceAll 1696 // 1697 //-------------------------------------------------------------------------------- 1698 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1699 UText replacementText = UTEXT_INITIALIZER; 1700 UText resultText = UTEXT_INITIALIZER; 1701 UnicodeString resultString; 1702 if (U_FAILURE(status)) { 1703 return resultString; 1704 } 1705 1706 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1707 utext_openUnicodeString(&resultText, &resultString, &status); 1708 1709 replaceAll(&replacementText, &resultText, status); 1710 1711 utext_close(&resultText); 1712 utext_close(&replacementText); 1713 1714 return resultString; 1715 } 1716 1717 1718 // 1719 // replaceAll, UText mode 1720 // 1721 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1722 if (U_FAILURE(status)) { 1723 return dest; 1724 } 1725 if (U_FAILURE(fDeferredStatus)) { 1726 status = fDeferredStatus; 1727 return dest; 1728 } 1729 1730 if (dest == nullptr) { 1731 UnicodeString emptyString; 1732 UText empty = UTEXT_INITIALIZER; 1733 1734 utext_openUnicodeString(&empty, &emptyString, &status); 1735 dest = utext_clone(nullptr, &empty, true, false, &status); 1736 utext_close(&empty); 1737 } 1738 1739 if (U_SUCCESS(status)) { 1740 reset(); 1741 while (find()) { 1742 appendReplacement(dest, replacement, status); 1743 if (U_FAILURE(status)) { 1744 break; 1745 } 1746 } 1747 appendTail(dest, status); 1748 } 1749 1750 return dest; 1751 } 1752 1753 1754 //-------------------------------------------------------------------------------- 1755 // 1756 // replaceFirst 1757 // 1758 //-------------------------------------------------------------------------------- 1759 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1760 UText replacementText = UTEXT_INITIALIZER; 1761 UText resultText = UTEXT_INITIALIZER; 1762 UnicodeString resultString; 1763 1764 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1765 utext_openUnicodeString(&resultText, &resultString, &status); 1766 1767 replaceFirst(&replacementText, &resultText, status); 1768 1769 utext_close(&resultText); 1770 utext_close(&replacementText); 1771 1772 return resultString; 1773 } 1774 1775 // 1776 // replaceFirst, UText mode 1777 // 1778 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1779 if (U_FAILURE(status)) { 1780 return dest; 1781 } 1782 if (U_FAILURE(fDeferredStatus)) { 1783 status = fDeferredStatus; 1784 return dest; 1785 } 1786 1787 reset(); 1788 if (!find()) { 1789 return getInput(dest, status); 1790 } 1791 1792 if (dest == nullptr) { 1793 UnicodeString emptyString; 1794 UText empty = UTEXT_INITIALIZER; 1795 1796 utext_openUnicodeString(&empty, &emptyString, &status); 1797 dest = utext_clone(nullptr, &empty, true, false, &status); 1798 utext_close(&empty); 1799 } 1800 1801 appendReplacement(dest, replacement, status); 1802 appendTail(dest, status); 1803 1804 return dest; 1805 } 1806 1807 1808 //-------------------------------------------------------------------------------- 1809 // 1810 // requireEnd 1811 // 1812 //-------------------------------------------------------------------------------- 1813 UBool RegexMatcher::requireEnd() const { 1814 return fRequireEnd; 1815 } 1816 1817 1818 //-------------------------------------------------------------------------------- 1819 // 1820 // reset 1821 // 1822 //-------------------------------------------------------------------------------- 1823 RegexMatcher &RegexMatcher::reset() { 1824 fRegionStart = 0; 1825 fRegionLimit = fInputLength; 1826 fActiveStart = 0; 1827 fActiveLimit = fInputLength; 1828 fAnchorStart = 0; 1829 fAnchorLimit = fInputLength; 1830 fLookStart = 0; 1831 fLookLimit = fInputLength; 1832 resetPreserveRegion(); 1833 return *this; 1834 } 1835 1836 1837 1838 void RegexMatcher::resetPreserveRegion() { 1839 fMatchStart = 0; 1840 fMatchEnd = 0; 1841 fLastMatchEnd = -1; 1842 fAppendPosition = 0; 1843 fMatch = false; 1844 fHitEnd = false; 1845 fRequireEnd = false; 1846 fTime = 0; 1847 fTickCounter = TIMER_INITIAL_VALUE; 1848 //resetStack(); // more expensive than it looks... 1849 } 1850 1851 1852 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1853 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1854 if (fPattern->fNeedsAltInput) { 1855 fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus); 1856 } 1857 if (U_FAILURE(fDeferredStatus)) { 1858 return *this; 1859 } 1860 fInputLength = utext_nativeLength(fInputText); 1861 1862 reset(); 1863 delete fInput; 1864 fInput = nullptr; 1865 1866 // Do the following for any UnicodeString. 1867 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1868 fInputUniStrMaybeMutable = true; 1869 1870 #if UCONFIG_NO_BREAK_ITERATION==0 1871 if (fWordBreakItr) { 1872 fWordBreakItr->setText(fInputText, fDeferredStatus); 1873 } 1874 if (fGCBreakItr) { 1875 fGCBreakItr->setText(fInputText, fDeferredStatus); 1876 } 1877 #endif 1878 1879 return *this; 1880 } 1881 1882 1883 RegexMatcher &RegexMatcher::reset(UText *input) { 1884 if (fInputText != input) { 1885 fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus); 1886 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus); 1887 if (U_FAILURE(fDeferredStatus)) { 1888 return *this; 1889 } 1890 fInputLength = utext_nativeLength(fInputText); 1891 1892 delete fInput; 1893 fInput = nullptr; 1894 1895 #if UCONFIG_NO_BREAK_ITERATION==0 1896 if (fWordBreakItr) { 1897 fWordBreakItr->setText(input, fDeferredStatus); 1898 } 1899 if (fGCBreakItr) { 1900 fGCBreakItr->setText(fInputText, fDeferredStatus); 1901 } 1902 #endif 1903 } 1904 reset(); 1905 fInputUniStrMaybeMutable = false; 1906 1907 return *this; 1908 } 1909 1910 /*RegexMatcher &RegexMatcher::reset(const char16_t *) { 1911 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1912 return *this; 1913 }*/ 1914 1915 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1916 if (U_FAILURE(status)) { 1917 return *this; 1918 } 1919 reset(); // Reset also resets the region to be the entire string. 1920 1921 if (position < 0 || position > fActiveLimit) { 1922 status = U_INDEX_OUTOFBOUNDS_ERROR; 1923 return *this; 1924 } 1925 fMatchEnd = position; 1926 return *this; 1927 } 1928 1929 1930 //-------------------------------------------------------------------------------- 1931 // 1932 // refresh 1933 // 1934 //-------------------------------------------------------------------------------- 1935 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1936 if (U_FAILURE(status)) { 1937 return *this; 1938 } 1939 if (input == nullptr) { 1940 status = U_ILLEGAL_ARGUMENT_ERROR; 1941 return *this; 1942 } 1943 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1944 status = U_ILLEGAL_ARGUMENT_ERROR; 1945 return *this; 1946 } 1947 int64_t pos = utext_getNativeIndex(fInputText); 1948 // Shallow read-only clone of the new UText into the existing input UText 1949 fInputText = utext_clone(fInputText, input, false, true, &status); 1950 if (U_FAILURE(status)) { 1951 return *this; 1952 } 1953 utext_setNativeIndex(fInputText, pos); 1954 1955 if (fAltInputText != nullptr) { 1956 pos = utext_getNativeIndex(fAltInputText); 1957 fAltInputText = utext_clone(fAltInputText, input, false, true, &status); 1958 if (U_FAILURE(status)) { 1959 return *this; 1960 } 1961 utext_setNativeIndex(fAltInputText, pos); 1962 } 1963 return *this; 1964 } 1965 1966 1967 1968 //-------------------------------------------------------------------------------- 1969 // 1970 // setTrace 1971 // 1972 //-------------------------------------------------------------------------------- 1973 void RegexMatcher::setTrace(UBool state) { 1974 fTraceDebug = state; 1975 } 1976 1977 1978 1979 /** 1980 * UText, replace entire contents of the destination UText with a substring of the source UText. 1981 * 1982 * @param src The source UText 1983 * @param dest The destination UText. Must be writable. 1984 * May be nullptr, in which case a new UText will be allocated. 1985 * @param start Start index of source substring. 1986 * @param limit Limit index of source substring. 1987 * @param status An error code. 1988 */ 1989 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { 1990 if (U_FAILURE(*status)) { 1991 return dest; 1992 } 1993 if (start == limit) { 1994 if (dest) { 1995 utext_replace(dest, 0, utext_nativeLength(dest), nullptr, 0, status); 1996 return dest; 1997 } else { 1998 return utext_openUChars(nullptr, nullptr, 0, status); 1999 } 2000 } 2001 UErrorCode bufferStatus = U_ZERO_ERROR; 2002 int32_t length = utext_extract(src, start, limit, nullptr, 0, &bufferStatus); 2003 if (bufferStatus != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(bufferStatus)) { 2004 *status = bufferStatus; 2005 return dest; 2006 } 2007 MaybeStackArray<char16_t, 40> buffer; 2008 if (length >= buffer.getCapacity()) { 2009 char16_t *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. 2010 if (newBuf == nullptr) { 2011 *status = U_MEMORY_ALLOCATION_ERROR; 2012 } 2013 } 2014 utext_extract(src, start, limit, buffer.getAlias(), length+1, status); 2015 if (dest) { 2016 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); 2017 return dest; 2018 } 2019 2020 // Caller did not provide a preexisting UText. 2021 // Open a new one, and have it adopt the text buffer storage. 2022 if (U_FAILURE(*status)) { 2023 return nullptr; 2024 } 2025 int32_t ownedLength = 0; 2026 char16_t *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); 2027 if (ownedBuf == nullptr) { 2028 *status = U_MEMORY_ALLOCATION_ERROR; 2029 return nullptr; 2030 } 2031 UText *result = utext_openUChars(nullptr, ownedBuf, length, status); 2032 if (U_FAILURE(*status)) { 2033 uprv_free(ownedBuf); 2034 return nullptr; 2035 } 2036 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); 2037 return result; 2038 } 2039 2040 2041 //--------------------------------------------------------------------- 2042 // 2043 // split 2044 // 2045 //--------------------------------------------------------------------- 2046 int32_t RegexMatcher::split(const UnicodeString &input, 2047 UnicodeString dest[], 2048 int32_t destCapacity, 2049 UErrorCode &status) 2050 { 2051 UText inputText = UTEXT_INITIALIZER; 2052 utext_openConstUnicodeString(&inputText, &input, &status); 2053 if (U_FAILURE(status)) { 2054 return 0; 2055 } 2056 2057 UText** destText = static_cast<UText**>(uprv_malloc(sizeof(UText*) * destCapacity)); 2058 if (destText == nullptr) { 2059 status = U_MEMORY_ALLOCATION_ERROR; 2060 return 0; 2061 } 2062 int32_t i; 2063 for (i = 0; i < destCapacity; i++) { 2064 destText[i] = utext_openUnicodeString(nullptr, &dest[i], &status); 2065 } 2066 2067 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2068 2069 for (i = 0; i < destCapacity; i++) { 2070 utext_close(destText[i]); 2071 } 2072 2073 uprv_free(destText); 2074 utext_close(&inputText); 2075 return fieldCount; 2076 } 2077 2078 // 2079 // split, UText mode 2080 // 2081 int32_t RegexMatcher::split(UText *input, 2082 UText *dest[], 2083 int32_t destCapacity, 2084 UErrorCode &status) 2085 { 2086 // 2087 // Check arguments for validity 2088 // 2089 if (U_FAILURE(status)) { 2090 return 0; 2091 } 2092 2093 if (destCapacity < 1) { 2094 status = U_ILLEGAL_ARGUMENT_ERROR; 2095 return 0; 2096 } 2097 2098 // 2099 // Reset for the input text 2100 // 2101 reset(input); 2102 int64_t nextOutputStringStart = 0; 2103 if (fActiveLimit == 0) { 2104 return 0; 2105 } 2106 2107 // 2108 // Loop through the input text, searching for the delimiter pattern 2109 // 2110 int32_t i; 2111 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2112 for (i=0; ; i++) { 2113 if (i>=destCapacity-1) { 2114 // There is one or zero output string left. 2115 // Fill the last output string with whatever is left from the input, then exit the loop. 2116 // ( i will be == destCapacity if we filled the output array while processing 2117 // capture groups of the delimiter expression, in which case we will discard the 2118 // last capture group saved in favor of the unprocessed remainder of the 2119 // input string.) 2120 i = destCapacity-1; 2121 if (fActiveLimit > nextOutputStringStart) { 2122 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2123 if (dest[i]) { 2124 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2125 input->chunkContents+nextOutputStringStart, 2126 static_cast<int32_t>(fActiveLimit - nextOutputStringStart), &status); 2127 } else { 2128 UText remainingText = UTEXT_INITIALIZER; 2129 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2130 fActiveLimit-nextOutputStringStart, &status); 2131 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); 2132 utext_close(&remainingText); 2133 } 2134 } else { 2135 UErrorCode lengthStatus = U_ZERO_ERROR; 2136 int32_t remaining16Length = 2137 utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus); 2138 char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1))); 2139 if (remainingChars == nullptr) { 2140 status = U_MEMORY_ALLOCATION_ERROR; 2141 break; 2142 } 2143 2144 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2145 if (dest[i]) { 2146 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2147 } else { 2148 UText remainingText = UTEXT_INITIALIZER; 2149 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2150 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); 2151 utext_close(&remainingText); 2152 } 2153 2154 uprv_free(remainingChars); 2155 } 2156 } 2157 break; 2158 } 2159 if (find()) { 2160 // We found another delimiter. Move everything from where we started looking 2161 // up until the start of the delimiter into the next output string. 2162 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2163 if (dest[i]) { 2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2165 input->chunkContents+nextOutputStringStart, 2166 static_cast<int32_t>(fMatchStart - nextOutputStringStart), &status); 2167 } else { 2168 UText remainingText = UTEXT_INITIALIZER; 2169 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2170 fMatchStart-nextOutputStringStart, &status); 2171 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); 2172 utext_close(&remainingText); 2173 } 2174 } else { 2175 UErrorCode lengthStatus = U_ZERO_ERROR; 2176 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, nullptr, 0, &lengthStatus); 2177 char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1))); 2178 if (remainingChars == nullptr) { 2179 status = U_MEMORY_ALLOCATION_ERROR; 2180 break; 2181 } 2182 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2183 if (dest[i]) { 2184 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2185 } else { 2186 UText remainingText = UTEXT_INITIALIZER; 2187 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2188 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); 2189 utext_close(&remainingText); 2190 } 2191 2192 uprv_free(remainingChars); 2193 } 2194 nextOutputStringStart = fMatchEnd; 2195 2196 // If the delimiter pattern has capturing parentheses, the captured 2197 // text goes out into the next n destination strings. 2198 int32_t groupNum; 2199 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2200 if (i >= destCapacity-2) { 2201 // Never fill the last available output string with capture group text. 2202 // It will filled with the last field, the remainder of the 2203 // unsplit input text. 2204 break; 2205 } 2206 i++; 2207 dest[i] = utext_extract_replace(fInputText, dest[i], 2208 start64(groupNum, status), end64(groupNum, status), &status); 2209 } 2210 2211 if (nextOutputStringStart == fActiveLimit) { 2212 // The delimiter was at the end of the string. We're done, but first 2213 // we output one last empty string, for the empty field following 2214 // the delimiter at the end of input. 2215 if (i+1 < destCapacity) { 2216 ++i; 2217 if (dest[i] == nullptr) { 2218 dest[i] = utext_openUChars(nullptr, nullptr, 0, &status); 2219 } else { 2220 static const char16_t emptyString[] = {static_cast<char16_t>(0)}; 2221 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); 2222 } 2223 } 2224 break; 2225 2226 } 2227 } 2228 else 2229 { 2230 // We ran off the end of the input while looking for the next delimiter. 2231 // All the remaining text goes into the current output string. 2232 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2233 if (dest[i]) { 2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2235 input->chunkContents+nextOutputStringStart, 2236 static_cast<int32_t>(fActiveLimit - nextOutputStringStart), &status); 2237 } else { 2238 UText remainingText = UTEXT_INITIALIZER; 2239 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2240 fActiveLimit-nextOutputStringStart, &status); 2241 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); 2242 utext_close(&remainingText); 2243 } 2244 } else { 2245 UErrorCode lengthStatus = U_ZERO_ERROR; 2246 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus); 2247 char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1))); 2248 if (remainingChars == nullptr) { 2249 status = U_MEMORY_ALLOCATION_ERROR; 2250 break; 2251 } 2252 2253 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2254 if (dest[i]) { 2255 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2256 } else { 2257 UText remainingText = UTEXT_INITIALIZER; 2258 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2259 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); 2260 utext_close(&remainingText); 2261 } 2262 2263 uprv_free(remainingChars); 2264 } 2265 break; 2266 } 2267 if (U_FAILURE(status)) { 2268 break; 2269 } 2270 } // end of for loop 2271 return i+1; 2272 } 2273 2274 2275 //-------------------------------------------------------------------------------- 2276 // 2277 // start 2278 // 2279 //-------------------------------------------------------------------------------- 2280 int32_t RegexMatcher::start(UErrorCode &status) const { 2281 return start(0, status); 2282 } 2283 2284 int64_t RegexMatcher::start64(UErrorCode &status) const { 2285 return start64(0, status); 2286 } 2287 2288 //-------------------------------------------------------------------------------- 2289 // 2290 // start(int32_t group, UErrorCode &status) 2291 // 2292 //-------------------------------------------------------------------------------- 2293 2294 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2295 if (U_FAILURE(status)) { 2296 return -1; 2297 } 2298 if (U_FAILURE(fDeferredStatus)) { 2299 status = fDeferredStatus; 2300 return -1; 2301 } 2302 if (fMatch == false) { 2303 status = U_REGEX_INVALID_STATE; 2304 return -1; 2305 } 2306 if (group < 0 || group > fPattern->fGroupMap->size()) { 2307 status = U_INDEX_OUTOFBOUNDS_ERROR; 2308 return -1; 2309 } 2310 int64_t s; 2311 if (group == 0) { 2312 s = fMatchStart; 2313 } else { 2314 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2315 U_ASSERT(groupOffset < fPattern->fFrameSize); 2316 U_ASSERT(groupOffset >= 0); 2317 s = fFrame->fExtra[groupOffset]; 2318 } 2319 2320 return s; 2321 } 2322 2323 2324 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2325 return static_cast<int32_t>(start64(group, status)); 2326 } 2327 2328 //-------------------------------------------------------------------------------- 2329 // 2330 // useAnchoringBounds 2331 // 2332 //-------------------------------------------------------------------------------- 2333 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2334 fAnchoringBounds = b; 2335 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2336 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2337 return *this; 2338 } 2339 2340 2341 //-------------------------------------------------------------------------------- 2342 // 2343 // useTransparentBounds 2344 // 2345 //-------------------------------------------------------------------------------- 2346 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2347 fTransparentBounds = b; 2348 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2349 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2350 return *this; 2351 } 2352 2353 //-------------------------------------------------------------------------------- 2354 // 2355 // setTimeLimit 2356 // 2357 //-------------------------------------------------------------------------------- 2358 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2359 if (U_FAILURE(status)) { 2360 return; 2361 } 2362 if (U_FAILURE(fDeferredStatus)) { 2363 status = fDeferredStatus; 2364 return; 2365 } 2366 if (limit < 0) { 2367 status = U_ILLEGAL_ARGUMENT_ERROR; 2368 return; 2369 } 2370 fTimeLimit = limit; 2371 } 2372 2373 2374 //-------------------------------------------------------------------------------- 2375 // 2376 // getTimeLimit 2377 // 2378 //-------------------------------------------------------------------------------- 2379 int32_t RegexMatcher::getTimeLimit() const { 2380 return fTimeLimit; 2381 } 2382 2383 2384 //-------------------------------------------------------------------------------- 2385 // 2386 // setStackLimit 2387 // 2388 //-------------------------------------------------------------------------------- 2389 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2390 if (U_FAILURE(status)) { 2391 return; 2392 } 2393 if (U_FAILURE(fDeferredStatus)) { 2394 status = fDeferredStatus; 2395 return; 2396 } 2397 if (limit < 0) { 2398 status = U_ILLEGAL_ARGUMENT_ERROR; 2399 return; 2400 } 2401 2402 // Reset the matcher. This is needed here in case there is a current match 2403 // whose final stack frame (containing the match results, pointed to by fFrame) 2404 // would be lost by resizing to a smaller stack size. 2405 reset(); 2406 2407 if (limit == 0) { 2408 // Unlimited stack expansion 2409 fStack->setMaxCapacity(0); 2410 } else { 2411 // Change the units of the limit from bytes to ints, and bump the size up 2412 // to be big enough to hold at least one stack frame for the pattern, 2413 // if it isn't there already. 2414 int32_t adjustedLimit = limit / sizeof(int32_t); 2415 if (adjustedLimit < fPattern->fFrameSize) { 2416 adjustedLimit = fPattern->fFrameSize; 2417 } 2418 fStack->setMaxCapacity(adjustedLimit); 2419 } 2420 fStackLimit = limit; 2421 } 2422 2423 2424 //-------------------------------------------------------------------------------- 2425 // 2426 // getStackLimit 2427 // 2428 //-------------------------------------------------------------------------------- 2429 int32_t RegexMatcher::getStackLimit() const { 2430 return fStackLimit; 2431 } 2432 2433 2434 //-------------------------------------------------------------------------------- 2435 // 2436 // setMatchCallback 2437 // 2438 //-------------------------------------------------------------------------------- 2439 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2440 const void *context, 2441 UErrorCode &status) { 2442 if (U_FAILURE(status)) { 2443 return; 2444 } 2445 fCallbackFn = callback; 2446 fCallbackContext = context; 2447 } 2448 2449 2450 //-------------------------------------------------------------------------------- 2451 // 2452 // getMatchCallback 2453 // 2454 //-------------------------------------------------------------------------------- 2455 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2456 const void *&context, 2457 UErrorCode &status) { 2458 if (U_FAILURE(status)) { 2459 return; 2460 } 2461 callback = fCallbackFn; 2462 context = fCallbackContext; 2463 } 2464 2465 2466 //-------------------------------------------------------------------------------- 2467 // 2468 // setMatchCallback 2469 // 2470 //-------------------------------------------------------------------------------- 2471 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2472 const void *context, 2473 UErrorCode &status) { 2474 if (U_FAILURE(status)) { 2475 return; 2476 } 2477 fFindProgressCallbackFn = callback; 2478 fFindProgressCallbackContext = context; 2479 } 2480 2481 2482 //-------------------------------------------------------------------------------- 2483 // 2484 // getMatchCallback 2485 // 2486 //-------------------------------------------------------------------------------- 2487 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2488 const void *&context, 2489 UErrorCode &status) { 2490 if (U_FAILURE(status)) { 2491 return; 2492 } 2493 callback = fFindProgressCallbackFn; 2494 context = fFindProgressCallbackContext; 2495 } 2496 2497 2498 //================================================================================ 2499 // 2500 // Code following this point in this file is the internal 2501 // Match Engine Implementation. 2502 // 2503 //================================================================================ 2504 2505 2506 //-------------------------------------------------------------------------------- 2507 // 2508 // resetStack 2509 // Discard any previous contents of the state save stack, and initialize a 2510 // new stack frame to all -1. The -1s are needed for capture group limits, 2511 // where they indicate that a group has not yet matched anything. 2512 //-------------------------------------------------------------------------------- 2513 REStackFrame *RegexMatcher::resetStack() { 2514 // Discard any previous contents of the state save stack, and initialize a 2515 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2516 // where they indicate that a group has not yet matched anything. 2517 fStack->removeAllElements(); 2518 2519 REStackFrame* iFrame = reinterpret_cast<REStackFrame*>(fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus)); 2520 if(U_FAILURE(fDeferredStatus)) { 2521 return nullptr; 2522 } 2523 2524 int32_t i; 2525 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2526 iFrame->fExtra[i] = -1; 2527 } 2528 return iFrame; 2529 } 2530 2531 2532 2533 //-------------------------------------------------------------------------------- 2534 // 2535 // isWordBoundary 2536 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2537 // For us, 2538 // If the current char is a combining mark, 2539 // \b is false. 2540 // Else Scan backwards to the first non-combining char. 2541 // We are at a boundary if the this char and the original chars are 2542 // opposite in membership in \w set 2543 // 2544 // parameters: pos - the current position in the input buffer 2545 // 2546 // TODO: double-check edge cases at region boundaries. 2547 // 2548 //-------------------------------------------------------------------------------- 2549 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2550 UBool isBoundary = false; 2551 UBool cIsWord = false; 2552 2553 if (pos >= fLookLimit) { 2554 fHitEnd = true; 2555 } else { 2556 // Determine whether char c at current position is a member of the word set of chars. 2557 // If we're off the end of the string, behave as though we're not at a word char. 2558 UTEXT_SETNATIVEINDEX(fInputText, pos); 2559 UChar32 c = UTEXT_CURRENT32(fInputText); 2560 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2561 // Current char is a combining one. Not a boundary. 2562 return false; 2563 } 2564 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); 2565 } 2566 2567 // Back up until we come to a non-combining char, determine whether 2568 // that char is a word char. 2569 UBool prevCIsWord = false; 2570 for (;;) { 2571 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2572 break; 2573 } 2574 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2575 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2576 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2577 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); 2578 break; 2579 } 2580 } 2581 isBoundary = cIsWord ^ prevCIsWord; 2582 return isBoundary; 2583 } 2584 2585 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2586 UBool isBoundary = false; 2587 UBool cIsWord = false; 2588 2589 const char16_t *inputBuf = fInputText->chunkContents; 2590 2591 if (pos >= fLookLimit) { 2592 fHitEnd = true; 2593 } else { 2594 // Determine whether char c at current position is a member of the word set of chars. 2595 // If we're off the end of the string, behave as though we're not at a word char. 2596 UChar32 c; 2597 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2598 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2599 // Current char is a combining one. Not a boundary. 2600 return false; 2601 } 2602 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); 2603 } 2604 2605 // Back up until we come to a non-combining char, determine whether 2606 // that char is a word char. 2607 UBool prevCIsWord = false; 2608 for (;;) { 2609 if (pos <= fLookStart) { 2610 break; 2611 } 2612 UChar32 prevChar; 2613 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2614 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2615 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2616 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); 2617 break; 2618 } 2619 } 2620 isBoundary = cIsWord ^ prevCIsWord; 2621 return isBoundary; 2622 } 2623 2624 //-------------------------------------------------------------------------------- 2625 // 2626 // isUWordBoundary 2627 // 2628 // Test for a word boundary using RBBI word break. 2629 // 2630 // parameters: pos - the current position in the input buffer 2631 // 2632 //-------------------------------------------------------------------------------- 2633 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) { 2634 UBool returnVal = false; 2635 2636 #if UCONFIG_NO_BREAK_ITERATION==0 2637 // Note: this point will never be reached if break iteration is configured out. 2638 // Regex patterns that would require this function will fail to compile. 2639 2640 // If we haven't yet created a break iterator for this matcher, do it now. 2641 if (fWordBreakItr == nullptr) { 2642 fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status); 2643 if (U_FAILURE(status)) { 2644 return false; 2645 } 2646 fWordBreakItr->setText(fInputText, status); 2647 } 2648 2649 // Note: zero width boundary tests like \b see through transparent region bounds, 2650 // which is why fLookLimit is used here, rather than fActiveLimit. 2651 if (pos >= fLookLimit) { 2652 fHitEnd = true; 2653 returnVal = true; // With Unicode word rules, only positions within the interior of "real" 2654 // words are not boundaries. All non-word chars stand by themselves, 2655 // with word boundaries on both sides. 2656 } else { 2657 returnVal = fWordBreakItr->isBoundary(static_cast<int32_t>(pos)); 2658 } 2659 #endif 2660 return returnVal; 2661 } 2662 2663 2664 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) { 2665 int64_t result = pos; 2666 2667 #if UCONFIG_NO_BREAK_ITERATION==0 2668 // Note: this point will never be reached if break iteration is configured out. 2669 // Regex patterns that would require this function will fail to compile. 2670 2671 // If we haven't yet created a break iterator for this matcher, do it now. 2672 if (fGCBreakItr == nullptr) { 2673 fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 2674 if (U_FAILURE(status)) { 2675 return pos; 2676 } 2677 fGCBreakItr->setText(fInputText, status); 2678 } 2679 result = fGCBreakItr->following(pos); 2680 if (result == BreakIterator::DONE) { 2681 result = pos; 2682 } 2683 #endif 2684 return result; 2685 } 2686 2687 //-------------------------------------------------------------------------------- 2688 // 2689 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2690 // saves. Increment the "time" counter, and call the 2691 // user callback function if there is one installed. 2692 // 2693 // If the match operation needs to be aborted, either for a time-out 2694 // or because the user callback asked for it, just set an error status. 2695 // The engine will pick that up and stop in its outer loop. 2696 // 2697 //-------------------------------------------------------------------------------- 2698 void RegexMatcher::IncrementTime(UErrorCode &status) { 2699 fTickCounter = TIMER_INITIAL_VALUE; 2700 fTime++; 2701 if (fCallbackFn != nullptr) { 2702 if ((*fCallbackFn)(fCallbackContext, fTime) == false) { 2703 status = U_REGEX_STOPPED_BY_CALLER; 2704 return; 2705 } 2706 } 2707 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2708 status = U_REGEX_TIME_OUT; 2709 } 2710 } 2711 2712 //-------------------------------------------------------------------------------- 2713 // 2714 // StateSave 2715 // Make a new stack frame, initialized as a copy of the current stack frame. 2716 // Set the pattern index in the original stack frame from the operand value 2717 // in the opcode. Execution of the engine continues with the state in 2718 // the newly created stack frame 2719 // 2720 // Note that reserveBlock() may grow the stack, resulting in the 2721 // whole thing being relocated in memory. 2722 // 2723 // Parameters: 2724 // fp The top frame pointer when called. At return, a new 2725 // fame will be present 2726 // savePatIdx An index into the compiled pattern. Goes into the original 2727 // (not new) frame. If execution ever back-tracks out of the 2728 // new frame, this will be where we continue from in the pattern. 2729 // Return 2730 // The new frame pointer. 2731 // 2732 //-------------------------------------------------------------------------------- 2733 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2734 if (U_FAILURE(status)) { 2735 return fp; 2736 } 2737 // push storage for a new frame. 2738 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2739 if (U_FAILURE(status)) { 2740 // Failure on attempted stack expansion. 2741 // Stack function set some other error code, change it to a more 2742 // specific one for regular expressions. 2743 status = U_REGEX_STACK_OVERFLOW; 2744 // We need to return a writable stack frame, so just return the 2745 // previous frame. The match operation will stop quickly 2746 // because of the error status, after which the frame will never 2747 // be looked at again. 2748 return fp; 2749 } 2750 fp = reinterpret_cast<REStackFrame*>(newFP - fFrameSize); // in case of realloc of stack. 2751 2752 // New stack frame = copy of old top frame. 2753 int64_t* source = reinterpret_cast<int64_t*>(fp); 2754 int64_t *dest = newFP; 2755 for (;;) { 2756 *dest++ = *source++; 2757 if (source == newFP) { 2758 break; 2759 } 2760 } 2761 2762 fTickCounter--; 2763 if (fTickCounter <= 0) { 2764 IncrementTime(status); // Re-initializes fTickCounter 2765 } 2766 fp->fPatIdx = savePatIdx; 2767 return reinterpret_cast<REStackFrame*>(newFP); 2768 } 2769 2770 #if defined(REGEX_DEBUG) 2771 namespace { 2772 UnicodeString StringFromUText(UText *ut) { 2773 UnicodeString result; 2774 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) { 2775 result.append(c); 2776 } 2777 return result; 2778 } 2779 } 2780 #endif // REGEX_DEBUG 2781 2782 2783 //-------------------------------------------------------------------------------- 2784 // 2785 // MatchAt This is the actual matching engine. 2786 // 2787 // startIdx: begin matching a this index. 2788 // toEnd: if true, match must extend to end of the input region 2789 // 2790 //-------------------------------------------------------------------------------- 2791 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2792 UBool isMatch = false; // True if the we have a match. 2793 2794 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2795 2796 int32_t op; // Operation from the compiled pattern, split into 2797 int32_t opType; // the opcode 2798 int32_t opValue; // and the operand value. 2799 2800 #ifdef REGEX_RUN_DEBUG 2801 if (fTraceDebug) { 2802 printf("MatchAt(startIdx=%ld)\n", startIdx); 2803 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); 2804 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); 2805 } 2806 #endif 2807 2808 if (U_FAILURE(status)) { 2809 return; 2810 } 2811 2812 // Cache frequently referenced items from the compiled pattern 2813 // 2814 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2815 2816 const char16_t *litText = fPattern->fLiteralText.getBuffer(); 2817 UVector *fSets = fPattern->fSets; 2818 2819 fFrameSize = fPattern->fFrameSize; 2820 REStackFrame *fp = resetStack(); 2821 if (U_FAILURE(fDeferredStatus)) { 2822 status = fDeferredStatus; 2823 return; 2824 } 2825 2826 fp->fPatIdx = 0; 2827 fp->fInputIdx = startIdx; 2828 2829 // Zero out the pattern's static data 2830 int32_t i; 2831 for (i = 0; i<fPattern->fDataSize; i++) { 2832 fData[i] = 0; 2833 } 2834 2835 // 2836 // Main loop for interpreting the compiled pattern. 2837 // One iteration of the loop per pattern operation performed. 2838 // 2839 for (;;) { 2840 op = static_cast<int32_t>(pat[fp->fPatIdx]); 2841 opType = URX_TYPE(op); 2842 opValue = URX_VAL(op); 2843 #ifdef REGEX_RUN_DEBUG 2844 if (fTraceDebug) { 2845 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2846 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 2847 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2848 fPattern->dumpOp(fp->fPatIdx); 2849 } 2850 #endif 2851 fp->fPatIdx++; 2852 2853 switch (opType) { 2854 2855 2856 case URX_NOP: 2857 break; 2858 2859 2860 case URX_BACKTRACK: 2861 // Force a backtrack. In some circumstances, the pattern compiler 2862 // will notice that the pattern can't possibly match anything, and will 2863 // emit one of these at that point. 2864 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 2865 break; 2866 2867 2868 case URX_ONECHAR: 2869 if (fp->fInputIdx < fActiveLimit) { 2870 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2871 UChar32 c = UTEXT_NEXT32(fInputText); 2872 if (c == opValue) { 2873 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2874 break; 2875 } 2876 } else { 2877 fHitEnd = true; 2878 } 2879 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 2880 break; 2881 2882 2883 case URX_STRING: 2884 { 2885 // Test input against a literal string. 2886 // Strings require two slots in the compiled pattern, one for the 2887 // offset to the string text, and one for the length. 2888 2889 int32_t stringStartIdx = opValue; 2890 op = static_cast<int32_t>(pat[fp->fPatIdx]); // Fetch the second operand 2891 fp->fPatIdx++; 2892 opType = URX_TYPE(op); 2893 int32_t stringLen = URX_VAL(op); 2894 U_ASSERT(opType == URX_STRING_LEN); 2895 U_ASSERT(stringLen >= 2); 2896 2897 const char16_t *patternString = litText+stringStartIdx; 2898 int32_t patternStringIndex = 0; 2899 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2900 UChar32 inputChar; 2901 UChar32 patternChar; 2902 UBool success = true; 2903 while (patternStringIndex < stringLen) { 2904 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 2905 success = false; 2906 fHitEnd = true; 2907 break; 2908 } 2909 inputChar = UTEXT_NEXT32(fInputText); 2910 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); 2911 if (patternChar != inputChar) { 2912 success = false; 2913 break; 2914 } 2915 } 2916 2917 if (success) { 2918 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2919 } else { 2920 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 2921 } 2922 } 2923 break; 2924 2925 2926 case URX_STATE_SAVE: 2927 fp = StateSave(fp, opValue, status); 2928 break; 2929 2930 2931 case URX_END: 2932 // The match loop will exit via this path on a successful match, 2933 // when we reach the end of the pattern. 2934 if (toEnd && fp->fInputIdx != fActiveLimit) { 2935 // The pattern matched, but not to the end of input. Try some more. 2936 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 2937 break; 2938 } 2939 isMatch = true; 2940 goto breakFromLoop; 2941 2942 // Start and End Capture stack frame variables are laid out out like this: 2943 // fp->fExtra[opValue] - The start of a completed capture group 2944 // opValue+1 - The end of a completed capture group 2945 // opValue+2 - the start of a capture group whose end 2946 // has not yet been reached (and might not ever be). 2947 case URX_START_CAPTURE: 2948 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2949 fp->fExtra[opValue+2] = fp->fInputIdx; 2950 break; 2951 2952 2953 case URX_END_CAPTURE: 2954 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2955 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2956 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2957 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 2958 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 2959 break; 2960 2961 2962 case URX_DOLLAR: // $, test for End of line 2963 // or for position before new line at end of input 2964 { 2965 if (fp->fInputIdx >= fAnchorLimit) { 2966 // We really are at the end of input. Success. 2967 fHitEnd = true; 2968 fRequireEnd = true; 2969 break; 2970 } 2971 2972 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2973 2974 // If we are positioned just before a new-line that is located at the 2975 // end of input, succeed. 2976 UChar32 c = UTEXT_NEXT32(fInputText); 2977 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2978 if (isLineTerminator(c)) { 2979 // If not in the middle of a CR/LF sequence 2980 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2981 // At new-line at end of input. Success 2982 fHitEnd = true; 2983 fRequireEnd = true; 2984 2985 break; 2986 } 2987 } 2988 } else { 2989 UChar32 nextC = UTEXT_NEXT32(fInputText); 2990 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2991 fHitEnd = true; 2992 fRequireEnd = true; 2993 break; // At CR/LF at end of input. Success 2994 } 2995 } 2996 2997 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 2998 } 2999 break; 3000 3001 3002 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 3003 if (fp->fInputIdx >= fAnchorLimit) { 3004 // Off the end of input. Success. 3005 fHitEnd = true; 3006 fRequireEnd = true; 3007 break; 3008 } else { 3009 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3010 UChar32 c = UTEXT_NEXT32(fInputText); 3011 // Either at the last character of input, or off the end. 3012 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 3013 fHitEnd = true; 3014 fRequireEnd = true; 3015 break; 3016 } 3017 } 3018 3019 // Not at end of input. Back-track out. 3020 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3021 break; 3022 3023 3024 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 3025 { 3026 if (fp->fInputIdx >= fAnchorLimit) { 3027 // We really are at the end of input. Success. 3028 fHitEnd = true; 3029 fRequireEnd = true; 3030 break; 3031 } 3032 // If we are positioned just before a new-line, succeed. 3033 // It makes no difference where the new-line is within the input. 3034 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3035 UChar32 c = UTEXT_CURRENT32(fInputText); 3036 if (isLineTerminator(c)) { 3037 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 3038 // In multi-line mode, hitting a new-line just before the end of input does not 3039 // set the hitEnd or requireEnd flags 3040 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3041 break; 3042 } 3043 } 3044 // not at a new line. Fail. 3045 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3046 } 3047 break; 3048 3049 3050 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3051 { 3052 if (fp->fInputIdx >= fAnchorLimit) { 3053 // We really are at the end of input. Success. 3054 fHitEnd = true; 3055 fRequireEnd = true; // Java set requireEnd in this case, even though 3056 break; // adding a new-line would not lose the match. 3057 } 3058 // If we are not positioned just before a new-line, the test fails; backtrack out. 3059 // It makes no difference where the new-line is within the input. 3060 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3061 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3062 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3063 } 3064 } 3065 break; 3066 3067 3068 case URX_CARET: // ^, test for start of line 3069 if (fp->fInputIdx != fAnchorStart) { 3070 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3071 } 3072 break; 3073 3074 3075 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3076 { 3077 if (fp->fInputIdx == fAnchorStart) { 3078 // We are at the start input. Success. 3079 break; 3080 } 3081 // Check whether character just before the current pos is a new-line 3082 // unless we are at the end of input 3083 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3084 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3085 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { 3086 // It's a new-line. ^ is true. Success. 3087 // TODO: what should be done with positions between a CR and LF? 3088 break; 3089 } 3090 // Not at the start of a line. Fail. 3091 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3092 } 3093 break; 3094 3095 3096 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3097 { 3098 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3099 if (fp->fInputIdx <= fAnchorStart) { 3100 // We are at the start input. Success. 3101 break; 3102 } 3103 // Check whether character just before the current pos is a new-line 3104 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3105 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3106 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3107 if (c != 0x0a) { 3108 // Not at the start of a line. Back-track out. 3109 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3110 } 3111 } 3112 break; 3113 3114 case URX_BACKSLASH_B: // Test for word boundaries 3115 { 3116 UBool success = isWordBoundary(fp->fInputIdx); 3117 success ^= static_cast<UBool>(opValue != 0); // flip sense for \B 3118 if (!success) { 3119 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3120 } 3121 } 3122 break; 3123 3124 3125 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3126 { 3127 UBool success = isUWordBoundary(fp->fInputIdx, status); 3128 success ^= static_cast<UBool>(opValue != 0); // flip sense for \B 3129 if (!success) { 3130 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3131 } 3132 } 3133 break; 3134 3135 3136 case URX_BACKSLASH_D: // Test for decimal digit 3137 { 3138 if (fp->fInputIdx >= fActiveLimit) { 3139 fHitEnd = true; 3140 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3141 break; 3142 } 3143 3144 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3145 3146 UChar32 c = UTEXT_NEXT32(fInputText); 3147 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3148 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3149 success ^= static_cast<UBool>(opValue != 0); // flip sense for \D 3150 if (success) { 3151 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3152 } else { 3153 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3154 } 3155 } 3156 break; 3157 3158 3159 case URX_BACKSLASH_G: // Test for position at end of previous match 3160 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) { 3161 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3162 } 3163 break; 3164 3165 3166 case URX_BACKSLASH_H: // Test for \h, horizontal white space. 3167 { 3168 if (fp->fInputIdx >= fActiveLimit) { 3169 fHitEnd = true; 3170 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3171 break; 3172 } 3173 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3174 UChar32 c = UTEXT_NEXT32(fInputText); 3175 int8_t ctype = u_charType(c); 3176 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB 3177 success ^= static_cast<UBool>(opValue != 0); // flip sense for \H 3178 if (success) { 3179 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3180 } else { 3181 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3182 } 3183 } 3184 break; 3185 3186 3187 case URX_BACKSLASH_R: // Test for \R, any line break sequence. 3188 { 3189 if (fp->fInputIdx >= fActiveLimit) { 3190 fHitEnd = true; 3191 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3192 break; 3193 } 3194 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3195 UChar32 c = UTEXT_NEXT32(fInputText); 3196 if (isLineTerminator(c)) { 3197 if (c == 0x0d && utext_current32(fInputText) == 0x0a) { 3198 utext_next32(fInputText); 3199 } 3200 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3201 } else { 3202 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3203 } 3204 } 3205 break; 3206 3207 3208 case URX_BACKSLASH_V: // \v, any single line ending character. 3209 { 3210 if (fp->fInputIdx >= fActiveLimit) { 3211 fHitEnd = true; 3212 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3213 break; 3214 } 3215 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3216 UChar32 c = UTEXT_NEXT32(fInputText); 3217 UBool success = isLineTerminator(c); 3218 success ^= static_cast<UBool>(opValue != 0); // flip sense for \V 3219 if (success) { 3220 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3221 } else { 3222 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3223 } 3224 } 3225 break; 3226 3227 3228 case URX_BACKSLASH_X: 3229 // Match a Grapheme, as defined by Unicode UAX 29. 3230 3231 // Fail if at end of input 3232 if (fp->fInputIdx >= fActiveLimit) { 3233 fHitEnd = true; 3234 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3235 break; 3236 } 3237 3238 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status); 3239 if (fp->fInputIdx >= fActiveLimit) { 3240 fHitEnd = true; 3241 fp->fInputIdx = fActiveLimit; 3242 } 3243 break; 3244 3245 3246 case URX_BACKSLASH_Z: // Test for end of Input 3247 if (fp->fInputIdx < fAnchorLimit) { 3248 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3249 } else { 3250 fHitEnd = true; 3251 fRequireEnd = true; 3252 } 3253 break; 3254 3255 3256 3257 case URX_STATIC_SETREF: 3258 { 3259 // Test input character against one of the predefined sets 3260 // (Word Characters, for example) 3261 // The high bit of the op value is a flag for the match polarity. 3262 // 0: success if input char is in set. 3263 // 1: success if input char is not in set. 3264 if (fp->fInputIdx >= fActiveLimit) { 3265 fHitEnd = true; 3266 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3267 break; 3268 } 3269 3270 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3271 opValue &= ~URX_NEG_SET; 3272 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3273 3274 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3275 UChar32 c = UTEXT_NEXT32(fInputText); 3276 if (c < 256) { 3277 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; 3278 if (s8.contains(c)) { 3279 success = !success; 3280 } 3281 } else { 3282 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; 3283 if (s.contains(c)) { 3284 success = !success; 3285 } 3286 } 3287 if (success) { 3288 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3289 } else { 3290 // the character wasn't in the set. 3291 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3292 } 3293 } 3294 break; 3295 3296 3297 case URX_STAT_SETREF_N: 3298 { 3299 // Test input character for NOT being a member of one of 3300 // the predefined sets (Word Characters, for example) 3301 if (fp->fInputIdx >= fActiveLimit) { 3302 fHitEnd = true; 3303 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3304 break; 3305 } 3306 3307 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3308 3309 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3310 3311 UChar32 c = UTEXT_NEXT32(fInputText); 3312 if (c < 256) { 3313 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; 3314 if (s8.contains(c) == false) { 3315 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3316 break; 3317 } 3318 } else { 3319 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; 3320 if (s.contains(c) == false) { 3321 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3322 break; 3323 } 3324 } 3325 // the character wasn't in the set. 3326 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3327 } 3328 break; 3329 3330 3331 case URX_SETREF: 3332 if (fp->fInputIdx >= fActiveLimit) { 3333 fHitEnd = true; 3334 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3335 break; 3336 } else { 3337 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3338 3339 // There is input left. Pick up one char and test it for set membership. 3340 UChar32 c = UTEXT_NEXT32(fInputText); 3341 U_ASSERT(opValue > 0 && opValue < fSets->size()); 3342 if (c<256) { 3343 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3344 if (s8->contains(c)) { 3345 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3346 break; 3347 } 3348 } else { 3349 UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue)); 3350 if (s->contains(c)) { 3351 // The character is in the set. A Match. 3352 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3353 break; 3354 } 3355 } 3356 3357 // the character wasn't in the set. 3358 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3359 } 3360 break; 3361 3362 3363 case URX_DOTANY: 3364 { 3365 // . matches anything, but stops at end-of-line. 3366 if (fp->fInputIdx >= fActiveLimit) { 3367 // At end of input. Match failed. Backtrack out. 3368 fHitEnd = true; 3369 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3370 break; 3371 } 3372 3373 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3374 3375 // There is input left. Advance over one char, unless we've hit end-of-line 3376 UChar32 c = UTEXT_NEXT32(fInputText); 3377 if (isLineTerminator(c)) { 3378 // End of line in normal mode. . does not match. 3379 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3380 break; 3381 } 3382 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3383 } 3384 break; 3385 3386 3387 case URX_DOTANY_ALL: 3388 { 3389 // ., in dot-matches-all (including new lines) mode 3390 if (fp->fInputIdx >= fActiveLimit) { 3391 // At end of input. Match failed. Backtrack out. 3392 fHitEnd = true; 3393 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3394 break; 3395 } 3396 3397 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3398 3399 // There is input left. Advance over one char, except if we are 3400 // at a cr/lf, advance over both of them. 3401 UChar32 c; 3402 c = UTEXT_NEXT32(fInputText); 3403 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3404 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3405 // In the case of a CR/LF, we need to advance over both. 3406 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3407 if (nextc == 0x0a) { 3408 (void)UTEXT_NEXT32(fInputText); 3409 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3410 } 3411 } 3412 } 3413 break; 3414 3415 3416 case URX_DOTANY_UNIX: 3417 { 3418 // '.' operator, matches all, but stops at end-of-line. 3419 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3420 if (fp->fInputIdx >= fActiveLimit) { 3421 // At end of input. Match failed. Backtrack out. 3422 fHitEnd = true; 3423 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3424 break; 3425 } 3426 3427 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3428 3429 // There is input left. Advance over one char, unless we've hit end-of-line 3430 UChar32 c = UTEXT_NEXT32(fInputText); 3431 if (c == 0x0a) { 3432 // End of line in normal mode. '.' does not match the \n 3433 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3434 } else { 3435 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3436 } 3437 } 3438 break; 3439 3440 3441 case URX_JMP: 3442 fp->fPatIdx = opValue; 3443 break; 3444 3445 case URX_FAIL: 3446 isMatch = false; 3447 goto breakFromLoop; 3448 3449 case URX_JMP_SAV: 3450 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3451 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3452 fp->fPatIdx = opValue; // Then JMP. 3453 break; 3454 3455 case URX_JMP_SAV_X: 3456 // This opcode is used with (x)+, when x can match a zero length string. 3457 // Same as JMP_SAV, except conditional on the match having made forward progress. 3458 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3459 // data address of the input position at the start of the loop. 3460 { 3461 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3462 int32_t stoOp = static_cast<int32_t>(pat[opValue - 1]); 3463 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3464 int32_t frameLoc = URX_VAL(stoOp); 3465 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3466 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3467 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3468 if (prevInputIdx < fp->fInputIdx) { 3469 // The match did make progress. Repeat the loop. 3470 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3471 fp->fPatIdx = opValue; 3472 fp->fExtra[frameLoc] = fp->fInputIdx; 3473 } 3474 // If the input position did not advance, we do nothing here, 3475 // execution will fall out of the loop. 3476 } 3477 break; 3478 3479 case URX_CTR_INIT: 3480 { 3481 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3482 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3483 3484 // Pick up the three extra operands that CTR_INIT has, and 3485 // skip the pattern location counter past 3486 int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx); 3487 fp->fPatIdx += 3; 3488 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3489 int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]); 3490 int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]); 3491 U_ASSERT(minCount>=0); 3492 U_ASSERT(maxCount>=minCount || maxCount==-1); 3493 U_ASSERT(loopLoc>=fp->fPatIdx); 3494 3495 if (minCount == 0) { 3496 fp = StateSave(fp, loopLoc+1, status); 3497 } 3498 if (maxCount == -1) { 3499 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 3500 } else if (maxCount == 0) { 3501 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3502 } 3503 } 3504 break; 3505 3506 case URX_CTR_LOOP: 3507 { 3508 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3509 int32_t initOp = static_cast<int32_t>(pat[opValue]); 3510 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3511 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3512 int32_t minCount = static_cast<int32_t>(pat[opValue + 2]); 3513 int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]); 3514 (*pCounter)++; 3515 if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) { 3516 U_ASSERT(*pCounter == maxCount); 3517 break; 3518 } 3519 if (*pCounter >= minCount) { 3520 if (maxCount == -1) { 3521 // Loop has no hard upper bound. 3522 // Check that it is progressing through the input, break if it is not. 3523 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3524 if (fp->fInputIdx == *pLastInputIdx) { 3525 break; 3526 } else { 3527 *pLastInputIdx = fp->fInputIdx; 3528 } 3529 } 3530 fp = StateSave(fp, fp->fPatIdx, status); 3531 } else { 3532 // Increment time-out counter. (StateSave() does it if count >= minCount) 3533 fTickCounter--; 3534 if (fTickCounter <= 0) { 3535 IncrementTime(status); // Re-initializes fTickCounter 3536 } 3537 } 3538 3539 fp->fPatIdx = opValue + 4; // Loop back. 3540 } 3541 break; 3542 3543 case URX_CTR_INIT_NG: 3544 { 3545 // Initialize a non-greedy loop 3546 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3547 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3548 3549 // Pick up the three extra operands that CTR_INIT_NG has, and 3550 // skip the pattern location counter past 3551 int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx); 3552 fp->fPatIdx += 3; 3553 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3554 int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]); 3555 int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]); 3556 U_ASSERT(minCount>=0); 3557 U_ASSERT(maxCount>=minCount || maxCount==-1); 3558 U_ASSERT(loopLoc>fp->fPatIdx); 3559 if (maxCount == -1) { 3560 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 3561 } 3562 3563 if (minCount == 0) { 3564 if (maxCount != 0) { 3565 fp = StateSave(fp, fp->fPatIdx, status); 3566 } 3567 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3568 } 3569 } 3570 break; 3571 3572 case URX_CTR_LOOP_NG: 3573 { 3574 // Non-greedy {min, max} loops 3575 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3576 int32_t initOp = static_cast<int32_t>(pat[opValue]); 3577 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3578 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3579 int32_t minCount = static_cast<int32_t>(pat[opValue + 2]); 3580 int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]); 3581 3582 (*pCounter)++; 3583 if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) { 3584 // The loop has matched the maximum permitted number of times. 3585 // Break out of here with no action. Matching will 3586 // continue with the following pattern. 3587 U_ASSERT(*pCounter == maxCount); 3588 break; 3589 } 3590 3591 if (*pCounter < minCount) { 3592 // We haven't met the minimum number of matches yet. 3593 // Loop back for another one. 3594 fp->fPatIdx = opValue + 4; // Loop back. 3595 // Increment time-out counter. (StateSave() does it if count >= minCount) 3596 fTickCounter--; 3597 if (fTickCounter <= 0) { 3598 IncrementTime(status); // Re-initializes fTickCounter 3599 } 3600 } else { 3601 // We do have the minimum number of matches. 3602 3603 // If there is no upper bound on the loop iterations, check that the input index 3604 // is progressing, and stop the loop if it is not. 3605 if (maxCount == -1) { 3606 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 3607 if (fp->fInputIdx == *pLastInputIdx) { 3608 break; 3609 } 3610 *pLastInputIdx = fp->fInputIdx; 3611 } 3612 3613 // Loop Continuation: we will fall into the pattern following the loop 3614 // (non-greedy, don't execute loop body first), but first do 3615 // a state save to the top of the loop, so that a match failure 3616 // in the following pattern will try another iteration of the loop. 3617 fp = StateSave(fp, opValue + 4, status); 3618 } 3619 } 3620 break; 3621 3622 case URX_STO_SP: 3623 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3624 fData[opValue] = fStack->size(); 3625 break; 3626 3627 case URX_LD_SP: 3628 { 3629 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3630 int32_t newStackSize = static_cast<int32_t>(fData[opValue]); 3631 U_ASSERT(newStackSize <= fStack->size()); 3632 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3633 if (newFP == reinterpret_cast<int64_t*>(fp)) { 3634 break; 3635 } 3636 int32_t j; 3637 for (j=0; j<fFrameSize; j++) { 3638 newFP[j] = reinterpret_cast<int64_t*>(fp)[j]; 3639 } 3640 fp = reinterpret_cast<REStackFrame*>(newFP); 3641 fStack->setSize(newStackSize); 3642 } 3643 break; 3644 3645 case URX_BACKREF: 3646 { 3647 U_ASSERT(opValue < fFrameSize); 3648 int64_t groupStartIdx = fp->fExtra[opValue]; 3649 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3650 U_ASSERT(groupStartIdx <= groupEndIdx); 3651 if (groupStartIdx < 0) { 3652 // This capture group has not participated in the match thus far, 3653 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match. 3654 break; 3655 } 3656 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3657 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3658 3659 // Note: if the capture group match was of an empty string the backref 3660 // match succeeds. Verified by testing: Perl matches succeed 3661 // in this case, so we do too. 3662 3663 UBool success = true; 3664 for (;;) { 3665 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3666 success = true; 3667 break; 3668 } 3669 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 3670 success = false; 3671 fHitEnd = true; 3672 break; 3673 } 3674 UChar32 captureGroupChar = utext_next32(fAltInputText); 3675 UChar32 inputChar = utext_next32(fInputText); 3676 if (inputChar != captureGroupChar) { 3677 success = false; 3678 break; 3679 } 3680 } 3681 3682 if (success) { 3683 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3684 } else { 3685 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3686 } 3687 } 3688 break; 3689 3690 3691 3692 case URX_BACKREF_I: 3693 { 3694 U_ASSERT(opValue < fFrameSize); 3695 int64_t groupStartIdx = fp->fExtra[opValue]; 3696 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3697 U_ASSERT(groupStartIdx <= groupEndIdx); 3698 if (groupStartIdx < 0) { 3699 // This capture group has not participated in the match thus far, 3700 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match. 3701 break; 3702 } 3703 utext_setNativeIndex(fAltInputText, groupStartIdx); 3704 utext_setNativeIndex(fInputText, fp->fInputIdx); 3705 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 3706 CaseFoldingUTextIterator inputItr(*fInputText); 3707 3708 // Note: if the capture group match was of an empty string the backref 3709 // match succeeds. Verified by testing: Perl matches succeed 3710 // in this case, so we do too. 3711 3712 UBool success = true; 3713 for (;;) { 3714 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3715 success = true; 3716 break; 3717 } 3718 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { 3719 success = false; 3720 fHitEnd = true; 3721 break; 3722 } 3723 UChar32 captureGroupChar = captureGroupItr.next(); 3724 UChar32 inputChar = inputItr.next(); 3725 if (inputChar != captureGroupChar) { 3726 success = false; 3727 break; 3728 } 3729 } 3730 3731 if (success && inputItr.inExpansion()) { 3732 // We obtained a match by consuming part of a string obtained from 3733 // case-folding a single code point of the input text. 3734 // This does not count as an overall match. 3735 success = false; 3736 } 3737 3738 if (success) { 3739 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3740 } else { 3741 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3742 } 3743 3744 } 3745 break; 3746 3747 case URX_STO_INP_LOC: 3748 { 3749 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3750 fp->fExtra[opValue] = fp->fInputIdx; 3751 } 3752 break; 3753 3754 case URX_JMPX: 3755 { 3756 int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx); 3757 fp->fPatIdx += 1; 3758 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3759 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3760 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3761 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3762 if (savedInputIdx < fp->fInputIdx) { 3763 fp->fPatIdx = opValue; // JMP 3764 } else { 3765 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no progress in loop. 3766 } 3767 } 3768 break; 3769 3770 case URX_LA_START: 3771 { 3772 // Entering a look around block. 3773 // Save Stack Ptr, Input Pos. 3774 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize); 3775 fData[opValue] = fStack->size(); 3776 fData[opValue+1] = fp->fInputIdx; 3777 fData[opValue+2] = fActiveStart; 3778 fData[opValue+3] = fActiveLimit; 3779 fActiveStart = fLookStart; // Set the match region change for 3780 fActiveLimit = fLookLimit; // transparent bounds. 3781 } 3782 break; 3783 3784 case URX_LA_END: 3785 { 3786 // Leaving a look-ahead block. 3787 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3788 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize); 3789 int32_t stackSize = fStack->size(); 3790 int32_t newStackSize = static_cast<int32_t>(fData[opValue]); 3791 U_ASSERT(stackSize >= newStackSize); 3792 if (stackSize > newStackSize) { 3793 // Copy the current top frame back to the new (cut back) top frame. 3794 // This makes the capture groups from within the look-ahead 3795 // expression available. 3796 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3797 int32_t j; 3798 for (j=0; j<fFrameSize; j++) { 3799 newFP[j] = reinterpret_cast<int64_t*>(fp)[j]; 3800 } 3801 fp = reinterpret_cast<REStackFrame*>(newFP); 3802 fStack->setSize(newStackSize); 3803 } 3804 fp->fInputIdx = fData[opValue+1]; 3805 3806 // Restore the active region bounds in the input string; they may have 3807 // been changed because of transparent bounds on a Region. 3808 fActiveStart = fData[opValue+2]; 3809 fActiveLimit = fData[opValue+3]; 3810 U_ASSERT(fActiveStart >= 0); 3811 U_ASSERT(fActiveLimit <= fInputLength); 3812 } 3813 break; 3814 3815 case URX_ONECHAR_I: 3816 // Case insensitive one char. The char from the pattern is already case folded. 3817 // Input text is not, but case folding the input can not reduce two or more code 3818 // points to one. 3819 if (fp->fInputIdx < fActiveLimit) { 3820 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3821 3822 UChar32 c = UTEXT_NEXT32(fInputText); 3823 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3824 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3825 break; 3826 } 3827 } else { 3828 fHitEnd = true; 3829 } 3830 3831 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3832 break; 3833 3834 case URX_STRING_I: 3835 { 3836 // Case-insensitive test input against a literal string. 3837 // Strings require two slots in the compiled pattern, one for the 3838 // offset to the string text, and one for the length. 3839 // The compiled string has already been case folded. 3840 { 3841 const char16_t *patternString = litText + opValue; 3842 int32_t patternStringIdx = 0; 3843 3844 op = static_cast<int32_t>(pat[fp->fPatIdx]); 3845 fp->fPatIdx++; 3846 opType = URX_TYPE(op); 3847 opValue = URX_VAL(op); 3848 U_ASSERT(opType == URX_STRING_LEN); 3849 int32_t patternStringLen = opValue; // Length of the string from the pattern. 3850 3851 3852 UChar32 cPattern; 3853 UChar32 cText; 3854 UBool success = true; 3855 3856 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3857 CaseFoldingUTextIterator inputIterator(*fInputText); 3858 while (patternStringIdx < patternStringLen) { 3859 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 3860 success = false; 3861 fHitEnd = true; 3862 break; 3863 } 3864 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 3865 cText = inputIterator.next(); 3866 if (cText != cPattern) { 3867 success = false; 3868 break; 3869 } 3870 } 3871 if (inputIterator.inExpansion()) { 3872 success = false; 3873 } 3874 3875 if (success) { 3876 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3877 } else { 3878 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3879 } 3880 } 3881 } 3882 break; 3883 3884 case URX_LB_START: 3885 { 3886 // Entering a look-behind block. 3887 // Save Stack Ptr, Input Pos and active input region. 3888 // TODO: implement transparent bounds. Ticket #6067 3889 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 3890 fData[opValue] = fStack->size(); 3891 fData[opValue+1] = fp->fInputIdx; 3892 // Save input string length, then reset to pin any matches to end at 3893 // the current position. 3894 fData[opValue+2] = fActiveStart; 3895 fData[opValue+3] = fActiveLimit; 3896 fActiveStart = fRegionStart; 3897 fActiveLimit = fp->fInputIdx; 3898 // Init the variable containing the start index for attempted matches. 3899 fData[opValue+4] = -1; 3900 } 3901 break; 3902 3903 3904 case URX_LB_CONT: 3905 { 3906 // Positive Look-Behind, at top of loop checking for matches of LB expression 3907 // at all possible input starting positions. 3908 3909 // Fetch the min and max possible match lengths. They are the operands 3910 // of this op in the pattern. 3911 int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]); 3912 int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]); 3913 if (!UTEXT_USES_U16(fInputText)) { 3914 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. 3915 // The max length need not be exact; it just needs to be >= actual maximum. 3916 maxML *= 3; 3917 } 3918 U_ASSERT(minML <= maxML); 3919 U_ASSERT(minML >= 0); 3920 3921 // Fetch (from data) the last input index where a match was attempted. 3922 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 3923 int64_t &lbStartIdx = fData[opValue+4]; 3924 if (lbStartIdx < 0) { 3925 // First time through loop. 3926 lbStartIdx = fp->fInputIdx - minML; 3927 if (lbStartIdx > 0) { 3928 // move index to a code point boundary, if it's not on one already. 3929 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 3930 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3931 } 3932 } else { 3933 // 2nd through nth time through the loop. 3934 // Back up start position for match by one. 3935 if (lbStartIdx == 0) { 3936 (lbStartIdx)--; 3937 } else { 3938 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 3939 (void)UTEXT_PREVIOUS32(fInputText); 3940 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 3941 } 3942 } 3943 3944 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 3945 // We have tried all potential match starting points without 3946 // getting a match. Backtrack out, and out of the 3947 // Look Behind altogether. 3948 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3949 fActiveStart = fData[opValue+2]; 3950 fActiveLimit = fData[opValue+3]; 3951 U_ASSERT(fActiveStart >= 0); 3952 U_ASSERT(fActiveLimit <= fInputLength); 3953 break; 3954 } 3955 3956 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 3957 // (successful match will fall off the end of the loop.) 3958 fp = StateSave(fp, fp->fPatIdx-3, status); 3959 fp->fInputIdx = lbStartIdx; 3960 } 3961 break; 3962 3963 case URX_LB_END: 3964 // End of a look-behind block, after a successful match. 3965 { 3966 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 3967 if (fp->fInputIdx != fActiveLimit) { 3968 // The look-behind expression matched, but the match did not 3969 // extend all the way to the point that we are looking behind from. 3970 // FAIL out of here, which will take us back to the LB_CONT, which 3971 // will retry the match starting at another position or fail 3972 // the look-behind altogether, whichever is appropriate. 3973 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 3974 break; 3975 } 3976 3977 // Look-behind match is good. Restore the original input string region, 3978 // which had been truncated to pin the end of the lookbehind match to the 3979 // position being looked-behind. 3980 fActiveStart = fData[opValue+2]; 3981 fActiveLimit = fData[opValue+3]; 3982 U_ASSERT(fActiveStart >= 0); 3983 U_ASSERT(fActiveLimit <= fInputLength); 3984 } 3985 break; 3986 3987 3988 case URX_LBN_CONT: 3989 { 3990 // Negative Look-Behind, at top of loop checking for matches of LB expression 3991 // at all possible input starting positions. 3992 3993 // Fetch the extra parameters of this op. 3994 int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]); 3995 int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]); 3996 if (!UTEXT_USES_U16(fInputText)) { 3997 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. 3998 // The max length need not be exact; it just needs to be >= actual maximum. 3999 maxML *= 3; 4000 } 4001 int32_t continueLoc = static_cast<int32_t>(pat[fp->fPatIdx++]); 4002 continueLoc = URX_VAL(continueLoc); 4003 U_ASSERT(minML <= maxML); 4004 U_ASSERT(minML >= 0); 4005 U_ASSERT(continueLoc > fp->fPatIdx); 4006 4007 // Fetch (from data) the last input index where a match was attempted. 4008 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 4009 int64_t &lbStartIdx = fData[opValue+4]; 4010 if (lbStartIdx < 0) { 4011 // First time through loop. 4012 lbStartIdx = fp->fInputIdx - minML; 4013 if (lbStartIdx > 0) { 4014 // move index to a code point boundary, if it's not on one already. 4015 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 4016 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4017 } 4018 } else { 4019 // 2nd through nth time through the loop. 4020 // Back up start position for match by one. 4021 if (lbStartIdx == 0) { 4022 (lbStartIdx)--; 4023 } else { 4024 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); 4025 (void)UTEXT_PREVIOUS32(fInputText); 4026 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4027 } 4028 } 4029 4030 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 4031 // We have tried all potential match starting points without 4032 // getting a match, which means that the negative lookbehind as 4033 // a whole has succeeded. Jump forward to the continue location 4034 fActiveStart = fData[opValue+2]; 4035 fActiveLimit = fData[opValue+3]; 4036 U_ASSERT(fActiveStart >= 0); 4037 U_ASSERT(fActiveLimit <= fInputLength); 4038 fp->fPatIdx = continueLoc; 4039 break; 4040 } 4041 4042 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4043 // (successful match will cause a FAIL out of the loop altogether.) 4044 fp = StateSave(fp, fp->fPatIdx-4, status); 4045 fp->fInputIdx = lbStartIdx; 4046 } 4047 break; 4048 4049 case URX_LBN_END: 4050 // End of a negative look-behind block, after a successful match. 4051 { 4052 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 4053 if (fp->fInputIdx != fActiveLimit) { 4054 // The look-behind expression matched, but the match did not 4055 // extend all the way to the point that we are looking behind from. 4056 // FAIL out of here, which will take us back to the LB_CONT, which 4057 // will retry the match starting at another position or succeed 4058 // the look-behind altogether, whichever is appropriate. 4059 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4060 break; 4061 } 4062 4063 // Look-behind expression matched, which means look-behind test as 4064 // a whole Fails 4065 4066 // Restore the original input string length, which had been truncated 4067 // inorder to pin the end of the lookbehind match 4068 // to the position being looked-behind. 4069 fActiveStart = fData[opValue+2]; 4070 fActiveLimit = fData[opValue+3]; 4071 U_ASSERT(fActiveStart >= 0); 4072 U_ASSERT(fActiveLimit <= fInputLength); 4073 4074 // Restore original stack position, discarding any state saved 4075 // by the successful pattern match. 4076 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4077 int32_t newStackSize = static_cast<int32_t>(fData[opValue]); 4078 U_ASSERT(fStack->size() > newStackSize); 4079 fStack->setSize(newStackSize); 4080 4081 // FAIL, which will take control back to someplace 4082 // prior to entering the look-behind test. 4083 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4084 } 4085 break; 4086 4087 4088 case URX_LOOP_SR_I: 4089 // Loop Initialization for the optimized implementation of 4090 // [some character set]* 4091 // This op scans through all matching input. 4092 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4093 { 4094 U_ASSERT(opValue > 0 && opValue < fSets->size()); 4095 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4096 UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue)); 4097 4098 // Loop through input, until either the input is exhausted or 4099 // we reach a character that is not a member of the set. 4100 int64_t ix = fp->fInputIdx; 4101 UTEXT_SETNATIVEINDEX(fInputText, ix); 4102 for (;;) { 4103 if (ix >= fActiveLimit) { 4104 fHitEnd = true; 4105 break; 4106 } 4107 UChar32 c = UTEXT_NEXT32(fInputText); 4108 if (c<256) { 4109 if (s8->contains(c) == false) { 4110 break; 4111 } 4112 } else { 4113 if (s->contains(c) == false) { 4114 break; 4115 } 4116 } 4117 ix = UTEXT_GETNATIVEINDEX(fInputText); 4118 } 4119 4120 // If there were no matching characters, skip over the loop altogether. 4121 // The loop doesn't run at all, a * op always succeeds. 4122 if (ix == fp->fInputIdx) { 4123 fp->fPatIdx++; // skip the URX_LOOP_C op. 4124 break; 4125 } 4126 4127 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4128 // must follow. It's operand is the stack location 4129 // that holds the starting input index for the match of this [set]* 4130 int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]); 4131 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4132 int32_t stackLoc = URX_VAL(loopcOp); 4133 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4134 fp->fExtra[stackLoc] = fp->fInputIdx; 4135 fp->fInputIdx = ix; 4136 4137 // Save State to the URX_LOOP_C op that follows this one, 4138 // so that match failures in the following code will return to there. 4139 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4140 fp = StateSave(fp, fp->fPatIdx, status); 4141 fp->fPatIdx++; 4142 } 4143 break; 4144 4145 4146 case URX_LOOP_DOT_I: 4147 // Loop Initialization for the optimized implementation of .* 4148 // This op scans through all remaining input. 4149 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4150 { 4151 // Loop through input until the input is exhausted (we reach an end-of-line) 4152 // In DOTALL mode, we can just go straight to the end of the input. 4153 int64_t ix; 4154 if ((opValue & 1) == 1) { 4155 // Dot-matches-All mode. Jump straight to the end of the string. 4156 ix = fActiveLimit; 4157 fHitEnd = true; 4158 } else { 4159 // NOT DOT ALL mode. Line endings do not match '.' 4160 // Scan forward until a line ending or end of input. 4161 ix = fp->fInputIdx; 4162 UTEXT_SETNATIVEINDEX(fInputText, ix); 4163 for (;;) { 4164 if (ix >= fActiveLimit) { 4165 fHitEnd = true; 4166 break; 4167 } 4168 UChar32 c = UTEXT_NEXT32(fInputText); 4169 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4170 if ((c == 0x0a) || // 0x0a is newline in both modes. 4171 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4172 isLineTerminator(c))) { 4173 // char is a line ending. Exit the scanning loop. 4174 break; 4175 } 4176 } 4177 ix = UTEXT_GETNATIVEINDEX(fInputText); 4178 } 4179 } 4180 4181 // If there were no matching characters, skip over the loop altogether. 4182 // The loop doesn't run at all, a * op always succeeds. 4183 if (ix == fp->fInputIdx) { 4184 fp->fPatIdx++; // skip the URX_LOOP_C op. 4185 break; 4186 } 4187 4188 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4189 // must follow. It's operand is the stack location 4190 // that holds the starting input index for the match of this .* 4191 int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]); 4192 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4193 int32_t stackLoc = URX_VAL(loopcOp); 4194 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4195 fp->fExtra[stackLoc] = fp->fInputIdx; 4196 fp->fInputIdx = ix; 4197 4198 // Save State to the URX_LOOP_C op that follows this one, 4199 // so that match failures in the following code will return to there. 4200 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4201 fp = StateSave(fp, fp->fPatIdx, status); 4202 fp->fPatIdx++; 4203 } 4204 break; 4205 4206 4207 case URX_LOOP_C: 4208 { 4209 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4210 backSearchIndex = fp->fExtra[opValue]; 4211 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4212 if (backSearchIndex == fp->fInputIdx) { 4213 // We've backed up the input idx to the point that the loop started. 4214 // The loop is done. Leave here without saving state. 4215 // Subsequent failures won't come back here. 4216 break; 4217 } 4218 // Set up for the next iteration of the loop, with input index 4219 // backed up by one from the last time through, 4220 // and a state save to this instruction in case the following code fails again. 4221 // (We're going backwards because this loop emulates stack unwinding, not 4222 // the initial scan forward.) 4223 U_ASSERT(fp->fInputIdx > 0); 4224 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4225 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4226 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4227 4228 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4229 if (prevC == 0x0a && 4230 fp->fInputIdx > backSearchIndex && 4231 twoPrevC == 0x0d) { 4232 int32_t prevOp = static_cast<int32_t>(pat[fp->fPatIdx - 2]); 4233 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4234 // .*, stepping back over CRLF pair. 4235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4236 } 4237 } 4238 4239 4240 fp = StateSave(fp, fp->fPatIdx-1, status); 4241 } 4242 break; 4243 4244 4245 4246 default: 4247 // Trouble. The compiled pattern contains an entry with an 4248 // unrecognized type tag. 4249 UPRV_UNREACHABLE_ASSERT; 4250 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have 4251 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. 4252 // See ICU-21669. 4253 status = U_INTERNAL_PROGRAM_ERROR; 4254 } 4255 4256 if (U_FAILURE(status)) { 4257 isMatch = false; 4258 break; 4259 } 4260 } 4261 4262 breakFromLoop: 4263 fMatch = isMatch; 4264 if (isMatch) { 4265 fLastMatchEnd = fMatchEnd; 4266 fMatchStart = startIdx; 4267 fMatchEnd = fp->fInputIdx; 4268 } 4269 4270 #ifdef REGEX_RUN_DEBUG 4271 if (fTraceDebug) { 4272 if (isMatch) { 4273 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 4274 } else { 4275 printf("No match\n\n"); 4276 } 4277 } 4278 #endif 4279 4280 fFrame = fp; // The active stack frame when the engine stopped. 4281 // Contains the capture group results that we need to 4282 // access later. 4283 } 4284 4285 4286 //-------------------------------------------------------------------------------- 4287 // 4288 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4289 // assumption that the entire string is available in the UText's 4290 // chunk buffer. For now, that means we can use int32_t indexes, 4291 // except for anything that needs to be saved (like group starts 4292 // and ends). 4293 // 4294 // startIdx: begin matching a this index. 4295 // toEnd: if true, match must extend to end of the input region 4296 // 4297 //-------------------------------------------------------------------------------- 4298 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4299 UBool isMatch = false; // True if the we have a match. 4300 4301 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4302 4303 int32_t op; // Operation from the compiled pattern, split into 4304 int32_t opType; // the opcode 4305 int32_t opValue; // and the operand value. 4306 4307 #ifdef REGEX_RUN_DEBUG 4308 if (fTraceDebug) { 4309 printf("MatchAt(startIdx=%d)\n", startIdx); 4310 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); 4311 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); 4312 } 4313 #endif 4314 4315 if (U_FAILURE(status)) { 4316 return; 4317 } 4318 4319 // Cache frequently referenced items from the compiled pattern 4320 // 4321 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4322 4323 const char16_t *litText = fPattern->fLiteralText.getBuffer(); 4324 UVector *fSets = fPattern->fSets; 4325 4326 const char16_t *inputBuf = fInputText->chunkContents; 4327 4328 fFrameSize = fPattern->fFrameSize; 4329 REStackFrame *fp = resetStack(); 4330 if (U_FAILURE(fDeferredStatus)) { 4331 status = fDeferredStatus; 4332 return; 4333 } 4334 4335 fp->fPatIdx = 0; 4336 fp->fInputIdx = startIdx; 4337 4338 // Zero out the pattern's static data 4339 int32_t i; 4340 for (i = 0; i<fPattern->fDataSize; i++) { 4341 fData[i] = 0; 4342 } 4343 4344 // 4345 // Main loop for interpreting the compiled pattern. 4346 // One iteration of the loop per pattern operation performed. 4347 // 4348 for (;;) { 4349 op = static_cast<int32_t>(pat[fp->fPatIdx]); 4350 opType = URX_TYPE(op); 4351 opValue = URX_VAL(op); 4352 #ifdef REGEX_RUN_DEBUG 4353 if (fTraceDebug) { 4354 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4355 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 4356 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4357 fPattern->dumpOp(fp->fPatIdx); 4358 } 4359 #endif 4360 fp->fPatIdx++; 4361 4362 switch (opType) { 4363 4364 4365 case URX_NOP: 4366 break; 4367 4368 4369 case URX_BACKTRACK: 4370 // Force a backtrack. In some circumstances, the pattern compiler 4371 // will notice that the pattern can't possibly match anything, and will 4372 // emit one of these at that point. 4373 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4374 break; 4375 4376 4377 case URX_ONECHAR: 4378 if (fp->fInputIdx < fActiveLimit) { 4379 UChar32 c; 4380 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4381 if (c == opValue) { 4382 break; 4383 } 4384 } else { 4385 fHitEnd = true; 4386 } 4387 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4388 break; 4389 4390 4391 case URX_STRING: 4392 { 4393 // Test input against a literal string. 4394 // Strings require two slots in the compiled pattern, one for the 4395 // offset to the string text, and one for the length. 4396 int32_t stringStartIdx = opValue; 4397 int32_t stringLen; 4398 4399 op = static_cast<int32_t>(pat[fp->fPatIdx]); // Fetch the second operand 4400 fp->fPatIdx++; 4401 opType = URX_TYPE(op); 4402 stringLen = URX_VAL(op); 4403 U_ASSERT(opType == URX_STRING_LEN); 4404 U_ASSERT(stringLen >= 2); 4405 4406 const char16_t * pInp = inputBuf + fp->fInputIdx; 4407 const char16_t * pInpLimit = inputBuf + fActiveLimit; 4408 const char16_t * pPat = litText+stringStartIdx; 4409 const char16_t * pEnd = pInp + stringLen; 4410 UBool success = true; 4411 while (pInp < pEnd) { 4412 if (pInp >= pInpLimit) { 4413 fHitEnd = true; 4414 success = false; 4415 break; 4416 } 4417 if (*pInp++ != *pPat++) { 4418 success = false; 4419 break; 4420 } 4421 } 4422 4423 // If the pattern string ends with an unpaired lead surrogate that 4424 // matched the lead surrogate of a valid pair in the input text, 4425 // this does not count as a match. 4426 if (success && U16_IS_LEAD(*(pInp-1)) && 4427 pInp < pInpLimit && U16_IS_TRAIL(*(pInp))) { 4428 success = false; 4429 } 4430 4431 if (success) { 4432 fp->fInputIdx += stringLen; 4433 } else { 4434 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4435 } 4436 } 4437 break; 4438 4439 4440 case URX_STATE_SAVE: 4441 fp = StateSave(fp, opValue, status); 4442 break; 4443 4444 4445 case URX_END: 4446 // The match loop will exit via this path on a successful match, 4447 // when we reach the end of the pattern. 4448 if (toEnd && fp->fInputIdx != fActiveLimit) { 4449 // The pattern matched, but not to the end of input. Try some more. 4450 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4451 break; 4452 } 4453 isMatch = true; 4454 goto breakFromLoop; 4455 4456 // Start and End Capture stack frame variables are laid out out like this: 4457 // fp->fExtra[opValue] - The start of a completed capture group 4458 // opValue+1 - The end of a completed capture group 4459 // opValue+2 - the start of a capture group whose end 4460 // has not yet been reached (and might not ever be). 4461 case URX_START_CAPTURE: 4462 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4463 fp->fExtra[opValue+2] = fp->fInputIdx; 4464 break; 4465 4466 4467 case URX_END_CAPTURE: 4468 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4469 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4470 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4471 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4472 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4473 break; 4474 4475 4476 case URX_DOLLAR: // $, test for End of line 4477 // or for position before new line at end of input 4478 if (fp->fInputIdx < fAnchorLimit-2) { 4479 // We are no where near the end of input. Fail. 4480 // This is the common case. Keep it first. 4481 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4482 break; 4483 } 4484 if (fp->fInputIdx >= fAnchorLimit) { 4485 // We really are at the end of input. Success. 4486 fHitEnd = true; 4487 fRequireEnd = true; 4488 break; 4489 } 4490 4491 // If we are positioned just before a new-line that is located at the 4492 // end of input, succeed. 4493 if (fp->fInputIdx == fAnchorLimit-1) { 4494 UChar32 c; 4495 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4496 4497 if (isLineTerminator(c)) { 4498 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4499 // At new-line at end of input. Success 4500 fHitEnd = true; 4501 fRequireEnd = true; 4502 break; 4503 } 4504 } 4505 } else if (fp->fInputIdx == fAnchorLimit-2 && 4506 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4507 fHitEnd = true; 4508 fRequireEnd = true; 4509 break; // At CR/LF at end of input. Success 4510 } 4511 4512 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4513 4514 break; 4515 4516 4517 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4518 if (fp->fInputIdx >= fAnchorLimit-1) { 4519 // Either at the last character of input, or off the end. 4520 if (fp->fInputIdx == fAnchorLimit-1) { 4521 // At last char of input. Success if it's a new line. 4522 if (inputBuf[fp->fInputIdx] == 0x0a) { 4523 fHitEnd = true; 4524 fRequireEnd = true; 4525 break; 4526 } 4527 } else { 4528 // Off the end of input. Success. 4529 fHitEnd = true; 4530 fRequireEnd = true; 4531 break; 4532 } 4533 } 4534 4535 // Not at end of input. Back-track out. 4536 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4537 break; 4538 4539 4540 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4541 { 4542 if (fp->fInputIdx >= fAnchorLimit) { 4543 // We really are at the end of input. Success. 4544 fHitEnd = true; 4545 fRequireEnd = true; 4546 break; 4547 } 4548 // If we are positioned just before a new-line, succeed. 4549 // It makes no difference where the new-line is within the input. 4550 UChar32 c = inputBuf[fp->fInputIdx]; 4551 if (isLineTerminator(c)) { 4552 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4553 // In multi-line mode, hitting a new-line just before the end of input does not 4554 // set the hitEnd or requireEnd flags 4555 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4556 break; 4557 } 4558 } 4559 // not at a new line. Fail. 4560 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4561 } 4562 break; 4563 4564 4565 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4566 { 4567 if (fp->fInputIdx >= fAnchorLimit) { 4568 // We really are at the end of input. Success. 4569 fHitEnd = true; 4570 fRequireEnd = true; // Java set requireEnd in this case, even though 4571 break; // adding a new-line would not lose the match. 4572 } 4573 // If we are not positioned just before a new-line, the test fails; backtrack out. 4574 // It makes no difference where the new-line is within the input. 4575 if (inputBuf[fp->fInputIdx] != 0x0a) { 4576 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4577 } 4578 } 4579 break; 4580 4581 4582 case URX_CARET: // ^, test for start of line 4583 if (fp->fInputIdx != fAnchorStart) { 4584 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4585 } 4586 break; 4587 4588 4589 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4590 { 4591 if (fp->fInputIdx == fAnchorStart) { 4592 // We are at the start input. Success. 4593 break; 4594 } 4595 // Check whether character just before the current pos is a new-line 4596 // unless we are at the end of input 4597 char16_t c = inputBuf[fp->fInputIdx - 1]; 4598 if ((fp->fInputIdx < fAnchorLimit) && 4599 isLineTerminator(c)) { 4600 // It's a new-line. ^ is true. Success. 4601 // TODO: what should be done with positions between a CR and LF? 4602 break; 4603 } 4604 // Not at the start of a line. Fail. 4605 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4606 } 4607 break; 4608 4609 4610 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4611 { 4612 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4613 if (fp->fInputIdx <= fAnchorStart) { 4614 // We are at the start input. Success. 4615 break; 4616 } 4617 // Check whether character just before the current pos is a new-line 4618 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4619 char16_t c = inputBuf[fp->fInputIdx - 1]; 4620 if (c != 0x0a) { 4621 // Not at the start of a line. Back-track out. 4622 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4623 } 4624 } 4625 break; 4626 4627 case URX_BACKSLASH_B: // Test for word boundaries 4628 { 4629 UBool success = isChunkWordBoundary(static_cast<int32_t>(fp->fInputIdx)); 4630 success ^= static_cast<UBool>(opValue != 0); // flip sense for \B 4631 if (!success) { 4632 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4633 } 4634 } 4635 break; 4636 4637 4638 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4639 { 4640 UBool success = isUWordBoundary(fp->fInputIdx, status); 4641 success ^= static_cast<UBool>(opValue != 0); // flip sense for \B 4642 if (!success) { 4643 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4644 } 4645 } 4646 break; 4647 4648 4649 case URX_BACKSLASH_D: // Test for decimal digit 4650 { 4651 if (fp->fInputIdx >= fActiveLimit) { 4652 fHitEnd = true; 4653 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4654 break; 4655 } 4656 4657 UChar32 c; 4658 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4659 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4660 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4661 success ^= static_cast<UBool>(opValue != 0); // flip sense for \D 4662 if (!success) { 4663 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4664 } 4665 } 4666 break; 4667 4668 4669 case URX_BACKSLASH_G: // Test for position at end of previous match 4670 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) { 4671 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4672 } 4673 break; 4674 4675 4676 case URX_BACKSLASH_H: // Test for \h, horizontal white space. 4677 { 4678 if (fp->fInputIdx >= fActiveLimit) { 4679 fHitEnd = true; 4680 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4681 break; 4682 } 4683 UChar32 c; 4684 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4685 int8_t ctype = u_charType(c); 4686 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB 4687 success ^= static_cast<UBool>(opValue != 0); // flip sense for \H 4688 if (!success) { 4689 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4690 } 4691 } 4692 break; 4693 4694 4695 case URX_BACKSLASH_R: // Test for \R, any line break sequence. 4696 { 4697 if (fp->fInputIdx >= fActiveLimit) { 4698 fHitEnd = true; 4699 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4700 break; 4701 } 4702 UChar32 c; 4703 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4704 if (isLineTerminator(c)) { 4705 if (c == 0x0d && fp->fInputIdx < fActiveLimit) { 4706 // Check for CR/LF sequence. Consume both together when found. 4707 char16_t c2; 4708 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); 4709 if (c2 != 0x0a) { 4710 U16_PREV(inputBuf, 0, fp->fInputIdx, c2); 4711 } 4712 } 4713 } else { 4714 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4715 } 4716 } 4717 break; 4718 4719 4720 case URX_BACKSLASH_V: // Any single code point line ending. 4721 { 4722 if (fp->fInputIdx >= fActiveLimit) { 4723 fHitEnd = true; 4724 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4725 break; 4726 } 4727 UChar32 c; 4728 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4729 UBool success = isLineTerminator(c); 4730 success ^= static_cast<UBool>(opValue != 0); // flip sense for \V 4731 if (!success) { 4732 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4733 } 4734 } 4735 break; 4736 4737 4738 case URX_BACKSLASH_X: 4739 // Match a Grapheme, as defined by Unicode UAX 29. 4740 4741 // Fail if at end of input 4742 if (fp->fInputIdx >= fActiveLimit) { 4743 fHitEnd = true; 4744 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4745 break; 4746 } 4747 4748 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status); 4749 if (fp->fInputIdx >= fActiveLimit) { 4750 fHitEnd = true; 4751 fp->fInputIdx = fActiveLimit; 4752 } 4753 break; 4754 4755 4756 case URX_BACKSLASH_Z: // Test for end of Input 4757 if (fp->fInputIdx < fAnchorLimit) { 4758 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4759 } else { 4760 fHitEnd = true; 4761 fRequireEnd = true; 4762 } 4763 break; 4764 4765 4766 4767 case URX_STATIC_SETREF: 4768 { 4769 // Test input character against one of the predefined sets 4770 // (Word Characters, for example) 4771 // The high bit of the op value is a flag for the match polarity. 4772 // 0: success if input char is in set. 4773 // 1: success if input char is not in set. 4774 if (fp->fInputIdx >= fActiveLimit) { 4775 fHitEnd = true; 4776 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4777 break; 4778 } 4779 4780 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4781 opValue &= ~URX_NEG_SET; 4782 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4783 4784 UChar32 c; 4785 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4786 if (c < 256) { 4787 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; 4788 if (s8.contains(c)) { 4789 success = !success; 4790 } 4791 } else { 4792 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; 4793 if (s.contains(c)) { 4794 success = !success; 4795 } 4796 } 4797 if (!success) { 4798 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4799 } 4800 } 4801 break; 4802 4803 4804 case URX_STAT_SETREF_N: 4805 { 4806 // Test input character for NOT being a member of one of 4807 // the predefined sets (Word Characters, for example) 4808 if (fp->fInputIdx >= fActiveLimit) { 4809 fHitEnd = true; 4810 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4811 break; 4812 } 4813 4814 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4815 4816 UChar32 c; 4817 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4818 if (c < 256) { 4819 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; 4820 if (s8.contains(c) == false) { 4821 break; 4822 } 4823 } else { 4824 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; 4825 if (s.contains(c) == false) { 4826 break; 4827 } 4828 } 4829 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4830 } 4831 break; 4832 4833 4834 case URX_SETREF: 4835 { 4836 if (fp->fInputIdx >= fActiveLimit) { 4837 fHitEnd = true; 4838 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4839 break; 4840 } 4841 4842 U_ASSERT(opValue > 0 && opValue < fSets->size()); 4843 4844 // There is input left. Pick up one char and test it for set membership. 4845 UChar32 c; 4846 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4847 if (c<256) { 4848 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4849 if (s8->contains(c)) { 4850 // The character is in the set. A Match. 4851 break; 4852 } 4853 } else { 4854 UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue)); 4855 if (s->contains(c)) { 4856 // The character is in the set. A Match. 4857 break; 4858 } 4859 } 4860 4861 // the character wasn't in the set. 4862 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4863 } 4864 break; 4865 4866 4867 case URX_DOTANY: 4868 { 4869 // . matches anything, but stops at end-of-line. 4870 if (fp->fInputIdx >= fActiveLimit) { 4871 // At end of input. Match failed. Backtrack out. 4872 fHitEnd = true; 4873 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4874 break; 4875 } 4876 4877 // There is input left. Advance over one char, unless we've hit end-of-line 4878 UChar32 c; 4879 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4880 if (isLineTerminator(c)) { 4881 // End of line in normal mode. . does not match. 4882 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4883 break; 4884 } 4885 } 4886 break; 4887 4888 4889 case URX_DOTANY_ALL: 4890 { 4891 // . in dot-matches-all (including new lines) mode 4892 if (fp->fInputIdx >= fActiveLimit) { 4893 // At end of input. Match failed. Backtrack out. 4894 fHitEnd = true; 4895 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4896 break; 4897 } 4898 4899 // There is input left. Advance over one char, except if we are 4900 // at a cr/lf, advance over both of them. 4901 UChar32 c; 4902 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4903 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 4904 // In the case of a CR/LF, we need to advance over both. 4905 if (inputBuf[fp->fInputIdx] == 0x0a) { 4906 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 4907 } 4908 } 4909 } 4910 break; 4911 4912 4913 case URX_DOTANY_UNIX: 4914 { 4915 // '.' operator, matches all, but stops at end-of-line. 4916 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 4917 if (fp->fInputIdx >= fActiveLimit) { 4918 // At end of input. Match failed. Backtrack out. 4919 fHitEnd = true; 4920 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4921 break; 4922 } 4923 4924 // There is input left. Advance over one char, unless we've hit end-of-line 4925 UChar32 c; 4926 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4927 if (c == 0x0a) { 4928 // End of line in normal mode. '.' does not match the \n 4929 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4930 } 4931 } 4932 break; 4933 4934 4935 case URX_JMP: 4936 fp->fPatIdx = opValue; 4937 break; 4938 4939 case URX_FAIL: 4940 isMatch = false; 4941 goto breakFromLoop; 4942 4943 case URX_JMP_SAV: 4944 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 4945 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4946 fp->fPatIdx = opValue; // Then JMP. 4947 break; 4948 4949 case URX_JMP_SAV_X: 4950 // This opcode is used with (x)+, when x can match a zero length string. 4951 // Same as JMP_SAV, except conditional on the match having made forward progress. 4952 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 4953 // data address of the input position at the start of the loop. 4954 { 4955 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 4956 int32_t stoOp = static_cast<int32_t>(pat[opValue - 1]); 4957 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 4958 int32_t frameLoc = URX_VAL(stoOp); 4959 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 4960 int32_t prevInputIdx = static_cast<int32_t>(fp->fExtra[frameLoc]); 4961 U_ASSERT(prevInputIdx <= fp->fInputIdx); 4962 if (prevInputIdx < fp->fInputIdx) { 4963 // The match did make progress. Repeat the loop. 4964 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4965 fp->fPatIdx = opValue; 4966 fp->fExtra[frameLoc] = fp->fInputIdx; 4967 } 4968 // If the input position did not advance, we do nothing here, 4969 // execution will fall out of the loop. 4970 } 4971 break; 4972 4973 case URX_CTR_INIT: 4974 { 4975 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 4976 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 4977 4978 // Pick up the three extra operands that CTR_INIT has, and 4979 // skip the pattern location counter past 4980 int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx); 4981 fp->fPatIdx += 3; 4982 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 4983 int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]); 4984 int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]); 4985 U_ASSERT(minCount>=0); 4986 U_ASSERT(maxCount>=minCount || maxCount==-1); 4987 U_ASSERT(loopLoc>=fp->fPatIdx); 4988 4989 if (minCount == 0) { 4990 fp = StateSave(fp, loopLoc+1, status); 4991 } 4992 if (maxCount == -1) { 4993 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. 4994 } else if (maxCount == 0) { 4995 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 4996 } 4997 } 4998 break; 4999 5000 case URX_CTR_LOOP: 5001 { 5002 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5003 int32_t initOp = static_cast<int32_t>(pat[opValue]); 5004 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5005 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5006 int32_t minCount = static_cast<int32_t>(pat[opValue + 2]); 5007 int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]); 5008 (*pCounter)++; 5009 if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) { 5010 U_ASSERT(*pCounter == maxCount); 5011 break; 5012 } 5013 if (*pCounter >= minCount) { 5014 if (maxCount == -1) { 5015 // Loop has no hard upper bound. 5016 // Check that it is progressing through the input, break if it is not. 5017 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5018 if (fp->fInputIdx == *pLastInputIdx) { 5019 break; 5020 } else { 5021 *pLastInputIdx = fp->fInputIdx; 5022 } 5023 } 5024 fp = StateSave(fp, fp->fPatIdx, status); 5025 } else { 5026 // Increment time-out counter. (StateSave() does it if count >= minCount) 5027 fTickCounter--; 5028 if (fTickCounter <= 0) { 5029 IncrementTime(status); // Re-initializes fTickCounter 5030 } 5031 } 5032 fp->fPatIdx = opValue + 4; // Loop back. 5033 } 5034 break; 5035 5036 case URX_CTR_INIT_NG: 5037 { 5038 // Initialize a non-greedy loop 5039 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5040 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5041 5042 // Pick up the three extra operands that CTR_INIT_NG has, and 5043 // skip the pattern location counter past 5044 int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx); 5045 fp->fPatIdx += 3; 5046 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5047 int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]); 5048 int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]); 5049 U_ASSERT(minCount>=0); 5050 U_ASSERT(maxCount>=minCount || maxCount==-1); 5051 U_ASSERT(loopLoc>fp->fPatIdx); 5052 if (maxCount == -1) { 5053 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. 5054 } 5055 5056 if (minCount == 0) { 5057 if (maxCount != 0) { 5058 fp = StateSave(fp, fp->fPatIdx, status); 5059 } 5060 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5061 } 5062 } 5063 break; 5064 5065 case URX_CTR_LOOP_NG: 5066 { 5067 // Non-greedy {min, max} loops 5068 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5069 int32_t initOp = static_cast<int32_t>(pat[opValue]); 5070 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5071 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5072 int32_t minCount = static_cast<int32_t>(pat[opValue + 2]); 5073 int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]); 5074 5075 (*pCounter)++; 5076 if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) { 5077 // The loop has matched the maximum permitted number of times. 5078 // Break out of here with no action. Matching will 5079 // continue with the following pattern. 5080 U_ASSERT(*pCounter == maxCount); 5081 break; 5082 } 5083 5084 if (*pCounter < minCount) { 5085 // We haven't met the minimum number of matches yet. 5086 // Loop back for another one. 5087 fp->fPatIdx = opValue + 4; // Loop back. 5088 fTickCounter--; 5089 if (fTickCounter <= 0) { 5090 IncrementTime(status); // Re-initializes fTickCounter 5091 } 5092 } else { 5093 // We do have the minimum number of matches. 5094 5095 // If there is no upper bound on the loop iterations, check that the input index 5096 // is progressing, and stop the loop if it is not. 5097 if (maxCount == -1) { 5098 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5099 if (fp->fInputIdx == *pLastInputIdx) { 5100 break; 5101 } 5102 *pLastInputIdx = fp->fInputIdx; 5103 } 5104 5105 // Loop Continuation: we will fall into the pattern following the loop 5106 // (non-greedy, don't execute loop body first), but first do 5107 // a state save to the top of the loop, so that a match failure 5108 // in the following pattern will try another iteration of the loop. 5109 fp = StateSave(fp, opValue + 4, status); 5110 } 5111 } 5112 break; 5113 5114 case URX_STO_SP: 5115 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5116 fData[opValue] = fStack->size(); 5117 break; 5118 5119 case URX_LD_SP: 5120 { 5121 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5122 int32_t newStackSize = static_cast<int32_t>(fData[opValue]); 5123 U_ASSERT(newStackSize <= fStack->size()); 5124 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5125 if (newFP == reinterpret_cast<int64_t*>(fp)) { 5126 break; 5127 } 5128 int32_t j; 5129 for (j=0; j<fFrameSize; j++) { 5130 newFP[j] = reinterpret_cast<int64_t*>(fp)[j]; 5131 } 5132 fp = reinterpret_cast<REStackFrame*>(newFP); 5133 fStack->setSize(newStackSize); 5134 } 5135 break; 5136 5137 case URX_BACKREF: 5138 { 5139 U_ASSERT(opValue < fFrameSize); 5140 int64_t groupStartIdx = fp->fExtra[opValue]; 5141 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5142 U_ASSERT(groupStartIdx <= groupEndIdx); 5143 int64_t inputIndex = fp->fInputIdx; 5144 if (groupStartIdx < 0) { 5145 // This capture group has not participated in the match thus far, 5146 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match. 5147 break; 5148 } 5149 UBool success = true; 5150 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { 5151 if (inputIndex >= fActiveLimit) { 5152 success = false; 5153 fHitEnd = true; 5154 break; 5155 } 5156 if (inputBuf[groupIndex] != inputBuf[inputIndex]) { 5157 success = false; 5158 break; 5159 } 5160 } 5161 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) && 5162 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) { 5163 // Capture group ended with an unpaired lead surrogate. 5164 // Back reference is not permitted to match lead only of a surrogatge pair. 5165 success = false; 5166 } 5167 if (success) { 5168 fp->fInputIdx = inputIndex; 5169 } else { 5170 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5171 } 5172 } 5173 break; 5174 5175 case URX_BACKREF_I: 5176 { 5177 U_ASSERT(opValue < fFrameSize); 5178 int64_t groupStartIdx = fp->fExtra[opValue]; 5179 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5180 U_ASSERT(groupStartIdx <= groupEndIdx); 5181 if (groupStartIdx < 0) { 5182 // This capture group has not participated in the match thus far, 5183 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match. 5184 break; 5185 } 5186 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); 5187 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); 5188 5189 // Note: if the capture group match was of an empty string the backref 5190 // match succeeds. Verified by testing: Perl matches succeed 5191 // in this case, so we do too. 5192 5193 UBool success = true; 5194 for (;;) { 5195 UChar32 captureGroupChar = captureGroupItr.next(); 5196 if (captureGroupChar == U_SENTINEL) { 5197 success = true; 5198 break; 5199 } 5200 UChar32 inputChar = inputItr.next(); 5201 if (inputChar == U_SENTINEL) { 5202 success = false; 5203 fHitEnd = true; 5204 break; 5205 } 5206 if (inputChar != captureGroupChar) { 5207 success = false; 5208 break; 5209 } 5210 } 5211 5212 if (success && inputItr.inExpansion()) { 5213 // We obtained a match by consuming part of a string obtained from 5214 // case-folding a single code point of the input text. 5215 // This does not count as an overall match. 5216 success = false; 5217 } 5218 5219 if (success) { 5220 fp->fInputIdx = inputItr.getIndex(); 5221 } else { 5222 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5223 } 5224 } 5225 break; 5226 5227 case URX_STO_INP_LOC: 5228 { 5229 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5230 fp->fExtra[opValue] = fp->fInputIdx; 5231 } 5232 break; 5233 5234 case URX_JMPX: 5235 { 5236 int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx); 5237 fp->fPatIdx += 1; 5238 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5239 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5240 int32_t savedInputIdx = static_cast<int32_t>(fp->fExtra[dataLoc]); 5241 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5242 if (savedInputIdx < fp->fInputIdx) { 5243 fp->fPatIdx = opValue; // JMP 5244 } else { 5245 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no progress in loop. 5246 } 5247 } 5248 break; 5249 5250 case URX_LA_START: 5251 { 5252 // Entering a look around block. 5253 // Save Stack Ptr, Input Pos. 5254 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize); 5255 fData[opValue] = fStack->size(); 5256 fData[opValue+1] = fp->fInputIdx; 5257 fData[opValue+2] = fActiveStart; 5258 fData[opValue+3] = fActiveLimit; 5259 fActiveStart = fLookStart; // Set the match region change for 5260 fActiveLimit = fLookLimit; // transparent bounds. 5261 } 5262 break; 5263 5264 case URX_LA_END: 5265 { 5266 // Leaving a look around block. 5267 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5268 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize); 5269 int32_t stackSize = fStack->size(); 5270 int32_t newStackSize = static_cast<int32_t>(fData[opValue]); 5271 U_ASSERT(stackSize >= newStackSize); 5272 if (stackSize > newStackSize) { 5273 // Copy the current top frame back to the new (cut back) top frame. 5274 // This makes the capture groups from within the look-ahead 5275 // expression available. 5276 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5277 int32_t j; 5278 for (j=0; j<fFrameSize; j++) { 5279 newFP[j] = reinterpret_cast<int64_t*>(fp)[j]; 5280 } 5281 fp = reinterpret_cast<REStackFrame*>(newFP); 5282 fStack->setSize(newStackSize); 5283 } 5284 fp->fInputIdx = fData[opValue+1]; 5285 5286 // Restore the active region bounds in the input string; they may have 5287 // been changed because of transparent bounds on a Region. 5288 fActiveStart = fData[opValue+2]; 5289 fActiveLimit = fData[opValue+3]; 5290 U_ASSERT(fActiveStart >= 0); 5291 U_ASSERT(fActiveLimit <= fInputLength); 5292 } 5293 break; 5294 5295 case URX_ONECHAR_I: 5296 if (fp->fInputIdx < fActiveLimit) { 5297 UChar32 c; 5298 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5299 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5300 break; 5301 } 5302 } else { 5303 fHitEnd = true; 5304 } 5305 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5306 break; 5307 5308 case URX_STRING_I: 5309 // Case-insensitive test input against a literal string. 5310 // Strings require two slots in the compiled pattern, one for the 5311 // offset to the string text, and one for the length. 5312 // The compiled string has already been case folded. 5313 { 5314 const char16_t *patternString = litText + opValue; 5315 5316 op = static_cast<int32_t>(pat[fp->fPatIdx]); 5317 fp->fPatIdx++; 5318 opType = URX_TYPE(op); 5319 opValue = URX_VAL(op); 5320 U_ASSERT(opType == URX_STRING_LEN); 5321 int32_t patternStringLen = opValue; // Length of the string from the pattern. 5322 5323 UChar32 cText; 5324 UChar32 cPattern; 5325 UBool success = true; 5326 int32_t patternStringIdx = 0; 5327 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 5328 while (patternStringIdx < patternStringLen) { 5329 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 5330 cText = inputIterator.next(); 5331 if (cText != cPattern) { 5332 success = false; 5333 if (cText == U_SENTINEL) { 5334 fHitEnd = true; 5335 } 5336 break; 5337 } 5338 } 5339 if (inputIterator.inExpansion()) { 5340 success = false; 5341 } 5342 5343 if (success) { 5344 fp->fInputIdx = inputIterator.getIndex(); 5345 } else { 5346 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5347 } 5348 } 5349 break; 5350 5351 case URX_LB_START: 5352 { 5353 // Entering a look-behind block. 5354 // Save Stack Ptr, Input Pos and active input region. 5355 // TODO: implement transparent bounds. Ticket #6067 5356 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 5357 fData[opValue] = fStack->size(); 5358 fData[opValue+1] = fp->fInputIdx; 5359 // Save input string length, then reset to pin any matches to end at 5360 // the current position. 5361 fData[opValue+2] = fActiveStart; 5362 fData[opValue+3] = fActiveLimit; 5363 fActiveStart = fRegionStart; 5364 fActiveLimit = fp->fInputIdx; 5365 // Init the variable containing the start index for attempted matches. 5366 fData[opValue+4] = -1; 5367 } 5368 break; 5369 5370 5371 case URX_LB_CONT: 5372 { 5373 // Positive Look-Behind, at top of loop checking for matches of LB expression 5374 // at all possible input starting positions. 5375 5376 // Fetch the min and max possible match lengths. They are the operands 5377 // of this op in the pattern. 5378 int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]); 5379 int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]); 5380 U_ASSERT(minML <= maxML); 5381 U_ASSERT(minML >= 0); 5382 5383 // Fetch (from data) the last input index where a match was attempted. 5384 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 5385 int64_t &lbStartIdx = fData[opValue+4]; 5386 if (lbStartIdx < 0) { 5387 // First time through loop. 5388 lbStartIdx = fp->fInputIdx - minML; 5389 if (lbStartIdx > 0 && lbStartIdx < fInputLength) { 5390 U16_SET_CP_START(inputBuf, 0, lbStartIdx); 5391 } 5392 } else { 5393 // 2nd through nth time through the loop. 5394 // Back up start position for match by one. 5395 if (lbStartIdx == 0) { 5396 lbStartIdx--; 5397 } else { 5398 U16_BACK_1(inputBuf, 0, lbStartIdx); 5399 } 5400 } 5401 5402 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 5403 // We have tried all potential match starting points without 5404 // getting a match. Backtrack out, and out of the 5405 // Look Behind altogether. 5406 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5407 fActiveStart = fData[opValue+2]; 5408 fActiveLimit = fData[opValue+3]; 5409 U_ASSERT(fActiveStart >= 0); 5410 U_ASSERT(fActiveLimit <= fInputLength); 5411 break; 5412 } 5413 5414 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5415 // (successful match will fall off the end of the loop.) 5416 fp = StateSave(fp, fp->fPatIdx-3, status); 5417 fp->fInputIdx = lbStartIdx; 5418 } 5419 break; 5420 5421 case URX_LB_END: 5422 // End of a look-behind block, after a successful match. 5423 { 5424 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 5425 if (fp->fInputIdx != fActiveLimit) { 5426 // The look-behind expression matched, but the match did not 5427 // extend all the way to the point that we are looking behind from. 5428 // FAIL out of here, which will take us back to the LB_CONT, which 5429 // will retry the match starting at another position or fail 5430 // the look-behind altogether, whichever is appropriate. 5431 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5432 break; 5433 } 5434 5435 // Look-behind match is good. Restore the original input string region, 5436 // which had been truncated to pin the end of the lookbehind match to the 5437 // position being looked-behind. 5438 fActiveStart = fData[opValue+2]; 5439 fActiveLimit = fData[opValue+3]; 5440 U_ASSERT(fActiveStart >= 0); 5441 U_ASSERT(fActiveLimit <= fInputLength); 5442 } 5443 break; 5444 5445 5446 case URX_LBN_CONT: 5447 { 5448 // Negative Look-Behind, at top of loop checking for matches of LB expression 5449 // at all possible input starting positions. 5450 5451 // Fetch the extra parameters of this op. 5452 int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]); 5453 int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]); 5454 int32_t continueLoc = static_cast<int32_t>(pat[fp->fPatIdx++]); 5455 continueLoc = URX_VAL(continueLoc); 5456 U_ASSERT(minML <= maxML); 5457 U_ASSERT(minML >= 0); 5458 U_ASSERT(continueLoc > fp->fPatIdx); 5459 5460 // Fetch (from data) the last input index where a match was attempted. 5461 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 5462 int64_t &lbStartIdx = fData[opValue+4]; 5463 if (lbStartIdx < 0) { 5464 // First time through loop. 5465 lbStartIdx = fp->fInputIdx - minML; 5466 if (lbStartIdx > 0 && lbStartIdx < fInputLength) { 5467 U16_SET_CP_START(inputBuf, 0, lbStartIdx); 5468 } 5469 } else { 5470 // 2nd through nth time through the loop. 5471 // Back up start position for match by one. 5472 if (lbStartIdx == 0) { 5473 lbStartIdx--; // Because U16_BACK is unsafe starting at 0. 5474 } else { 5475 U16_BACK_1(inputBuf, 0, lbStartIdx); 5476 } 5477 } 5478 5479 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { 5480 // We have tried all potential match starting points without 5481 // getting a match, which means that the negative lookbehind as 5482 // a whole has succeeded. Jump forward to the continue location 5483 fActiveStart = fData[opValue+2]; 5484 fActiveLimit = fData[opValue+3]; 5485 U_ASSERT(fActiveStart >= 0); 5486 U_ASSERT(fActiveLimit <= fInputLength); 5487 fp->fPatIdx = continueLoc; 5488 break; 5489 } 5490 5491 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5492 // (successful match will cause a FAIL out of the loop altogether.) 5493 fp = StateSave(fp, fp->fPatIdx-4, status); 5494 fp->fInputIdx = lbStartIdx; 5495 } 5496 break; 5497 5498 case URX_LBN_END: 5499 // End of a negative look-behind block, after a successful match. 5500 { 5501 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize); 5502 if (fp->fInputIdx != fActiveLimit) { 5503 // The look-behind expression matched, but the match did not 5504 // extend all the way to the point that we are looking behind from. 5505 // FAIL out of here, which will take us back to the LB_CONT, which 5506 // will retry the match starting at another position or succeed 5507 // the look-behind altogether, whichever is appropriate. 5508 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5509 break; 5510 } 5511 5512 // Look-behind expression matched, which means look-behind test as 5513 // a whole Fails 5514 5515 // Restore the original input string length, which had been truncated 5516 // inorder to pin the end of the lookbehind match 5517 // to the position being looked-behind. 5518 fActiveStart = fData[opValue+2]; 5519 fActiveLimit = fData[opValue+3]; 5520 U_ASSERT(fActiveStart >= 0); 5521 U_ASSERT(fActiveLimit <= fInputLength); 5522 5523 // Restore original stack position, discarding any state saved 5524 // by the successful pattern match. 5525 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5526 int32_t newStackSize = static_cast<int32_t>(fData[opValue]); 5527 U_ASSERT(fStack->size() > newStackSize); 5528 fStack->setSize(newStackSize); 5529 5530 // FAIL, which will take control back to someplace 5531 // prior to entering the look-behind test. 5532 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); 5533 } 5534 break; 5535 5536 5537 case URX_LOOP_SR_I: 5538 // Loop Initialization for the optimized implementation of 5539 // [some character set]* 5540 // This op scans through all matching input. 5541 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5542 { 5543 U_ASSERT(opValue > 0 && opValue < fSets->size()); 5544 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5545 UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue)); 5546 5547 // Loop through input, until either the input is exhausted or 5548 // we reach a character that is not a member of the set. 5549 int32_t ix = static_cast<int32_t>(fp->fInputIdx); 5550 for (;;) { 5551 if (ix >= fActiveLimit) { 5552 fHitEnd = true; 5553 break; 5554 } 5555 UChar32 c; 5556 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5557 if (c<256) { 5558 if (s8->contains(c) == false) { 5559 U16_BACK_1(inputBuf, 0, ix); 5560 break; 5561 } 5562 } else { 5563 if (s->contains(c) == false) { 5564 U16_BACK_1(inputBuf, 0, ix); 5565 break; 5566 } 5567 } 5568 } 5569 5570 // If there were no matching characters, skip over the loop altogether. 5571 // The loop doesn't run at all, a * op always succeeds. 5572 if (ix == fp->fInputIdx) { 5573 fp->fPatIdx++; // skip the URX_LOOP_C op. 5574 break; 5575 } 5576 5577 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5578 // must follow. It's operand is the stack location 5579 // that holds the starting input index for the match of this [set]* 5580 int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]); 5581 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5582 int32_t stackLoc = URX_VAL(loopcOp); 5583 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5584 fp->fExtra[stackLoc] = fp->fInputIdx; 5585 fp->fInputIdx = ix; 5586 5587 // Save State to the URX_LOOP_C op that follows this one, 5588 // so that match failures in the following code will return to there. 5589 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5590 fp = StateSave(fp, fp->fPatIdx, status); 5591 fp->fPatIdx++; 5592 } 5593 break; 5594 5595 5596 case URX_LOOP_DOT_I: 5597 // Loop Initialization for the optimized implementation of .* 5598 // This op scans through all remaining input. 5599 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5600 { 5601 // Loop through input until the input is exhausted (we reach an end-of-line) 5602 // In DOTALL mode, we can just go straight to the end of the input. 5603 int32_t ix; 5604 if ((opValue & 1) == 1) { 5605 // Dot-matches-All mode. Jump straight to the end of the string. 5606 ix = static_cast<int32_t>(fActiveLimit); 5607 fHitEnd = true; 5608 } else { 5609 // NOT DOT ALL mode. Line endings do not match '.' 5610 // Scan forward until a line ending or end of input. 5611 ix = static_cast<int32_t>(fp->fInputIdx); 5612 for (;;) { 5613 if (ix >= fActiveLimit) { 5614 fHitEnd = true; 5615 break; 5616 } 5617 UChar32 c; 5618 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 5619 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 5620 if ((c == 0x0a) || // 0x0a is newline in both modes. 5621 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5622 isLineTerminator(c))) { 5623 // char is a line ending. Put the input pos back to the 5624 // line ending char, and exit the scanning loop. 5625 U16_BACK_1(inputBuf, 0, ix); 5626 break; 5627 } 5628 } 5629 } 5630 } 5631 5632 // If there were no matching characters, skip over the loop altogether. 5633 // The loop doesn't run at all, a * op always succeeds. 5634 if (ix == fp->fInputIdx) { 5635 fp->fPatIdx++; // skip the URX_LOOP_C op. 5636 break; 5637 } 5638 5639 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5640 // must follow. It's operand is the stack location 5641 // that holds the starting input index for the match of this .* 5642 int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]); 5643 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5644 int32_t stackLoc = URX_VAL(loopcOp); 5645 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5646 fp->fExtra[stackLoc] = fp->fInputIdx; 5647 fp->fInputIdx = ix; 5648 5649 // Save State to the URX_LOOP_C op that follows this one, 5650 // so that match failures in the following code will return to there. 5651 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5652 fp = StateSave(fp, fp->fPatIdx, status); 5653 fp->fPatIdx++; 5654 } 5655 break; 5656 5657 5658 case URX_LOOP_C: 5659 { 5660 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5661 backSearchIndex = static_cast<int32_t>(fp->fExtra[opValue]); 5662 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5663 if (backSearchIndex == fp->fInputIdx) { 5664 // We've backed up the input idx to the point that the loop started. 5665 // The loop is done. Leave here without saving state. 5666 // Subsequent failures won't come back here. 5667 break; 5668 } 5669 // Set up for the next iteration of the loop, with input index 5670 // backed up by one from the last time through, 5671 // and a state save to this instruction in case the following code fails again. 5672 // (We're going backwards because this loop emulates stack unwinding, not 5673 // the initial scan forward.) 5674 U_ASSERT(fp->fInputIdx > 0); 5675 UChar32 prevC; 5676 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5677 5678 if (prevC == 0x0a && 5679 fp->fInputIdx > backSearchIndex && 5680 inputBuf[fp->fInputIdx-1] == 0x0d) { 5681 int32_t prevOp = static_cast<int32_t>(pat[fp->fPatIdx - 2]); 5682 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5683 // .*, stepping back over CRLF pair. 5684 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5685 } 5686 } 5687 5688 5689 fp = StateSave(fp, fp->fPatIdx-1, status); 5690 } 5691 break; 5692 5693 5694 5695 default: 5696 // Trouble. The compiled pattern contains an entry with an 5697 // unrecognized type tag. 5698 UPRV_UNREACHABLE_ASSERT; 5699 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have 5700 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. 5701 // See ICU-21669. 5702 status = U_INTERNAL_PROGRAM_ERROR; 5703 } 5704 5705 if (U_FAILURE(status)) { 5706 isMatch = false; 5707 break; 5708 } 5709 } 5710 5711 breakFromLoop: 5712 fMatch = isMatch; 5713 if (isMatch) { 5714 fLastMatchEnd = fMatchEnd; 5715 fMatchStart = startIdx; 5716 fMatchEnd = fp->fInputIdx; 5717 } 5718 5719 #ifdef REGEX_RUN_DEBUG 5720 if (fTraceDebug) { 5721 if (isMatch) { 5722 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); 5723 } else { 5724 printf("No match\n\n"); 5725 } 5726 } 5727 #endif 5728 5729 fFrame = fp; // The active stack frame when the engine stopped. 5730 // Contains the capture group results that we need to 5731 // access later. 5732 } 5733 5734 5735 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5736 5737 U_NAMESPACE_END 5738 5739 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS