tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rematch.cpp (224707B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **************************************************************************
      5 *   Copyright (C) 2002-2016 International Business Machines Corporation
      6 *   and others. All rights reserved.
      7 **************************************************************************
      8 */
      9 //
     10 //  file:  rematch.cpp
     11 //
     12 //         Contains the implementation of class RegexMatcher,
     13 //         which is one of the main API classes for the ICU regular expression package.
     14 //
     15 
     16 #include "unicode/utypes.h"
     17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     18 
     19 #include "unicode/regex.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/ustring.h"
     23 #include "unicode/rbbi.h"
     24 #include "unicode/utf.h"
     25 #include "unicode/utf16.h"
     26 #include "uassert.h"
     27 #include "cmemory.h"
     28 #include "cstr.h"
     29 #include "uvector.h"
     30 #include "uvectr32.h"
     31 #include "uvectr64.h"
     32 #include "regeximp.h"
     33 #include "regexst.h"
     34 #include "regextxt.h"
     35 #include "ucase.h"
     36 
     37 // #include <malloc.h>        // Needed for heapcheck testing
     38 
     39 
     40 U_NAMESPACE_BEGIN
     41 
     42 // Default limit for the size of the back track stack, to avoid system
     43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
     44 // This value puts ICU's limits higher than most other regexp implementations,
     45 //    which use recursion rather than the heap, and take more storage per
     46 //    backtrack point.
     47 //
     48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
     49 
     50 // Time limit counter constant.
     51 //   Time limits for expression evaluation are in terms of quanta of work by
     52 //   the engine, each of which is 10,000 state saves.
     53 //   This constant determines that state saves per tick number.
     54 static const int32_t TIMER_INITIAL_VALUE = 10000;
     55 
     56 
     57 // Test for any of the Unicode line terminating characters.
     58 static inline UBool isLineTerminator(UChar32 c) {
     59    if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
     60        return false;
     61    }
     62    return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
     63 }
     64 
     65 //-----------------------------------------------------------------------------
     66 //
     67 //   Constructor and Destructor
     68 //
     69 //-----------------------------------------------------------------------------
     70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
     71    fDeferredStatus = U_ZERO_ERROR;
     72    init(fDeferredStatus);
     73    if (U_FAILURE(fDeferredStatus)) {
     74        return;
     75    }
     76    if (pat==nullptr) {
     77        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
     78        return;
     79    }
     80    fPattern = pat;
     81    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
     82 }
     83 
     84 
     85 
     86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
     87                           uint32_t flags, UErrorCode &status) {
     88    init(status);
     89    if (U_FAILURE(status)) {
     90        return;
     91    }
     92    UParseError    pe;
     93    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
     94    fPattern           = fPatternOwned;
     95 
     96    UText inputText = UTEXT_INITIALIZER;
     97    utext_openConstUnicodeString(&inputText, &input, &status);
     98    init2(&inputText, status);
     99    utext_close(&inputText);
    100 
    101    fInputUniStrMaybeMutable = true;
    102 }
    103 
    104 
    105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
    106                           uint32_t flags, UErrorCode &status) {
    107    init(status);
    108    if (U_FAILURE(status)) {
    109        return;
    110    }
    111    UParseError    pe;
    112    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    113    if (U_FAILURE(status)) {
    114        return;
    115    }
    116 
    117    fPattern           = fPatternOwned;
    118    init2(input, status);
    119 }
    120 
    121 
    122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
    123                           uint32_t flags, UErrorCode &status) {
    124    init(status);
    125    if (U_FAILURE(status)) {
    126        return;
    127    }
    128    UParseError    pe;
    129    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    130    if (U_FAILURE(status)) {
    131        return;
    132    }
    133    fPattern           = fPatternOwned;
    134    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    135 }
    136 
    137 RegexMatcher::RegexMatcher(UText *regexp,
    138                           uint32_t flags, UErrorCode &status) {
    139    init(status);
    140    if (U_FAILURE(status)) {
    141        return;
    142    }
    143    UParseError    pe;
    144    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
    145        if (U_FAILURE(status)) {
    146        return;
    147    }
    148 
    149    fPattern           = fPatternOwned;
    150    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
    151 }
    152 
    153 
    154 
    155 
    156 RegexMatcher::~RegexMatcher() {
    157    delete fStack;
    158    if (fData != fSmallData) {
    159        uprv_free(fData);
    160        fData = nullptr;
    161    }
    162    if (fPatternOwned) {
    163        delete fPatternOwned;
    164        fPatternOwned = nullptr;
    165        fPattern = nullptr;
    166    }
    167 
    168    delete fInput;
    169    if (fInputText) {
    170        utext_close(fInputText);
    171    }
    172    if (fAltInputText) {
    173        utext_close(fAltInputText);
    174    }
    175 
    176    #if UCONFIG_NO_BREAK_ITERATION==0
    177    delete fWordBreakItr;
    178    delete fGCBreakItr;
    179    #endif
    180 }
    181 
    182 //
    183 //   init()   common initialization for use by all constructors.
    184 //            Initialize all fields, get the object into a consistent state.
    185 //            This must be done even when the initial status shows an error,
    186 //            so that the object is initialized sufficiently well for the destructor
    187 //            to run safely.
    188 //
    189 void RegexMatcher::init(UErrorCode &status) {
    190    fPattern           = nullptr;
    191    fPatternOwned      = nullptr;
    192    fFrameSize         = 0;
    193    fRegionStart       = 0;
    194    fRegionLimit       = 0;
    195    fAnchorStart       = 0;
    196    fAnchorLimit       = 0;
    197    fLookStart         = 0;
    198    fLookLimit         = 0;
    199    fActiveStart       = 0;
    200    fActiveLimit       = 0;
    201    fTransparentBounds = false;
    202    fAnchoringBounds   = true;
    203    fMatch             = false;
    204    fMatchStart        = 0;
    205    fMatchEnd          = 0;
    206    fLastMatchEnd      = -1;
    207    fAppendPosition    = 0;
    208    fHitEnd            = false;
    209    fRequireEnd        = false;
    210    fStack             = nullptr;
    211    fFrame             = nullptr;
    212    fTimeLimit         = 0;
    213    fTime              = 0;
    214    fTickCounter       = 0;
    215    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
    216    fCallbackFn        = nullptr;
    217    fCallbackContext   = nullptr;
    218    fFindProgressCallbackFn      = nullptr;
    219    fFindProgressCallbackContext = nullptr;
    220    fTraceDebug        = false;
    221    fDeferredStatus    = status;
    222    fData              = fSmallData;
    223    fWordBreakItr      = nullptr;
    224    fGCBreakItr        = nullptr;
    225 
    226    fStack             = nullptr;
    227    fInputText         = nullptr;
    228    fAltInputText      = nullptr;
    229    fInput             = nullptr;
    230    fInputLength       = 0;
    231    fInputUniStrMaybeMutable = false;
    232 }
    233 
    234 //
    235 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
    236 //            This handles the common setup to be done after the Pattern is available.
    237 //
    238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
    239    if (U_FAILURE(status)) {
    240        fDeferredStatus = status;
    241        return;
    242    }
    243 
    244    if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
    245        fData = static_cast<int64_t*>(uprv_malloc(fPattern->fDataSize * sizeof(int64_t)));
    246        if (fData == nullptr) {
    247            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    248            return;
    249        }
    250    }
    251 
    252    fStack = new UVector64(status);
    253    if (fStack == nullptr) {
    254        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    255        return;
    256    }
    257 
    258    reset(input);
    259    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
    260    if (U_FAILURE(status)) {
    261        fDeferredStatus = status;
    262        return;
    263    }
    264 }
    265 
    266 
    267 static const char16_t BACKSLASH  = 0x5c;
    268 static const char16_t DOLLARSIGN = 0x24;
    269 static const char16_t LEFTBRACKET = 0x7b;
    270 static const char16_t RIGHTBRACKET = 0x7d;
    271 
    272 //--------------------------------------------------------------------------------
    273 //
    274 //    appendReplacement
    275 //
    276 //--------------------------------------------------------------------------------
    277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
    278                                              const UnicodeString &replacement,
    279                                              UErrorCode &status) {
    280    UText replacementText = UTEXT_INITIALIZER;
    281 
    282    utext_openConstUnicodeString(&replacementText, &replacement, &status);
    283    if (U_SUCCESS(status)) {
    284        UText resultText = UTEXT_INITIALIZER;
    285        utext_openUnicodeString(&resultText, &dest, &status);
    286 
    287        if (U_SUCCESS(status)) {
    288            appendReplacement(&resultText, &replacementText, status);
    289            utext_close(&resultText);
    290        }
    291        utext_close(&replacementText);
    292    }
    293 
    294    return *this;
    295 }
    296 
    297 //
    298 //    appendReplacement, UText mode
    299 //
    300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
    301                                              UText *replacement,
    302                                              UErrorCode &status) {
    303    if (U_FAILURE(status)) {
    304        return *this;
    305    }
    306    if (U_FAILURE(fDeferredStatus)) {
    307        status = fDeferredStatus;
    308        return *this;
    309    }
    310    if (fMatch == false) {
    311        status = U_REGEX_INVALID_STATE;
    312        return *this;
    313    }
    314 
    315    // Copy input string from the end of previous match to start of current match
    316    int64_t  destLen = utext_nativeLength(dest);
    317    if (fMatchStart > fAppendPosition) {
    318        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    319            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    320                                     static_cast<int32_t>(fMatchStart - fAppendPosition), &status);
    321        } else {
    322            int32_t len16;
    323            if (UTEXT_USES_U16(fInputText)) {
    324                len16 = static_cast<int32_t>(fMatchStart - fAppendPosition);
    325            } else {
    326                UErrorCode lengthStatus = U_ZERO_ERROR;
    327                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus);
    328            }
    329            char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1)));
    330            if (inputChars == nullptr) {
    331                status = U_MEMORY_ALLOCATION_ERROR;
    332                return *this;
    333            }
    334            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
    335            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    336            uprv_free(inputChars);
    337        }
    338    }
    339    fAppendPosition = fMatchEnd;
    340 
    341 
    342    // scan the replacement text, looking for substitutions ($n) and \escapes.
    343    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
    344    //         move entire ranges not containing substitutions.
    345    UTEXT_SETNATIVEINDEX(replacement, 0);
    346    for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
    347        if (c == BACKSLASH) {
    348            // Backslash Escape.  Copy the following char out without further checks.
    349            //                    Note:  Surrogate pairs don't need any special handling
    350            //                           The second half wont be a '$' or a '\', and
    351            //                           will move to the dest normally on the next
    352            //                           loop iteration.
    353            c = UTEXT_CURRENT32(replacement);
    354            if (c == U_SENTINEL) {
    355                break;
    356            }
    357 
    358            if (c==0x55/*U*/ || c==0x75/*u*/) {
    359                // We have a \udddd or \Udddddddd escape sequence.
    360                int32_t offset = 0;
    361                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
    362                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
    363                if (escapedChar != static_cast<UChar32>(0xFFFFFFFF)) {
    364                    if (U_IS_BMP(escapedChar)) {
    365                        char16_t c16 = static_cast<char16_t>(escapedChar);
    366                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    367                    } else {
    368                        char16_t surrogate[2];
    369                        surrogate[0] = U16_LEAD(escapedChar);
    370                        surrogate[1] = U16_TRAIL(escapedChar);
    371                        if (U_SUCCESS(status)) {
    372                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    373                        }
    374                    }
    375                    // TODO:  Report errors for mal-formed \u escapes?
    376                    //        As this is, the original sequence is output, which may be OK.
    377                    if (context.lastOffset == offset) {
    378                        (void)UTEXT_PREVIOUS32(replacement);
    379                    } else if (context.lastOffset != offset-1) {
    380                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
    381                    }
    382                }
    383            } else {
    384                (void)UTEXT_NEXT32(replacement);
    385                // Plain backslash escape.  Just put out the escaped character.
    386                if (U_IS_BMP(c)) {
    387                    char16_t c16 = static_cast<char16_t>(c);
    388                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    389                } else {
    390                    char16_t surrogate[2];
    391                    surrogate[0] = U16_LEAD(c);
    392                    surrogate[1] = U16_TRAIL(c);
    393                    if (U_SUCCESS(status)) {
    394                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    395                    }
    396                }
    397            }
    398        } else if (c != DOLLARSIGN) {
    399            // Normal char, not a $.  Copy it out without further checks.
    400            if (U_IS_BMP(c)) {
    401                char16_t c16 = static_cast<char16_t>(c);
    402                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
    403            } else {
    404                char16_t surrogate[2];
    405                surrogate[0] = U16_LEAD(c);
    406                surrogate[1] = U16_TRAIL(c);
    407                if (U_SUCCESS(status)) {
    408                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
    409                }
    410            }
    411        } else {
    412            // We've got a $.  Pick up a capture group name or number if one follows.
    413            // Consume digits so long as the resulting group number <= the number of
    414            // number of capture groups in the pattern.
    415 
    416            int32_t groupNum  = 0;
    417            int32_t numDigits = 0;
    418            UChar32 nextChar = utext_current32(replacement);
    419            if (nextChar == LEFTBRACKET) {
    420                // Scan for a Named Capture Group, ${name}.
    421                UnicodeString groupName;
    422                utext_next32(replacement);
    423                while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
    424                    nextChar = utext_next32(replacement);
    425                    if (nextChar == U_SENTINEL) {
    426                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    427                    } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
    428                               (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
    429                               (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
    430                        groupName.append(nextChar);
    431                    } else if (nextChar == RIGHTBRACKET) {
    432                        groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
    433                        if (groupNum == 0) {
    434                            status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    435                        }
    436                    } else {
    437                        // Character was something other than a name char or a closing '}'
    438                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    439                    }
    440                }
    441 
    442            } else if (u_isdigit(nextChar)) {
    443                // $n    Scan for a capture group number
    444                int32_t numCaptureGroups = fPattern->fGroupMap->size();
    445                for (;;) {
    446                    nextChar = UTEXT_CURRENT32(replacement);
    447                    if (nextChar == U_SENTINEL) {
    448                        break;
    449                    }
    450                    if (u_isdigit(nextChar) == false) {
    451                        break;
    452                    }
    453                    int32_t nextDigitVal = u_charDigitValue(nextChar);
    454                    if (groupNum*10 + nextDigitVal > numCaptureGroups) {
    455                        // Don't consume the next digit if it makes the capture group number too big.
    456                        if (numDigits == 0) {
    457                            status = U_INDEX_OUTOFBOUNDS_ERROR;
    458                        }
    459                        break;
    460                    }
    461                    (void)UTEXT_NEXT32(replacement);
    462                    groupNum=groupNum*10 + nextDigitVal;
    463                    ++numDigits;
    464                }
    465            } else {
    466                // $ not followed by capture group name or number.
    467                status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
    468            }
    469 
    470            if (U_SUCCESS(status)) {
    471                destLen += appendGroup(groupNum, dest, status);
    472            }
    473        }  // End of $ capture group handling
    474    }  // End of per-character loop through the replacement string.
    475 
    476    return *this;
    477 }
    478 
    479 
    480 
    481 //--------------------------------------------------------------------------------
    482 //
    483 //    appendTail     Intended to be used in conjunction with appendReplacement()
    484 //                   To the destination string, append everything following
    485 //                   the last match position from the input string.
    486 //
    487 //                   Note:  Match ranges do not affect appendTail or appendReplacement
    488 //
    489 //--------------------------------------------------------------------------------
    490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
    491    UErrorCode status = U_ZERO_ERROR;
    492    UText resultText = UTEXT_INITIALIZER;
    493    utext_openUnicodeString(&resultText, &dest, &status);
    494 
    495    if (U_SUCCESS(status)) {
    496        appendTail(&resultText, status);
    497        utext_close(&resultText);
    498    }
    499 
    500    return dest;
    501 }
    502 
    503 //
    504 //   appendTail, UText mode
    505 //
    506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
    507    if (U_FAILURE(status)) {
    508        return dest;
    509    }
    510    if (U_FAILURE(fDeferredStatus)) {
    511        status = fDeferredStatus;
    512        return dest;
    513    }
    514 
    515    if (fInputLength > fAppendPosition) {
    516        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    517            int64_t destLen = utext_nativeLength(dest);
    518            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
    519                          static_cast<int32_t>(fInputLength - fAppendPosition), &status);
    520        } else {
    521            int32_t len16;
    522            if (UTEXT_USES_U16(fInputText)) {
    523                len16 = static_cast<int32_t>(fInputLength - fAppendPosition);
    524            } else {
    525                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, nullptr, 0, &status);
    526                status = U_ZERO_ERROR; // buffer overflow
    527            }
    528 
    529            char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16)));
    530            if (inputChars == nullptr) {
    531                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    532            } else {
    533                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
    534                int64_t destLen = utext_nativeLength(dest);
    535                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
    536                uprv_free(inputChars);
    537            }
    538        }
    539    }
    540    return dest;
    541 }
    542 
    543 
    544 
    545 //--------------------------------------------------------------------------------
    546 //
    547 //   end
    548 //
    549 //--------------------------------------------------------------------------------
    550 int32_t RegexMatcher::end(UErrorCode &err) const {
    551    return end(0, err);
    552 }
    553 
    554 int64_t RegexMatcher::end64(UErrorCode &err) const {
    555    return end64(0, err);
    556 }
    557 
    558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
    559    if (U_FAILURE(err)) {
    560        return -1;
    561    }
    562    if (fMatch == false) {
    563        err = U_REGEX_INVALID_STATE;
    564        return -1;
    565    }
    566    if (group < 0 || group > fPattern->fGroupMap->size()) {
    567        err = U_INDEX_OUTOFBOUNDS_ERROR;
    568        return -1;
    569    }
    570    int64_t e = -1;
    571    if (group == 0) {
    572        e = fMatchEnd;
    573    } else {
    574        // Get the position within the stack frame of the variables for
    575        //    this capture group.
    576        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
    577        U_ASSERT(groupOffset < fPattern->fFrameSize);
    578        U_ASSERT(groupOffset >= 0);
    579        e = fFrame->fExtra[groupOffset + 1];
    580    }
    581 
    582        return e;
    583 }
    584 
    585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
    586    return static_cast<int32_t>(end64(group, err));
    587 }
    588 
    589 //--------------------------------------------------------------------------------
    590 //
    591 //   findProgressInterrupt  This function is called once for each advance in the target
    592 //                          string from the find() function, and calls the user progress callback
    593 //                          function if there is one installed.
    594 //
    595 //         Return:  true if the find operation is to be terminated.
    596 //                  false if the find operation is to continue running.
    597 //
    598 //--------------------------------------------------------------------------------
    599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
    600    if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
    601        status = U_REGEX_STOPPED_BY_CALLER;
    602        return true;
    603    }
    604    return false;
    605 }
    606 
    607 //--------------------------------------------------------------------------------
    608 //
    609 //   find()
    610 //
    611 //--------------------------------------------------------------------------------
    612 UBool RegexMatcher::find() {
    613    if (U_FAILURE(fDeferredStatus)) {
    614        return false;
    615    }
    616    UErrorCode status = U_ZERO_ERROR;
    617    UBool result = find(status);
    618    return result;
    619 }
    620 
    621 //--------------------------------------------------------------------------------
    622 //
    623 //   find()
    624 //
    625 //--------------------------------------------------------------------------------
    626 UBool RegexMatcher::find(UErrorCode &status) {
    627    // Start at the position of the last match end.  (Will be zero if the
    628    //   matcher has been reset.)
    629    //
    630    if (U_FAILURE(status)) {
    631        return false;
    632    }
    633    if (U_FAILURE(fDeferredStatus)) {
    634        status = fDeferredStatus;
    635        return false;
    636    }
    637 
    638    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
    639        return findUsingChunk(status);
    640    }
    641 
    642    int64_t startPos = fMatchEnd;
    643    if (startPos==0) {
    644        startPos = fActiveStart;
    645    }
    646 
    647    if (fMatch) {
    648        // Save the position of any previous successful match.
    649        fLastMatchEnd = fMatchEnd;
    650 
    651        if (fMatchStart == fMatchEnd) {
    652            // Previous match had zero length.  Move start position up one position
    653            //  to avoid sending find() into a loop on zero-length matches.
    654            if (startPos >= fActiveLimit) {
    655                fMatch = false;
    656                fHitEnd = true;
    657                return false;
    658            }
    659            UTEXT_SETNATIVEINDEX(fInputText, startPos);
    660            (void)UTEXT_NEXT32(fInputText);
    661            startPos = UTEXT_GETNATIVEINDEX(fInputText);
    662        }
    663    } else {
    664        if (fLastMatchEnd >= 0) {
    665            // A previous find() failed to match.  Don't try again.
    666            //   (without this test, a pattern with a zero-length match
    667            //    could match again at the end of an input string.)
    668            fHitEnd = true;
    669            return false;
    670        }
    671    }
    672 
    673 
    674    // Compute the position in the input string beyond which a match can not begin, because
    675    //   the minimum length match would extend past the end of the input.
    676    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    677    //          Be aware of possible overflows if making changes here.
    678    int64_t testStartLimit;
    679    if (UTEXT_USES_U16(fInputText)) {
    680        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
    681        if (startPos > testStartLimit) {
    682            fMatch = false;
    683            fHitEnd = true;
    684            return false;
    685        }
    686    } else {
    687        // We don't know exactly how long the minimum match length is in native characters.
    688        // Treat anything > 0 as 1.
    689        testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
    690    }
    691 
    692    UChar32  c;
    693    U_ASSERT(startPos >= 0);
    694 
    695    switch (fPattern->fStartType) {
    696    case START_NO_INFO:
    697        // No optimization was found.
    698        //  Try a match at each input position.
    699        for (;;) {
    700            MatchAt(startPos, false, status);
    701            if (U_FAILURE(status)) {
    702                return false;
    703            }
    704            if (fMatch) {
    705                return true;
    706            }
    707            if (startPos >= testStartLimit) {
    708                fHitEnd = true;
    709                return false;
    710            }
    711            UTEXT_SETNATIVEINDEX(fInputText, startPos);
    712            (void)UTEXT_NEXT32(fInputText);
    713            startPos = UTEXT_GETNATIVEINDEX(fInputText);
    714            // Note that it's perfectly OK for a pattern to have a zero-length
    715            //   match at the end of a string, so we must make sure that the loop
    716            //   runs with startPos == testStartLimit the last time through.
    717            if  (findProgressInterrupt(startPos, status))
    718                return false;
    719        }
    720        UPRV_UNREACHABLE_EXIT;
    721 
    722    case START_START:
    723        // Matches are only possible at the start of the input string
    724        //   (pattern begins with ^ or \A)
    725        if (startPos > fActiveStart) {
    726            fMatch = false;
    727            return false;
    728        }
    729        MatchAt(startPos, false, status);
    730        if (U_FAILURE(status)) {
    731            return false;
    732        }
    733        return fMatch;
    734 
    735 
    736    case START_SET:
    737        {
    738            // Match may start on any char from a pre-computed set.
    739            U_ASSERT(fPattern->fMinMatchLen > 0);
    740            UTEXT_SETNATIVEINDEX(fInputText, startPos);
    741            for (;;) {
    742                int64_t pos = startPos;
    743                c = UTEXT_NEXT32(fInputText);
    744                startPos = UTEXT_GETNATIVEINDEX(fInputText);
    745                // c will be -1 (U_SENTINEL) at end of text, in which case we
    746                // skip this next block (so we don't have a negative array index)
    747                // and handle end of text in the following block.
    748                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
    749                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
    750                    MatchAt(pos, false, status);
    751                    if (U_FAILURE(status)) {
    752                        return false;
    753                    }
    754                    if (fMatch) {
    755                        return true;
    756                    }
    757                    UTEXT_SETNATIVEINDEX(fInputText, pos);
    758                }
    759                if (startPos > testStartLimit) {
    760                    fMatch = false;
    761                    fHitEnd = true;
    762                    return false;
    763                }
    764                if  (findProgressInterrupt(startPos, status))
    765                    return false;
    766            }
    767        }
    768        UPRV_UNREACHABLE_EXIT;
    769 
    770    case START_STRING:
    771    case START_CHAR:
    772        {
    773            // Match starts on exactly one char.
    774            U_ASSERT(fPattern->fMinMatchLen > 0);
    775            UChar32 theChar = fPattern->fInitialChar;
    776            UTEXT_SETNATIVEINDEX(fInputText, startPos);
    777            for (;;) {
    778                int64_t pos = startPos;
    779                c = UTEXT_NEXT32(fInputText);
    780                startPos = UTEXT_GETNATIVEINDEX(fInputText);
    781                if (c == theChar) {
    782                    MatchAt(pos, false, status);
    783                    if (U_FAILURE(status)) {
    784                        return false;
    785                    }
    786                    if (fMatch) {
    787                        return true;
    788                    }
    789                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
    790                }
    791                if (startPos > testStartLimit) {
    792                    fMatch = false;
    793                    fHitEnd = true;
    794                    return false;
    795                }
    796                if  (findProgressInterrupt(startPos, status))
    797                    return false;
    798           }
    799        }
    800        UPRV_UNREACHABLE_EXIT;
    801 
    802    case START_LINE:
    803        {
    804            UChar32 ch;
    805            if (startPos == fAnchorStart) {
    806                MatchAt(startPos, false, status);
    807                if (U_FAILURE(status)) {
    808                    return false;
    809                }
    810                if (fMatch) {
    811                    return true;
    812                }
    813                UTEXT_SETNATIVEINDEX(fInputText, startPos);
    814                ch = UTEXT_NEXT32(fInputText);
    815                startPos = UTEXT_GETNATIVEINDEX(fInputText);
    816            } else {
    817                UTEXT_SETNATIVEINDEX(fInputText, startPos);
    818                ch = UTEXT_PREVIOUS32(fInputText);
    819                UTEXT_SETNATIVEINDEX(fInputText, startPos);
    820            }
    821 
    822            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
    823                for (;;) {
    824                    if (ch == 0x0a) {
    825                            MatchAt(startPos, false, status);
    826                            if (U_FAILURE(status)) {
    827                                return false;
    828                            }
    829                            if (fMatch) {
    830                                return true;
    831                            }
    832                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
    833                    }
    834                    if (startPos >= testStartLimit) {
    835                        fMatch = false;
    836                        fHitEnd = true;
    837                        return false;
    838                    }
    839                    ch = UTEXT_NEXT32(fInputText);
    840                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
    841                    // Note that it's perfectly OK for a pattern to have a zero-length
    842                    //   match at the end of a string, so we must make sure that the loop
    843                    //   runs with startPos == testStartLimit the last time through.
    844                    if  (findProgressInterrupt(startPos, status))
    845                        return false;
    846                }
    847            } else {
    848                for (;;) {
    849                    if (isLineTerminator(ch)) {
    850                        if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
    851                            (void)UTEXT_NEXT32(fInputText);
    852                            startPos = UTEXT_GETNATIVEINDEX(fInputText);
    853                        }
    854                        MatchAt(startPos, false, status);
    855                        if (U_FAILURE(status)) {
    856                            return false;
    857                        }
    858                        if (fMatch) {
    859                            return true;
    860                        }
    861                        UTEXT_SETNATIVEINDEX(fInputText, startPos);
    862                    }
    863                    if (startPos >= testStartLimit) {
    864                        fMatch = false;
    865                        fHitEnd = true;
    866                        return false;
    867                    }
    868                    ch = UTEXT_NEXT32(fInputText);
    869                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
    870                    // Note that it's perfectly OK for a pattern to have a zero-length
    871                    //   match at the end of a string, so we must make sure that the loop
    872                    //   runs with startPos == testStartLimit the last time through.
    873                    if  (findProgressInterrupt(startPos, status))
    874                        return false;
    875                }
    876            }
    877        }
    878 
    879    default:
    880        UPRV_UNREACHABLE_ASSERT;
    881        // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
    882        // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
    883        // See ICU-21669.
    884        status = U_INTERNAL_PROGRAM_ERROR;
    885        return false;
    886    }
    887 
    888    UPRV_UNREACHABLE_EXIT;
    889 }
    890 
    891 
    892 
    893 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
    894    if (U_FAILURE(status)) {
    895        return false;
    896    }
    897    if (U_FAILURE(fDeferredStatus)) {
    898        status = fDeferredStatus;
    899        return false;
    900    }
    901    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
    902                                          //        This will reset the region to be the full input length.
    903    if (start < 0) {
    904        status = U_INDEX_OUTOFBOUNDS_ERROR;
    905        return false;
    906    }
    907 
    908    int64_t nativeStart = start;
    909    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
    910        status = U_INDEX_OUTOFBOUNDS_ERROR;
    911        return false;
    912    }
    913    fMatchEnd = nativeStart;
    914    return find(status);
    915 }
    916 
    917 
    918 //--------------------------------------------------------------------------------
    919 //
    920 //   findUsingChunk() -- like find(), but with the advance knowledge that the
    921 //                       entire string is available in the UText's chunk buffer.
    922 //
    923 //--------------------------------------------------------------------------------
    924 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
    925    // Start at the position of the last match end.  (Will be zero if the
    926    //   matcher has been reset.
    927    //
    928 
    929    int32_t startPos = static_cast<int32_t>(fMatchEnd);
    930    if (startPos==0) {
    931        startPos = static_cast<int32_t>(fActiveStart);
    932    }
    933 
    934    const char16_t *inputBuf = fInputText->chunkContents;
    935 
    936    if (fMatch) {
    937        // Save the position of any previous successful match.
    938        fLastMatchEnd = fMatchEnd;
    939 
    940        if (fMatchStart == fMatchEnd) {
    941            // Previous match had zero length.  Move start position up one position
    942            //  to avoid sending find() into a loop on zero-length matches.
    943            if (startPos >= fActiveLimit) {
    944                fMatch = false;
    945                fHitEnd = true;
    946                return false;
    947            }
    948            U16_FWD_1(inputBuf, startPos, fInputLength);
    949        }
    950    } else {
    951        if (fLastMatchEnd >= 0) {
    952            // A previous find() failed to match.  Don't try again.
    953            //   (without this test, a pattern with a zero-length match
    954            //    could match again at the end of an input string.)
    955            fHitEnd = true;
    956            return false;
    957        }
    958    }
    959 
    960 
    961    // Compute the position in the input string beyond which a match can not begin, because
    962    //   the minimum length match would extend past the end of the input.
    963    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
    964    //          Be aware of possible overflows if making changes here.
    965    //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
    966    int32_t testLen = static_cast<int32_t>(fActiveLimit - fPattern->fMinMatchLen);
    967    if (startPos > testLen) {
    968        fMatch = false;
    969        fHitEnd = true;
    970        return false;
    971    }
    972 
    973    UChar32  c;
    974    U_ASSERT(startPos >= 0);
    975 
    976    switch (fPattern->fStartType) {
    977    case START_NO_INFO:
    978        // No optimization was found.
    979        //  Try a match at each input position.
    980        for (;;) {
    981            MatchChunkAt(startPos, false, status);
    982            if (U_FAILURE(status)) {
    983                return false;
    984            }
    985            if (fMatch) {
    986                return true;
    987            }
    988            if (startPos >= testLen) {
    989                fHitEnd = true;
    990                return false;
    991            }
    992            U16_FWD_1(inputBuf, startPos, fActiveLimit);
    993            // Note that it's perfectly OK for a pattern to have a zero-length
    994            //   match at the end of a string, so we must make sure that the loop
    995            //   runs with startPos == testLen the last time through.
    996            if  (findProgressInterrupt(startPos, status))
    997                return false;
    998        }
    999        UPRV_UNREACHABLE_EXIT;
   1000 
   1001    case START_START:
   1002        // Matches are only possible at the start of the input string
   1003        //   (pattern begins with ^ or \A)
   1004        if (startPos > fActiveStart) {
   1005            fMatch = false;
   1006            return false;
   1007        }
   1008        MatchChunkAt(startPos, false, status);
   1009        if (U_FAILURE(status)) {
   1010            return false;
   1011        }
   1012        return fMatch;
   1013 
   1014 
   1015    case START_SET:
   1016    {
   1017        // Match may start on any char from a pre-computed set.
   1018        U_ASSERT(fPattern->fMinMatchLen > 0);
   1019        for (;;) {
   1020            int32_t pos = startPos;
   1021            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1022            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
   1023                (c>=256 && fPattern->fInitialChars->contains(c))) {
   1024                MatchChunkAt(pos, false, status);
   1025                if (U_FAILURE(status)) {
   1026                    return false;
   1027                }
   1028                if (fMatch) {
   1029                    return true;
   1030                }
   1031            }
   1032            if (startPos > testLen) {
   1033                fMatch = false;
   1034                fHitEnd = true;
   1035                return false;
   1036            }
   1037            if  (findProgressInterrupt(startPos, status))
   1038                return false;
   1039        }
   1040    }
   1041    UPRV_UNREACHABLE_EXIT;
   1042 
   1043    case START_STRING:
   1044    case START_CHAR:
   1045    {
   1046        // Match starts on exactly one char.
   1047        U_ASSERT(fPattern->fMinMatchLen > 0);
   1048        UChar32 theChar = fPattern->fInitialChar;
   1049        for (;;) {
   1050            int32_t pos = startPos;
   1051            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
   1052            if (c == theChar) {
   1053                MatchChunkAt(pos, false, status);
   1054                if (U_FAILURE(status)) {
   1055                    return false;
   1056                }
   1057                if (fMatch) {
   1058                    return true;
   1059                }
   1060            }
   1061            if (startPos > testLen) {
   1062                fMatch = false;
   1063                fHitEnd = true;
   1064                return false;
   1065            }
   1066            if  (findProgressInterrupt(startPos, status))
   1067                return false;
   1068        }
   1069    }
   1070    UPRV_UNREACHABLE_EXIT;
   1071 
   1072    case START_LINE:
   1073    {
   1074        UChar32 ch;
   1075        if (startPos == fAnchorStart) {
   1076            MatchChunkAt(startPos, false, status);
   1077            if (U_FAILURE(status)) {
   1078                return false;
   1079            }
   1080            if (fMatch) {
   1081                return true;
   1082            }
   1083            U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1084        }
   1085 
   1086        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
   1087            for (;;) {
   1088                ch = inputBuf[startPos-1];
   1089                if (ch == 0x0a) {
   1090                    MatchChunkAt(startPos, false, status);
   1091                    if (U_FAILURE(status)) {
   1092                        return false;
   1093                    }
   1094                    if (fMatch) {
   1095                        return true;
   1096                    }
   1097                }
   1098                if (startPos >= testLen) {
   1099                    fMatch = false;
   1100                    fHitEnd = true;
   1101                    return false;
   1102                }
   1103                U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1104                // Note that it's perfectly OK for a pattern to have a zero-length
   1105                //   match at the end of a string, so we must make sure that the loop
   1106                //   runs with startPos == testLen the last time through.
   1107                if  (findProgressInterrupt(startPos, status))
   1108                    return false;
   1109            }
   1110        } else {
   1111            for (;;) {
   1112                ch = inputBuf[startPos-1];
   1113                if (isLineTerminator(ch)) {
   1114                    if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
   1115                        startPos++;
   1116                    }
   1117                    MatchChunkAt(startPos, false, status);
   1118                    if (U_FAILURE(status)) {
   1119                        return false;
   1120                    }
   1121                    if (fMatch) {
   1122                        return true;
   1123                    }
   1124                }
   1125                if (startPos >= testLen) {
   1126                    fMatch = false;
   1127                    fHitEnd = true;
   1128                    return false;
   1129                }
   1130                U16_FWD_1(inputBuf, startPos, fActiveLimit);
   1131                // Note that it's perfectly OK for a pattern to have a zero-length
   1132                //   match at the end of a string, so we must make sure that the loop
   1133                //   runs with startPos == testLen the last time through.
   1134                if  (findProgressInterrupt(startPos, status))
   1135                    return false;
   1136            }
   1137        }
   1138    }
   1139 
   1140    default:
   1141        UPRV_UNREACHABLE_ASSERT;
   1142        // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
   1143        // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
   1144        // See ICU-21669.
   1145        status = U_INTERNAL_PROGRAM_ERROR;
   1146        return false;
   1147    }
   1148 
   1149    UPRV_UNREACHABLE_EXIT;
   1150 }
   1151 
   1152 
   1153 
   1154 //--------------------------------------------------------------------------------
   1155 //
   1156 //  group()
   1157 //
   1158 //--------------------------------------------------------------------------------
   1159 UnicodeString RegexMatcher::group(UErrorCode &status) const {
   1160    return group(0, status);
   1161 }
   1162 
   1163 //  Return immutable shallow clone
   1164 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
   1165    return group(0, dest, group_len, status);
   1166 }
   1167 
   1168 //  Return immutable shallow clone
   1169 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
   1170    group_len = 0;
   1171    if (U_FAILURE(status)) {
   1172        return dest;
   1173    }
   1174    if (U_FAILURE(fDeferredStatus)) {
   1175        status = fDeferredStatus;
   1176    } else if (fMatch == false) {
   1177        status = U_REGEX_INVALID_STATE;
   1178    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1179        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1180    }
   1181 
   1182    if (U_FAILURE(status)) {
   1183        return dest;
   1184    }
   1185 
   1186    int64_t s, e;
   1187    if (groupNum == 0) {
   1188        s = fMatchStart;
   1189        e = fMatchEnd;
   1190    } else {
   1191        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1192        U_ASSERT(groupOffset < fPattern->fFrameSize);
   1193        U_ASSERT(groupOffset >= 0);
   1194        s = fFrame->fExtra[groupOffset];
   1195        e = fFrame->fExtra[groupOffset+1];
   1196    }
   1197 
   1198    if (s < 0) {
   1199        // A capture group wasn't part of the match
   1200        return utext_clone(dest, fInputText, false, true, &status);
   1201    }
   1202    U_ASSERT(s <= e);
   1203    group_len = e - s;
   1204 
   1205    dest = utext_clone(dest, fInputText, false, true, &status);
   1206    if (dest)
   1207        UTEXT_SETNATIVEINDEX(dest, s);
   1208    return dest;
   1209 }
   1210 
   1211 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
   1212    UnicodeString result;
   1213    int64_t groupStart = start64(groupNum, status);
   1214    int64_t groupEnd = end64(groupNum, status);
   1215    if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
   1216        return result;
   1217    }
   1218 
   1219    // Get the group length using a utext_extract preflight.
   1220    //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
   1221    UErrorCode bufferStatus = U_ZERO_ERROR;
   1222    int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &bufferStatus);
   1223    if (bufferStatus != U_BUFFER_OVERFLOW_ERROR) {
   1224        if (U_FAILURE(bufferStatus)) {
   1225            status = bufferStatus;
   1226        }
   1227        return result;
   1228    }
   1229 
   1230    char16_t *buf = result.getBuffer(length);
   1231    if (buf == nullptr) {
   1232        status = U_MEMORY_ALLOCATION_ERROR;
   1233    } else {
   1234        int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
   1235        result.releaseBuffer(extractLength);
   1236        U_ASSERT(length == extractLength);
   1237    }
   1238    return result;
   1239 }
   1240 
   1241 
   1242 //--------------------------------------------------------------------------------
   1243 //
   1244 //  appendGroup() -- currently internal only, appends a group to a UText rather
   1245 //                   than replacing its contents
   1246 //
   1247 //--------------------------------------------------------------------------------
   1248 
   1249 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
   1250    if (U_FAILURE(status)) {
   1251        return 0;
   1252    }
   1253    if (U_FAILURE(fDeferredStatus)) {
   1254        status = fDeferredStatus;
   1255        return 0;
   1256    }
   1257    int64_t destLen = utext_nativeLength(dest);
   1258 
   1259    if (fMatch == false) {
   1260        status = U_REGEX_INVALID_STATE;
   1261        return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
   1262    }
   1263    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
   1264        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1265        return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
   1266    }
   1267 
   1268    int64_t s, e;
   1269    if (groupNum == 0) {
   1270        s = fMatchStart;
   1271        e = fMatchEnd;
   1272    } else {
   1273        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
   1274        U_ASSERT(groupOffset < fPattern->fFrameSize);
   1275        U_ASSERT(groupOffset >= 0);
   1276        s = fFrame->fExtra[groupOffset];
   1277        e = fFrame->fExtra[groupOffset+1];
   1278    }
   1279 
   1280    if (s < 0) {
   1281        // A capture group wasn't part of the match
   1282        return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
   1283    }
   1284    U_ASSERT(s <= e);
   1285 
   1286    int64_t deltaLen;
   1287    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1288        U_ASSERT(e <= fInputLength);
   1289        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents + s, static_cast<int32_t>(e - s), &status);
   1290    } else {
   1291        int32_t len16;
   1292        if (UTEXT_USES_U16(fInputText)) {
   1293            len16 = static_cast<int32_t>(e - s);
   1294        } else {
   1295            UErrorCode lengthStatus = U_ZERO_ERROR;
   1296            len16 = utext_extract(fInputText, s, e, nullptr, 0, &lengthStatus);
   1297        }
   1298        char16_t* groupChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1)));
   1299        if (groupChars == nullptr) {
   1300            status = U_MEMORY_ALLOCATION_ERROR;
   1301            return 0;
   1302        }
   1303        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
   1304 
   1305        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
   1306        uprv_free(groupChars);
   1307    }
   1308    return deltaLen;
   1309 }
   1310 
   1311 
   1312 
   1313 //--------------------------------------------------------------------------------
   1314 //
   1315 //  groupCount()
   1316 //
   1317 //--------------------------------------------------------------------------------
   1318 int32_t RegexMatcher::groupCount() const {
   1319    return fPattern->fGroupMap->size();
   1320 }
   1321 
   1322 //--------------------------------------------------------------------------------
   1323 //
   1324 //  hasAnchoringBounds()
   1325 //
   1326 //--------------------------------------------------------------------------------
   1327 UBool RegexMatcher::hasAnchoringBounds() const {
   1328    return fAnchoringBounds;
   1329 }
   1330 
   1331 
   1332 //--------------------------------------------------------------------------------
   1333 //
   1334 //  hasTransparentBounds()
   1335 //
   1336 //--------------------------------------------------------------------------------
   1337 UBool RegexMatcher::hasTransparentBounds() const {
   1338    return fTransparentBounds;
   1339 }
   1340 
   1341 
   1342 
   1343 //--------------------------------------------------------------------------------
   1344 //
   1345 //  hitEnd()
   1346 //
   1347 //--------------------------------------------------------------------------------
   1348 UBool RegexMatcher::hitEnd() const {
   1349    return fHitEnd;
   1350 }
   1351 
   1352 
   1353 //--------------------------------------------------------------------------------
   1354 //
   1355 //  input()
   1356 //
   1357 //--------------------------------------------------------------------------------
   1358 const UnicodeString &RegexMatcher::input() const {
   1359    if (!fInput) {
   1360        UErrorCode status = U_ZERO_ERROR;
   1361        int32_t len16;
   1362        if (UTEXT_USES_U16(fInputText)) {
   1363            len16 = static_cast<int32_t>(fInputLength);
   1364        } else {
   1365            len16 = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &status);
   1366            status = U_ZERO_ERROR; // overflow, length status
   1367        }
   1368        UnicodeString *result = new UnicodeString(len16, 0, 0);
   1369 
   1370        char16_t *inputChars = result->getBuffer(len16);
   1371        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
   1372        result->releaseBuffer(len16);
   1373 
   1374        *const_cast<const UnicodeString**>(&fInput) = result; // pointer assignment, rather than operator=
   1375    }
   1376 
   1377    return *fInput;
   1378 }
   1379 
   1380 //--------------------------------------------------------------------------------
   1381 //
   1382 //  inputText()
   1383 //
   1384 //--------------------------------------------------------------------------------
   1385 UText *RegexMatcher::inputText() const {
   1386    return fInputText;
   1387 }
   1388 
   1389 
   1390 //--------------------------------------------------------------------------------
   1391 //
   1392 //  getInput() -- like inputText(), but makes a clone or copies into another UText
   1393 //
   1394 //--------------------------------------------------------------------------------
   1395 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
   1396    if (U_FAILURE(status)) {
   1397        return dest;
   1398    }
   1399    if (U_FAILURE(fDeferredStatus)) {
   1400        status = fDeferredStatus;
   1401        return dest;
   1402    }
   1403 
   1404    if (dest) {
   1405        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1406            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, static_cast<int32_t>(fInputLength), &status);
   1407        } else {
   1408            int32_t input16Len;
   1409            if (UTEXT_USES_U16(fInputText)) {
   1410                input16Len = static_cast<int32_t>(fInputLength);
   1411            } else {
   1412                UErrorCode lengthStatus = U_ZERO_ERROR;
   1413                input16Len = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &lengthStatus); // buffer overflow error
   1414            }
   1415            char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (input16Len)));
   1416            if (inputChars == nullptr) {
   1417                return dest;
   1418            }
   1419 
   1420            status = U_ZERO_ERROR;
   1421            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
   1422            status = U_ZERO_ERROR;
   1423            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
   1424 
   1425            uprv_free(inputChars);
   1426        }
   1427        return dest;
   1428    } else {
   1429        return utext_clone(nullptr, fInputText, false, true, &status);
   1430    }
   1431 }
   1432 
   1433 
   1434 static UBool compat_SyncMutableUTextContents(UText *ut);
   1435 static UBool compat_SyncMutableUTextContents(UText *ut) {
   1436    UBool retVal = false;
   1437 
   1438    //  In the following test, we're really only interested in whether the UText should switch
   1439    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
   1440    //  will still point to the correct data.
   1441    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
   1442        UnicodeString *us=(UnicodeString *)ut->context;
   1443 
   1444        // Update to the latest length.
   1445        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
   1446        int32_t newLength = us->length();
   1447 
   1448        // Update the chunk description.
   1449        // The buffer may have switched between stack- and heap-based.
   1450        ut->chunkContents    = us->getBuffer();
   1451        ut->chunkLength      = newLength;
   1452        ut->chunkNativeLimit = newLength;
   1453        ut->nativeIndexingLimit = newLength;
   1454        retVal = true;
   1455    }
   1456 
   1457    return retVal;
   1458 }
   1459 
   1460 //--------------------------------------------------------------------------------
   1461 //
   1462 //  lookingAt()
   1463 //
   1464 //--------------------------------------------------------------------------------
   1465 UBool RegexMatcher::lookingAt(UErrorCode &status) {
   1466    if (U_FAILURE(status)) {
   1467        return false;
   1468    }
   1469    if (U_FAILURE(fDeferredStatus)) {
   1470        status = fDeferredStatus;
   1471        return false;
   1472    }
   1473 
   1474    if (fInputUniStrMaybeMutable) {
   1475        if (compat_SyncMutableUTextContents(fInputText)) {
   1476        fInputLength = utext_nativeLength(fInputText);
   1477        reset();
   1478        }
   1479    }
   1480    else {
   1481        resetPreserveRegion();
   1482    }
   1483    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1484        MatchChunkAt(static_cast<int32_t>(fActiveStart), false, status);
   1485    } else {
   1486        MatchAt(fActiveStart, false, status);
   1487    }
   1488    return fMatch;
   1489 }
   1490 
   1491 
   1492 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
   1493    if (U_FAILURE(status)) {
   1494        return false;
   1495    }
   1496    if (U_FAILURE(fDeferredStatus)) {
   1497        status = fDeferredStatus;
   1498        return false;
   1499    }
   1500    reset();
   1501 
   1502    if (start < 0) {
   1503        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1504        return false;
   1505    }
   1506 
   1507    if (fInputUniStrMaybeMutable) {
   1508        if (compat_SyncMutableUTextContents(fInputText)) {
   1509        fInputLength = utext_nativeLength(fInputText);
   1510        reset();
   1511        }
   1512    }
   1513 
   1514    int64_t nativeStart;
   1515    nativeStart = start;
   1516    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1517        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1518        return false;
   1519    }
   1520 
   1521    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1522        MatchChunkAt(static_cast<int32_t>(nativeStart), false, status);
   1523    } else {
   1524        MatchAt(nativeStart, false, status);
   1525    }
   1526    return fMatch;
   1527 }
   1528 
   1529 
   1530 
   1531 //--------------------------------------------------------------------------------
   1532 //
   1533 //  matches()
   1534 //
   1535 //--------------------------------------------------------------------------------
   1536 UBool RegexMatcher::matches(UErrorCode &status) {
   1537    if (U_FAILURE(status)) {
   1538        return false;
   1539    }
   1540    if (U_FAILURE(fDeferredStatus)) {
   1541        status = fDeferredStatus;
   1542        return false;
   1543    }
   1544 
   1545    if (fInputUniStrMaybeMutable) {
   1546        if (compat_SyncMutableUTextContents(fInputText)) {
   1547        fInputLength = utext_nativeLength(fInputText);
   1548        reset();
   1549        }
   1550    }
   1551    else {
   1552        resetPreserveRegion();
   1553    }
   1554 
   1555    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1556        MatchChunkAt(static_cast<int32_t>(fActiveStart), true, status);
   1557    } else {
   1558        MatchAt(fActiveStart, true, status);
   1559    }
   1560    return fMatch;
   1561 }
   1562 
   1563 
   1564 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
   1565    if (U_FAILURE(status)) {
   1566        return false;
   1567    }
   1568    if (U_FAILURE(fDeferredStatus)) {
   1569        status = fDeferredStatus;
   1570        return false;
   1571    }
   1572    reset();
   1573 
   1574    if (start < 0) {
   1575        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1576        return false;
   1577    }
   1578 
   1579    if (fInputUniStrMaybeMutable) {
   1580        if (compat_SyncMutableUTextContents(fInputText)) {
   1581        fInputLength = utext_nativeLength(fInputText);
   1582        reset();
   1583        }
   1584    }
   1585 
   1586    int64_t nativeStart;
   1587    nativeStart = start;
   1588    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
   1589        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1590        return false;
   1591    }
   1592 
   1593    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
   1594        MatchChunkAt(static_cast<int32_t>(nativeStart), true, status);
   1595    } else {
   1596        MatchAt(nativeStart, true, status);
   1597    }
   1598    return fMatch;
   1599 }
   1600 
   1601 
   1602 
   1603 //--------------------------------------------------------------------------------
   1604 //
   1605 //    pattern
   1606 //
   1607 //--------------------------------------------------------------------------------
   1608 const RegexPattern &RegexMatcher::pattern() const {
   1609    return *fPattern;
   1610 }
   1611 
   1612 
   1613 
   1614 //--------------------------------------------------------------------------------
   1615 //
   1616 //    region
   1617 //
   1618 //--------------------------------------------------------------------------------
   1619 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
   1620    if (U_FAILURE(status)) {
   1621        return *this;
   1622    }
   1623 
   1624    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
   1625        status = U_ILLEGAL_ARGUMENT_ERROR;
   1626    }
   1627 
   1628    int64_t nativeStart = regionStart;
   1629    int64_t nativeLimit = regionLimit;
   1630    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
   1631      status = U_ILLEGAL_ARGUMENT_ERROR;
   1632    }
   1633 
   1634    if (startIndex == -1)
   1635      this->reset();
   1636    else
   1637      resetPreserveRegion();
   1638 
   1639    fRegionStart = nativeStart;
   1640    fRegionLimit = nativeLimit;
   1641    fActiveStart = nativeStart;
   1642    fActiveLimit = nativeLimit;
   1643 
   1644    if (startIndex != -1) {
   1645      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
   1646          status = U_INDEX_OUTOFBOUNDS_ERROR;
   1647      }
   1648      fMatchEnd = startIndex;
   1649    }
   1650 
   1651    if (!fTransparentBounds) {
   1652        fLookStart = nativeStart;
   1653        fLookLimit = nativeLimit;
   1654    }
   1655    if (fAnchoringBounds) {
   1656        fAnchorStart = nativeStart;
   1657        fAnchorLimit = nativeLimit;
   1658    }
   1659    return *this;
   1660 }
   1661 
   1662 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
   1663  return region(start, limit, -1, status);
   1664 }
   1665 
   1666 //--------------------------------------------------------------------------------
   1667 //
   1668 //    regionEnd
   1669 //
   1670 //--------------------------------------------------------------------------------
   1671 int32_t RegexMatcher::regionEnd() const {
   1672    return static_cast<int32_t>(fRegionLimit);
   1673 }
   1674 
   1675 int64_t RegexMatcher::regionEnd64() const {
   1676    return fRegionLimit;
   1677 }
   1678 
   1679 //--------------------------------------------------------------------------------
   1680 //
   1681 //    regionStart
   1682 //
   1683 //--------------------------------------------------------------------------------
   1684 int32_t RegexMatcher::regionStart() const {
   1685    return static_cast<int32_t>(fRegionStart);
   1686 }
   1687 
   1688 int64_t RegexMatcher::regionStart64() const {
   1689    return fRegionStart;
   1690 }
   1691 
   1692 
   1693 //--------------------------------------------------------------------------------
   1694 //
   1695 //    replaceAll
   1696 //
   1697 //--------------------------------------------------------------------------------
   1698 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
   1699    UText replacementText = UTEXT_INITIALIZER;
   1700    UText resultText = UTEXT_INITIALIZER;
   1701    UnicodeString resultString;
   1702    if (U_FAILURE(status)) {
   1703        return resultString;
   1704    }
   1705 
   1706    utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1707    utext_openUnicodeString(&resultText, &resultString, &status);
   1708 
   1709    replaceAll(&replacementText, &resultText, status);
   1710 
   1711    utext_close(&resultText);
   1712    utext_close(&replacementText);
   1713 
   1714    return resultString;
   1715 }
   1716 
   1717 
   1718 //
   1719 //    replaceAll, UText mode
   1720 //
   1721 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
   1722    if (U_FAILURE(status)) {
   1723        return dest;
   1724    }
   1725    if (U_FAILURE(fDeferredStatus)) {
   1726        status = fDeferredStatus;
   1727        return dest;
   1728    }
   1729 
   1730    if (dest == nullptr) {
   1731        UnicodeString emptyString;
   1732        UText empty = UTEXT_INITIALIZER;
   1733 
   1734        utext_openUnicodeString(&empty, &emptyString, &status);
   1735        dest = utext_clone(nullptr, &empty, true, false, &status);
   1736        utext_close(&empty);
   1737    }
   1738 
   1739    if (U_SUCCESS(status)) {
   1740        reset();
   1741        while (find()) {
   1742            appendReplacement(dest, replacement, status);
   1743            if (U_FAILURE(status)) {
   1744                break;
   1745            }
   1746        }
   1747        appendTail(dest, status);
   1748    }
   1749 
   1750    return dest;
   1751 }
   1752 
   1753 
   1754 //--------------------------------------------------------------------------------
   1755 //
   1756 //    replaceFirst
   1757 //
   1758 //--------------------------------------------------------------------------------
   1759 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
   1760    UText replacementText = UTEXT_INITIALIZER;
   1761    UText resultText = UTEXT_INITIALIZER;
   1762    UnicodeString resultString;
   1763 
   1764    utext_openConstUnicodeString(&replacementText, &replacement, &status);
   1765    utext_openUnicodeString(&resultText, &resultString, &status);
   1766 
   1767    replaceFirst(&replacementText, &resultText, status);
   1768 
   1769    utext_close(&resultText);
   1770    utext_close(&replacementText);
   1771 
   1772    return resultString;
   1773 }
   1774 
   1775 //
   1776 //    replaceFirst, UText mode
   1777 //
   1778 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
   1779    if (U_FAILURE(status)) {
   1780        return dest;
   1781    }
   1782    if (U_FAILURE(fDeferredStatus)) {
   1783        status = fDeferredStatus;
   1784        return dest;
   1785    }
   1786 
   1787    reset();
   1788    if (!find()) {
   1789        return getInput(dest, status);
   1790    }
   1791 
   1792    if (dest == nullptr) {
   1793        UnicodeString emptyString;
   1794        UText empty = UTEXT_INITIALIZER;
   1795 
   1796        utext_openUnicodeString(&empty, &emptyString, &status);
   1797        dest = utext_clone(nullptr, &empty, true, false, &status);
   1798        utext_close(&empty);
   1799    }
   1800 
   1801    appendReplacement(dest, replacement, status);
   1802    appendTail(dest, status);
   1803 
   1804    return dest;
   1805 }
   1806 
   1807 
   1808 //--------------------------------------------------------------------------------
   1809 //
   1810 //     requireEnd
   1811 //
   1812 //--------------------------------------------------------------------------------
   1813 UBool RegexMatcher::requireEnd() const {
   1814    return fRequireEnd;
   1815 }
   1816 
   1817 
   1818 //--------------------------------------------------------------------------------
   1819 //
   1820 //     reset
   1821 //
   1822 //--------------------------------------------------------------------------------
   1823 RegexMatcher &RegexMatcher::reset() {
   1824    fRegionStart    = 0;
   1825    fRegionLimit    = fInputLength;
   1826    fActiveStart    = 0;
   1827    fActiveLimit    = fInputLength;
   1828    fAnchorStart    = 0;
   1829    fAnchorLimit    = fInputLength;
   1830    fLookStart      = 0;
   1831    fLookLimit      = fInputLength;
   1832    resetPreserveRegion();
   1833    return *this;
   1834 }
   1835 
   1836 
   1837 
   1838 void RegexMatcher::resetPreserveRegion() {
   1839    fMatchStart     = 0;
   1840    fMatchEnd       = 0;
   1841    fLastMatchEnd   = -1;
   1842    fAppendPosition = 0;
   1843    fMatch          = false;
   1844    fHitEnd         = false;
   1845    fRequireEnd     = false;
   1846    fTime           = 0;
   1847    fTickCounter    = TIMER_INITIAL_VALUE;
   1848    //resetStack(); // more expensive than it looks...
   1849 }
   1850 
   1851 
   1852 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
   1853    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
   1854    if (fPattern->fNeedsAltInput) {
   1855        fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
   1856    }
   1857    if (U_FAILURE(fDeferredStatus)) {
   1858        return *this;
   1859    }
   1860    fInputLength = utext_nativeLength(fInputText);
   1861 
   1862    reset();
   1863    delete fInput;
   1864    fInput = nullptr;
   1865 
   1866    //  Do the following for any UnicodeString.
   1867    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
   1868    fInputUniStrMaybeMutable = true;
   1869 
   1870 #if UCONFIG_NO_BREAK_ITERATION==0
   1871    if (fWordBreakItr) {
   1872        fWordBreakItr->setText(fInputText, fDeferredStatus);
   1873    }
   1874    if (fGCBreakItr) {
   1875        fGCBreakItr->setText(fInputText, fDeferredStatus);
   1876    }
   1877 #endif
   1878 
   1879    return *this;
   1880 }
   1881 
   1882 
   1883 RegexMatcher &RegexMatcher::reset(UText *input) {
   1884    if (fInputText != input) {
   1885        fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus);
   1886        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
   1887        if (U_FAILURE(fDeferredStatus)) {
   1888            return *this;
   1889        }
   1890        fInputLength = utext_nativeLength(fInputText);
   1891 
   1892        delete fInput;
   1893        fInput = nullptr;
   1894 
   1895 #if UCONFIG_NO_BREAK_ITERATION==0
   1896        if (fWordBreakItr) {
   1897            fWordBreakItr->setText(input, fDeferredStatus);
   1898        }
   1899        if (fGCBreakItr) {
   1900            fGCBreakItr->setText(fInputText, fDeferredStatus);
   1901        }
   1902 #endif
   1903    }
   1904    reset();
   1905    fInputUniStrMaybeMutable = false;
   1906 
   1907    return *this;
   1908 }
   1909 
   1910 /*RegexMatcher &RegexMatcher::reset(const char16_t *) {
   1911    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
   1912    return *this;
   1913 }*/
   1914 
   1915 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
   1916    if (U_FAILURE(status)) {
   1917        return *this;
   1918    }
   1919    reset();       // Reset also resets the region to be the entire string.
   1920 
   1921    if (position < 0 || position > fActiveLimit) {
   1922        status = U_INDEX_OUTOFBOUNDS_ERROR;
   1923        return *this;
   1924    }
   1925    fMatchEnd = position;
   1926    return *this;
   1927 }
   1928 
   1929 
   1930 //--------------------------------------------------------------------------------
   1931 //
   1932 //    refresh
   1933 //
   1934 //--------------------------------------------------------------------------------
   1935 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
   1936    if (U_FAILURE(status)) {
   1937        return *this;
   1938    }
   1939    if (input == nullptr) {
   1940        status = U_ILLEGAL_ARGUMENT_ERROR;
   1941        return *this;
   1942    }
   1943    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
   1944        status = U_ILLEGAL_ARGUMENT_ERROR;
   1945        return *this;
   1946    }
   1947    int64_t  pos = utext_getNativeIndex(fInputText);
   1948    //  Shallow read-only clone of the new UText into the existing input UText
   1949    fInputText = utext_clone(fInputText, input, false, true, &status);
   1950    if (U_FAILURE(status)) {
   1951        return *this;
   1952    }
   1953    utext_setNativeIndex(fInputText, pos);
   1954 
   1955    if (fAltInputText != nullptr) {
   1956        pos = utext_getNativeIndex(fAltInputText);
   1957        fAltInputText = utext_clone(fAltInputText, input, false, true, &status);
   1958        if (U_FAILURE(status)) {
   1959            return *this;
   1960        }
   1961        utext_setNativeIndex(fAltInputText, pos);
   1962    }
   1963    return *this;
   1964 }
   1965 
   1966 
   1967 
   1968 //--------------------------------------------------------------------------------
   1969 //
   1970 //    setTrace
   1971 //
   1972 //--------------------------------------------------------------------------------
   1973 void RegexMatcher::setTrace(UBool state) {
   1974    fTraceDebug = state;
   1975 }
   1976 
   1977 
   1978 
   1979 /**
   1980  *  UText, replace entire contents of the destination UText with a substring of the source UText.
   1981  *
   1982  *     @param src    The source UText
   1983  *     @param dest   The destination UText. Must be writable.
   1984  *                   May be nullptr, in which case a new UText will be allocated.
   1985  *     @param start  Start index of source substring.
   1986  *     @param limit  Limit index of source substring.
   1987  *     @param status An error code.
   1988  */
   1989 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
   1990    if (U_FAILURE(*status)) {
   1991        return dest;
   1992    }
   1993    if (start == limit) {
   1994        if (dest) {
   1995            utext_replace(dest, 0, utext_nativeLength(dest), nullptr, 0, status);
   1996            return dest;
   1997        } else {
   1998            return utext_openUChars(nullptr, nullptr, 0, status);
   1999        }
   2000    }
   2001    UErrorCode bufferStatus = U_ZERO_ERROR;
   2002    int32_t length = utext_extract(src, start, limit, nullptr, 0, &bufferStatus);
   2003    if (bufferStatus != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(bufferStatus)) {
   2004        *status = bufferStatus;
   2005        return dest;
   2006    }
   2007    MaybeStackArray<char16_t, 40> buffer;
   2008    if (length >= buffer.getCapacity()) {
   2009        char16_t *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
   2010        if (newBuf == nullptr) {
   2011            *status = U_MEMORY_ALLOCATION_ERROR;
   2012        }
   2013    }
   2014    utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
   2015    if (dest) {
   2016        utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
   2017        return dest;
   2018    }
   2019 
   2020    // Caller did not provide a preexisting UText.
   2021    // Open a new one, and have it adopt the text buffer storage.
   2022    if (U_FAILURE(*status)) {
   2023        return nullptr;
   2024    }
   2025    int32_t ownedLength = 0;
   2026    char16_t *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
   2027    if (ownedBuf == nullptr) {
   2028        *status = U_MEMORY_ALLOCATION_ERROR;
   2029        return nullptr;
   2030    }
   2031    UText *result = utext_openUChars(nullptr, ownedBuf, length, status);
   2032    if (U_FAILURE(*status)) {
   2033        uprv_free(ownedBuf);
   2034        return nullptr;
   2035    }
   2036    result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
   2037    return result;
   2038 }
   2039 
   2040 
   2041 //---------------------------------------------------------------------
   2042 //
   2043 //   split
   2044 //
   2045 //---------------------------------------------------------------------
   2046 int32_t  RegexMatcher::split(const UnicodeString &input,
   2047        UnicodeString    dest[],
   2048        int32_t          destCapacity,
   2049        UErrorCode      &status)
   2050 {
   2051    UText inputText = UTEXT_INITIALIZER;
   2052    utext_openConstUnicodeString(&inputText, &input, &status);
   2053    if (U_FAILURE(status)) {
   2054        return 0;
   2055    }
   2056 
   2057    UText** destText = static_cast<UText**>(uprv_malloc(sizeof(UText*) * destCapacity));
   2058    if (destText == nullptr) {
   2059        status = U_MEMORY_ALLOCATION_ERROR;
   2060        return 0;
   2061    }
   2062    int32_t i;
   2063    for (i = 0; i < destCapacity; i++) {
   2064        destText[i] = utext_openUnicodeString(nullptr, &dest[i], &status);
   2065    }
   2066 
   2067    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
   2068 
   2069    for (i = 0; i < destCapacity; i++) {
   2070        utext_close(destText[i]);
   2071    }
   2072 
   2073    uprv_free(destText);
   2074    utext_close(&inputText);
   2075    return fieldCount;
   2076 }
   2077 
   2078 //
   2079 //   split, UText mode
   2080 //
   2081 int32_t  RegexMatcher::split(UText *input,
   2082        UText           *dest[],
   2083        int32_t          destCapacity,
   2084        UErrorCode      &status)
   2085 {
   2086    //
   2087    // Check arguments for validity
   2088    //
   2089    if (U_FAILURE(status)) {
   2090        return 0;
   2091    }
   2092 
   2093    if (destCapacity < 1) {
   2094        status = U_ILLEGAL_ARGUMENT_ERROR;
   2095        return 0;
   2096    }
   2097 
   2098    //
   2099    // Reset for the input text
   2100    //
   2101    reset(input);
   2102    int64_t   nextOutputStringStart = 0;
   2103    if (fActiveLimit == 0) {
   2104        return 0;
   2105    }
   2106 
   2107    //
   2108    // Loop through the input text, searching for the delimiter pattern
   2109    //
   2110    int32_t i;
   2111    int32_t numCaptureGroups = fPattern->fGroupMap->size();
   2112    for (i=0; ; i++) {
   2113        if (i>=destCapacity-1) {
   2114            // There is one or zero output string left.
   2115            // Fill the last output string with whatever is left from the input, then exit the loop.
   2116            //  ( i will be == destCapacity if we filled the output array while processing
   2117            //    capture groups of the delimiter expression, in which case we will discard the
   2118            //    last capture group saved in favor of the unprocessed remainder of the
   2119            //    input string.)
   2120            i = destCapacity-1;
   2121            if (fActiveLimit > nextOutputStringStart) {
   2122                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2123                    if (dest[i]) {
   2124                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2125                                      input->chunkContents+nextOutputStringStart,
   2126                                      static_cast<int32_t>(fActiveLimit - nextOutputStringStart), &status);
   2127                    } else {
   2128                        UText remainingText = UTEXT_INITIALIZER;
   2129                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2130                                         fActiveLimit-nextOutputStringStart, &status);
   2131                        dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
   2132                        utext_close(&remainingText);
   2133                    }
   2134                } else {
   2135                    UErrorCode lengthStatus = U_ZERO_ERROR;
   2136                    int32_t remaining16Length =
   2137                        utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
   2138                    char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1)));
   2139                    if (remainingChars == nullptr) {
   2140                        status = U_MEMORY_ALLOCATION_ERROR;
   2141                        break;
   2142                    }
   2143 
   2144                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2145                    if (dest[i]) {
   2146                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2147                    } else {
   2148                        UText remainingText = UTEXT_INITIALIZER;
   2149                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2150                        dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
   2151                        utext_close(&remainingText);
   2152                    }
   2153 
   2154                    uprv_free(remainingChars);
   2155                }
   2156            }
   2157            break;
   2158        }
   2159        if (find()) {
   2160            // We found another delimiter.  Move everything from where we started looking
   2161            //  up until the start of the delimiter into the next output string.
   2162            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2163                if (dest[i]) {
   2164                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2165                                  input->chunkContents+nextOutputStringStart,
   2166                                  static_cast<int32_t>(fMatchStart - nextOutputStringStart), &status);
   2167                } else {
   2168                    UText remainingText = UTEXT_INITIALIZER;
   2169                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2170                                      fMatchStart-nextOutputStringStart, &status);
   2171                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
   2172                    utext_close(&remainingText);
   2173                }
   2174            } else {
   2175                UErrorCode lengthStatus = U_ZERO_ERROR;
   2176                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, nullptr, 0, &lengthStatus);
   2177                char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1)));
   2178                if (remainingChars == nullptr) {
   2179                    status = U_MEMORY_ALLOCATION_ERROR;
   2180                    break;
   2181                }
   2182                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
   2183                if (dest[i]) {
   2184                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2185                } else {
   2186                    UText remainingText = UTEXT_INITIALIZER;
   2187                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2188                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
   2189                    utext_close(&remainingText);
   2190                }
   2191 
   2192                uprv_free(remainingChars);
   2193            }
   2194            nextOutputStringStart = fMatchEnd;
   2195 
   2196            // If the delimiter pattern has capturing parentheses, the captured
   2197            //  text goes out into the next n destination strings.
   2198            int32_t groupNum;
   2199            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   2200                if (i >= destCapacity-2) {
   2201                    // Never fill the last available output string with capture group text.
   2202                    // It will filled with the last field, the remainder of the
   2203                    //  unsplit input text.
   2204                    break;
   2205                }
   2206                i++;
   2207                dest[i] = utext_extract_replace(fInputText, dest[i],
   2208                                               start64(groupNum, status), end64(groupNum, status), &status);
   2209            }
   2210 
   2211            if (nextOutputStringStart == fActiveLimit) {
   2212                // The delimiter was at the end of the string.  We're done, but first
   2213                // we output one last empty string, for the empty field following
   2214                //   the delimiter at the end of input.
   2215                if (i+1 < destCapacity) {
   2216                    ++i;
   2217                    if (dest[i] == nullptr) {
   2218                        dest[i] = utext_openUChars(nullptr, nullptr, 0, &status);
   2219                    } else {
   2220                        static const char16_t emptyString[] = {static_cast<char16_t>(0)};
   2221                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
   2222                    }
   2223                }
   2224                break;
   2225 
   2226            }
   2227        }
   2228        else
   2229        {
   2230            // We ran off the end of the input while looking for the next delimiter.
   2231            // All the remaining text goes into the current output string.
   2232            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
   2233                if (dest[i]) {
   2234                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
   2235                                  input->chunkContents+nextOutputStringStart,
   2236                                  static_cast<int32_t>(fActiveLimit - nextOutputStringStart), &status);
   2237                } else {
   2238                    UText remainingText = UTEXT_INITIALIZER;
   2239                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
   2240                                     fActiveLimit-nextOutputStringStart, &status);
   2241                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
   2242                    utext_close(&remainingText);
   2243                }
   2244            } else {
   2245                UErrorCode lengthStatus = U_ZERO_ERROR;
   2246                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
   2247                char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1)));
   2248                if (remainingChars == nullptr) {
   2249                    status = U_MEMORY_ALLOCATION_ERROR;
   2250                    break;
   2251                }
   2252 
   2253                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
   2254                if (dest[i]) {
   2255                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
   2256                } else {
   2257                    UText remainingText = UTEXT_INITIALIZER;
   2258                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
   2259                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
   2260                    utext_close(&remainingText);
   2261                }
   2262 
   2263                uprv_free(remainingChars);
   2264            }
   2265            break;
   2266        }
   2267        if (U_FAILURE(status)) {
   2268            break;
   2269        }
   2270    }   // end of for loop
   2271    return i+1;
   2272 }
   2273 
   2274 
   2275 //--------------------------------------------------------------------------------
   2276 //
   2277 //     start
   2278 //
   2279 //--------------------------------------------------------------------------------
   2280 int32_t RegexMatcher::start(UErrorCode &status) const {
   2281    return start(0, status);
   2282 }
   2283 
   2284 int64_t RegexMatcher::start64(UErrorCode &status) const {
   2285    return start64(0, status);
   2286 }
   2287 
   2288 //--------------------------------------------------------------------------------
   2289 //
   2290 //     start(int32_t group, UErrorCode &status)
   2291 //
   2292 //--------------------------------------------------------------------------------
   2293 
   2294 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
   2295    if (U_FAILURE(status)) {
   2296        return -1;
   2297    }
   2298    if (U_FAILURE(fDeferredStatus)) {
   2299        status = fDeferredStatus;
   2300        return -1;
   2301    }
   2302    if (fMatch == false) {
   2303        status = U_REGEX_INVALID_STATE;
   2304        return -1;
   2305    }
   2306    if (group < 0 || group > fPattern->fGroupMap->size()) {
   2307        status = U_INDEX_OUTOFBOUNDS_ERROR;
   2308        return -1;
   2309    }
   2310    int64_t s;
   2311    if (group == 0) {
   2312        s = fMatchStart;
   2313    } else {
   2314        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
   2315        U_ASSERT(groupOffset < fPattern->fFrameSize);
   2316        U_ASSERT(groupOffset >= 0);
   2317        s = fFrame->fExtra[groupOffset];
   2318    }
   2319 
   2320    return s;
   2321 }
   2322 
   2323 
   2324 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
   2325    return static_cast<int32_t>(start64(group, status));
   2326 }
   2327 
   2328 //--------------------------------------------------------------------------------
   2329 //
   2330 //     useAnchoringBounds
   2331 //
   2332 //--------------------------------------------------------------------------------
   2333 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
   2334    fAnchoringBounds = b;
   2335    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
   2336    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
   2337    return *this;
   2338 }
   2339 
   2340 
   2341 //--------------------------------------------------------------------------------
   2342 //
   2343 //     useTransparentBounds
   2344 //
   2345 //--------------------------------------------------------------------------------
   2346 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
   2347    fTransparentBounds = b;
   2348    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
   2349    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
   2350    return *this;
   2351 }
   2352 
   2353 //--------------------------------------------------------------------------------
   2354 //
   2355 //     setTimeLimit
   2356 //
   2357 //--------------------------------------------------------------------------------
   2358 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
   2359    if (U_FAILURE(status)) {
   2360        return;
   2361    }
   2362    if (U_FAILURE(fDeferredStatus)) {
   2363        status = fDeferredStatus;
   2364        return;
   2365    }
   2366    if (limit < 0) {
   2367        status = U_ILLEGAL_ARGUMENT_ERROR;
   2368        return;
   2369    }
   2370    fTimeLimit = limit;
   2371 }
   2372 
   2373 
   2374 //--------------------------------------------------------------------------------
   2375 //
   2376 //     getTimeLimit
   2377 //
   2378 //--------------------------------------------------------------------------------
   2379 int32_t RegexMatcher::getTimeLimit() const {
   2380    return fTimeLimit;
   2381 }
   2382 
   2383 
   2384 //--------------------------------------------------------------------------------
   2385 //
   2386 //     setStackLimit
   2387 //
   2388 //--------------------------------------------------------------------------------
   2389 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
   2390    if (U_FAILURE(status)) {
   2391        return;
   2392    }
   2393    if (U_FAILURE(fDeferredStatus)) {
   2394        status = fDeferredStatus;
   2395        return;
   2396    }
   2397    if (limit < 0) {
   2398        status = U_ILLEGAL_ARGUMENT_ERROR;
   2399        return;
   2400    }
   2401 
   2402    // Reset the matcher.  This is needed here in case there is a current match
   2403    //    whose final stack frame (containing the match results, pointed to by fFrame)
   2404    //    would be lost by resizing to a smaller stack size.
   2405    reset();
   2406 
   2407    if (limit == 0) {
   2408        // Unlimited stack expansion
   2409        fStack->setMaxCapacity(0);
   2410    } else {
   2411        // Change the units of the limit  from bytes to ints, and bump the size up
   2412        //   to be big enough to hold at least one stack frame for the pattern,
   2413        //   if it isn't there already.
   2414        int32_t adjustedLimit = limit / sizeof(int32_t);
   2415        if (adjustedLimit < fPattern->fFrameSize) {
   2416            adjustedLimit = fPattern->fFrameSize;
   2417        }
   2418        fStack->setMaxCapacity(adjustedLimit);
   2419    }
   2420    fStackLimit = limit;
   2421 }
   2422 
   2423 
   2424 //--------------------------------------------------------------------------------
   2425 //
   2426 //     getStackLimit
   2427 //
   2428 //--------------------------------------------------------------------------------
   2429 int32_t RegexMatcher::getStackLimit() const {
   2430    return fStackLimit;
   2431 }
   2432 
   2433 
   2434 //--------------------------------------------------------------------------------
   2435 //
   2436 //     setMatchCallback
   2437 //
   2438 //--------------------------------------------------------------------------------
   2439 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
   2440                                    const void              *context,
   2441                                    UErrorCode              &status) {
   2442    if (U_FAILURE(status)) {
   2443        return;
   2444    }
   2445    fCallbackFn = callback;
   2446    fCallbackContext = context;
   2447 }
   2448 
   2449 
   2450 //--------------------------------------------------------------------------------
   2451 //
   2452 //     getMatchCallback
   2453 //
   2454 //--------------------------------------------------------------------------------
   2455 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
   2456                                  const void              *&context,
   2457                                  UErrorCode              &status) {
   2458    if (U_FAILURE(status)) {
   2459       return;
   2460    }
   2461    callback = fCallbackFn;
   2462    context  = fCallbackContext;
   2463 }
   2464 
   2465 
   2466 //--------------------------------------------------------------------------------
   2467 //
   2468 //     setMatchCallback
   2469 //
   2470 //--------------------------------------------------------------------------------
   2471 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
   2472                                                const void                      *context,
   2473                                                UErrorCode                      &status) {
   2474    if (U_FAILURE(status)) {
   2475        return;
   2476    }
   2477    fFindProgressCallbackFn = callback;
   2478    fFindProgressCallbackContext = context;
   2479 }
   2480 
   2481 
   2482 //--------------------------------------------------------------------------------
   2483 //
   2484 //     getMatchCallback
   2485 //
   2486 //--------------------------------------------------------------------------------
   2487 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
   2488                                                const void                    *&context,
   2489                                                UErrorCode                    &status) {
   2490    if (U_FAILURE(status)) {
   2491       return;
   2492    }
   2493    callback = fFindProgressCallbackFn;
   2494    context  = fFindProgressCallbackContext;
   2495 }
   2496 
   2497 
   2498 //================================================================================
   2499 //
   2500 //    Code following this point in this file is the internal
   2501 //    Match Engine Implementation.
   2502 //
   2503 //================================================================================
   2504 
   2505 
   2506 //--------------------------------------------------------------------------------
   2507 //
   2508 //   resetStack
   2509 //           Discard any previous contents of the state save stack, and initialize a
   2510 //           new stack frame to all -1.  The -1s are needed for capture group limits,
   2511 //           where they indicate that a group has not yet matched anything.
   2512 //--------------------------------------------------------------------------------
   2513 REStackFrame *RegexMatcher::resetStack() {
   2514    // Discard any previous contents of the state save stack, and initialize a
   2515    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
   2516    //  where they indicate that a group has not yet matched anything.
   2517    fStack->removeAllElements();
   2518 
   2519    REStackFrame* iFrame = reinterpret_cast<REStackFrame*>(fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus));
   2520    if(U_FAILURE(fDeferredStatus)) {
   2521        return nullptr;
   2522    }
   2523 
   2524    int32_t i;
   2525    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
   2526        iFrame->fExtra[i] = -1;
   2527    }
   2528    return iFrame;
   2529 }
   2530 
   2531 
   2532 
   2533 //--------------------------------------------------------------------------------
   2534 //
   2535 //   isWordBoundary
   2536 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
   2537 //                     For us,
   2538 //                       If the current char is a combining mark,
   2539 //                          \b is false.
   2540 //                       Else Scan backwards to the first non-combining char.
   2541 //                            We are at a boundary if the this char and the original chars are
   2542 //                               opposite in membership in \w set
   2543 //
   2544 //          parameters:   pos   - the current position in the input buffer
   2545 //
   2546 //              TODO:  double-check edge cases at region boundaries.
   2547 //
   2548 //--------------------------------------------------------------------------------
   2549 UBool RegexMatcher::isWordBoundary(int64_t pos) {
   2550    UBool isBoundary = false;
   2551    UBool cIsWord    = false;
   2552 
   2553    if (pos >= fLookLimit) {
   2554        fHitEnd = true;
   2555    } else {
   2556        // Determine whether char c at current position is a member of the word set of chars.
   2557        // If we're off the end of the string, behave as though we're not at a word char.
   2558        UTEXT_SETNATIVEINDEX(fInputText, pos);
   2559        UChar32  c = UTEXT_CURRENT32(fInputText);
   2560        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2561            // Current char is a combining one.  Not a boundary.
   2562            return false;
   2563        }
   2564        cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
   2565    }
   2566 
   2567    // Back up until we come to a non-combining char, determine whether
   2568    //  that char is a word char.
   2569    UBool prevCIsWord = false;
   2570    for (;;) {
   2571        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
   2572            break;
   2573        }
   2574        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
   2575        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2576              || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2577            prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
   2578            break;
   2579        }
   2580    }
   2581    isBoundary = cIsWord ^ prevCIsWord;
   2582    return isBoundary;
   2583 }
   2584 
   2585 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
   2586    UBool isBoundary = false;
   2587    UBool cIsWord    = false;
   2588 
   2589    const char16_t *inputBuf = fInputText->chunkContents;
   2590 
   2591    if (pos >= fLookLimit) {
   2592        fHitEnd = true;
   2593    } else {
   2594        // Determine whether char c at current position is a member of the word set of chars.
   2595        // If we're off the end of the string, behave as though we're not at a word char.
   2596        UChar32 c;
   2597        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
   2598        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
   2599            // Current char is a combining one.  Not a boundary.
   2600            return false;
   2601        }
   2602        cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
   2603    }
   2604 
   2605    // Back up until we come to a non-combining char, determine whether
   2606    //  that char is a word char.
   2607    UBool prevCIsWord = false;
   2608    for (;;) {
   2609        if (pos <= fLookStart) {
   2610            break;
   2611        }
   2612        UChar32 prevChar;
   2613        U16_PREV(inputBuf, fLookStart, pos, prevChar);
   2614        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
   2615              || u_charType(prevChar) == U_FORMAT_CHAR)) {
   2616            prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
   2617            break;
   2618        }
   2619    }
   2620    isBoundary = cIsWord ^ prevCIsWord;
   2621    return isBoundary;
   2622 }
   2623 
   2624 //--------------------------------------------------------------------------------
   2625 //
   2626 //   isUWordBoundary
   2627 //
   2628 //         Test for a word boundary using RBBI word break.
   2629 //
   2630 //          parameters:   pos   - the current position in the input buffer
   2631 //
   2632 //--------------------------------------------------------------------------------
   2633 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
   2634    UBool       returnVal = false;
   2635 
   2636 #if UCONFIG_NO_BREAK_ITERATION==0
   2637    // Note: this point will never be reached if break iteration is configured out.
   2638    //       Regex patterns that would require this function will fail to compile.
   2639 
   2640    // If we haven't yet created a break iterator for this matcher, do it now.
   2641    if (fWordBreakItr == nullptr) {
   2642        fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
   2643        if (U_FAILURE(status)) {
   2644            return false;
   2645        }
   2646        fWordBreakItr->setText(fInputText, status);
   2647    }
   2648 
   2649    // Note: zero width boundary tests like \b see through transparent region bounds,
   2650    //       which is why fLookLimit is used here, rather than fActiveLimit.
   2651    if (pos >= fLookLimit) {
   2652        fHitEnd = true;
   2653        returnVal = true;   // With Unicode word rules, only positions within the interior of "real"
   2654                            //    words are not boundaries.  All non-word chars stand by themselves,
   2655                            //    with word boundaries on both sides.
   2656    } else {
   2657        returnVal = fWordBreakItr->isBoundary(static_cast<int32_t>(pos));
   2658    }
   2659 #endif
   2660    return   returnVal;
   2661 }
   2662 
   2663 
   2664 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
   2665    int64_t result = pos;
   2666 
   2667 #if UCONFIG_NO_BREAK_ITERATION==0
   2668    // Note: this point will never be reached if break iteration is configured out.
   2669    //       Regex patterns that would require this function will fail to compile.
   2670 
   2671    // If we haven't yet created a break iterator for this matcher, do it now.
   2672    if (fGCBreakItr == nullptr) {
   2673        fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
   2674        if (U_FAILURE(status)) {
   2675            return pos;
   2676        }
   2677        fGCBreakItr->setText(fInputText, status);
   2678    }
   2679    result = fGCBreakItr->following(pos);
   2680    if (result == BreakIterator::DONE) {
   2681        result = pos;
   2682    }
   2683 #endif
   2684    return result;
   2685 }
   2686 
   2687 //--------------------------------------------------------------------------------
   2688 //
   2689 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
   2690 //                     saves. Increment the "time" counter, and call the
   2691 //                     user callback function if there is one installed.
   2692 //
   2693 //                     If the match operation needs to be aborted, either for a time-out
   2694 //                     or because the user callback asked for it, just set an error status.
   2695 //                     The engine will pick that up and stop in its outer loop.
   2696 //
   2697 //--------------------------------------------------------------------------------
   2698 void RegexMatcher::IncrementTime(UErrorCode &status) {
   2699    fTickCounter = TIMER_INITIAL_VALUE;
   2700    fTime++;
   2701    if (fCallbackFn != nullptr) {
   2702        if ((*fCallbackFn)(fCallbackContext, fTime) == false) {
   2703            status = U_REGEX_STOPPED_BY_CALLER;
   2704            return;
   2705        }
   2706    }
   2707    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
   2708        status = U_REGEX_TIME_OUT;
   2709    }
   2710 }
   2711 
   2712 //--------------------------------------------------------------------------------
   2713 //
   2714 //   StateSave
   2715 //       Make a new stack frame, initialized as a copy of the current stack frame.
   2716 //       Set the pattern index in the original stack frame from the operand value
   2717 //       in the opcode.  Execution of the engine continues with the state in
   2718 //       the newly created stack frame
   2719 //
   2720 //       Note that reserveBlock() may grow the stack, resulting in the
   2721 //       whole thing being relocated in memory.
   2722 //
   2723 //    Parameters:
   2724 //       fp           The top frame pointer when called.  At return, a new
   2725 //                    fame will be present
   2726 //       savePatIdx   An index into the compiled pattern.  Goes into the original
   2727 //                    (not new) frame.  If execution ever back-tracks out of the
   2728 //                    new frame, this will be where we continue from in the pattern.
   2729 //    Return
   2730 //                    The new frame pointer.
   2731 //
   2732 //--------------------------------------------------------------------------------
   2733 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
   2734    if (U_FAILURE(status)) {
   2735        return fp;
   2736    }
   2737    // push storage for a new frame.
   2738    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
   2739    if (U_FAILURE(status)) {
   2740        // Failure on attempted stack expansion.
   2741        //   Stack function set some other error code, change it to a more
   2742        //   specific one for regular expressions.
   2743        status = U_REGEX_STACK_OVERFLOW;
   2744        // We need to return a writable stack frame, so just return the
   2745        //    previous frame.  The match operation will stop quickly
   2746        //    because of the error status, after which the frame will never
   2747        //    be looked at again.
   2748        return fp;
   2749    }
   2750    fp = reinterpret_cast<REStackFrame*>(newFP - fFrameSize); // in case of realloc of stack.
   2751 
   2752    // New stack frame = copy of old top frame.
   2753    int64_t* source = reinterpret_cast<int64_t*>(fp);
   2754    int64_t *dest   = newFP;
   2755    for (;;) {
   2756        *dest++ = *source++;
   2757        if (source == newFP) {
   2758            break;
   2759        }
   2760    }
   2761 
   2762    fTickCounter--;
   2763    if (fTickCounter <= 0) {
   2764       IncrementTime(status);    // Re-initializes fTickCounter
   2765    }
   2766    fp->fPatIdx = savePatIdx;
   2767    return reinterpret_cast<REStackFrame*>(newFP);
   2768 }
   2769 
   2770 #if defined(REGEX_DEBUG)
   2771 namespace {
   2772 UnicodeString StringFromUText(UText *ut) {
   2773    UnicodeString result;
   2774    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
   2775        result.append(c);
   2776    }
   2777    return result;
   2778 }
   2779 }
   2780 #endif // REGEX_DEBUG
   2781 
   2782 
   2783 //--------------------------------------------------------------------------------
   2784 //
   2785 //   MatchAt      This is the actual matching engine.
   2786 //
   2787 //                  startIdx:    begin matching a this index.
   2788 //                  toEnd:       if true, match must extend to end of the input region
   2789 //
   2790 //--------------------------------------------------------------------------------
   2791 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
   2792    UBool       isMatch  = false;      // True if the we have a match.
   2793 
   2794    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
   2795 
   2796    int32_t     op;                    // Operation from the compiled pattern, split into
   2797    int32_t     opType;                //    the opcode
   2798    int32_t     opValue;               //    and the operand value.
   2799 
   2800 #ifdef REGEX_RUN_DEBUG
   2801    if (fTraceDebug) {
   2802        printf("MatchAt(startIdx=%ld)\n", startIdx);
   2803        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
   2804        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
   2805    }
   2806 #endif
   2807 
   2808    if (U_FAILURE(status)) {
   2809        return;
   2810    }
   2811 
   2812    //  Cache frequently referenced items from the compiled pattern
   2813    //
   2814    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   2815 
   2816    const char16_t      *litText       = fPattern->fLiteralText.getBuffer();
   2817    UVector             *fSets         = fPattern->fSets;
   2818 
   2819    fFrameSize = fPattern->fFrameSize;
   2820    REStackFrame        *fp            = resetStack();
   2821    if (U_FAILURE(fDeferredStatus)) {
   2822        status = fDeferredStatus;
   2823        return;
   2824    }
   2825 
   2826    fp->fPatIdx   = 0;
   2827    fp->fInputIdx = startIdx;
   2828 
   2829    // Zero out the pattern's static data
   2830    int32_t i;
   2831    for (i = 0; i<fPattern->fDataSize; i++) {
   2832        fData[i] = 0;
   2833    }
   2834 
   2835    //
   2836    //  Main loop for interpreting the compiled pattern.
   2837    //  One iteration of the loop per pattern operation performed.
   2838    //
   2839    for (;;) {
   2840        op = static_cast<int32_t>(pat[fp->fPatIdx]);
   2841        opType  = URX_TYPE(op);
   2842        opValue = URX_VAL(op);
   2843 #ifdef REGEX_RUN_DEBUG
   2844        if (fTraceDebug) {
   2845            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2846            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   2847                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   2848            fPattern->dumpOp(fp->fPatIdx);
   2849        }
   2850 #endif
   2851        fp->fPatIdx++;
   2852 
   2853        switch (opType) {
   2854 
   2855 
   2856        case URX_NOP:
   2857            break;
   2858 
   2859 
   2860        case URX_BACKTRACK:
   2861            // Force a backtrack.  In some circumstances, the pattern compiler
   2862            //   will notice that the pattern can't possibly match anything, and will
   2863            //   emit one of these at that point.
   2864            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   2865            break;
   2866 
   2867 
   2868        case URX_ONECHAR:
   2869            if (fp->fInputIdx < fActiveLimit) {
   2870                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2871                UChar32 c = UTEXT_NEXT32(fInputText);
   2872                if (c == opValue) {
   2873                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2874                    break;
   2875                }
   2876            } else {
   2877                fHitEnd = true;
   2878            }
   2879            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   2880            break;
   2881 
   2882 
   2883        case URX_STRING:
   2884            {
   2885                // Test input against a literal string.
   2886                // Strings require two slots in the compiled pattern, one for the
   2887                //   offset to the string text, and one for the length.
   2888 
   2889                int32_t   stringStartIdx = opValue;
   2890                op = static_cast<int32_t>(pat[fp->fPatIdx]); // Fetch the second operand
   2891                fp->fPatIdx++;
   2892                opType    = URX_TYPE(op);
   2893                int32_t stringLen = URX_VAL(op);
   2894                U_ASSERT(opType == URX_STRING_LEN);
   2895                U_ASSERT(stringLen >= 2);
   2896 
   2897                const char16_t *patternString = litText+stringStartIdx;
   2898                int32_t patternStringIndex = 0;
   2899                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2900                UChar32 inputChar;
   2901                UChar32 patternChar;
   2902                UBool success = true;
   2903                while (patternStringIndex < stringLen) {
   2904                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   2905                        success = false;
   2906                        fHitEnd = true;
   2907                        break;
   2908                    }
   2909                    inputChar = UTEXT_NEXT32(fInputText);
   2910                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
   2911                    if (patternChar != inputChar) {
   2912                        success = false;
   2913                        break;
   2914                    }
   2915                }
   2916 
   2917                if (success) {
   2918                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   2919                } else {
   2920                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   2921                }
   2922            }
   2923            break;
   2924 
   2925 
   2926        case URX_STATE_SAVE:
   2927            fp = StateSave(fp, opValue, status);
   2928            break;
   2929 
   2930 
   2931        case URX_END:
   2932            // The match loop will exit via this path on a successful match,
   2933            //   when we reach the end of the pattern.
   2934            if (toEnd && fp->fInputIdx != fActiveLimit) {
   2935                // The pattern matched, but not to the end of input.  Try some more.
   2936                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   2937                break;
   2938            }
   2939            isMatch = true;
   2940            goto  breakFromLoop;
   2941 
   2942        // Start and End Capture stack frame variables are laid out out like this:
   2943            //  fp->fExtra[opValue]  - The start of a completed capture group
   2944            //             opValue+1 - The end   of a completed capture group
   2945            //             opValue+2 - the start of a capture group whose end
   2946            //                          has not yet been reached (and might not ever be).
   2947        case URX_START_CAPTURE:
   2948            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2949            fp->fExtra[opValue+2] = fp->fInputIdx;
   2950            break;
   2951 
   2952 
   2953        case URX_END_CAPTURE:
   2954            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   2955            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   2956            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   2957            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   2958            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   2959            break;
   2960 
   2961 
   2962        case URX_DOLLAR:                   //  $, test for End of line
   2963                                           //     or for position before new line at end of input
   2964            {
   2965                if (fp->fInputIdx >= fAnchorLimit) {
   2966                    // We really are at the end of input.  Success.
   2967                    fHitEnd = true;
   2968                    fRequireEnd = true;
   2969                    break;
   2970                }
   2971 
   2972                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   2973 
   2974                // If we are positioned just before a new-line that is located at the
   2975                //   end of input, succeed.
   2976                UChar32 c = UTEXT_NEXT32(fInputText);
   2977                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2978                    if (isLineTerminator(c)) {
   2979                        // If not in the middle of a CR/LF sequence
   2980                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
   2981                            // At new-line at end of input. Success
   2982                            fHitEnd = true;
   2983                            fRequireEnd = true;
   2984 
   2985                            break;
   2986                        }
   2987                    }
   2988                } else {
   2989                    UChar32 nextC = UTEXT_NEXT32(fInputText);
   2990                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
   2991                        fHitEnd = true;
   2992                        fRequireEnd = true;
   2993                        break;                         // At CR/LF at end of input.  Success
   2994                    }
   2995                }
   2996 
   2997                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   2998            }
   2999            break;
   3000 
   3001 
   3002         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   3003            if (fp->fInputIdx >= fAnchorLimit) {
   3004                // Off the end of input.  Success.
   3005                fHitEnd = true;
   3006                fRequireEnd = true;
   3007                break;
   3008            } else {
   3009                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3010                UChar32 c = UTEXT_NEXT32(fInputText);
   3011                // Either at the last character of input, or off the end.
   3012                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
   3013                    fHitEnd = true;
   3014                    fRequireEnd = true;
   3015                    break;
   3016                }
   3017            }
   3018 
   3019            // Not at end of input.  Back-track out.
   3020            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3021            break;
   3022 
   3023 
   3024         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   3025             {
   3026                 if (fp->fInputIdx >= fAnchorLimit) {
   3027                     // We really are at the end of input.  Success.
   3028                     fHitEnd = true;
   3029                     fRequireEnd = true;
   3030                     break;
   3031                 }
   3032                 // If we are positioned just before a new-line, succeed.
   3033                 // It makes no difference where the new-line is within the input.
   3034                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3035                 UChar32 c = UTEXT_CURRENT32(fInputText);
   3036                 if (isLineTerminator(c)) {
   3037                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   3038                     //  In multi-line mode, hitting a new-line just before the end of input does not
   3039                     //   set the hitEnd or requireEnd flags
   3040                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
   3041                        break;
   3042                     }
   3043                 }
   3044                 // not at a new line.  Fail.
   3045                 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3046             }
   3047             break;
   3048 
   3049 
   3050         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   3051             {
   3052                 if (fp->fInputIdx >= fAnchorLimit) {
   3053                     // We really are at the end of input.  Success.
   3054                     fHitEnd = true;
   3055                     fRequireEnd = true;  // Java set requireEnd in this case, even though
   3056                     break;               //   adding a new-line would not lose the match.
   3057                 }
   3058                 // If we are not positioned just before a new-line, the test fails; backtrack out.
   3059                 // It makes no difference where the new-line is within the input.
   3060                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3061                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
   3062                     fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3063                 }
   3064             }
   3065             break;
   3066 
   3067 
   3068       case URX_CARET:                    //  ^, test for start of line
   3069            if (fp->fInputIdx != fAnchorStart) {
   3070                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3071            }
   3072            break;
   3073 
   3074 
   3075       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   3076           {
   3077               if (fp->fInputIdx == fAnchorStart) {
   3078                   // We are at the start input.  Success.
   3079                   break;
   3080               }
   3081               // Check whether character just before the current pos is a new-line
   3082               //   unless we are at the end of input
   3083               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3084               UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3085               if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
   3086                   //  It's a new-line.  ^ is true.  Success.
   3087                   //  TODO:  what should be done with positions between a CR and LF?
   3088                   break;
   3089               }
   3090               // Not at the start of a line.  Fail.
   3091               fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3092           }
   3093           break;
   3094 
   3095 
   3096       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   3097           {
   3098               U_ASSERT(fp->fInputIdx >= fAnchorStart);
   3099               if (fp->fInputIdx <= fAnchorStart) {
   3100                   // We are at the start input.  Success.
   3101                   break;
   3102               }
   3103               // Check whether character just before the current pos is a new-line
   3104               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   3105               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3106               UChar32  c = UTEXT_PREVIOUS32(fInputText);
   3107               if (c != 0x0a) {
   3108                   // Not at the start of a line.  Back-track out.
   3109                   fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3110               }
   3111           }
   3112           break;
   3113 
   3114        case URX_BACKSLASH_B:          // Test for word boundaries
   3115            {
   3116                UBool success = isWordBoundary(fp->fInputIdx);
   3117                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
   3118                if (!success) {
   3119                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3120                }
   3121            }
   3122            break;
   3123 
   3124 
   3125        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   3126            {
   3127                UBool success = isUWordBoundary(fp->fInputIdx, status);
   3128                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
   3129                if (!success) {
   3130                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3131                }
   3132            }
   3133            break;
   3134 
   3135 
   3136        case URX_BACKSLASH_D:            // Test for decimal digit
   3137            {
   3138                if (fp->fInputIdx >= fActiveLimit) {
   3139                    fHitEnd = true;
   3140                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3141                    break;
   3142                }
   3143 
   3144                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3145 
   3146                UChar32 c = UTEXT_NEXT32(fInputText);
   3147                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   3148                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   3149                success ^= static_cast<UBool>(opValue != 0); // flip sense for \D
   3150                if (success) {
   3151                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3152                } else {
   3153                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3154                }
   3155            }
   3156            break;
   3157 
   3158 
   3159        case URX_BACKSLASH_G:          // Test for position at end of previous match
   3160            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
   3161                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3162            }
   3163            break;
   3164 
   3165 
   3166        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
   3167            {
   3168                if (fp->fInputIdx >= fActiveLimit) {
   3169                    fHitEnd = true;
   3170                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3171                    break;
   3172                }
   3173                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3174                UChar32 c = UTEXT_NEXT32(fInputText);
   3175                int8_t ctype = u_charType(c);
   3176                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
   3177                success ^= static_cast<UBool>(opValue != 0);  // flip sense for \H
   3178                if (success) {
   3179                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3180                } else {
   3181                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3182                }
   3183            }
   3184            break;
   3185 
   3186 
   3187        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
   3188            {
   3189                if (fp->fInputIdx >= fActiveLimit) {
   3190                    fHitEnd = true;
   3191                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3192                    break;
   3193                }
   3194                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3195                UChar32 c = UTEXT_NEXT32(fInputText);
   3196                if (isLineTerminator(c)) {
   3197                    if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
   3198                        utext_next32(fInputText);
   3199                    }
   3200                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3201                } else {
   3202                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3203                }
   3204            }
   3205            break;
   3206 
   3207 
   3208        case URX_BACKSLASH_V:            // \v, any single line ending character.
   3209            {
   3210                if (fp->fInputIdx >= fActiveLimit) {
   3211                    fHitEnd = true;
   3212                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3213                    break;
   3214                }
   3215                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3216                UChar32 c = UTEXT_NEXT32(fInputText);
   3217                UBool success = isLineTerminator(c);
   3218                success ^= static_cast<UBool>(opValue != 0); // flip sense for \V
   3219                if (success) {
   3220                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3221                } else {
   3222                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3223                }
   3224            }
   3225            break;
   3226 
   3227 
   3228        case URX_BACKSLASH_X:
   3229            //  Match a Grapheme, as defined by Unicode UAX 29.
   3230 
   3231            // Fail if at end of input
   3232            if (fp->fInputIdx >= fActiveLimit) {
   3233                fHitEnd = true;
   3234                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3235                break;
   3236            }
   3237 
   3238            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
   3239            if (fp->fInputIdx >= fActiveLimit) {
   3240                fHitEnd = true;
   3241                fp->fInputIdx = fActiveLimit;
   3242            }
   3243            break;
   3244 
   3245 
   3246        case URX_BACKSLASH_Z:          // Test for end of Input
   3247            if (fp->fInputIdx < fAnchorLimit) {
   3248                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3249            } else {
   3250                fHitEnd = true;
   3251                fRequireEnd = true;
   3252            }
   3253            break;
   3254 
   3255 
   3256 
   3257        case URX_STATIC_SETREF:
   3258            {
   3259                // Test input character against one of the predefined sets
   3260                //    (Word Characters, for example)
   3261                // The high bit of the op value is a flag for the match polarity.
   3262                //    0:   success if input char is in set.
   3263                //    1:   success if input char is not in set.
   3264                if (fp->fInputIdx >= fActiveLimit) {
   3265                    fHitEnd = true;
   3266                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3267                    break;
   3268                }
   3269 
   3270                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   3271                opValue &= ~URX_NEG_SET;
   3272                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3273 
   3274                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3275                UChar32 c = UTEXT_NEXT32(fInputText);
   3276                if (c < 256) {
   3277                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
   3278                    if (s8.contains(c)) {
   3279                        success = !success;
   3280                    }
   3281                } else {
   3282                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
   3283                    if (s.contains(c)) {
   3284                        success = !success;
   3285                    }
   3286                }
   3287                if (success) {
   3288                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3289                } else {
   3290                    // the character wasn't in the set.
   3291                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3292                }
   3293            }
   3294            break;
   3295 
   3296 
   3297        case URX_STAT_SETREF_N:
   3298            {
   3299                // Test input character for NOT being a member of  one of
   3300                //    the predefined sets (Word Characters, for example)
   3301                if (fp->fInputIdx >= fActiveLimit) {
   3302                    fHitEnd = true;
   3303                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3304                    break;
   3305                }
   3306 
   3307                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   3308 
   3309                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3310 
   3311                UChar32 c = UTEXT_NEXT32(fInputText);
   3312                if (c < 256) {
   3313                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
   3314                    if (s8.contains(c) == false) {
   3315                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3316                        break;
   3317                    }
   3318                } else {
   3319                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
   3320                    if (s.contains(c) == false) {
   3321                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3322                        break;
   3323                    }
   3324                }
   3325                // the character wasn't in the set.
   3326                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3327            }
   3328            break;
   3329 
   3330 
   3331        case URX_SETREF:
   3332            if (fp->fInputIdx >= fActiveLimit) {
   3333                fHitEnd = true;
   3334                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3335                break;
   3336            } else {
   3337                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3338 
   3339                // There is input left.  Pick up one char and test it for set membership.
   3340                UChar32 c = UTEXT_NEXT32(fInputText);
   3341                U_ASSERT(opValue > 0 && opValue < fSets->size());
   3342                if (c<256) {
   3343                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   3344                    if (s8->contains(c)) {
   3345                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3346                        break;
   3347                    }
   3348                } else {
   3349                    UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
   3350                    if (s->contains(c)) {
   3351                        // The character is in the set.  A Match.
   3352                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3353                        break;
   3354                    }
   3355                }
   3356 
   3357                // the character wasn't in the set.
   3358                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3359            }
   3360            break;
   3361 
   3362 
   3363        case URX_DOTANY:
   3364            {
   3365                // . matches anything, but stops at end-of-line.
   3366                if (fp->fInputIdx >= fActiveLimit) {
   3367                    // At end of input.  Match failed.  Backtrack out.
   3368                    fHitEnd = true;
   3369                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3370                    break;
   3371                }
   3372 
   3373                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3374 
   3375                // There is input left.  Advance over one char, unless we've hit end-of-line
   3376                UChar32 c = UTEXT_NEXT32(fInputText);
   3377                if (isLineTerminator(c)) {
   3378                    // End of line in normal mode.   . does not match.
   3379                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3380                    break;
   3381                }
   3382                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3383            }
   3384            break;
   3385 
   3386 
   3387        case URX_DOTANY_ALL:
   3388            {
   3389                // ., in dot-matches-all (including new lines) mode
   3390                if (fp->fInputIdx >= fActiveLimit) {
   3391                    // At end of input.  Match failed.  Backtrack out.
   3392                    fHitEnd = true;
   3393                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3394                    break;
   3395                }
   3396 
   3397                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3398 
   3399                // There is input left.  Advance over one char, except if we are
   3400                //   at a cr/lf, advance over both of them.
   3401                UChar32 c;
   3402                c = UTEXT_NEXT32(fInputText);
   3403                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3404                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   3405                    // In the case of a CR/LF, we need to advance over both.
   3406                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
   3407                    if (nextc == 0x0a) {
   3408                        (void)UTEXT_NEXT32(fInputText);
   3409                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3410                    }
   3411                }
   3412            }
   3413            break;
   3414 
   3415 
   3416        case URX_DOTANY_UNIX:
   3417            {
   3418                // '.' operator, matches all, but stops at end-of-line.
   3419                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   3420                if (fp->fInputIdx >= fActiveLimit) {
   3421                    // At end of input.  Match failed.  Backtrack out.
   3422                    fHitEnd = true;
   3423                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3424                    break;
   3425                }
   3426 
   3427                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3428 
   3429                // There is input left.  Advance over one char, unless we've hit end-of-line
   3430                UChar32 c = UTEXT_NEXT32(fInputText);
   3431                if (c == 0x0a) {
   3432                    // End of line in normal mode.   '.' does not match the \n
   3433                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3434                } else {
   3435                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3436                }
   3437            }
   3438            break;
   3439 
   3440 
   3441        case URX_JMP:
   3442            fp->fPatIdx = opValue;
   3443            break;
   3444 
   3445        case URX_FAIL:
   3446            isMatch = false;
   3447            goto breakFromLoop;
   3448 
   3449        case URX_JMP_SAV:
   3450            U_ASSERT(opValue < fPattern->fCompiledPat->size());
   3451            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   3452            fp->fPatIdx = opValue;                         // Then JMP.
   3453            break;
   3454 
   3455        case URX_JMP_SAV_X:
   3456            // This opcode is used with (x)+, when x can match a zero length string.
   3457            // Same as JMP_SAV, except conditional on the match having made forward progress.
   3458            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   3459            //   data address of the input position at the start of the loop.
   3460            {
   3461                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   3462                int32_t stoOp = static_cast<int32_t>(pat[opValue - 1]);
   3463                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   3464                int32_t  frameLoc = URX_VAL(stoOp);
   3465                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   3466                int64_t prevInputIdx = fp->fExtra[frameLoc];
   3467                U_ASSERT(prevInputIdx <= fp->fInputIdx);
   3468                if (prevInputIdx < fp->fInputIdx) {
   3469                    // The match did make progress.  Repeat the loop.
   3470                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   3471                    fp->fPatIdx = opValue;
   3472                    fp->fExtra[frameLoc] = fp->fInputIdx;
   3473                }
   3474                // If the input position did not advance, we do nothing here,
   3475                //   execution will fall out of the loop.
   3476            }
   3477            break;
   3478 
   3479        case URX_CTR_INIT:
   3480            {
   3481                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3482                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3483 
   3484                // Pick up the three extra operands that CTR_INIT has, and
   3485                //    skip the pattern location counter past
   3486                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
   3487                fp->fPatIdx += 3;
   3488                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3489                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
   3490                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
   3491                U_ASSERT(minCount>=0);
   3492                U_ASSERT(maxCount>=minCount || maxCount==-1);
   3493                U_ASSERT(loopLoc>=fp->fPatIdx);
   3494 
   3495                if (minCount == 0) {
   3496                    fp = StateSave(fp, loopLoc+1, status);
   3497                }
   3498                if (maxCount == -1) {
   3499                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   3500                } else if (maxCount == 0) {
   3501                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3502                }
   3503            }
   3504            break;
   3505 
   3506        case URX_CTR_LOOP:
   3507            {
   3508                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3509                int32_t initOp = static_cast<int32_t>(pat[opValue]);
   3510                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   3511                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3512                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
   3513                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
   3514                (*pCounter)++;
   3515                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
   3516                    U_ASSERT(*pCounter == maxCount);
   3517                    break;
   3518                }
   3519                if (*pCounter >= minCount) {
   3520                    if (maxCount == -1) {
   3521                        // Loop has no hard upper bound.
   3522                        // Check that it is progressing through the input, break if it is not.
   3523                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3524                        if (fp->fInputIdx == *pLastInputIdx) {
   3525                            break;
   3526                        } else {
   3527                            *pLastInputIdx = fp->fInputIdx;
   3528                        }
   3529                    }
   3530                    fp = StateSave(fp, fp->fPatIdx, status);
   3531                } else {
   3532                    // Increment time-out counter. (StateSave() does it if count >= minCount)
   3533                    fTickCounter--;
   3534                    if (fTickCounter <= 0) {
   3535                        IncrementTime(status);    // Re-initializes fTickCounter
   3536                    }
   3537                }
   3538 
   3539                fp->fPatIdx = opValue + 4;    // Loop back.
   3540            }
   3541            break;
   3542 
   3543        case URX_CTR_INIT_NG:
   3544            {
   3545                // Initialize a non-greedy loop
   3546                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   3547                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   3548 
   3549                // Pick up the three extra operands that CTR_INIT_NG has, and
   3550                //    skip the pattern location counter past
   3551                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
   3552                fp->fPatIdx += 3;
   3553                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   3554                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
   3555                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
   3556                U_ASSERT(minCount>=0);
   3557                U_ASSERT(maxCount>=minCount || maxCount==-1);
   3558                U_ASSERT(loopLoc>fp->fPatIdx);
   3559                if (maxCount == -1) {
   3560                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   3561                }
   3562 
   3563                if (minCount == 0) {
   3564                    if (maxCount != 0) {
   3565                        fp = StateSave(fp, fp->fPatIdx, status);
   3566                    }
   3567                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   3568                }
   3569            }
   3570            break;
   3571 
   3572        case URX_CTR_LOOP_NG:
   3573            {
   3574                // Non-greedy {min, max} loops
   3575                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   3576                int32_t initOp = static_cast<int32_t>(pat[opValue]);
   3577                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   3578                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   3579                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
   3580                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
   3581 
   3582                (*pCounter)++;
   3583                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
   3584                    // The loop has matched the maximum permitted number of times.
   3585                    //   Break out of here with no action.  Matching will
   3586                    //   continue with the following pattern.
   3587                    U_ASSERT(*pCounter == maxCount);
   3588                    break;
   3589                }
   3590 
   3591                if (*pCounter < minCount) {
   3592                    // We haven't met the minimum number of matches yet.
   3593                    //   Loop back for another one.
   3594                    fp->fPatIdx = opValue + 4;    // Loop back.
   3595                    // Increment time-out counter. (StateSave() does it if count >= minCount)
   3596                    fTickCounter--;
   3597                    if (fTickCounter <= 0) {
   3598                        IncrementTime(status);    // Re-initializes fTickCounter
   3599                    }
   3600                } else {
   3601                    // We do have the minimum number of matches.
   3602 
   3603                    // If there is no upper bound on the loop iterations, check that the input index
   3604                    // is progressing, and stop the loop if it is not.
   3605                    if (maxCount == -1) {
   3606                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   3607                        if (fp->fInputIdx == *pLastInputIdx) {
   3608                            break;
   3609                        }
   3610                        *pLastInputIdx = fp->fInputIdx;
   3611                    }
   3612 
   3613                    // Loop Continuation: we will fall into the pattern following the loop
   3614                    //   (non-greedy, don't execute loop body first), but first do
   3615                    //   a state save to the top of the loop, so that a match failure
   3616                    //   in the following pattern will try another iteration of the loop.
   3617                    fp = StateSave(fp, opValue + 4, status);
   3618                }
   3619            }
   3620            break;
   3621 
   3622        case URX_STO_SP:
   3623            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3624            fData[opValue] = fStack->size();
   3625            break;
   3626 
   3627        case URX_LD_SP:
   3628            {
   3629                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   3630                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
   3631                U_ASSERT(newStackSize <= fStack->size());
   3632                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3633                if (newFP == reinterpret_cast<int64_t*>(fp)) {
   3634                    break;
   3635                }
   3636                int32_t j;
   3637                for (j=0; j<fFrameSize; j++) {
   3638                    newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
   3639                }
   3640                fp = reinterpret_cast<REStackFrame*>(newFP);
   3641                fStack->setSize(newStackSize);
   3642            }
   3643            break;
   3644 
   3645        case URX_BACKREF:
   3646            {
   3647                U_ASSERT(opValue < fFrameSize);
   3648                int64_t groupStartIdx = fp->fExtra[opValue];
   3649                int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3650                U_ASSERT(groupStartIdx <= groupEndIdx);
   3651                if (groupStartIdx < 0) {
   3652                    // This capture group has not participated in the match thus far,
   3653                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
   3654                    break;
   3655                }
   3656                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
   3657                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3658 
   3659                //   Note: if the capture group match was of an empty string the backref
   3660                //         match succeeds.  Verified by testing:  Perl matches succeed
   3661                //         in this case, so we do too.
   3662 
   3663                UBool success = true;
   3664                for (;;) {
   3665                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3666                        success = true;
   3667                        break;
   3668                    }
   3669                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3670                        success = false;
   3671                        fHitEnd = true;
   3672                        break;
   3673                    }
   3674                    UChar32 captureGroupChar = utext_next32(fAltInputText);
   3675                    UChar32 inputChar = utext_next32(fInputText);
   3676                    if (inputChar != captureGroupChar) {
   3677                        success = false;
   3678                        break;
   3679                    }
   3680                }
   3681 
   3682                if (success) {
   3683                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3684                } else {
   3685                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3686                }
   3687            }
   3688            break;
   3689 
   3690 
   3691 
   3692        case URX_BACKREF_I:
   3693            {
   3694                U_ASSERT(opValue < fFrameSize);
   3695                int64_t groupStartIdx = fp->fExtra[opValue];
   3696                int64_t groupEndIdx   = fp->fExtra[opValue+1];
   3697                U_ASSERT(groupStartIdx <= groupEndIdx);
   3698                if (groupStartIdx < 0) {
   3699                    // This capture group has not participated in the match thus far,
   3700                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
   3701                    break;
   3702                }
   3703                utext_setNativeIndex(fAltInputText, groupStartIdx);
   3704                utext_setNativeIndex(fInputText, fp->fInputIdx);
   3705                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
   3706                CaseFoldingUTextIterator inputItr(*fInputText);
   3707 
   3708                //   Note: if the capture group match was of an empty string the backref
   3709                //         match succeeds.  Verified by testing:  Perl matches succeed
   3710                //         in this case, so we do too.
   3711 
   3712                UBool success = true;
   3713                for (;;) {
   3714                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
   3715                        success = true;
   3716                        break;
   3717                    }
   3718                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
   3719                        success = false;
   3720                        fHitEnd = true;
   3721                        break;
   3722                    }
   3723                    UChar32 captureGroupChar = captureGroupItr.next();
   3724                    UChar32 inputChar = inputItr.next();
   3725                    if (inputChar != captureGroupChar) {
   3726                        success = false;
   3727                        break;
   3728                    }
   3729                }
   3730 
   3731                if (success && inputItr.inExpansion()) {
   3732                    // We obtained a match by consuming part of a string obtained from
   3733                    // case-folding a single code point of the input text.
   3734                    // This does not count as an overall match.
   3735                    success = false;
   3736                }
   3737 
   3738                if (success) {
   3739                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3740                } else {
   3741                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3742                }
   3743 
   3744            }
   3745            break;
   3746 
   3747        case URX_STO_INP_LOC:
   3748            {
   3749                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   3750                fp->fExtra[opValue] = fp->fInputIdx;
   3751            }
   3752            break;
   3753 
   3754        case URX_JMPX:
   3755            {
   3756                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
   3757                fp->fPatIdx += 1;
   3758                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   3759                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   3760                int64_t savedInputIdx = fp->fExtra[dataLoc];
   3761                U_ASSERT(savedInputIdx <= fp->fInputIdx);
   3762                if (savedInputIdx < fp->fInputIdx) {
   3763                    fp->fPatIdx = opValue;                               // JMP
   3764                } else {
   3765                     fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no progress in loop.
   3766                }
   3767            }
   3768            break;
   3769 
   3770        case URX_LA_START:
   3771            {
   3772                // Entering a look around block.
   3773                // Save Stack Ptr, Input Pos.
   3774                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
   3775                fData[opValue]   = fStack->size();
   3776                fData[opValue+1] = fp->fInputIdx;
   3777                fData[opValue+2] = fActiveStart;
   3778                fData[opValue+3] = fActiveLimit;
   3779                fActiveStart     = fLookStart;          // Set the match region change for
   3780                fActiveLimit     = fLookLimit;          //   transparent bounds.
   3781            }
   3782            break;
   3783 
   3784        case URX_LA_END:
   3785            {
   3786                // Leaving a look-ahead block.
   3787                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   3788                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
   3789                int32_t stackSize = fStack->size();
   3790                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
   3791                U_ASSERT(stackSize >= newStackSize);
   3792                if (stackSize > newStackSize) {
   3793                    // Copy the current top frame back to the new (cut back) top frame.
   3794                    //   This makes the capture groups from within the look-ahead
   3795                    //   expression available.
   3796                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   3797                    int32_t j;
   3798                    for (j=0; j<fFrameSize; j++) {
   3799                        newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
   3800                    }
   3801                    fp = reinterpret_cast<REStackFrame*>(newFP);
   3802                    fStack->setSize(newStackSize);
   3803                }
   3804                fp->fInputIdx = fData[opValue+1];
   3805 
   3806                // Restore the active region bounds in the input string; they may have
   3807                //    been changed because of transparent bounds on a Region.
   3808                fActiveStart = fData[opValue+2];
   3809                fActiveLimit = fData[opValue+3];
   3810                U_ASSERT(fActiveStart >= 0);
   3811                U_ASSERT(fActiveLimit <= fInputLength);
   3812            }
   3813            break;
   3814 
   3815        case URX_ONECHAR_I:
   3816            // Case insensitive one char.  The char from the pattern is already case folded.
   3817            // Input text is not, but case folding the input can not reduce two or more code
   3818            // points to one.
   3819            if (fp->fInputIdx < fActiveLimit) {
   3820                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3821 
   3822                UChar32 c = UTEXT_NEXT32(fInputText);
   3823                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   3824                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3825                    break;
   3826                }
   3827            } else {
   3828                fHitEnd = true;
   3829            }
   3830 
   3831            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3832            break;
   3833 
   3834        case URX_STRING_I:
   3835            {
   3836                // Case-insensitive test input against a literal string.
   3837                // Strings require two slots in the compiled pattern, one for the
   3838                //   offset to the string text, and one for the length.
   3839                //   The compiled string has already been case folded.
   3840                {
   3841                    const char16_t *patternString = litText + opValue;
   3842                    int32_t      patternStringIdx  = 0;
   3843 
   3844                    op = static_cast<int32_t>(pat[fp->fPatIdx]);
   3845                    fp->fPatIdx++;
   3846                    opType  = URX_TYPE(op);
   3847                    opValue = URX_VAL(op);
   3848                    U_ASSERT(opType == URX_STRING_LEN);
   3849                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   3850 
   3851 
   3852                    UChar32   cPattern;
   3853                    UChar32   cText;
   3854                    UBool     success = true;
   3855 
   3856                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   3857                    CaseFoldingUTextIterator inputIterator(*fInputText);
   3858                    while (patternStringIdx < patternStringLen) {
   3859                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
   3860                            success = false;
   3861                            fHitEnd = true;
   3862                            break;
   3863                        }
   3864                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   3865                        cText = inputIterator.next();
   3866                        if (cText != cPattern) {
   3867                            success = false;
   3868                            break;
   3869                        }
   3870                    }
   3871                    if (inputIterator.inExpansion()) {
   3872                        success = false;
   3873                    }
   3874 
   3875                    if (success) {
   3876                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3877                    } else {
   3878                        fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3879                    }
   3880                }
   3881            }
   3882            break;
   3883 
   3884        case URX_LB_START:
   3885            {
   3886                // Entering a look-behind block.
   3887                // Save Stack Ptr, Input Pos and active input region.
   3888                //   TODO:  implement transparent bounds.  Ticket #6067
   3889                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   3890                fData[opValue]   = fStack->size();
   3891                fData[opValue+1] = fp->fInputIdx;
   3892                // Save input string length, then reset to pin any matches to end at
   3893                //   the current position.
   3894                fData[opValue+2] = fActiveStart;
   3895                fData[opValue+3] = fActiveLimit;
   3896                fActiveStart     = fRegionStart;
   3897                fActiveLimit     = fp->fInputIdx;
   3898                // Init the variable containing the start index for attempted matches.
   3899                fData[opValue+4] = -1;
   3900            }
   3901            break;
   3902 
   3903 
   3904        case URX_LB_CONT:
   3905            {
   3906                // Positive Look-Behind, at top of loop checking for matches of LB expression
   3907                //    at all possible input starting positions.
   3908 
   3909                // Fetch the min and max possible match lengths.  They are the operands
   3910                //   of this op in the pattern.
   3911                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   3912                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   3913                if (!UTEXT_USES_U16(fInputText)) {
   3914                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
   3915                    // The max length need not be exact; it just needs to be >= actual maximum.
   3916                    maxML *= 3;
   3917                }
   3918                U_ASSERT(minML <= maxML);
   3919                U_ASSERT(minML >= 0);
   3920 
   3921                // Fetch (from data) the last input index where a match was attempted.
   3922                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   3923                int64_t  &lbStartIdx = fData[opValue+4];
   3924                if (lbStartIdx < 0) {
   3925                    // First time through loop.
   3926                    lbStartIdx = fp->fInputIdx - minML;
   3927                    if (lbStartIdx > 0) {
   3928                        // move index to a code point boundary, if it's not on one already.
   3929                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   3930                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3931                    }
   3932                } else {
   3933                    // 2nd through nth time through the loop.
   3934                    // Back up start position for match by one.
   3935                    if (lbStartIdx == 0) {
   3936                        (lbStartIdx)--;
   3937                    } else {
   3938                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   3939                        (void)UTEXT_PREVIOUS32(fInputText);
   3940                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   3941                    }
   3942                }
   3943 
   3944                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   3945                    // We have tried all potential match starting points without
   3946                    //  getting a match.  Backtrack out, and out of the
   3947                    //   Look Behind altogether.
   3948                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3949                    fActiveStart = fData[opValue+2];
   3950                    fActiveLimit = fData[opValue+3];
   3951                    U_ASSERT(fActiveStart >= 0);
   3952                    U_ASSERT(fActiveLimit <= fInputLength);
   3953                    break;
   3954                }
   3955 
   3956                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   3957                //      (successful match will fall off the end of the loop.)
   3958                fp = StateSave(fp, fp->fPatIdx-3, status);
   3959                fp->fInputIdx = lbStartIdx;
   3960            }
   3961            break;
   3962 
   3963        case URX_LB_END:
   3964            // End of a look-behind block, after a successful match.
   3965            {
   3966                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   3967                if (fp->fInputIdx != fActiveLimit) {
   3968                    //  The look-behind expression matched, but the match did not
   3969                    //    extend all the way to the point that we are looking behind from.
   3970                    //  FAIL out of here, which will take us back to the LB_CONT, which
   3971                    //     will retry the match starting at another position or fail
   3972                    //     the look-behind altogether, whichever is appropriate.
   3973                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   3974                    break;
   3975                }
   3976 
   3977                // Look-behind match is good.  Restore the original input string region,
   3978                //   which had been truncated to pin the end of the lookbehind match to the
   3979                //   position being looked-behind.
   3980                fActiveStart = fData[opValue+2];
   3981                fActiveLimit = fData[opValue+3];
   3982                U_ASSERT(fActiveStart >= 0);
   3983                U_ASSERT(fActiveLimit <= fInputLength);
   3984            }
   3985            break;
   3986 
   3987 
   3988        case URX_LBN_CONT:
   3989            {
   3990                // Negative Look-Behind, at top of loop checking for matches of LB expression
   3991                //    at all possible input starting positions.
   3992 
   3993                // Fetch the extra parameters of this op.
   3994                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   3995                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   3996                if (!UTEXT_USES_U16(fInputText)) {
   3997                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
   3998                    // The max length need not be exact; it just needs to be >= actual maximum.
   3999                    maxML *= 3;
   4000                }
   4001                int32_t continueLoc = static_cast<int32_t>(pat[fp->fPatIdx++]);
   4002                        continueLoc = URX_VAL(continueLoc);
   4003                U_ASSERT(minML <= maxML);
   4004                U_ASSERT(minML >= 0);
   4005                U_ASSERT(continueLoc > fp->fPatIdx);
   4006 
   4007                // Fetch (from data) the last input index where a match was attempted.
   4008                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   4009                int64_t  &lbStartIdx = fData[opValue+4];
   4010                if (lbStartIdx < 0) {
   4011                    // First time through loop.
   4012                    lbStartIdx = fp->fInputIdx - minML;
   4013                    if (lbStartIdx > 0) {
   4014                        // move index to a code point boundary, if it's not on one already.
   4015                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   4016                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4017                    }
   4018                } else {
   4019                    // 2nd through nth time through the loop.
   4020                    // Back up start position for match by one.
   4021                    if (lbStartIdx == 0) {
   4022                        (lbStartIdx)--;
   4023                    } else {
   4024                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
   4025                        (void)UTEXT_PREVIOUS32(fInputText);
   4026                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4027                    }
   4028                }
   4029 
   4030                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   4031                    // We have tried all potential match starting points without
   4032                    //  getting a match, which means that the negative lookbehind as
   4033                    //  a whole has succeeded.  Jump forward to the continue location
   4034                    fActiveStart = fData[opValue+2];
   4035                    fActiveLimit = fData[opValue+3];
   4036                    U_ASSERT(fActiveStart >= 0);
   4037                    U_ASSERT(fActiveLimit <= fInputLength);
   4038                    fp->fPatIdx = continueLoc;
   4039                    break;
   4040                }
   4041 
   4042                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   4043                //      (successful match will cause a FAIL out of the loop altogether.)
   4044                fp = StateSave(fp, fp->fPatIdx-4, status);
   4045                fp->fInputIdx = lbStartIdx;
   4046            }
   4047            break;
   4048 
   4049        case URX_LBN_END:
   4050            // End of a negative look-behind block, after a successful match.
   4051            {
   4052                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   4053                if (fp->fInputIdx != fActiveLimit) {
   4054                    //  The look-behind expression matched, but the match did not
   4055                    //    extend all the way to the point that we are looking behind from.
   4056                    //  FAIL out of here, which will take us back to the LB_CONT, which
   4057                    //     will retry the match starting at another position or succeed
   4058                    //     the look-behind altogether, whichever is appropriate.
   4059                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4060                    break;
   4061                }
   4062 
   4063                // Look-behind expression matched, which means look-behind test as
   4064                //   a whole Fails
   4065 
   4066                //   Restore the original input string length, which had been truncated
   4067                //   inorder to pin the end of the lookbehind match
   4068                //   to the position being looked-behind.
   4069                fActiveStart = fData[opValue+2];
   4070                fActiveLimit = fData[opValue+3];
   4071                U_ASSERT(fActiveStart >= 0);
   4072                U_ASSERT(fActiveLimit <= fInputLength);
   4073 
   4074                // Restore original stack position, discarding any state saved
   4075                //   by the successful pattern match.
   4076                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   4077                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
   4078                U_ASSERT(fStack->size() > newStackSize);
   4079                fStack->setSize(newStackSize);
   4080 
   4081                //  FAIL, which will take control back to someplace
   4082                //  prior to entering the look-behind test.
   4083                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4084            }
   4085            break;
   4086 
   4087 
   4088        case URX_LOOP_SR_I:
   4089            // Loop Initialization for the optimized implementation of
   4090            //     [some character set]*
   4091            //   This op scans through all matching input.
   4092            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4093            {
   4094                U_ASSERT(opValue > 0 && opValue < fSets->size());
   4095                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4096                UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
   4097 
   4098                // Loop through input, until either the input is exhausted or
   4099                //   we reach a character that is not a member of the set.
   4100                int64_t ix = fp->fInputIdx;
   4101                UTEXT_SETNATIVEINDEX(fInputText, ix);
   4102                for (;;) {
   4103                    if (ix >= fActiveLimit) {
   4104                        fHitEnd = true;
   4105                        break;
   4106                    }
   4107                    UChar32 c = UTEXT_NEXT32(fInputText);
   4108                    if (c<256) {
   4109                        if (s8->contains(c) == false) {
   4110                            break;
   4111                        }
   4112                    } else {
   4113                        if (s->contains(c) == false) {
   4114                            break;
   4115                        }
   4116                    }
   4117                    ix = UTEXT_GETNATIVEINDEX(fInputText);
   4118                }
   4119 
   4120                // If there were no matching characters, skip over the loop altogether.
   4121                //   The loop doesn't run at all, a * op always succeeds.
   4122                if (ix == fp->fInputIdx) {
   4123                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4124                    break;
   4125                }
   4126 
   4127                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4128                //   must follow.  It's operand is the stack location
   4129                //   that holds the starting input index for the match of this [set]*
   4130                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
   4131                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4132                int32_t stackLoc = URX_VAL(loopcOp);
   4133                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4134                fp->fExtra[stackLoc] = fp->fInputIdx;
   4135                fp->fInputIdx = ix;
   4136 
   4137                // Save State to the URX_LOOP_C op that follows this one,
   4138                //   so that match failures in the following code will return to there.
   4139                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4140                fp = StateSave(fp, fp->fPatIdx, status);
   4141                fp->fPatIdx++;
   4142            }
   4143            break;
   4144 
   4145 
   4146        case URX_LOOP_DOT_I:
   4147            // Loop Initialization for the optimized implementation of .*
   4148            //   This op scans through all remaining input.
   4149            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   4150            {
   4151                // Loop through input until the input is exhausted (we reach an end-of-line)
   4152                // In DOTALL mode, we can just go straight to the end of the input.
   4153                int64_t ix;
   4154                if ((opValue & 1) == 1) {
   4155                    // Dot-matches-All mode.  Jump straight to the end of the string.
   4156                    ix = fActiveLimit;
   4157                    fHitEnd = true;
   4158                } else {
   4159                    // NOT DOT ALL mode.  Line endings do not match '.'
   4160                    // Scan forward until a line ending or end of input.
   4161                    ix = fp->fInputIdx;
   4162                    UTEXT_SETNATIVEINDEX(fInputText, ix);
   4163                    for (;;) {
   4164                        if (ix >= fActiveLimit) {
   4165                            fHitEnd = true;
   4166                            break;
   4167                        }
   4168                        UChar32 c = UTEXT_NEXT32(fInputText);
   4169                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   4170                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   4171                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   4172                                    isLineTerminator(c))) {
   4173                                //  char is a line ending.  Exit the scanning loop.
   4174                                break;
   4175                            }
   4176                        }
   4177                        ix = UTEXT_GETNATIVEINDEX(fInputText);
   4178                    }
   4179                }
   4180 
   4181                // If there were no matching characters, skip over the loop altogether.
   4182                //   The loop doesn't run at all, a * op always succeeds.
   4183                if (ix == fp->fInputIdx) {
   4184                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
   4185                    break;
   4186                }
   4187 
   4188                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   4189                //   must follow.  It's operand is the stack location
   4190                //   that holds the starting input index for the match of this .*
   4191                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
   4192                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   4193                int32_t stackLoc = URX_VAL(loopcOp);
   4194                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   4195                fp->fExtra[stackLoc] = fp->fInputIdx;
   4196                fp->fInputIdx = ix;
   4197 
   4198                // Save State to the URX_LOOP_C op that follows this one,
   4199                //   so that match failures in the following code will return to there.
   4200                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   4201                fp = StateSave(fp, fp->fPatIdx, status);
   4202                fp->fPatIdx++;
   4203            }
   4204            break;
   4205 
   4206 
   4207        case URX_LOOP_C:
   4208            {
   4209                U_ASSERT(opValue>=0 && opValue<fFrameSize);
   4210                backSearchIndex = fp->fExtra[opValue];
   4211                U_ASSERT(backSearchIndex <= fp->fInputIdx);
   4212                if (backSearchIndex == fp->fInputIdx) {
   4213                    // We've backed up the input idx to the point that the loop started.
   4214                    // The loop is done.  Leave here without saving state.
   4215                    //  Subsequent failures won't come back here.
   4216                    break;
   4217                }
   4218                // Set up for the next iteration of the loop, with input index
   4219                //   backed up by one from the last time through,
   4220                //   and a state save to this instruction in case the following code fails again.
   4221                //   (We're going backwards because this loop emulates stack unwinding, not
   4222                //    the initial scan forward.)
   4223                U_ASSERT(fp->fInputIdx > 0);
   4224                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4225                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
   4226                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4227 
   4228                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
   4229                if (prevC == 0x0a &&
   4230                    fp->fInputIdx > backSearchIndex &&
   4231                    twoPrevC == 0x0d) {
   4232                    int32_t prevOp = static_cast<int32_t>(pat[fp->fPatIdx - 2]);
   4233                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   4234                        // .*, stepping back over CRLF pair.
   4235                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
   4236                    }
   4237                }
   4238 
   4239 
   4240                fp = StateSave(fp, fp->fPatIdx-1, status);
   4241            }
   4242            break;
   4243 
   4244 
   4245 
   4246        default:
   4247            // Trouble.  The compiled pattern contains an entry with an
   4248            //           unrecognized type tag.
   4249            UPRV_UNREACHABLE_ASSERT;
   4250            // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
   4251            // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
   4252            // See ICU-21669.
   4253            status = U_INTERNAL_PROGRAM_ERROR;
   4254        }
   4255 
   4256        if (U_FAILURE(status)) {
   4257            isMatch = false;
   4258            break;
   4259        }
   4260    }
   4261 
   4262 breakFromLoop:
   4263    fMatch = isMatch;
   4264    if (isMatch) {
   4265        fLastMatchEnd = fMatchEnd;
   4266        fMatchStart   = startIdx;
   4267        fMatchEnd     = fp->fInputIdx;
   4268    }
   4269 
   4270 #ifdef REGEX_RUN_DEBUG
   4271    if (fTraceDebug) {
   4272        if (isMatch) {
   4273            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
   4274        } else {
   4275            printf("No match\n\n");
   4276        }
   4277    }
   4278 #endif
   4279 
   4280    fFrame = fp;                // The active stack frame when the engine stopped.
   4281                                //   Contains the capture group results that we need to
   4282                                //    access later.
   4283 }
   4284 
   4285 
   4286 //--------------------------------------------------------------------------------
   4287 //
   4288 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
   4289 //                  assumption that the entire string is available in the UText's
   4290 //                  chunk buffer. For now, that means we can use int32_t indexes,
   4291 //                  except for anything that needs to be saved (like group starts
   4292 //                  and ends).
   4293 //
   4294 //                  startIdx:    begin matching a this index.
   4295 //                  toEnd:       if true, match must extend to end of the input region
   4296 //
   4297 //--------------------------------------------------------------------------------
   4298 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
   4299    UBool       isMatch  = false;      // True if the we have a match.
   4300 
   4301    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
   4302 
   4303    int32_t     op;                    // Operation from the compiled pattern, split into
   4304    int32_t     opType;                //    the opcode
   4305    int32_t     opValue;               //    and the operand value.
   4306 
   4307 #ifdef REGEX_RUN_DEBUG
   4308    if (fTraceDebug) {
   4309        printf("MatchAt(startIdx=%d)\n", startIdx);
   4310        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
   4311        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
   4312    }
   4313 #endif
   4314 
   4315    if (U_FAILURE(status)) {
   4316        return;
   4317    }
   4318 
   4319    //  Cache frequently referenced items from the compiled pattern
   4320    //
   4321    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
   4322 
   4323    const char16_t      *litText       = fPattern->fLiteralText.getBuffer();
   4324    UVector             *fSets         = fPattern->fSets;
   4325 
   4326    const char16_t      *inputBuf      = fInputText->chunkContents;
   4327 
   4328    fFrameSize = fPattern->fFrameSize;
   4329    REStackFrame        *fp            = resetStack();
   4330    if (U_FAILURE(fDeferredStatus)) {
   4331        status = fDeferredStatus;
   4332        return;
   4333    }
   4334 
   4335    fp->fPatIdx   = 0;
   4336    fp->fInputIdx = startIdx;
   4337 
   4338    // Zero out the pattern's static data
   4339    int32_t i;
   4340    for (i = 0; i<fPattern->fDataSize; i++) {
   4341        fData[i] = 0;
   4342    }
   4343 
   4344    //
   4345    //  Main loop for interpreting the compiled pattern.
   4346    //  One iteration of the loop per pattern operation performed.
   4347    //
   4348    for (;;) {
   4349        op = static_cast<int32_t>(pat[fp->fPatIdx]);
   4350        opType  = URX_TYPE(op);
   4351        opValue = URX_VAL(op);
   4352 #ifdef REGEX_RUN_DEBUG
   4353        if (fTraceDebug) {
   4354            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
   4355            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
   4356                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
   4357            fPattern->dumpOp(fp->fPatIdx);
   4358        }
   4359 #endif
   4360        fp->fPatIdx++;
   4361 
   4362        switch (opType) {
   4363 
   4364 
   4365        case URX_NOP:
   4366            break;
   4367 
   4368 
   4369        case URX_BACKTRACK:
   4370            // Force a backtrack.  In some circumstances, the pattern compiler
   4371            //   will notice that the pattern can't possibly match anything, and will
   4372            //   emit one of these at that point.
   4373            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4374            break;
   4375 
   4376 
   4377        case URX_ONECHAR:
   4378            if (fp->fInputIdx < fActiveLimit) {
   4379                UChar32 c;
   4380                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4381                if (c == opValue) {
   4382                    break;
   4383                }
   4384            } else {
   4385                fHitEnd = true;
   4386            }
   4387            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4388            break;
   4389 
   4390 
   4391        case URX_STRING:
   4392            {
   4393                // Test input against a literal string.
   4394                // Strings require two slots in the compiled pattern, one for the
   4395                //   offset to the string text, and one for the length.
   4396                int32_t   stringStartIdx = opValue;
   4397                int32_t   stringLen;
   4398 
   4399                op = static_cast<int32_t>(pat[fp->fPatIdx]); // Fetch the second operand
   4400                fp->fPatIdx++;
   4401                opType    = URX_TYPE(op);
   4402                stringLen = URX_VAL(op);
   4403                U_ASSERT(opType == URX_STRING_LEN);
   4404                U_ASSERT(stringLen >= 2);
   4405 
   4406                const char16_t * pInp = inputBuf + fp->fInputIdx;
   4407                const char16_t * pInpLimit = inputBuf + fActiveLimit;
   4408                const char16_t * pPat = litText+stringStartIdx;
   4409                const char16_t * pEnd = pInp + stringLen;
   4410                UBool success = true;
   4411                while (pInp < pEnd) {
   4412                    if (pInp >= pInpLimit) {
   4413                        fHitEnd = true;
   4414                        success = false;
   4415                        break;
   4416                    }
   4417                    if (*pInp++ != *pPat++) {
   4418                        success = false;
   4419                        break;
   4420                    }
   4421                }
   4422 
   4423                // If the pattern string ends with an unpaired lead surrogate that
   4424                // matched the lead surrogate of a valid pair in the input text,
   4425                // this does not count as a match.
   4426                if (success && U16_IS_LEAD(*(pInp-1)) &&
   4427                        pInp < pInpLimit && U16_IS_TRAIL(*(pInp))) {
   4428                    success = false;
   4429                }
   4430 
   4431                if (success) {
   4432                    fp->fInputIdx += stringLen;
   4433                } else {
   4434                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4435                }
   4436            }
   4437            break;
   4438 
   4439 
   4440        case URX_STATE_SAVE:
   4441            fp = StateSave(fp, opValue, status);
   4442            break;
   4443 
   4444 
   4445        case URX_END:
   4446            // The match loop will exit via this path on a successful match,
   4447            //   when we reach the end of the pattern.
   4448            if (toEnd && fp->fInputIdx != fActiveLimit) {
   4449                // The pattern matched, but not to the end of input.  Try some more.
   4450                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4451                break;
   4452            }
   4453            isMatch = true;
   4454            goto  breakFromLoop;
   4455 
   4456            // Start and End Capture stack frame variables are laid out out like this:
   4457            //  fp->fExtra[opValue]  - The start of a completed capture group
   4458            //             opValue+1 - The end   of a completed capture group
   4459            //             opValue+2 - the start of a capture group whose end
   4460            //                          has not yet been reached (and might not ever be).
   4461        case URX_START_CAPTURE:
   4462            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4463            fp->fExtra[opValue+2] = fp->fInputIdx;
   4464            break;
   4465 
   4466 
   4467        case URX_END_CAPTURE:
   4468            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
   4469            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
   4470            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
   4471            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
   4472            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
   4473            break;
   4474 
   4475 
   4476        case URX_DOLLAR:                   //  $, test for End of line
   4477            //     or for position before new line at end of input
   4478            if (fp->fInputIdx < fAnchorLimit-2) {
   4479                // We are no where near the end of input.  Fail.
   4480                //   This is the common case.  Keep it first.
   4481                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4482                break;
   4483            }
   4484            if (fp->fInputIdx >= fAnchorLimit) {
   4485                // We really are at the end of input.  Success.
   4486                fHitEnd = true;
   4487                fRequireEnd = true;
   4488                break;
   4489            }
   4490 
   4491            // If we are positioned just before a new-line that is located at the
   4492            //   end of input, succeed.
   4493            if (fp->fInputIdx == fAnchorLimit-1) {
   4494                UChar32 c;
   4495                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
   4496 
   4497                if (isLineTerminator(c)) {
   4498                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4499                        // At new-line at end of input. Success
   4500                        fHitEnd = true;
   4501                        fRequireEnd = true;
   4502                        break;
   4503                    }
   4504                }
   4505            } else if (fp->fInputIdx == fAnchorLimit-2 &&
   4506                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
   4507                    fHitEnd = true;
   4508                    fRequireEnd = true;
   4509                    break;                         // At CR/LF at end of input.  Success
   4510            }
   4511 
   4512            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4513 
   4514            break;
   4515 
   4516 
   4517        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
   4518            if (fp->fInputIdx >= fAnchorLimit-1) {
   4519                // Either at the last character of input, or off the end.
   4520                if (fp->fInputIdx == fAnchorLimit-1) {
   4521                    // At last char of input.  Success if it's a new line.
   4522                    if (inputBuf[fp->fInputIdx] == 0x0a) {
   4523                        fHitEnd = true;
   4524                        fRequireEnd = true;
   4525                        break;
   4526                    }
   4527                } else {
   4528                    // Off the end of input.  Success.
   4529                    fHitEnd = true;
   4530                    fRequireEnd = true;
   4531                    break;
   4532                }
   4533            }
   4534 
   4535            // Not at end of input.  Back-track out.
   4536            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4537            break;
   4538 
   4539 
   4540        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
   4541            {
   4542                if (fp->fInputIdx >= fAnchorLimit) {
   4543                    // We really are at the end of input.  Success.
   4544                    fHitEnd = true;
   4545                    fRequireEnd = true;
   4546                    break;
   4547                }
   4548                // If we are positioned just before a new-line, succeed.
   4549                // It makes no difference where the new-line is within the input.
   4550                UChar32 c = inputBuf[fp->fInputIdx];
   4551                if (isLineTerminator(c)) {
   4552                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
   4553                    //  In multi-line mode, hitting a new-line just before the end of input does not
   4554                    //   set the hitEnd or requireEnd flags
   4555                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
   4556                        break;
   4557                    }
   4558                }
   4559                // not at a new line.  Fail.
   4560                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4561            }
   4562            break;
   4563 
   4564 
   4565        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
   4566            {
   4567                if (fp->fInputIdx >= fAnchorLimit) {
   4568                    // We really are at the end of input.  Success.
   4569                    fHitEnd = true;
   4570                    fRequireEnd = true;  // Java set requireEnd in this case, even though
   4571                    break;               //   adding a new-line would not lose the match.
   4572                }
   4573                // If we are not positioned just before a new-line, the test fails; backtrack out.
   4574                // It makes no difference where the new-line is within the input.
   4575                if (inputBuf[fp->fInputIdx] != 0x0a) {
   4576                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4577                }
   4578            }
   4579            break;
   4580 
   4581 
   4582        case URX_CARET:                    //  ^, test for start of line
   4583            if (fp->fInputIdx != fAnchorStart) {
   4584                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4585            }
   4586            break;
   4587 
   4588 
   4589        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
   4590            {
   4591                if (fp->fInputIdx == fAnchorStart) {
   4592                    // We are at the start input.  Success.
   4593                    break;
   4594                }
   4595                // Check whether character just before the current pos is a new-line
   4596                //   unless we are at the end of input
   4597                char16_t  c = inputBuf[fp->fInputIdx - 1];
   4598                if ((fp->fInputIdx < fAnchorLimit) &&
   4599                    isLineTerminator(c)) {
   4600                    //  It's a new-line.  ^ is true.  Success.
   4601                    //  TODO:  what should be done with positions between a CR and LF?
   4602                    break;
   4603                }
   4604                // Not at the start of a line.  Fail.
   4605                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4606            }
   4607            break;
   4608 
   4609 
   4610        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
   4611            {
   4612                U_ASSERT(fp->fInputIdx >= fAnchorStart);
   4613                if (fp->fInputIdx <= fAnchorStart) {
   4614                    // We are at the start input.  Success.
   4615                    break;
   4616                }
   4617                // Check whether character just before the current pos is a new-line
   4618                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
   4619                char16_t  c = inputBuf[fp->fInputIdx - 1];
   4620                if (c != 0x0a) {
   4621                    // Not at the start of a line.  Back-track out.
   4622                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4623                }
   4624            }
   4625            break;
   4626 
   4627        case URX_BACKSLASH_B:          // Test for word boundaries
   4628            {
   4629                UBool success = isChunkWordBoundary(static_cast<int32_t>(fp->fInputIdx));
   4630                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
   4631                if (!success) {
   4632                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4633                }
   4634            }
   4635            break;
   4636 
   4637 
   4638        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
   4639            {
   4640                UBool success = isUWordBoundary(fp->fInputIdx, status);
   4641                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
   4642                if (!success) {
   4643                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4644                }
   4645            }
   4646            break;
   4647 
   4648 
   4649        case URX_BACKSLASH_D:            // Test for decimal digit
   4650            {
   4651                if (fp->fInputIdx >= fActiveLimit) {
   4652                    fHitEnd = true;
   4653                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4654                    break;
   4655                }
   4656 
   4657                UChar32 c;
   4658                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4659                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
   4660                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
   4661                success ^= static_cast<UBool>(opValue != 0); // flip sense for \D
   4662                if (!success) {
   4663                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4664                }
   4665            }
   4666            break;
   4667 
   4668 
   4669        case URX_BACKSLASH_G:          // Test for position at end of previous match
   4670            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
   4671                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4672            }
   4673            break;
   4674 
   4675 
   4676        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
   4677            {
   4678                if (fp->fInputIdx >= fActiveLimit) {
   4679                    fHitEnd = true;
   4680                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4681                    break;
   4682                }
   4683                UChar32 c;
   4684                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4685                int8_t ctype = u_charType(c);
   4686                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
   4687                success ^= static_cast<UBool>(opValue != 0);  // flip sense for \H
   4688                if (!success) {
   4689                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4690                }
   4691            }
   4692            break;
   4693 
   4694 
   4695        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
   4696            {
   4697                if (fp->fInputIdx >= fActiveLimit) {
   4698                    fHitEnd = true;
   4699                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4700                    break;
   4701                }
   4702                UChar32 c;
   4703                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4704                if (isLineTerminator(c)) {
   4705                    if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
   4706                        // Check for CR/LF sequence. Consume both together when found.
   4707                        char16_t c2;
   4708                        U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
   4709                        if (c2 != 0x0a) {
   4710                            U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
   4711                        }
   4712                    }
   4713                } else {
   4714                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4715                }
   4716            }
   4717            break;
   4718 
   4719 
   4720        case URX_BACKSLASH_V:         // Any single code point line ending.
   4721            {
   4722                if (fp->fInputIdx >= fActiveLimit) {
   4723                    fHitEnd = true;
   4724                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4725                    break;
   4726                }
   4727                UChar32 c;
   4728                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4729                UBool success = isLineTerminator(c);
   4730                success ^= static_cast<UBool>(opValue != 0); // flip sense for \V
   4731                if (!success) {
   4732                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4733                }
   4734            }
   4735            break;
   4736 
   4737 
   4738        case URX_BACKSLASH_X:
   4739            //  Match a Grapheme, as defined by Unicode UAX 29.
   4740 
   4741            // Fail if at end of input
   4742            if (fp->fInputIdx >= fActiveLimit) {
   4743                fHitEnd = true;
   4744                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4745                break;
   4746            }
   4747 
   4748            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
   4749            if (fp->fInputIdx >= fActiveLimit) {
   4750                fHitEnd = true;
   4751                fp->fInputIdx = fActiveLimit;
   4752            }
   4753            break;
   4754 
   4755 
   4756        case URX_BACKSLASH_Z:          // Test for end of Input
   4757            if (fp->fInputIdx < fAnchorLimit) {
   4758                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4759            } else {
   4760                fHitEnd = true;
   4761                fRequireEnd = true;
   4762            }
   4763            break;
   4764 
   4765 
   4766 
   4767        case URX_STATIC_SETREF:
   4768            {
   4769                // Test input character against one of the predefined sets
   4770                //    (Word Characters, for example)
   4771                // The high bit of the op value is a flag for the match polarity.
   4772                //    0:   success if input char is in set.
   4773                //    1:   success if input char is not in set.
   4774                if (fp->fInputIdx >= fActiveLimit) {
   4775                    fHitEnd = true;
   4776                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4777                    break;
   4778                }
   4779 
   4780                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
   4781                opValue &= ~URX_NEG_SET;
   4782                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4783 
   4784                UChar32 c;
   4785                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4786                if (c < 256) {
   4787                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
   4788                    if (s8.contains(c)) {
   4789                        success = !success;
   4790                    }
   4791                } else {
   4792                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
   4793                    if (s.contains(c)) {
   4794                        success = !success;
   4795                    }
   4796                }
   4797                if (!success) {
   4798                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4799                }
   4800            }
   4801            break;
   4802 
   4803 
   4804        case URX_STAT_SETREF_N:
   4805            {
   4806                // Test input character for NOT being a member of  one of
   4807                //    the predefined sets (Word Characters, for example)
   4808                if (fp->fInputIdx >= fActiveLimit) {
   4809                    fHitEnd = true;
   4810                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4811                    break;
   4812                }
   4813 
   4814                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
   4815 
   4816                UChar32  c;
   4817                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4818                if (c < 256) {
   4819                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
   4820                    if (s8.contains(c) == false) {
   4821                        break;
   4822                    }
   4823                } else {
   4824                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
   4825                    if (s.contains(c) == false) {
   4826                        break;
   4827                    }
   4828                }
   4829                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4830            }
   4831            break;
   4832 
   4833 
   4834        case URX_SETREF:
   4835            {
   4836                if (fp->fInputIdx >= fActiveLimit) {
   4837                    fHitEnd = true;
   4838                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4839                    break;
   4840                }
   4841 
   4842                U_ASSERT(opValue > 0 && opValue < fSets->size());
   4843 
   4844                // There is input left.  Pick up one char and test it for set membership.
   4845                UChar32  c;
   4846                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4847                if (c<256) {
   4848                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   4849                    if (s8->contains(c)) {
   4850                        // The character is in the set.  A Match.
   4851                        break;
   4852                    }
   4853                } else {
   4854                    UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
   4855                    if (s->contains(c)) {
   4856                        // The character is in the set.  A Match.
   4857                        break;
   4858                    }
   4859                }
   4860 
   4861                // the character wasn't in the set.
   4862                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4863            }
   4864            break;
   4865 
   4866 
   4867        case URX_DOTANY:
   4868            {
   4869                // . matches anything, but stops at end-of-line.
   4870                if (fp->fInputIdx >= fActiveLimit) {
   4871                    // At end of input.  Match failed.  Backtrack out.
   4872                    fHitEnd = true;
   4873                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4874                    break;
   4875                }
   4876 
   4877                // There is input left.  Advance over one char, unless we've hit end-of-line
   4878                UChar32  c;
   4879                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4880                if (isLineTerminator(c)) {
   4881                    // End of line in normal mode.   . does not match.
   4882                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4883                    break;
   4884                }
   4885            }
   4886            break;
   4887 
   4888 
   4889        case URX_DOTANY_ALL:
   4890            {
   4891                // . in dot-matches-all (including new lines) mode
   4892                if (fp->fInputIdx >= fActiveLimit) {
   4893                    // At end of input.  Match failed.  Backtrack out.
   4894                    fHitEnd = true;
   4895                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4896                    break;
   4897                }
   4898 
   4899                // There is input left.  Advance over one char, except if we are
   4900                //   at a cr/lf, advance over both of them.
   4901                UChar32 c;
   4902                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4903                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
   4904                    // In the case of a CR/LF, we need to advance over both.
   4905                    if (inputBuf[fp->fInputIdx] == 0x0a) {
   4906                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
   4907                    }
   4908                }
   4909            }
   4910            break;
   4911 
   4912 
   4913        case URX_DOTANY_UNIX:
   4914            {
   4915                // '.' operator, matches all, but stops at end-of-line.
   4916                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
   4917                if (fp->fInputIdx >= fActiveLimit) {
   4918                    // At end of input.  Match failed.  Backtrack out.
   4919                    fHitEnd = true;
   4920                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4921                    break;
   4922                }
   4923 
   4924                // There is input left.  Advance over one char, unless we've hit end-of-line
   4925                UChar32 c;
   4926                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   4927                if (c == 0x0a) {
   4928                    // End of line in normal mode.   '.' does not match the \n
   4929                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4930                }
   4931            }
   4932            break;
   4933 
   4934 
   4935        case URX_JMP:
   4936            fp->fPatIdx = opValue;
   4937            break;
   4938 
   4939        case URX_FAIL:
   4940            isMatch = false;
   4941            goto breakFromLoop;
   4942 
   4943        case URX_JMP_SAV:
   4944            U_ASSERT(opValue < fPattern->fCompiledPat->size());
   4945            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
   4946            fp->fPatIdx = opValue;                         // Then JMP.
   4947            break;
   4948 
   4949        case URX_JMP_SAV_X:
   4950            // This opcode is used with (x)+, when x can match a zero length string.
   4951            // Same as JMP_SAV, except conditional on the match having made forward progress.
   4952            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
   4953            //   data address of the input position at the start of the loop.
   4954            {
   4955                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
   4956                int32_t stoOp = static_cast<int32_t>(pat[opValue - 1]);
   4957                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
   4958                int32_t  frameLoc = URX_VAL(stoOp);
   4959                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
   4960                int32_t prevInputIdx = static_cast<int32_t>(fp->fExtra[frameLoc]);
   4961                U_ASSERT(prevInputIdx <= fp->fInputIdx);
   4962                if (prevInputIdx < fp->fInputIdx) {
   4963                    // The match did make progress.  Repeat the loop.
   4964                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
   4965                    fp->fPatIdx = opValue;
   4966                    fp->fExtra[frameLoc] = fp->fInputIdx;
   4967                }
   4968                // If the input position did not advance, we do nothing here,
   4969                //   execution will fall out of the loop.
   4970            }
   4971            break;
   4972 
   4973        case URX_CTR_INIT:
   4974            {
   4975                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   4976                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   4977 
   4978                // Pick up the three extra operands that CTR_INIT has, and
   4979                //    skip the pattern location counter past
   4980                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
   4981                fp->fPatIdx += 3;
   4982                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   4983                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
   4984                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
   4985                U_ASSERT(minCount>=0);
   4986                U_ASSERT(maxCount>=minCount || maxCount==-1);
   4987                U_ASSERT(loopLoc>=fp->fPatIdx);
   4988 
   4989                if (minCount == 0) {
   4990                    fp = StateSave(fp, loopLoc+1, status);
   4991                }
   4992                if (maxCount == -1) {
   4993                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
   4994                } else if (maxCount == 0) {
   4995                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   4996                }
   4997            }
   4998            break;
   4999 
   5000        case URX_CTR_LOOP:
   5001            {
   5002                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5003                int32_t initOp = static_cast<int32_t>(pat[opValue]);
   5004                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
   5005                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5006                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
   5007                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
   5008                (*pCounter)++;
   5009                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
   5010                    U_ASSERT(*pCounter == maxCount);
   5011                    break;
   5012                }
   5013                if (*pCounter >= minCount) {
   5014                    if (maxCount == -1) {
   5015                        // Loop has no hard upper bound.
   5016                        // Check that it is progressing through the input, break if it is not.
   5017                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5018                        if (fp->fInputIdx == *pLastInputIdx) {
   5019                            break;
   5020                        } else {
   5021                            *pLastInputIdx = fp->fInputIdx;
   5022                        }
   5023                    }
   5024                    fp = StateSave(fp, fp->fPatIdx, status);
   5025                } else {
   5026                    // Increment time-out counter. (StateSave() does it if count >= minCount)
   5027                    fTickCounter--;
   5028                    if (fTickCounter <= 0) {
   5029                        IncrementTime(status);    // Re-initializes fTickCounter
   5030                    }
   5031                }
   5032                fp->fPatIdx = opValue + 4;    // Loop back.
   5033            }
   5034            break;
   5035 
   5036        case URX_CTR_INIT_NG:
   5037            {
   5038                // Initialize a non-greedy loop
   5039                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
   5040                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
   5041 
   5042                // Pick up the three extra operands that CTR_INIT_NG has, and
   5043                //    skip the pattern location counter past
   5044                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
   5045                fp->fPatIdx += 3;
   5046                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
   5047                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
   5048                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
   5049                U_ASSERT(minCount>=0);
   5050                U_ASSERT(maxCount>=minCount || maxCount==-1);
   5051                U_ASSERT(loopLoc>fp->fPatIdx);
   5052                if (maxCount == -1) {
   5053                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
   5054                }
   5055 
   5056                if (minCount == 0) {
   5057                    if (maxCount != 0) {
   5058                        fp = StateSave(fp, fp->fPatIdx, status);
   5059                    }
   5060                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
   5061                }
   5062            }
   5063            break;
   5064 
   5065        case URX_CTR_LOOP_NG:
   5066            {
   5067                // Non-greedy {min, max} loops
   5068                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
   5069                int32_t initOp = static_cast<int32_t>(pat[opValue]);
   5070                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
   5071                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
   5072                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
   5073                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
   5074 
   5075                (*pCounter)++;
   5076                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
   5077                    // The loop has matched the maximum permitted number of times.
   5078                    //   Break out of here with no action.  Matching will
   5079                    //   continue with the following pattern.
   5080                    U_ASSERT(*pCounter == maxCount);
   5081                    break;
   5082                }
   5083 
   5084                if (*pCounter < minCount) {
   5085                    // We haven't met the minimum number of matches yet.
   5086                    //   Loop back for another one.
   5087                    fp->fPatIdx = opValue + 4;    // Loop back.
   5088                    fTickCounter--;
   5089                    if (fTickCounter <= 0) {
   5090                        IncrementTime(status);    // Re-initializes fTickCounter
   5091                    }
   5092                } else {
   5093                    // We do have the minimum number of matches.
   5094 
   5095                    // If there is no upper bound on the loop iterations, check that the input index
   5096                    // is progressing, and stop the loop if it is not.
   5097                    if (maxCount == -1) {
   5098                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
   5099                        if (fp->fInputIdx == *pLastInputIdx) {
   5100                            break;
   5101                        }
   5102                        *pLastInputIdx = fp->fInputIdx;
   5103                    }
   5104 
   5105                    // Loop Continuation: we will fall into the pattern following the loop
   5106                    //   (non-greedy, don't execute loop body first), but first do
   5107                    //   a state save to the top of the loop, so that a match failure
   5108                    //   in the following pattern will try another iteration of the loop.
   5109                    fp = StateSave(fp, opValue + 4, status);
   5110                }
   5111            }
   5112            break;
   5113 
   5114        case URX_STO_SP:
   5115            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5116            fData[opValue] = fStack->size();
   5117            break;
   5118 
   5119        case URX_LD_SP:
   5120            {
   5121                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
   5122                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
   5123                U_ASSERT(newStackSize <= fStack->size());
   5124                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5125                if (newFP == reinterpret_cast<int64_t*>(fp)) {
   5126                    break;
   5127                }
   5128                int32_t j;
   5129                for (j=0; j<fFrameSize; j++) {
   5130                    newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
   5131                }
   5132                fp = reinterpret_cast<REStackFrame*>(newFP);
   5133                fStack->setSize(newStackSize);
   5134            }
   5135            break;
   5136 
   5137        case URX_BACKREF:
   5138            {
   5139                U_ASSERT(opValue < fFrameSize);
   5140                int64_t groupStartIdx = fp->fExtra[opValue];
   5141                int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5142                U_ASSERT(groupStartIdx <= groupEndIdx);
   5143                int64_t inputIndex = fp->fInputIdx;
   5144                if (groupStartIdx < 0) {
   5145                    // This capture group has not participated in the match thus far,
   5146                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
   5147                    break;
   5148                }
   5149                UBool success = true;
   5150                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
   5151                    if (inputIndex >= fActiveLimit) {
   5152                        success = false;
   5153                        fHitEnd = true;
   5154                        break;
   5155                    }
   5156                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
   5157                        success = false;
   5158                        break;
   5159                    }
   5160                }
   5161                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
   5162                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
   5163                    // Capture group ended with an unpaired lead surrogate.
   5164                    // Back reference is not permitted to match lead only of a surrogatge pair.
   5165                    success = false;
   5166                }
   5167                if (success) {
   5168                    fp->fInputIdx = inputIndex;
   5169                } else {
   5170                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5171                }
   5172            }
   5173            break;
   5174 
   5175        case URX_BACKREF_I:
   5176            {
   5177                U_ASSERT(opValue < fFrameSize);
   5178                int64_t groupStartIdx = fp->fExtra[opValue];
   5179                int64_t groupEndIdx   = fp->fExtra[opValue+1];
   5180                U_ASSERT(groupStartIdx <= groupEndIdx);
   5181                if (groupStartIdx < 0) {
   5182                    // This capture group has not participated in the match thus far,
   5183                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
   5184                    break;
   5185                }
   5186                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
   5187                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
   5188 
   5189                //   Note: if the capture group match was of an empty string the backref
   5190                //         match succeeds.  Verified by testing:  Perl matches succeed
   5191                //         in this case, so we do too.
   5192 
   5193                UBool success = true;
   5194                for (;;) {
   5195                    UChar32 captureGroupChar = captureGroupItr.next();
   5196                    if (captureGroupChar == U_SENTINEL) {
   5197                        success = true;
   5198                        break;
   5199                    }
   5200                    UChar32 inputChar = inputItr.next();
   5201                    if (inputChar == U_SENTINEL) {
   5202                        success = false;
   5203                        fHitEnd = true;
   5204                        break;
   5205                    }
   5206                    if (inputChar != captureGroupChar) {
   5207                        success = false;
   5208                        break;
   5209                    }
   5210                }
   5211 
   5212                if (success && inputItr.inExpansion()) {
   5213                    // We obtained a match by consuming part of a string obtained from
   5214                    // case-folding a single code point of the input text.
   5215                    // This does not count as an overall match.
   5216                    success = false;
   5217                }
   5218 
   5219                if (success) {
   5220                    fp->fInputIdx = inputItr.getIndex();
   5221                } else {
   5222                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5223                }
   5224            }
   5225            break;
   5226 
   5227        case URX_STO_INP_LOC:
   5228            {
   5229                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
   5230                fp->fExtra[opValue] = fp->fInputIdx;
   5231            }
   5232            break;
   5233 
   5234        case URX_JMPX:
   5235            {
   5236                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
   5237                fp->fPatIdx += 1;
   5238                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
   5239                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
   5240                int32_t savedInputIdx = static_cast<int32_t>(fp->fExtra[dataLoc]);
   5241                U_ASSERT(savedInputIdx <= fp->fInputIdx);
   5242                if (savedInputIdx < fp->fInputIdx) {
   5243                    fp->fPatIdx = opValue;                               // JMP
   5244                } else {
   5245                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no progress in loop.
   5246                }
   5247            }
   5248            break;
   5249 
   5250        case URX_LA_START:
   5251            {
   5252                // Entering a look around block.
   5253                // Save Stack Ptr, Input Pos.
   5254                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
   5255                fData[opValue]   = fStack->size();
   5256                fData[opValue+1] = fp->fInputIdx;
   5257                fData[opValue+2] = fActiveStart;
   5258                fData[opValue+3] = fActiveLimit;
   5259                fActiveStart     = fLookStart;          // Set the match region change for
   5260                fActiveLimit     = fLookLimit;          //   transparent bounds.
   5261            }
   5262            break;
   5263 
   5264        case URX_LA_END:
   5265            {
   5266                // Leaving a look around block.
   5267                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
   5268                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
   5269                int32_t stackSize = fStack->size();
   5270                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
   5271                U_ASSERT(stackSize >= newStackSize);
   5272                if (stackSize > newStackSize) {
   5273                    // Copy the current top frame back to the new (cut back) top frame.
   5274                    //   This makes the capture groups from within the look-ahead
   5275                    //   expression available.
   5276                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
   5277                    int32_t j;
   5278                    for (j=0; j<fFrameSize; j++) {
   5279                        newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
   5280                    }
   5281                    fp = reinterpret_cast<REStackFrame*>(newFP);
   5282                    fStack->setSize(newStackSize);
   5283                }
   5284                fp->fInputIdx = fData[opValue+1];
   5285 
   5286                // Restore the active region bounds in the input string; they may have
   5287                //    been changed because of transparent bounds on a Region.
   5288                fActiveStart = fData[opValue+2];
   5289                fActiveLimit = fData[opValue+3];
   5290                U_ASSERT(fActiveStart >= 0);
   5291                U_ASSERT(fActiveLimit <= fInputLength);
   5292            }
   5293            break;
   5294 
   5295        case URX_ONECHAR_I:
   5296            if (fp->fInputIdx < fActiveLimit) {
   5297                UChar32 c;
   5298                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
   5299                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
   5300                    break;
   5301                }
   5302            } else {
   5303                fHitEnd = true;
   5304            }
   5305            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5306            break;
   5307 
   5308        case URX_STRING_I:
   5309            // Case-insensitive test input against a literal string.
   5310            // Strings require two slots in the compiled pattern, one for the
   5311            //   offset to the string text, and one for the length.
   5312            //   The compiled string has already been case folded.
   5313            {
   5314                const char16_t *patternString = litText + opValue;
   5315 
   5316                op = static_cast<int32_t>(pat[fp->fPatIdx]);
   5317                fp->fPatIdx++;
   5318                opType  = URX_TYPE(op);
   5319                opValue = URX_VAL(op);
   5320                U_ASSERT(opType == URX_STRING_LEN);
   5321                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
   5322 
   5323                UChar32      cText;
   5324                UChar32      cPattern;
   5325                UBool        success = true;
   5326                int32_t      patternStringIdx  = 0;
   5327                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
   5328                while (patternStringIdx < patternStringLen) {
   5329                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
   5330                    cText = inputIterator.next();
   5331                    if (cText != cPattern) {
   5332                        success = false;
   5333                        if (cText == U_SENTINEL) {
   5334                            fHitEnd = true;
   5335                        }
   5336                        break;
   5337                    }
   5338                }
   5339                if (inputIterator.inExpansion()) {
   5340                    success = false;
   5341                }
   5342 
   5343                if (success) {
   5344                    fp->fInputIdx = inputIterator.getIndex();
   5345                } else {
   5346                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5347                }
   5348            }
   5349            break;
   5350 
   5351        case URX_LB_START:
   5352            {
   5353                // Entering a look-behind block.
   5354                // Save Stack Ptr, Input Pos and active input region.
   5355                //   TODO:  implement transparent bounds.  Ticket #6067
   5356                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   5357                fData[opValue]   = fStack->size();
   5358                fData[opValue+1] = fp->fInputIdx;
   5359                // Save input string length, then reset to pin any matches to end at
   5360                //   the current position.
   5361                fData[opValue+2] = fActiveStart;
   5362                fData[opValue+3] = fActiveLimit;
   5363                fActiveStart     = fRegionStart;
   5364                fActiveLimit     = fp->fInputIdx;
   5365                // Init the variable containing the start index for attempted matches.
   5366                fData[opValue+4] = -1;
   5367            }
   5368            break;
   5369 
   5370 
   5371        case URX_LB_CONT:
   5372            {
   5373                // Positive Look-Behind, at top of loop checking for matches of LB expression
   5374                //    at all possible input starting positions.
   5375 
   5376                // Fetch the min and max possible match lengths.  They are the operands
   5377                //   of this op in the pattern.
   5378                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   5379                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   5380                U_ASSERT(minML <= maxML);
   5381                U_ASSERT(minML >= 0);
   5382 
   5383                // Fetch (from data) the last input index where a match was attempted.
   5384                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   5385                int64_t  &lbStartIdx = fData[opValue+4];
   5386                if (lbStartIdx < 0) {
   5387                    // First time through loop.
   5388                    lbStartIdx = fp->fInputIdx - minML;
   5389                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
   5390                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
   5391                    }
   5392                } else {
   5393                    // 2nd through nth time through the loop.
   5394                    // Back up start position for match by one.
   5395                    if (lbStartIdx == 0) {
   5396                        lbStartIdx--;
   5397                    } else {
   5398                        U16_BACK_1(inputBuf, 0, lbStartIdx);
   5399                    }
   5400                }
   5401 
   5402                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   5403                    // We have tried all potential match starting points without
   5404                    //  getting a match.  Backtrack out, and out of the
   5405                    //   Look Behind altogether.
   5406                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5407                    fActiveStart = fData[opValue+2];
   5408                    fActiveLimit = fData[opValue+3];
   5409                    U_ASSERT(fActiveStart >= 0);
   5410                    U_ASSERT(fActiveLimit <= fInputLength);
   5411                    break;
   5412                }
   5413 
   5414                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5415                //      (successful match will fall off the end of the loop.)
   5416                fp = StateSave(fp, fp->fPatIdx-3, status);
   5417                fp->fInputIdx =  lbStartIdx;
   5418            }
   5419            break;
   5420 
   5421        case URX_LB_END:
   5422            // End of a look-behind block, after a successful match.
   5423            {
   5424                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   5425                if (fp->fInputIdx != fActiveLimit) {
   5426                    //  The look-behind expression matched, but the match did not
   5427                    //    extend all the way to the point that we are looking behind from.
   5428                    //  FAIL out of here, which will take us back to the LB_CONT, which
   5429                    //     will retry the match starting at another position or fail
   5430                    //     the look-behind altogether, whichever is appropriate.
   5431                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5432                    break;
   5433                }
   5434 
   5435                // Look-behind match is good.  Restore the original input string region,
   5436                //   which had been truncated to pin the end of the lookbehind match to the
   5437                //   position being looked-behind.
   5438                fActiveStart = fData[opValue+2];
   5439                fActiveLimit = fData[opValue+3];
   5440                U_ASSERT(fActiveStart >= 0);
   5441                U_ASSERT(fActiveLimit <= fInputLength);
   5442            }
   5443            break;
   5444 
   5445 
   5446        case URX_LBN_CONT:
   5447            {
   5448                // Negative Look-Behind, at top of loop checking for matches of LB expression
   5449                //    at all possible input starting positions.
   5450 
   5451                // Fetch the extra parameters of this op.
   5452                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   5453                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
   5454                int32_t continueLoc = static_cast<int32_t>(pat[fp->fPatIdx++]);
   5455                continueLoc = URX_VAL(continueLoc);
   5456                U_ASSERT(minML <= maxML);
   5457                U_ASSERT(minML >= 0);
   5458                U_ASSERT(continueLoc > fp->fPatIdx);
   5459 
   5460                // Fetch (from data) the last input index where a match was attempted.
   5461                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   5462                int64_t  &lbStartIdx = fData[opValue+4];
   5463                if (lbStartIdx < 0) {
   5464                    // First time through loop.
   5465                    lbStartIdx = fp->fInputIdx - minML;
   5466                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
   5467                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
   5468                    }
   5469                } else {
   5470                    // 2nd through nth time through the loop.
   5471                    // Back up start position for match by one.
   5472                    if (lbStartIdx == 0) {
   5473                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
   5474                    } else {
   5475                        U16_BACK_1(inputBuf, 0, lbStartIdx);
   5476                    }
   5477                }
   5478 
   5479                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
   5480                    // We have tried all potential match starting points without
   5481                    //  getting a match, which means that the negative lookbehind as
   5482                    //  a whole has succeeded.  Jump forward to the continue location
   5483                    fActiveStart = fData[opValue+2];
   5484                    fActiveLimit = fData[opValue+3];
   5485                    U_ASSERT(fActiveStart >= 0);
   5486                    U_ASSERT(fActiveLimit <= fInputLength);
   5487                    fp->fPatIdx = continueLoc;
   5488                    break;
   5489                }
   5490 
   5491                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
   5492                //      (successful match will cause a FAIL out of the loop altogether.)
   5493                fp = StateSave(fp, fp->fPatIdx-4, status);
   5494                fp->fInputIdx =  lbStartIdx;
   5495            }
   5496            break;
   5497 
   5498        case URX_LBN_END:
   5499            // End of a negative look-behind block, after a successful match.
   5500            {
   5501                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
   5502                if (fp->fInputIdx != fActiveLimit) {
   5503                    //  The look-behind expression matched, but the match did not
   5504                    //    extend all the way to the point that we are looking behind from.
   5505                    //  FAIL out of here, which will take us back to the LB_CONT, which
   5506                    //     will retry the match starting at another position or succeed
   5507                    //     the look-behind altogether, whichever is appropriate.
   5508                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5509                    break;
   5510                }
   5511 
   5512                // Look-behind expression matched, which means look-behind test as
   5513                //   a whole Fails
   5514 
   5515                //   Restore the original input string length, which had been truncated
   5516                //   inorder to pin the end of the lookbehind match
   5517                //   to the position being looked-behind.
   5518                fActiveStart = fData[opValue+2];
   5519                fActiveLimit = fData[opValue+3];
   5520                U_ASSERT(fActiveStart >= 0);
   5521                U_ASSERT(fActiveLimit <= fInputLength);
   5522 
   5523                // Restore original stack position, discarding any state saved
   5524                //   by the successful pattern match.
   5525                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
   5526                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
   5527                U_ASSERT(fStack->size() > newStackSize);
   5528                fStack->setSize(newStackSize);
   5529 
   5530                //  FAIL, which will take control back to someplace
   5531                //  prior to entering the look-behind test.
   5532                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
   5533            }
   5534            break;
   5535 
   5536 
   5537        case URX_LOOP_SR_I:
   5538            // Loop Initialization for the optimized implementation of
   5539            //     [some character set]*
   5540            //   This op scans through all matching input.
   5541            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5542            {
   5543                U_ASSERT(opValue > 0 && opValue < fSets->size());
   5544                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
   5545                UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
   5546 
   5547                // Loop through input, until either the input is exhausted or
   5548                //   we reach a character that is not a member of the set.
   5549                int32_t ix = static_cast<int32_t>(fp->fInputIdx);
   5550                for (;;) {
   5551                    if (ix >= fActiveLimit) {
   5552                        fHitEnd = true;
   5553                        break;
   5554                    }
   5555                    UChar32   c;
   5556                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
   5557                    if (c<256) {
   5558                        if (s8->contains(c) == false) {
   5559                            U16_BACK_1(inputBuf, 0, ix);
   5560                            break;
   5561                        }
   5562                    } else {
   5563                        if (s->contains(c) == false) {
   5564                            U16_BACK_1(inputBuf, 0, ix);
   5565                            break;
   5566                        }
   5567                    }
   5568                }
   5569 
   5570                // If there were no matching characters, skip over the loop altogether.
   5571                //   The loop doesn't run at all, a * op always succeeds.
   5572                if (ix == fp->fInputIdx) {
   5573                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5574                    break;
   5575                }
   5576 
   5577                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5578                //   must follow.  It's operand is the stack location
   5579                //   that holds the starting input index for the match of this [set]*
   5580                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
   5581                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5582                int32_t stackLoc = URX_VAL(loopcOp);
   5583                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5584                fp->fExtra[stackLoc] = fp->fInputIdx;
   5585                fp->fInputIdx = ix;
   5586 
   5587                // Save State to the URX_LOOP_C op that follows this one,
   5588                //   so that match failures in the following code will return to there.
   5589                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5590                fp = StateSave(fp, fp->fPatIdx, status);
   5591                fp->fPatIdx++;
   5592            }
   5593            break;
   5594 
   5595 
   5596        case URX_LOOP_DOT_I:
   5597            // Loop Initialization for the optimized implementation of .*
   5598            //   This op scans through all remaining input.
   5599            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
   5600            {
   5601                // Loop through input until the input is exhausted (we reach an end-of-line)
   5602                // In DOTALL mode, we can just go straight to the end of the input.
   5603                int32_t ix;
   5604                if ((opValue & 1) == 1) {
   5605                    // Dot-matches-All mode.  Jump straight to the end of the string.
   5606                    ix = static_cast<int32_t>(fActiveLimit);
   5607                    fHitEnd = true;
   5608                } else {
   5609                    // NOT DOT ALL mode.  Line endings do not match '.'
   5610                    // Scan forward until a line ending or end of input.
   5611                    ix = static_cast<int32_t>(fp->fInputIdx);
   5612                    for (;;) {
   5613                        if (ix >= fActiveLimit) {
   5614                            fHitEnd = true;
   5615                            break;
   5616                        }
   5617                        UChar32   c;
   5618                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
   5619                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
   5620                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
   5621                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
   5622                                   isLineTerminator(c))) {
   5623                                //  char is a line ending.  Put the input pos back to the
   5624                                //    line ending char, and exit the scanning loop.
   5625                                U16_BACK_1(inputBuf, 0, ix);
   5626                                break;
   5627                            }
   5628                        }
   5629                    }
   5630                }
   5631 
   5632                // If there were no matching characters, skip over the loop altogether.
   5633                //   The loop doesn't run at all, a * op always succeeds.
   5634                if (ix == fp->fInputIdx) {
   5635                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
   5636                    break;
   5637                }
   5638 
   5639                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
   5640                //   must follow.  It's operand is the stack location
   5641                //   that holds the starting input index for the match of this .*
   5642                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
   5643                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
   5644                int32_t stackLoc = URX_VAL(loopcOp);
   5645                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
   5646                fp->fExtra[stackLoc] = fp->fInputIdx;
   5647                fp->fInputIdx = ix;
   5648 
   5649                // Save State to the URX_LOOP_C op that follows this one,
   5650                //   so that match failures in the following code will return to there.
   5651                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
   5652                fp = StateSave(fp, fp->fPatIdx, status);
   5653                fp->fPatIdx++;
   5654            }
   5655            break;
   5656 
   5657 
   5658        case URX_LOOP_C:
   5659            {
   5660                U_ASSERT(opValue>=0 && opValue<fFrameSize);
   5661                backSearchIndex = static_cast<int32_t>(fp->fExtra[opValue]);
   5662                U_ASSERT(backSearchIndex <= fp->fInputIdx);
   5663                if (backSearchIndex == fp->fInputIdx) {
   5664                    // We've backed up the input idx to the point that the loop started.
   5665                    // The loop is done.  Leave here without saving state.
   5666                    //  Subsequent failures won't come back here.
   5667                    break;
   5668                }
   5669                // Set up for the next iteration of the loop, with input index
   5670                //   backed up by one from the last time through,
   5671                //   and a state save to this instruction in case the following code fails again.
   5672                //   (We're going backwards because this loop emulates stack unwinding, not
   5673                //    the initial scan forward.)
   5674                U_ASSERT(fp->fInputIdx > 0);
   5675                UChar32 prevC;
   5676                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
   5677 
   5678                if (prevC == 0x0a &&
   5679                    fp->fInputIdx > backSearchIndex &&
   5680                    inputBuf[fp->fInputIdx-1] == 0x0d) {
   5681                    int32_t prevOp = static_cast<int32_t>(pat[fp->fPatIdx - 2]);
   5682                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
   5683                        // .*, stepping back over CRLF pair.
   5684                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
   5685                    }
   5686                }
   5687 
   5688 
   5689                fp = StateSave(fp, fp->fPatIdx-1, status);
   5690            }
   5691            break;
   5692 
   5693 
   5694 
   5695        default:
   5696            // Trouble.  The compiled pattern contains an entry with an
   5697            //           unrecognized type tag.
   5698            UPRV_UNREACHABLE_ASSERT;
   5699            // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
   5700            // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
   5701            // See ICU-21669.
   5702            status = U_INTERNAL_PROGRAM_ERROR;
   5703        }
   5704 
   5705        if (U_FAILURE(status)) {
   5706            isMatch = false;
   5707            break;
   5708        }
   5709    }
   5710 
   5711 breakFromLoop:
   5712    fMatch = isMatch;
   5713    if (isMatch) {
   5714        fLastMatchEnd = fMatchEnd;
   5715        fMatchStart   = startIdx;
   5716        fMatchEnd     = fp->fInputIdx;
   5717    }
   5718 
   5719 #ifdef REGEX_RUN_DEBUG
   5720    if (fTraceDebug) {
   5721        if (isMatch) {
   5722            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
   5723        } else {
   5724            printf("No match\n\n");
   5725        }
   5726    }
   5727 #endif
   5728 
   5729    fFrame = fp;                // The active stack frame when the engine stopped.
   5730                                //   Contains the capture group results that we need to
   5731                                //    access later.
   5732 }
   5733 
   5734 
   5735 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
   5736 
   5737 U_NAMESPACE_END
   5738 
   5739 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS